[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen-unstable] merge



# HG changeset patch
# User kfraser@xxxxxxxxxxxxxxxxxxxxx
# Node ID 01345b08d0122830f17624bf8d19b0ba48744de0
# Parent  f2151423f729a49281c719308be4cd76f6c934c4
# Parent  e66352312acb9a8beacb7a8faae9a68442e9fb31
merge
---
 xen/arch/x86/audit.c                     |  984 ------
 xen/arch/x86/shadow.c                    | 4150 ----------------------------
 xen/arch/x86/shadow32.c                  | 3782 --------------------------
 xen/arch/x86/shadow_guest32.c            |   16 
 xen/arch/x86/shadow_guest32pae.c         |   16 
 xen/arch/x86/shadow_public.c             | 2143 --------------
 xen/include/asm-x86/shadow_64.h          |  587 ----
 xen/include/asm-x86/shadow_ops.h         |  138 
 xen/include/asm-x86/shadow_public.h      |   61 
 .hgtags                                  |   10 
 tools/examples/xmexample.hvm             |    4 
 tools/libxc/xc_domain.c                  |   13 
 tools/libxc/xc_hvm_build.c               |   13 
 tools/libxc/xc_linux_build.c             |    2 
 tools/libxc/xc_linux_save.c              |   18 
 tools/libxc/xenctrl.h                    |    2 
 tools/misc/xc_shadow.c                   |    2 
 tools/python/xen/lowlevel/xc/xc.c        |   69 
 tools/python/xen/xend/XendDomain.py      |   24 
 tools/python/xen/xend/XendDomainInfo.py  |   47 
 tools/python/xen/xend/image.py           |   17 
 tools/python/xen/xm/create.py            |    9 
 xen/arch/x86/Makefile                    |   16 
 xen/arch/x86/dom0_ops.c                  |    2 
 xen/arch/x86/domain.c                    |  106 
 xen/arch/x86/domain_build.c              |   13 
 xen/arch/x86/hvm/hvm.c                   |   25 
 xen/arch/x86/hvm/platform.c              |    9 
 xen/arch/x86/hvm/svm/svm.c               |  265 -
 xen/arch/x86/hvm/svm/vmcb.c              |    4 
 xen/arch/x86/hvm/vlapic.c                |    3 
 xen/arch/x86/hvm/vmx/vmcs.c              |   15 
 xen/arch/x86/hvm/vmx/vmx.c               |  228 -
 xen/arch/x86/mm.c                        |  485 +--
 xen/arch/x86/setup.c                     |    2 
 xen/arch/x86/shadow2-common.c            | 3394 +++++++++++++++++++++++
 xen/arch/x86/shadow2.c                   | 4469 +++++++++++++++++++++++++++++++
 xen/arch/x86/smpboot.c                   |    2 
 xen/arch/x86/traps.c                     |   32 
 xen/arch/x86/x86_32/domain_page.c        |   33 
 xen/arch/x86/x86_32/mm.c                 |    3 
 xen/arch/x86/x86_64/mm.c                 |    3 
 xen/arch/x86/x86_64/traps.c              |   14 
 xen/common/acm_ops.c                     |    1 
 xen/common/grant_table.c                 |    4 
 xen/common/keyhandler.c                  |   19 
 xen/common/memory.c                      |   11 
 xen/drivers/char/console.c               |   50 
 xen/include/asm-x86/bitops.h             |   18 
 xen/include/asm-x86/config.h             |   22 
 xen/include/asm-x86/domain.h             |   99 
 xen/include/asm-x86/grant_table.h        |    2 
 xen/include/asm-x86/hvm/hvm.h            |   25 
 xen/include/asm-x86/hvm/support.h        |   11 
 xen/include/asm-x86/hvm/vcpu.h           |    6 
 xen/include/asm-x86/hvm/vmx/vmcs.h       |    1 
 xen/include/asm-x86/hvm/vmx/vmx.h        |   49 
 xen/include/asm-x86/mm.h                 |  140 
 xen/include/asm-x86/msr.h                |    4 
 xen/include/asm-x86/page-guest32.h       |    7 
 xen/include/asm-x86/page.h               |   37 
 xen/include/asm-x86/perfc_defn.h         |   53 
 xen/include/asm-x86/processor.h          |    1 
 xen/include/asm-x86/shadow.h             | 1791 ------------
 xen/include/asm-x86/shadow2-multi.h      |  116 
 xen/include/asm-x86/shadow2-private.h    |  612 ++++
 xen/include/asm-x86/shadow2-types.h      |  705 ++++
 xen/include/asm-x86/shadow2.h            |  627 ++++
 xen/include/asm-x86/x86_32/page-2level.h |    1 
 xen/include/asm-x86/x86_32/page-3level.h |    3 
 xen/include/asm-x86/x86_64/page.h        |    5 
 xen/include/public/dom0_ops.h            |   16 
 xen/include/xen/domain_page.h            |   13 
 xen/include/xen/lib.h                    |    4 
 xen/include/xen/list.h                   |   10 
 xen/include/xen/sched.h                  |    5 
 76 files changed, 11149 insertions(+), 14549 deletions(-)

diff -r f2151423f729 -r 01345b08d012 .hgtags
--- a/.hgtags   Wed Aug 16 16:48:45 2006 +0100
+++ b/.hgtags   Wed Aug 16 17:11:56 2006 +0100
@@ -15,3 +15,13 @@ c8fdb0caa77b429cf47f9707926e83947778cb48
 c8fdb0caa77b429cf47f9707926e83947778cb48 RELEASE-3.0.0
 af0573e9e5258db0a9d28aa954dd302ddd2c2d23 3.0.2-rc
 d0d3fef37685be264a7f52201f8ef44c030daad3 3.0.2-branched
+6e864d7de9db066f92bea505d256bfe286200fed last-code-review
+a898a6510c5db4e3d1f69d40fcacb540643b0f22 mainline
+bfa6f4a0c594bc0ebd896437d69857b58dab0988 last-code-review
+fc6cbf31bd883bc76ceb97f4b817ac88078d696a latest patch to unstable
+8e55c5c1147589b7a6a1875384d4317aec7ccf84 mainline
+2d2ed4d9b1c14aeee29dfdd77acd6017d31290cd mainline
+0e32095a7b4611d18a82052a9d5b23e474f91af9 mainline
+88e6bd5e2b5439f97e1d50a8724103c619aeaadf mainline
+5233c4b076b9aa073eff63508461b7bfa597737c mainline
+fda70200da01b89d5339342df6c0db372369a16d mainline
diff -r f2151423f729 -r 01345b08d012 tools/examples/xmexample.hvm
--- a/tools/examples/xmexample.hvm      Wed Aug 16 16:48:45 2006 +0100
+++ b/tools/examples/xmexample.hvm      Wed Aug 16 17:11:56 2006 +0100
@@ -26,6 +26,10 @@ builder='hvm'
 #          memory errors. The domain needs enough memory to boot kernel
 #          and modules. Allocating less than 32MBs is not recommended.
 memory = 128
+
+# Shadow pagetable memory for the domain, in MB.
+# Should be at least 2KB per MB of domain memory, plus a few MB per vcpu.
+shadow_memory = 8
 
 # A name for your domain. All domains must have different names.
 name = "ExampleHVMDomain"
diff -r f2151423f729 -r 01345b08d012 tools/libxc/xc_domain.c
--- a/tools/libxc/xc_domain.c   Wed Aug 16 16:48:45 2006 +0100
+++ b/tools/libxc/xc_domain.c   Wed Aug 16 17:11:56 2006 +0100
@@ -213,21 +213,28 @@ int xc_shadow_control(int xc_handle,
                       unsigned int sop,
                       unsigned long *dirty_bitmap,
                       unsigned long pages,
-                      xc_shadow_control_stats_t *stats )
+                      unsigned long *mb,
+                      uint32_t mode,
+                      xc_shadow_control_stats_t *stats)
 {
     int rc;
     DECLARE_DOM0_OP;
     op.cmd = DOM0_SHADOW_CONTROL;
     op.u.shadow_control.domain = (domid_t)domid;
     op.u.shadow_control.op     = sop;
+    op.u.shadow_control.pages  = pages;
+    op.u.shadow_control.mb     = mb ? *mb : 0;
+    op.u.shadow_control.mode   = mode;
     set_xen_guest_handle(op.u.shadow_control.dirty_bitmap, dirty_bitmap);
-    op.u.shadow_control.pages  = pages;
 
     rc = do_dom0_op(xc_handle, &op);
 
     if ( stats )
         memcpy(stats, &op.u.shadow_control.stats,
                sizeof(xc_shadow_control_stats_t));
+    
+    if ( mb ) 
+        *mb = op.u.shadow_control.mb;
 
     return (rc == 0) ? op.u.shadow_control.pages : rc;
 }
@@ -391,7 +398,7 @@ int xc_domain_memory_populate_physmap(in
 
     if ( err > 0 )
     {
-        DPRINTF("Failed deallocation for dom %d: %ld pages order %d\n",
+        DPRINTF("Failed allocation for dom %d: %ld pages order %d\n",
                 domid, nr_extents, extent_order);
         errno = EBUSY;
         err = -1;
diff -r f2151423f729 -r 01345b08d012 tools/libxc/xc_hvm_build.c
--- a/tools/libxc/xc_hvm_build.c        Wed Aug 16 16:48:45 2006 +0100
+++ b/tools/libxc/xc_hvm_build.c        Wed Aug 16 17:11:56 2006 +0100
@@ -395,6 +395,19 @@ static int xc_hvm_build_internal(int xc_
         PERROR("Could not get info on domain");
         goto error_out;
     }
+
+    /* HVM domains must be put into shadow2 mode at the start of day */
+    if ( xc_shadow_control(xc_handle, domid, DOM0_SHADOW2_CONTROL_OP_ENABLE,
+                           NULL, 0, NULL, 
+                           DOM0_SHADOW2_CONTROL_FLAG_ENABLE 
+                           | DOM0_SHADOW2_CONTROL_FLAG_REFCOUNT
+                           | DOM0_SHADOW2_CONTROL_FLAG_TRANSLATE
+                           | DOM0_SHADOW2_CONTROL_FLAG_EXTERNAL, 
+                           NULL) ) 
+    {
+        PERROR("Could not enable shadow paging for domain.\n");
+        goto error_out;
+    }        
 
     memset(ctxt, 0, sizeof(*ctxt));
 
diff -r f2151423f729 -r 01345b08d012 tools/libxc/xc_linux_build.c
--- a/tools/libxc/xc_linux_build.c      Wed Aug 16 16:48:45 2006 +0100
+++ b/tools/libxc/xc_linux_build.c      Wed Aug 16 17:11:56 2006 +0100
@@ -972,7 +972,7 @@ static int setup_guest(int xc_handle,
         /* Enable shadow translate mode */
         if ( xc_shadow_control(xc_handle, dom,
                                DOM0_SHADOW_CONTROL_OP_ENABLE_TRANSLATE,
-                               NULL, 0, NULL) < 0 )
+                               NULL, 0, NULL, 0, NULL) < 0 )
         {
             PERROR("Could not enable translation mode");
             goto error_out;
diff -r f2151423f729 -r 01345b08d012 tools/libxc/xc_linux_save.c
--- a/tools/libxc/xc_linux_save.c       Wed Aug 16 16:48:45 2006 +0100
+++ b/tools/libxc/xc_linux_save.c       Wed Aug 16 17:11:56 2006 +0100
@@ -338,13 +338,13 @@ static int analysis_phase(int xc_handle,
         int i;
 
         xc_shadow_control(xc_handle, domid, DOM0_SHADOW_CONTROL_OP_CLEAN,
-                          arr, max_pfn, NULL);
+                          arr, max_pfn, NULL, 0, NULL);
         DPRINTF("#Flush\n");
         for ( i = 0; i < 40; i++ ) {
             usleep(50000);
             now = llgettimeofday();
             xc_shadow_control(xc_handle, domid, DOM0_SHADOW_CONTROL_OP_PEEK,
-                              NULL, 0, &stats);
+                              NULL, 0, NULL, 0, &stats);
 
             DPRINTF("now= %lld faults= %" PRId32 " dirty= %" PRId32
                     " dirty_net= %" PRId32 " dirty_block= %" PRId32"\n",
@@ -727,7 +727,7 @@ int xc_linux_save(int xc_handle, int io_
 
         if (xc_shadow_control(xc_handle, dom,
                               DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY,
-                              NULL, 0, NULL ) < 0) {
+                              NULL, 0, NULL, 0, NULL) < 0) {
             ERR("Couldn't enable shadow mode");
             goto out;
         }
@@ -879,7 +879,7 @@ int xc_linux_save(int xc_handle, int io_
                but this is fast enough for the moment. */
             if (!last_iter && xc_shadow_control(
                     xc_handle, dom, DOM0_SHADOW_CONTROL_OP_PEEK,
-                    to_skip, max_pfn, NULL) != max_pfn) {
+                    to_skip, max_pfn, NULL, 0, NULL) != max_pfn) {
                 ERR("Error peeking shadow bitmap");
                 goto out;
             }
@@ -1084,8 +1084,9 @@ int xc_linux_save(int xc_handle, int io_
                         (unsigned long)ctxt.user_regs.edx);
             }
 
-            if (xc_shadow_control(xc_handle, dom, DOM0_SHADOW_CONTROL_OP_CLEAN,
-                                  to_send, max_pfn, &stats ) != max_pfn) {
+            if (xc_shadow_control(xc_handle, dom, 
+                                  DOM0_SHADOW_CONTROL_OP_CLEAN, to_send, 
+                                  max_pfn, NULL, 0, &stats) != max_pfn) {
                 ERR("Error flushing shadow PT");
                 goto out;
             }
@@ -1174,8 +1175,9 @@ int xc_linux_save(int xc_handle, int io_
  out:
 
     if (live) {
-        if(xc_shadow_control(xc_handle, dom, DOM0_SHADOW_CONTROL_OP_OFF,
-                             NULL, 0, NULL ) < 0) {
+        if(xc_shadow_control(xc_handle, dom, 
+                             DOM0_SHADOW_CONTROL_OP_OFF,
+                             NULL, 0, NULL, 0, NULL) < 0) {
             DPRINTF("Warning - couldn't disable shadow mode");
         }
     }
diff -r f2151423f729 -r 01345b08d012 tools/libxc/xenctrl.h
--- a/tools/libxc/xenctrl.h     Wed Aug 16 16:48:45 2006 +0100
+++ b/tools/libxc/xenctrl.h     Wed Aug 16 17:11:56 2006 +0100
@@ -323,6 +323,8 @@ int xc_shadow_control(int xc_handle,
                       unsigned int sop,
                       unsigned long *dirty_bitmap,
                       unsigned long pages,
+                      unsigned long *mb,
+                      uint32_t mode,
                       xc_shadow_control_stats_t *stats);
 
 int xc_bvtsched_global_set(int xc_handle,
diff -r f2151423f729 -r 01345b08d012 tools/misc/xc_shadow.c
--- a/tools/misc/xc_shadow.c    Wed Aug 16 16:48:45 2006 +0100
+++ b/tools/misc/xc_shadow.c    Wed Aug 16 17:11:56 2006 +0100
@@ -60,6 +60,8 @@ int main(int argc, char *argv[])
                            mode, 
                            NULL,
                            0,
+                           NULL,
+                           0,
                            NULL) < 0 )
     {    
         fprintf(stderr, "Error reseting performance counters: %d (%s)\n",
diff -r f2151423f729 -r 01345b08d012 tools/python/xen/lowlevel/xc/xc.c
--- a/tools/python/xen/lowlevel/xc/xc.c Wed Aug 16 16:48:45 2006 +0100
+++ b/tools/python/xen/lowlevel/xc/xc.c Wed Aug 16 17:11:56 2006 +0100
@@ -669,6 +669,59 @@ static PyObject *pyxc_sedf_domain_get(Xc
                          "weight",    weight);
 }
 
+static PyObject *pyxc_shadow_control(PyObject *self,
+                                     PyObject *args,
+                                     PyObject *kwds)
+{
+    XcObject *xc = (XcObject *)self;
+
+    uint32_t dom;
+    int op=0;
+
+    static char *kwd_list[] = { "dom", "op", NULL };
+
+    if ( !PyArg_ParseTupleAndKeywords(args, kwds, "i|i", kwd_list, 
+                                      &dom, &op) )
+        return NULL;
+    
+    if ( xc_shadow_control(xc->xc_handle, dom, op, NULL, 0, NULL, 0, NULL) 
+         < 0 )
+        return PyErr_SetFromErrno(xc_error);
+    
+    Py_INCREF(zero);
+    return zero;
+}
+
+static PyObject *pyxc_shadow_mem_control(PyObject *self,
+                                         PyObject *args,
+                                         PyObject *kwds)
+{
+    XcObject *xc = (XcObject *)self;
+    int op;
+    uint32_t dom;
+    int mbarg = -1;
+    unsigned long mb;
+
+    static char *kwd_list[] = { "dom", "mb", NULL };
+
+    if ( !PyArg_ParseTupleAndKeywords(args, kwds, "i|i", kwd_list, 
+                                      &dom, &mbarg) )
+        return NULL;
+    
+    if ( mbarg < 0 ) 
+        op = DOM0_SHADOW2_CONTROL_OP_GET_ALLOCATION;
+    else 
+    {
+        mb = mbarg;
+        op = DOM0_SHADOW2_CONTROL_OP_SET_ALLOCATION;
+    }
+    if ( xc_shadow_control(xc->xc_handle, dom, op, NULL, 0, &mb, 0, NULL) < 0 )
+        return PyErr_SetFromErrno(xc_error);
+    
+    mbarg = mb;
+    return Py_BuildValue("i", mbarg);
+}
+
 static PyObject *pyxc_sched_credit_domain_set(XcObject *self,
                                               PyObject *args,
                                               PyObject *kwds)
@@ -1118,6 +1171,22 @@ static PyMethodDef pyxc_methods[] = {
       "Get information about the Xen host\n"
       "Returns [dict]: information about Xen"
       "        [None]: on failure.\n" },
+
+    { "shadow_control", 
+      (PyCFunction)pyxc_shadow_control, 
+      METH_VARARGS | METH_KEYWORDS, "\n"
+      "Set parameter for shadow pagetable interface\n"
+      " dom [int]:   Identifier of domain.\n"
+      " op [int, 0]: operation\n\n"
+      "Returns: [int] 0 on success; -1 on error.\n" },
+
+    { "shadow_mem_control", 
+      (PyCFunction)pyxc_shadow_mem_control, 
+      METH_VARARGS | METH_KEYWORDS, "\n"
+      "Set or read shadow pagetable memory use\n"
+      " dom [int]:   Identifier of domain.\n"
+      " mb [int, -1]: MB of shadow memory this domain should have.\n\n"
+      "Returns: [int] MB of shadow memory in use by this domain.\n" },
 
     { "domain_setmaxmem", 
       (PyCFunction)pyxc_domain_setmaxmem, 
diff -r f2151423f729 -r 01345b08d012 tools/python/xen/xend/XendDomain.py
--- a/tools/python/xen/xend/XendDomain.py       Wed Aug 16 16:48:45 2006 +0100
+++ b/tools/python/xen/xend/XendDomain.py       Wed Aug 16 17:11:56 2006 +0100
@@ -532,6 +532,30 @@ class XendDomain:
         except Exception, ex:
             raise XendError(str(ex))
 
+    def domain_shadow_control(self, domid, op):
+        """Shadow page control."""
+        dominfo = self.domain_lookup(domid)
+        try:
+            return xc.shadow_control(dominfo.getDomid(), op)
+        except Exception, ex:
+            raise XendError(str(ex))
+
+    def domain_shadow_mem_get(self, domid):
+        """Get shadow pagetable memory allocation."""
+        dominfo = self.domain_lookup(domid)
+        try:
+            return xc.shadow_mem_control(dominfo.getDomid())
+        except Exception, ex:
+            raise XendError(str(ex))
+
+    def domain_shadow_mem_set(self, domid, mb):
+        """Set shadow pagetable memory allocation."""
+        dominfo = self.domain_lookup(domid)
+        try:
+            return xc.shadow_mem_control(dominfo.getDomid(), mb=mb)
+        except Exception, ex:
+            raise XendError(str(ex))
+
     def domain_sched_credit_get(self, domid):
         """Get credit scheduler parameters for a domain.
         """
diff -r f2151423f729 -r 01345b08d012 tools/python/xen/xend/XendDomainInfo.py
--- a/tools/python/xen/xend/XendDomainInfo.py   Wed Aug 16 16:48:45 2006 +0100
+++ b/tools/python/xen/xend/XendDomainInfo.py   Wed Aug 16 17:11:56 2006 +0100
@@ -30,6 +30,7 @@ import time
 import time
 import threading
 import os
+import math
 
 import xen.lowlevel.xc
 from xen.util import asserts
@@ -126,16 +127,17 @@ VM_CONFIG_PARAMS = [
 # don't come out of xc in the same form as they are specified in the config
 # file, so those are handled separately.
 ROUNDTRIPPING_CONFIG_ENTRIES = [
-    ('uuid',       str),
-    ('vcpus',      int),
-    ('vcpu_avail', int),
-    ('cpu_weight', float),
-    ('memory',     int),
-    ('maxmem',     int),
-    ('bootloader', str),
+    ('uuid',            str),
+    ('vcpus',           int),
+    ('vcpu_avail',      int),
+    ('cpu_weight',      float),
+    ('memory',          int),
+    ('shadow_memory',   int),
+    ('maxmem',          int),
+    ('bootloader',      str),
     ('bootloader_args', str),
-    ('features', str),
-    ('localtime', int),
+    ('features',        str),
+    ('localtime',       int),
     ]
 
 ROUNDTRIPPING_CONFIG_ENTRIES += VM_CONFIG_PARAMS
@@ -146,12 +148,13 @@ ROUNDTRIPPING_CONFIG_ENTRIES += VM_CONFI
 # entries written to the store that cannot be reconfigured on-the-fly.
 #
 VM_STORE_ENTRIES = [
-    ('uuid',       str),
-    ('vcpus',      int),
-    ('vcpu_avail', int),
-    ('memory',     int),
-    ('maxmem',     int),
-    ('start_time', float),
+    ('uuid',          str),
+    ('vcpus',         int),
+    ('vcpu_avail',    int),
+    ('memory',        int),
+    ('shadow_memory', int),
+    ('maxmem',        int),
+    ('start_time',    float),
     ]
 
 VM_STORE_ENTRIES += VM_CONFIG_PARAMS
@@ -572,6 +575,7 @@ class XendDomainInfo:
             defaultInfo('vcpu_avail',   lambda: (1 << self.info['vcpus']) - 1)
 
             defaultInfo('memory',       lambda: 0)
+            defaultInfo('shadow_memory', lambda: 0)
             defaultInfo('maxmem',       lambda: 0)
             defaultInfo('bootloader',   lambda: None)
             defaultInfo('bootloader_args', lambda: None)            
@@ -1280,7 +1284,18 @@ class XendDomainInfo:
             xc.domain_setmaxmem(self.domid, self.info['maxmem'] * 1024)
 
             m = self.image.getDomainMemory(self.info['memory'] * 1024)
-            balloon.free(m)
+
+            # get the domain's shadow memory requirement
+            sm = int(math.ceil(self.image.getDomainShadowMemory(m) / 1024.0))
+            if self.info['shadow_memory'] > sm:
+                sm = self.info['shadow_memory']
+
+            # Make sure there's enough RAM available for the domain
+            balloon.free(m + sm * 1024)
+
+            # Set up the shadow memory
+            sm = xc.shadow_mem_control(self.domid, mb=sm)
+            self.info['shadow_memory'] = sm
 
             init_reservation = self.info['memory'] * 1024
             if os.uname()[4] in ('ia64', 'ppc64'):
diff -r f2151423f729 -r 01345b08d012 tools/python/xen/xend/image.py
--- a/tools/python/xen/xend/image.py    Wed Aug 16 16:48:45 2006 +0100
+++ b/tools/python/xen/xend/image.py    Wed Aug 16 17:11:56 2006 +0100
@@ -152,6 +152,12 @@ class ImageHandler:
             if 'hvm' in xc.xeninfo()['xen_caps']:
                 mem_kb += 4*1024;
         return mem_kb
+
+    def getDomainShadowMemory(self, mem_kb):
+        """@return The minimum shadow memory required, in KiB, for a domain 
+        with mem_kb KiB of RAM."""
+        # PV domains don't need any shadow memory
+        return 0
 
     def buildDomain(self):
         """Build the domain. Define in subclass."""
@@ -364,6 +370,17 @@ class HVMImageHandler(ImageHandler):
             extra_pages = int( math.ceil( extra_mb*1024 / page_kb ))
         return mem_kb + extra_pages * page_kb
 
+    def getDomainShadowMemory(self, mem_kb):
+        """@return The minimum shadow memory required, in KiB, for a domain 
+        with mem_kb KiB of RAM."""
+        if os.uname()[4] in ('ia64', 'ppc64'):
+            # Explicit shadow memory is not a concept 
+            return 0
+        else:
+            # 1MB per vcpu plus 4Kib/Mib of RAM.  This is higher than 
+            # the minimum that Xen would allocate if no value were given.
+            return 1024 * self.vm.getVCpuCount() + mem_kb / 256
+
     def register_shutdown_watch(self):
         """ add xen store watch on control/shutdown """
         self.shutdownWatch = xswatch(self.vm.dompath + "/control/shutdown", \
diff -r f2151423f729 -r 01345b08d012 tools/python/xen/xm/create.py
--- a/tools/python/xen/xm/create.py     Wed Aug 16 16:48:45 2006 +0100
+++ b/tools/python/xen/xm/create.py     Wed Aug 16 17:11:56 2006 +0100
@@ -157,6 +157,10 @@ gopts.var('maxmem', val='MEMORY',
 gopts.var('maxmem', val='MEMORY',
           fn=set_int, default=None,
           use="Maximum domain memory in MB.")
+
+gopts.var('shadow_memory', val='MEMORY',
+          fn=set_int, default=0,
+          use="Domain shadow memory in MB.")
 
 gopts.var('cpu', val='CPU',
           fn=set_int, default=None,
@@ -666,8 +670,9 @@ def make_config(vals):
             if v:
                 config.append([n, v])
 
-    map(add_conf, ['name', 'memory', 'maxmem', 'restart', 'on_poweroff',
-                   'on_reboot', 'on_crash', 'vcpus', 'features'])
+    map(add_conf, ['name', 'memory', 'maxmem', 'shadow_memory',
+                   'restart', 'on_poweroff', 'on_reboot', 'on_crash',
+                   'vcpus', 'features'])
 
     if vals.uuid is not None:
         config.append(['uuid', vals.uuid])
diff -r f2151423f729 -r 01345b08d012 xen/arch/x86/Makefile
--- a/xen/arch/x86/Makefile     Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/arch/x86/Makefile     Wed Aug 16 17:11:56 2006 +0100
@@ -8,7 +8,6 @@ subdir-$(x86_64) += x86_64
 subdir-$(x86_64) += x86_64
 
 obj-y += apic.o
-obj-y += audit.o
 obj-y += bitops.o
 obj-y += compat.o
 obj-y += delay.o
@@ -41,12 +40,21 @@ obj-y += x86_emulate.o
 obj-y += x86_emulate.o
 
 ifneq ($(pae),n)
-obj-$(x86_32) += shadow.o shadow_public.o shadow_guest32.o shadow_guest32pae.o
+obj-$(x86_32) += shadow2-common.o shadow2_g2_on_s3.o shadow2_g3_on_s3.o
 else
-obj-$(x86_32) += shadow32.o
+obj-$(x86_32) += shadow2-common.o shadow2_g2_on_s2.o
 endif
 
-obj-$(x86_64) += shadow.o shadow_public.o shadow_guest32.o shadow_guest32pae.o
+obj-$(x86_64) += shadow2-common.o shadow2_g4_on_s4.o shadow2_g3_on_s3.o \
+                 shadow2_g2_on_s3.o
+
+guest_levels  = $(subst g,,$(filter g%,$(subst ., ,$(subst _, ,$(subst 
shadow2_,,$(1))))))
+shadow_levels = $(subst s,,$(filter s%,$(subst ., ,$(subst _, ,$(subst 
shadow2_,,$(1))))))
+shadow2_defns = -DGUEST_PAGING_LEVELS=$(call guest_levels,$(1)) \
+                -DSHADOW_PAGING_LEVELS=$(call shadow_levels,$(1))
+
+shadow2_%.o: shadow2.c $(HDRS) Makefile
+       $(CC) $(CFLAGS) $(call shadow2_defns,$(@F)) -c $< -o $@
 
 obj-$(crash_debug) += gdbstub.o
 
diff -r f2151423f729 -r 01345b08d012 xen/arch/x86/dom0_ops.c
--- a/xen/arch/x86/dom0_ops.c   Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/arch/x86/dom0_ops.c   Wed Aug 16 17:11:56 2006 +0100
@@ -89,7 +89,7 @@ long arch_do_dom0_op(struct dom0_op *op,
         d = find_domain_by_id(op->u.shadow_control.domain);
         if ( d != NULL )
         {
-            ret = shadow_mode_control(d, &op->u.shadow_control);
+            ret = shadow2_control_op(d, &op->u.shadow_control, u_dom0_op);
             put_domain(d);
             copy_to_guest(u_dom0_op, op, 1);
         } 
diff -r f2151423f729 -r 01345b08d012 xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c     Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/arch/x86/domain.c     Wed Aug 16 17:11:56 2006 +0100
@@ -134,13 +134,6 @@ struct vcpu *alloc_vcpu_struct(struct do
     v->arch.perdomain_ptes =
         d->arch.mm_perdomain_pt + (vcpu_id << GDT_LDT_VCPU_SHIFT);
 
-    v->arch.guest_vtable  = __linear_l2_table;
-    v->arch.shadow_vtable = __shadow_linear_l2_table;
-#if defined(__x86_64__)
-    v->arch.guest_vl3table = __linear_l3_table;
-    v->arch.guest_vl4table = __linear_l4_table;
-#endif
-
     pae_l3_cache_init(&v->arch.pae_l3_cache);
 
     return v;
@@ -155,9 +148,7 @@ int arch_domain_create(struct domain *d)
 {
     l1_pgentry_t gdt_l1e;
     int vcpuid, pdpt_order;
-#ifdef __x86_64__
     int i;
-#endif
 
     pdpt_order = get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t));
     d->arch.mm_perdomain_pt = alloc_xenheap_pages(pdpt_order);
@@ -202,8 +193,12 @@ int arch_domain_create(struct domain *d)
 
 #endif /* __x86_64__ */
 
-    shadow_lock_init(d);
-    INIT_LIST_HEAD(&d->arch.free_shadow_frames);
+    shadow2_lock_init(d);
+    for ( i = 0; i <= SHADOW2_MAX_ORDER; i++ )
+        INIT_LIST_HEAD(&d->arch.shadow2_freelists[i]);
+    INIT_LIST_HEAD(&d->arch.shadow2_p2m_freelist);
+    INIT_LIST_HEAD(&d->arch.shadow2_p2m_inuse);
+    INIT_LIST_HEAD(&d->arch.shadow2_toplevel_shadows);
 
     if ( !is_idle_domain(d) )
     {
@@ -234,6 +229,8 @@ int arch_domain_create(struct domain *d)
 
 void arch_domain_destroy(struct domain *d)
 {
+    shadow2_final_teardown(d);
+
     free_xenheap_pages(
         d->arch.mm_perdomain_pt,
         get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t)));
@@ -328,31 +325,35 @@ int arch_set_info_guest(
         if ( !hvm_initialize_guest_resources(v) )
             return -EINVAL;
     }
-    else if ( shadow_mode_refcounts(d) )
-    {
-        if ( !get_page(mfn_to_page(cr3_pfn), d) )
+    else
+    {
+        if ( !get_page_and_type(mfn_to_page(cr3_pfn), d,
+                                PGT_base_page_table) )
         {
             destroy_gdt(v);
             return -EINVAL;
         }
-    }
-    else
-    {
-        if ( !get_page_and_type(mfn_to_page(cr3_pfn), d,
-                                PGT_base_page_table) )
-        {
-            destroy_gdt(v);
-            return -EINVAL;
-        }
-    }
-
-    update_pagetables(v);
+    }    
+
+    /* Shadow2: make sure the domain has enough shadow memory to
+     * boot another vcpu */
+    if ( shadow2_mode_enabled(d) 
+         && d->arch.shadow2_total_pages < shadow2_min_acceptable_pages(d) )
+    {
+        destroy_gdt(v);
+        return -ENOMEM;
+    }
 
     if ( v->vcpu_id == 0 )
         update_domain_wallclock_time(d);
 
     /* Don't redo final setup */
     set_bit(_VCPUF_initialised, &v->vcpu_flags);
+
+    if ( shadow2_mode_enabled(d) )
+        shadow2_update_paging_modes(v);
+
+    update_cr3(v);
 
     return 0;
 }
@@ -669,7 +670,6 @@ static void __context_switch(void)
             loaddebug(&n->arch.guest_context, 6);
             loaddebug(&n->arch.guest_context, 7);
         }
-
         n->arch.ctxt_switch_to(n);
     }
 
@@ -927,29 +927,34 @@ void domain_relinquish_resources(struct 
     /* Drop the in-use references to page-table bases. */
     for_each_vcpu ( d, v )
     {
-        if ( (pfn = pagetable_get_pfn(v->arch.guest_table)) != 0 )
-        {
-            if ( !shadow_mode_refcounts(d) )
-                put_page_type(mfn_to_page(pfn));
-            put_page(mfn_to_page(pfn));
-
+        /* Drop ref to guest_table (from new_guest_cr3(), svm/vmx cr3 handling,
+         * or sh2_update_paging_modes()) */
+        pfn = pagetable_get_pfn(v->arch.guest_table);
+        if ( pfn != 0 )
+        {
+            if ( shadow2_mode_refcounts(d) )
+                put_page(mfn_to_page(pfn));
+            else
+                put_page_and_type(mfn_to_page(pfn));
             v->arch.guest_table = pagetable_null();
         }
 
-        if ( (pfn = pagetable_get_pfn(v->arch.guest_table_user)) != 0 )
-        {
-            if ( !shadow_mode_refcounts(d) )
-                put_page_type(mfn_to_page(pfn));
-            put_page(mfn_to_page(pfn));
-
+#ifdef __x86_64__
+        /* Drop ref to guest_table_user (from MMUEXT_NEW_USER_BASEPTR) */
+        pfn = pagetable_get_pfn(v->arch.guest_table_user);
+        if ( pfn != 0 )
+        {
+            put_page_and_type(mfn_to_page(pfn));
             v->arch.guest_table_user = pagetable_null();
         }
+#endif
     }
 
     if ( d->vcpu[0] && hvm_guest(d->vcpu[0]) )
         hvm_relinquish_guest_resources(d);
 
-    shadow_mode_disable(d);
+    /* Tear down shadow mode stuff. */
+    shadow2_teardown(d);
 
     /*
      * Relinquish GDT mappings. No need for explicit unmapping of the LDT as
@@ -964,26 +969,23 @@ void domain_relinquish_resources(struct 
 
     /* Free page used by xen oprofile buffer */
     free_xenoprof_pages(d);
-
 }
 
 void arch_dump_domain_info(struct domain *d)
 {
-    if ( shadow_mode_enabled(d) )
-    {
-        printk("    shadow mode: ");
-        if ( shadow_mode_refcounts(d) )
+    if ( shadow2_mode_enabled(d) )
+    {
+        printk("    shadow2 mode: ");
+        if ( d->arch.shadow2_mode & SHM2_enable )
+            printk("enabled ");
+        if ( shadow2_mode_refcounts(d) )
             printk("refcounts ");
-        if ( shadow_mode_write_all(d) )
-            printk("write_all ");
-        if ( shadow_mode_log_dirty(d) )
+        if ( shadow2_mode_log_dirty(d) )
             printk("log_dirty ");
-        if ( shadow_mode_translate(d) )
+        if ( shadow2_mode_translate(d) )
             printk("translate ");
-        if ( shadow_mode_external(d) )
+        if ( shadow2_mode_external(d) )
             printk("external ");
-        if ( shadow_mode_wr_pt_pte(d) )
-            printk("wr_pt_pte ");
         printk("\n");
     }
 }
diff -r f2151423f729 -r 01345b08d012 xen/arch/x86/domain_build.c
--- a/xen/arch/x86/domain_build.c       Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/arch/x86/domain_build.c       Wed Aug 16 17:11:56 2006 +0100
@@ -683,8 +683,11 @@ int construct_dom0(struct domain *d,
     for ( i = 1; i < opt_dom0_max_vcpus; i++ )
         (void)alloc_vcpu(d, i, i);
 
-    /* Set up monitor table */
-    update_pagetables(v);
+    /* Set up CR3 value for write_ptbase */
+    if ( shadow2_mode_enabled(v->domain) )
+        shadow2_update_paging_modes(v);
+    else
+        update_cr3(v);
 
     /* Install the new page tables. */
     local_irq_disable();
@@ -796,10 +799,8 @@ int construct_dom0(struct domain *d,
     new_thread(v, dsi.v_kernentry, vstack_end, vstartinfo_start);
 
     if ( opt_dom0_shadow )
-    {
-        shadow_mode_enable(d, SHM_enable);
-        update_pagetables(v);
-    }
+        if ( shadow2_test_enable(d) == 0 ) 
+            shadow2_update_paging_modes(v);
 
     if ( supervisor_mode_kernel )
     {
diff -r f2151423f729 -r 01345b08d012 xen/arch/x86/hvm/hvm.c
--- a/xen/arch/x86/hvm/hvm.c    Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/arch/x86/hvm/hvm.c    Wed Aug 16 17:11:56 2006 +0100
@@ -30,6 +30,7 @@
 #include <xen/hypercall.h>
 #include <xen/guest_access.h>
 #include <xen/event.h>
+#include <xen/shadow.h>
 #include <asm/current.h>
 #include <asm/e820.h>
 #include <asm/io.h>
@@ -42,10 +43,6 @@
 #include <asm/spinlock.h>
 #include <asm/hvm/hvm.h>
 #include <asm/hvm/support.h>
-#include <asm/shadow.h>
-#if CONFIG_PAGING_LEVELS >= 3
-#include <asm/shadow_64.h>
-#endif
 #include <public/sched.h>
 #include <public/hvm/ioreq.h>
 #include <public/version.h>
@@ -61,7 +58,7 @@ static void hvm_zap_mmio_range(
 static void hvm_zap_mmio_range(
     struct domain *d, unsigned long pfn, unsigned long nr_pfn)
 {
-    unsigned long i, val = INVALID_MFN;
+    unsigned long i;
 
     ASSERT(d == current->domain);
 
@@ -70,7 +67,8 @@ static void hvm_zap_mmio_range(
         if ( pfn + i >= 0xfffff )
             break;
 
-        __copy_to_user(&phys_to_machine_mapping[pfn + i], &val, sizeof (val));
+        if ( VALID_MFN(gmfn_to_mfn(d, pfn + i)) )
+            guest_remove_page(d, pfn + i);
     }
 }
 
@@ -262,11 +260,13 @@ void hvm_setup_platform(struct domain* d
     if ( !hvm_guest(v) || (v->vcpu_id != 0) )
         return;
 
+#if 0 /* SHADOW2 does not have this */
     if ( shadow_direct_map_init(d) == 0 )
     {
         printk("Can not allocate shadow direct map for HVM domain.\n");
         domain_crash_synchronous();
     }
+#endif
 
     hvm_zap_iommu_pages(d);
 
@@ -351,6 +351,7 @@ void hvm_hlt(unsigned long rflags)
     struct periodic_time *pt = &v->domain->arch.hvm_domain.pl_time.periodic_tm;
     s_time_t next_pit = -1, next_wakeup;
 
+#if 0 /* This seems to fire at unwelcome times in Linux */
     /*
      * Detect machine shutdown.  Only do this for vcpu 0, to avoid potentially 
      * shutting down the domain early. If we halt with interrupts disabled, 
@@ -364,6 +365,7 @@ void hvm_hlt(unsigned long rflags)
         domain_shutdown(current->domain, SHUTDOWN_poweroff);
         return;
     }
+#endif /* 0 */
 
     if ( !v->vcpu_id )
         next_pit = get_scheduled(v, pt->irq, pt);
@@ -380,6 +382,8 @@ void hvm_hlt(unsigned long rflags)
  */
 int hvm_copy(void *buf, unsigned long vaddr, int size, int dir)
 {
+    struct vcpu *v = current;
+    unsigned long gfn;
     unsigned long mfn;
     char *addr;
     int count;
@@ -389,10 +393,9 @@ int hvm_copy(void *buf, unsigned long va
         if (count > size)
             count = size;
 
-        if (hvm_paging_enabled(current))
-            mfn = gva_to_mfn(vaddr);
-        else
-            mfn = get_mfn_from_gpfn(vaddr >> PAGE_SHIFT);
+        gfn = shadow2_gva_to_gfn(v, vaddr);
+        mfn = mfn_x(sh2_vcpu_gfn_to_mfn(v, gfn));
+
         if (mfn == INVALID_MFN)
             return 0;
 
@@ -545,7 +548,7 @@ void hvm_do_hypercall(struct cpu_user_re
         return;
     }
 
-    if ( current->domain->arch.ops->guest_paging_levels == PAGING_L4 )
+    if ( current->arch.shadow2->guest_levels == 4 )
     {
         pregs->rax = hvm_hypercall64_table[pregs->rax](pregs->rdi,
                                                        pregs->rsi,
diff -r f2151423f729 -r 01345b08d012 xen/arch/x86/hvm/platform.c
--- a/xen/arch/x86/hvm/platform.c       Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/arch/x86/hvm/platform.c       Wed Aug 16 17:11:56 2006 +0100
@@ -21,7 +21,7 @@
 #include <xen/config.h>
 #include <xen/types.h>
 #include <xen/mm.h>
-#include <asm/shadow.h>
+#include <xen/shadow.h>
 #include <xen/domain_page.h>
 #include <asm/page.h>
 #include <xen/event.h>
@@ -35,9 +35,6 @@
 #include <xen/lib.h>
 #include <xen/sched.h>
 #include <asm/current.h>
-#if CONFIG_PAGING_LEVELS >= 3
-#include <asm/shadow_64.h>
-#endif
 
 #define DECODE_success  1
 #define DECODE_failure  0
@@ -724,7 +721,7 @@ void send_pio_req(struct cpu_user_regs *
 
     if (pvalid) {
         if (hvm_paging_enabled(current))
-            p->u.pdata = (void *) gva_to_gpa(value);
+            p->u.data = shadow2_gva_to_gpa(current, value);
         else
             p->u.pdata = (void *) value; /* guest VA == guest PA */
     } else
@@ -774,7 +771,7 @@ void send_mmio_req(
 
     if (pvalid) {
         if (hvm_paging_enabled(v))
-            p->u.pdata = (void *) gva_to_gpa(value);
+            p->u.data = shadow2_gva_to_gpa(v, value);
         else
             p->u.pdata = (void *) value; /* guest VA == guest PA */
     } else
diff -r f2151423f729 -r 01345b08d012 xen/arch/x86/hvm/svm/svm.c
--- a/xen/arch/x86/hvm/svm/svm.c        Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/arch/x86/hvm/svm/svm.c        Wed Aug 16 17:11:56 2006 +0100
@@ -26,9 +26,10 @@
 #include <xen/irq.h>
 #include <xen/softirq.h>
 #include <xen/hypercall.h>
+#include <xen/domain_page.h>
 #include <asm/current.h>
 #include <asm/io.h>
-#include <asm/shadow.h>
+#include <asm/shadow2.h>
 #include <asm/regs.h>
 #include <asm/cpufeature.h>
 #include <asm/processor.h>
@@ -43,10 +44,6 @@
 #include <asm/hvm/svm/emulate.h>
 #include <asm/hvm/svm/vmmcall.h>
 #include <asm/hvm/svm/intr.h>
-#include <asm/shadow.h>
-#if CONFIG_PAGING_LEVELS >= 3
-#include <asm/shadow_64.h>
-#endif
 #include <public/sched.h>
 
 #define SVM_EXTRA_DEBUG
@@ -414,7 +411,7 @@ static int svm_realmode(struct vcpu *v)
     return (eflags & X86_EFLAGS_VM) || !(cr0 & X86_CR0_PE);
 }
 
-static int svm_instruction_length(struct vcpu *v)
+int svm_guest_x86_mode(struct vcpu *v)
 {
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
     unsigned long cr0 = vmcb->cr0, eflags = vmcb->rflags, mode;
@@ -423,10 +420,20 @@ static int svm_instruction_length(struct
         mode = vmcb->cs.attributes.fields.l ? 8 : 4;
     else
         mode = (eflags & X86_EFLAGS_VM) || !(cr0 & X86_CR0_PE) ? 2 : 4;
-    return svm_instrlen(guest_cpu_user_regs(), mode);
-}
-
-static unsigned long svm_get_ctrl_reg(struct vcpu *v, unsigned int num)
+    return mode;
+}
+
+int svm_instruction_length(struct vcpu *v)
+{
+    return svm_instrlen(guest_cpu_user_regs(), svm_guest_x86_mode(v));
+}
+
+void svm_update_host_cr3(struct vcpu *v)
+{
+    /* SVM doesn't have a HOST_CR3 equivalent to update. */
+}
+
+unsigned long svm_get_ctrl_reg(struct vcpu *v, unsigned int num)
 {
     switch ( num )
     {
@@ -436,6 +443,8 @@ static unsigned long svm_get_ctrl_reg(st
         return v->arch.hvm_svm.cpu_cr2;
     case 3:
         return v->arch.hvm_svm.cpu_cr3;
+    case 4:
+        return v->arch.hvm_svm.cpu_shadow_cr4;
     default:
         BUG();
     }
@@ -524,8 +533,6 @@ static void svm_init_hypercall_page(stru
     /* Don't support HYPERVISOR_iret at the moment */
     *(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */
 }
-
-
 
 
 int svm_dbg_on = 0;
@@ -647,6 +654,11 @@ static void svm_load_cpu_guest_regs(
     svm_load_cpu_user_regs(v, regs);
 }
 
+int svm_long_mode_enabled(struct vcpu *v)
+{
+    return SVM_LONG_GUEST(v);
+}
+
 
 
 static void arch_svm_do_launch(struct vcpu *v) 
@@ -726,7 +738,6 @@ static void svm_final_setup_guest(struct
 static void svm_final_setup_guest(struct vcpu *v)
 {
     struct domain *d = v->domain;
-    struct vcpu *vc;
 
     v->arch.schedule_tail    = arch_svm_do_launch;
     v->arch.ctxt_switch_from = svm_ctxt_switch_from;
@@ -735,9 +746,12 @@ static void svm_final_setup_guest(struct
     if ( v != d->vcpu[0] )
         return;
 
-    /* Initialize monitor page table */
-    for_each_vcpu( d, vc )
-        vc->arch.monitor_table = pagetable_null();
+    if ( !shadow2_mode_external(d) )
+    {
+        DPRINTK("Can't init HVM for dom %u vcpu %u: "
+                "not in shadow2 external mode\n", d->domain_id, v->vcpu_id);
+        domain_crash(d);
+    }
 
     /* 
      * Required to do this once per domain
@@ -745,13 +759,6 @@ static void svm_final_setup_guest(struct
      */
     memset(&d->shared_info->evtchn_mask[0], 0xff, 
            sizeof(d->shared_info->evtchn_mask));       
-
-    /* 
-     * Put the domain in shadow mode even though we're going to be using
-     * the shared 1:1 page table initially. It shouldn't hurt 
-     */
-    shadow_mode_enable(d, SHM_enable|SHM_refcounts|
-                       SHM_translate|SHM_external|SHM_wr_pt_pte);
 }
 
 
@@ -809,9 +816,13 @@ int start_svm(void)
 
     hvm_funcs.realmode = svm_realmode;
     hvm_funcs.paging_enabled = svm_paging_enabled;
+    hvm_funcs.long_mode_enabled = svm_long_mode_enabled;
+    hvm_funcs.guest_x86_mode = svm_guest_x86_mode;
     hvm_funcs.instruction_length = svm_instruction_length;
     hvm_funcs.get_guest_ctrl_reg = svm_get_ctrl_reg;
 
+    hvm_funcs.update_host_cr3 = svm_update_host_cr3;
+    
     hvm_funcs.stts = svm_stts;
     hvm_funcs.set_tsc_offset = svm_set_tsc_offset;
 
@@ -834,7 +845,6 @@ static void svm_relinquish_guest_resourc
             continue;
 
         destroy_vmcb(&v->arch.hvm_svm);
-        free_monitor_pagetable(v);
         kill_timer(&v->arch.hvm_vcpu.hlt_timer);
         if ( hvm_apic_support(v->domain) && (VLAPIC(v) != NULL) ) 
         {
@@ -851,8 +861,6 @@ static void svm_relinquish_guest_resourc
 
     if ( d->arch.hvm_domain.buffered_io_va )
         unmap_domain_page_global((void *)d->arch.hvm_domain.buffered_io_va);
-
-    shadow_direct_map_clean(d);
 }
 
 
@@ -894,7 +902,6 @@ static int svm_do_page_fault(unsigned lo
 {
     struct vcpu *v = current;
     unsigned long eip;
-    unsigned long gpa; /* FIXME: PAE */
     int result;
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
 
@@ -907,43 +914,7 @@ static int svm_do_page_fault(unsigned lo
             va, eip, (unsigned long)regs->error_code);
 //#endif
 
-    if ( !svm_paging_enabled(v) )
-    {
-        if ( shadow_direct_map_fault(va, regs) ) 
-            return 1;
-
-        handle_mmio(va, va);
-        return 1;
-    }
-
-
-    gpa = gva_to_gpa(va);
-
-    /* Use 1:1 page table to identify MMIO address space */
-    if (mmio_space(gpa))
-    {
-        /* No support for APIC */
-        if (!hvm_apic_support(v->domain) && gpa >= 0xFEC00000)
-        { 
-            int inst_len;
-            inst_len = svm_instruction_length(v);
-            if (inst_len == -1)
-            {
-                printf("%s: INST_LEN - Unable to decode properly\n", __func__);
-                domain_crash_synchronous();
-            }
-
-            __update_guest_eip(vmcb, inst_len);
-
-            return 1;
-        }
-
-        handle_mmio(va, gpa);
-
-        return 1;
-    }
-    
-    result = shadow_fault(va, regs);
+    result = shadow2_fault(va, regs); 
 
     if( result ) {
         /* Let's make sure that the Guest TLB is flushed */
@@ -1035,19 +1006,12 @@ static void svm_vmexit_do_cpuid(struct v
             clear_bit(X86_FEATURE_APIC, &edx);
         }
 
-#if CONFIG_PAGING_LEVELS < 3
-        clear_bit(X86_FEATURE_PAE, &edx);
-        clear_bit(X86_FEATURE_PSE, &edx);
+#if CONFIG_PAGING_LEVELS >= 3
+        if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
+#endif
+            clear_bit(X86_FEATURE_PAE, &edx);
         clear_bit(X86_FEATURE_PSE36, &edx);
-#else
-        if ( v->domain->arch.ops->guest_paging_levels == PAGING_L2 )
-        {
-            if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
-                clear_bit(X86_FEATURE_PAE, &edx);
-            clear_bit(X86_FEATURE_PSE, &edx);
-            clear_bit(X86_FEATURE_PSE36, &edx);
-        }
-#endif
+
         /* Clear out reserved bits. */
         ecx &= ~SVM_VCPU_CPUID_L1_ECX_RESERVED;
         edx &= ~SVM_VCPU_CPUID_L1_EDX_RESERVED;
@@ -1097,23 +1061,12 @@ static void svm_vmexit_do_cpuid(struct v
         clear_bit(X86_FEATURE_SYSCALL & 31, &edx);
 #endif
 
-#if CONFIG_PAGING_LEVELS < 3
-        clear_bit(X86_FEATURE_NX & 31, &edx);
-        clear_bit(X86_FEATURE_PAE, &edx);
-        clear_bit(X86_FEATURE_PSE, &edx);
+
+#if CONFIG_PAGING_LEVELS >= 3
+        if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
+#endif
+            clear_bit(X86_FEATURE_PAE, &edx);
         clear_bit(X86_FEATURE_PSE36, &edx);
-#else
-        if ( v->domain->arch.ops->guest_paging_levels == PAGING_L2 )
-        {
-            if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
-            {
-                clear_bit(X86_FEATURE_NX & 31, &edx);
-                clear_bit(X86_FEATURE_PAE, &edx);
-            }
-            clear_bit(X86_FEATURE_PSE, &edx);
-            clear_bit(X86_FEATURE_PSE36, &edx);
-        }
-#endif
 
         /* Make SVM feature invisible to the guest. */
         clear_bit(X86_FEATURE_SVME & 31, &ecx);
@@ -1555,6 +1508,7 @@ static int svm_set_cr0(unsigned long val
     unsigned long mfn;
     int paging_enabled;
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
+    unsigned long old_base_mfn;
   
     ASSERT(vmcb);
 
@@ -1600,54 +1554,21 @@ static int svm_set_cr0(unsigned long val
             set_bit(SVM_CPU_STATE_LMA_ENABLED,
                     &v->arch.hvm_svm.cpu_state);
             vmcb->efer |= (EFER_LMA | EFER_LME);
-            if (!shadow_set_guest_paging_levels(v->domain, PAGING_L4) )
-            {
-                printk("Unsupported guest paging levels\n");
-                domain_crash_synchronous(); /* need to take a clean path */
-            }
-        }
-        else
+        }
 #endif  /* __x86_64__ */
-        {
-#if CONFIG_PAGING_LEVELS >= 3
-            /* seems it's a 32-bit or 32-bit PAE guest */
-            if ( test_bit(SVM_CPU_STATE_PAE_ENABLED,
-                        &v->arch.hvm_svm.cpu_state) )
-            {
-                /* The guest enables PAE first and then it enables PG, it is
-                 * really a PAE guest */
-                if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L3) )
-                {
-                    printk("Unsupported guest paging levels\n");
-                    domain_crash_synchronous();
-                }
-            }
-            else
-            {
-                if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L2) )
-                {
-                    printk("Unsupported guest paging levels\n");
-                    domain_crash_synchronous(); /* need to take a clean path */
-                }
-            }
-#endif
-        }
 
         /* Now arch.guest_table points to machine physical. */
+        old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
         v->arch.guest_table = pagetable_from_pfn(mfn);
-        update_pagetables(v);
+        if ( old_base_mfn )
+            put_page(mfn_to_page(old_base_mfn));
+        shadow2_update_paging_modes(v);
 
         HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx", 
                 (unsigned long) (mfn << PAGE_SHIFT));
 
+        vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3; 
         set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
-        vmcb->cr3 = pagetable_get_paddr(v->arch.shadow_table);
-
-        /* arch->shadow_table should hold the next CR3 for shadow */
-        HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx\n", 
-                    v->arch.hvm_svm.cpu_cr3, mfn);
-
-        return 1;
     }
 
     if ( !((value & X86_CR0_PE) && (value & X86_CR0_PG)) && paging_enabled )
@@ -1667,17 +1588,16 @@ static int svm_set_cr0(unsigned long val
             svm_inject_exception(v, TRAP_gp_fault, 1, 0);
             return 0;
         }
-
-        clear_all_shadow_status( v->domain );
+        shadow2_update_paging_modes(v);
+        vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3;
         set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
-        vmcb->cr3 = pagetable_get_paddr(v->domain->arch.phys_table);
     }
     else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE )
     {
         /* we should take care of this kind of situation */
-        clear_all_shadow_status(v->domain);
+        shadow2_update_paging_modes(v);
+        vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3;
         set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
-        vmcb->cr3 = pagetable_get_paddr(v->domain->arch.phys_table);
     }
 
     return 1;
@@ -1786,7 +1706,7 @@ static int mov_to_cr(int gpreg, int cr, 
             mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
             if (mfn != pagetable_get_pfn(v->arch.guest_table))
                 __hvm_bug(regs);
-            shadow_sync_all(v->domain);
+            shadow2_update_cr3(v);
         }
         else 
         {
@@ -1812,14 +1732,10 @@ static int mov_to_cr(int gpreg, int cr, 
             /*
              * arch.shadow_table should now hold the next CR3 for shadow
              */
-#if CONFIG_PAGING_LEVELS >= 3
-            if ( v->domain->arch.ops->guest_paging_levels == PAGING_L3 )
-                shadow_sync_all(v->domain);
-#endif
             v->arch.hvm_svm.cpu_cr3 = value;
-            update_pagetables(v);
+            update_cr3(v);
+            vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3; 
             HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value);
-            vmcb->cr3 = pagetable_get_paddr(v->arch.shadow_table);
         }
         break;
     }
@@ -1838,12 +1754,6 @@ static int mov_to_cr(int gpreg, int cr, 
                 /* The guest is a 32-bit PAE guest. */
 #if CONFIG_PAGING_LEVELS >= 3
                 unsigned long mfn, old_base_mfn;
-
-                if( !shadow_set_guest_paging_levels(v->domain, PAGING_L3) )
-                {
-                    printk("Unsupported guest paging levels\n");
-                    domain_crash_synchronous(); /* need to take a clean path */
-                }
 
                 if ( !VALID_MFN(mfn = get_mfn_from_gpfn(
                                     v->arch.hvm_svm.cpu_cr3 >> PAGE_SHIFT)) ||
@@ -1853,21 +1763,20 @@ static int mov_to_cr(int gpreg, int cr, 
                     domain_crash_synchronous(); /* need to take a clean path */
                 }
 
-                old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
-                if ( old_base_mfn )
-                    put_page(mfn_to_page(old_base_mfn));
-
                 /*
                  * Now arch.guest_table points to machine physical.
                  */
 
+                old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
                 v->arch.guest_table = pagetable_from_pfn(mfn);
-                update_pagetables(v);
+                if ( old_base_mfn )
+                    put_page(mfn_to_page(old_base_mfn));
+                shadow2_update_paging_modes(v);
 
                 HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
                             (unsigned long) (mfn << PAGE_SHIFT));
 
-                vmcb->cr3 = pagetable_get_paddr(v->arch.shadow_table);
+                vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3; 
 
                 /*
                  * arch->shadow_table should hold the next CR3 for shadow
@@ -1876,33 +1785,6 @@ static int mov_to_cr(int gpreg, int cr, 
                 HVM_DBG_LOG(DBG_LEVEL_VMMU, 
                             "Update CR3 value = %lx, mfn = %lx",
                             v->arch.hvm_svm.cpu_cr3, mfn);
-#endif
-            }
-            else
-            {
-                /*  The guest is a 64 bit or 32-bit PAE guest. */
-#if CONFIG_PAGING_LEVELS >= 3
-                if ( (v->domain->arch.ops != NULL) &&
-                        v->domain->arch.ops->guest_paging_levels == PAGING_L2)
-                {
-                    /* Seems the guest first enables PAE without enabling PG,
-                     * it must enable PG after that, and it is a 32-bit PAE
-                     * guest */
-
-                    if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L3))
-                    {
-                        printk("Unsupported guest paging levels\n");
-                        domain_crash_synchronous();
-                    }                   
-                }
-                else
-                {
-                    if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L4))
-                    {
-                        printk("Unsupported guest paging levels\n");
-                        domain_crash_synchronous();
-                    }
-                }
 #endif
             }
         }
@@ -1926,7 +1808,7 @@ static int mov_to_cr(int gpreg, int cr, 
         if ((old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE))
         {
             set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
-            shadow_sync_all(v->domain);
+            shadow2_update_paging_modes(v);
         }
         break;
     }
@@ -2267,7 +2149,7 @@ void svm_handle_invlpg(const short invlp
 
     /* Overkill, we may not this */
     set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
-    shadow_invlpg(v, g_vaddr);
+    shadow2_invlpg(v, g_vaddr);
 }
 
 
@@ -2638,7 +2520,7 @@ void walk_shadow_and_guest_pt(unsigned l
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
     unsigned long gpa;
 
-    gpa = gva_to_gpa( gva );
+    gpa = shadow2_gva_to_gpa(current, gva);
     printk( "gva = %lx, gpa=%lx, gCR3=%x\n", gva, gpa, (u32)vmcb->cr3 );
     if( !svm_paging_enabled(v) || mmio_space(gpa) )
        return;
@@ -2662,8 +2544,12 @@ void walk_shadow_and_guest_pt(unsigned l
     __copy_from_user(&gpte, &linear_pg_table[ l1_linear_offset(gva) ],
                      sizeof(gpte) );
     printk( "G-PTE = %x, flags=%x\n", gpte.l1, l1e_get_flags(gpte) );
-    __copy_from_user( &spte, &phys_to_machine_mapping[ l1e_get_pfn( gpte ) ],
+
+    BUG(); // need to think about this, and convert usage of
+           // phys_to_machine_mapping to use pagetable format...
+    __copy_from_user( &spte, &phys_to_machine_mapping[ l1e_get_pfn( gpte ) ], 
                       sizeof(spte) );
+
     printk( "S-PTE = %x, flags=%x\n", spte.l1, l1e_get_flags(spte));
 }
 #endif /* SVM_WALK_GUEST_PAGES */
@@ -2704,7 +2590,8 @@ asmlinkage void svm_vmexit_handler(struc
 
     if (svm_dbg_on && exit_reason == VMEXIT_EXCEPTION_PF) 
     {
-        if (svm_paging_enabled(v) && !mmio_space(gva_to_gpa(vmcb->exitinfo2)))
+        if (svm_paging_enabled(v) && 
+            !mmio_space(shadow2_gva_to_gpa(current, vmcb->exitinfo2)))
         {
             printk("I%08ld,ExC=%s(%d),IP=%x:%llx,I1=%llx,I2=%llx,INT=%llx, "
                    "gpa=%llx\n", intercepts_counter,
@@ -2713,7 +2600,7 @@ asmlinkage void svm_vmexit_handler(struc
                    (unsigned long long) vmcb->exitinfo1,
                    (unsigned long long) vmcb->exitinfo2,
                    (unsigned long long) vmcb->exitintinfo.bytes,
-            (unsigned long long) gva_to_gpa( vmcb->exitinfo2 ) );
+            (unsigned long long) shadow2_gva_to_gpa(current, vmcb->exitinfo2));
         }
         else 
         {
@@ -2757,7 +2644,7 @@ asmlinkage void svm_vmexit_handler(struc
         && ( ( vmcb->exitinfo2 == vmcb->rip )
         || vmcb->exitintinfo.bytes) )
     {
-       if (svm_paging_enabled(v) && !mmio_space(gva_to_gpa(vmcb->exitinfo2)))  
   
+       if (svm_paging_enabled(v) && !mmio_space(gva_to_gpa(vmcb->exitinfo2)))
            walk_shadow_and_guest_pt( vmcb->exitinfo2 );
     }
 #endif
diff -r f2151423f729 -r 01345b08d012 xen/arch/x86/hvm/svm/vmcb.c
--- a/xen/arch/x86/hvm/svm/vmcb.c       Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/arch/x86/hvm/svm/vmcb.c       Wed Aug 16 17:11:56 2006 +0100
@@ -380,8 +380,8 @@ void svm_do_launch(struct vcpu *v)
         printk("%s: phys_table   = %lx\n", __func__, pt);
     }
 
-    /* At launch we always use the phys_table */
-    vmcb->cr3 = pagetable_get_paddr(v->domain->arch.phys_table);
+    /* Set cr3 from hw_cr3 even when guest-visible paging is not enabled */
+    vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3; 
 
     if (svm_dbg_on) 
     {
diff -r f2151423f729 -r 01345b08d012 xen/arch/x86/hvm/vlapic.c
--- a/xen/arch/x86/hvm/vlapic.c Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/arch/x86/hvm/vlapic.c Wed Aug 16 17:11:56 2006 +0100
@@ -21,7 +21,8 @@
 #include <xen/types.h>
 #include <xen/mm.h>
 #include <xen/xmalloc.h>
-#include <asm/shadow.h>
+#include <xen/shadow.h>
+#include <xen/domain_page.h>
 #include <asm/page.h>
 #include <xen/event.h>
 #include <xen/trace.h>
diff -r f2151423f729 -r 01345b08d012 xen/arch/x86/hvm/vmx/vmcs.c
--- a/xen/arch/x86/hvm/vmx/vmcs.c       Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/arch/x86/hvm/vmx/vmcs.c       Wed Aug 16 17:11:56 2006 +0100
@@ -34,12 +34,8 @@
 #include <asm/flushtlb.h>
 #include <xen/event.h>
 #include <xen/kernel.h>
-#include <asm/shadow.h>
 #include <xen/keyhandler.h>
-
-#if CONFIG_PAGING_LEVELS >= 3
-#include <asm/shadow_64.h>
-#endif
+#include <asm/shadow2.h>
 
 static int vmcs_size;
 static int vmcs_order;
@@ -238,7 +234,7 @@ static void vmx_set_host_env(struct vcpu
 
 static void vmx_do_launch(struct vcpu *v)
 {
-/* Update CR3, GDT, LDT, TR */
+/* Update CR3, CR0, CR4, GDT, LDT, TR */
     unsigned int  error = 0;
     unsigned long cr0, cr4;
 
@@ -276,8 +272,11 @@ static void vmx_do_launch(struct vcpu *v
     error |= __vmwrite(GUEST_TR_BASE, 0);
     error |= __vmwrite(GUEST_TR_LIMIT, 0xff);
 
-    __vmwrite(GUEST_CR3, pagetable_get_paddr(v->domain->arch.phys_table));
-    __vmwrite(HOST_CR3, pagetable_get_paddr(v->arch.monitor_table));
+    shadow2_update_paging_modes(v);
+    printk("%s(): GUEST_CR3<=%08lx, HOST_CR3<=%08lx\n",
+           __func__, v->arch.hvm_vcpu.hw_cr3, v->arch.cr3);
+    __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
+    __vmwrite(HOST_CR3, v->arch.cr3);
 
     v->arch.schedule_tail = arch_vmx_do_resume;
 
diff -r f2151423f729 -r 01345b08d012 xen/arch/x86/hvm/vmx/vmx.c
--- a/xen/arch/x86/hvm/vmx/vmx.c        Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/arch/x86/hvm/vmx/vmx.c        Wed Aug 16 17:11:56 2006 +0100
@@ -26,9 +26,9 @@
 #include <xen/softirq.h>
 #include <xen/domain_page.h>
 #include <xen/hypercall.h>
+#include <xen/perfc.h>
 #include <asm/current.h>
 #include <asm/io.h>
-#include <asm/shadow.h>
 #include <asm/regs.h>
 #include <asm/cpufeature.h>
 #include <asm/processor.h>
@@ -40,10 +40,7 @@
 #include <asm/hvm/vmx/vmx.h>
 #include <asm/hvm/vmx/vmcs.h>
 #include <asm/hvm/vmx/cpu.h>
-#include <asm/shadow.h>
-#if CONFIG_PAGING_LEVELS >= 3
-#include <asm/shadow_64.h>
-#endif
+#include <asm/shadow2.h>
 #include <public/sched.h>
 #include <public/hvm/ioreq.h>
 #include <asm/hvm/vpic.h>
@@ -69,11 +66,16 @@ static int vmx_initialize_guest_resource
     if ( v->vcpu_id != 0 )
         return 1;
 
+    if ( !shadow2_mode_external(d) )
+    {
+        DPRINTK("Can't init HVM for dom %u vcpu %u: "
+                "not in shadow2 external mode\n", 
+                d->domain_id, v->vcpu_id);
+        domain_crash(d);
+    }
+
     for_each_vcpu ( d, vc )
     {
-        /* Initialize monitor page table */
-        vc->arch.monitor_table = pagetable_null();
-
         memset(&vc->arch.hvm_vmx, 0, sizeof(struct arch_vmx_struct));
 
         if ( (rc = vmx_create_vmcs(vc)) != 0 )
@@ -107,6 +109,7 @@ static int vmx_initialize_guest_resource
 
         vc->arch.hvm_vmx.io_bitmap_a = io_bitmap_a;
         vc->arch.hvm_vmx.io_bitmap_b = io_bitmap_b;
+
     }
 
     /*
@@ -116,11 +119,6 @@ static int vmx_initialize_guest_resource
     memset(&d->shared_info->evtchn_mask[0], 0xff,
            sizeof(d->shared_info->evtchn_mask));
 
-    /* Put the domain in shadow mode even though we're going to be using
-     * the shared 1:1 page table initially. It shouldn't hurt */
-    shadow_mode_enable(
-        d, SHM_enable|SHM_refcounts|SHM_translate|SHM_external|SHM_wr_pt_pte);
-
     return 1;
 }
 
@@ -133,7 +131,6 @@ static void vmx_relinquish_guest_resourc
         vmx_destroy_vmcs(v);
         if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
             continue;
-        free_monitor_pagetable(v);
         kill_timer(&v->arch.hvm_vcpu.hlt_timer);
         if ( hvm_apic_support(v->domain) && (VLAPIC(v) != NULL) )
         {
@@ -153,8 +150,6 @@ static void vmx_relinquish_guest_resourc
 
     if ( d->arch.hvm_domain.buffered_io_va )
         unmap_domain_page_global((void *)d->arch.hvm_domain.buffered_io_va);
-
-    shadow_direct_map_clean(d);
 }
 
 #ifdef __x86_64__
@@ -595,14 +590,6 @@ static void vmx_load_cpu_guest_regs(stru
     vmx_vmcs_exit(v);
 }
 
-static int vmx_realmode(struct vcpu *v)
-{
-    unsigned long rflags;
-
-    __vmread(GUEST_RFLAGS, &rflags);
-    return rflags & X86_EFLAGS_VM;
-}
-
 static int vmx_instruction_length(struct vcpu *v)
 {
     unsigned long inst_len;
@@ -622,6 +609,8 @@ static unsigned long vmx_get_ctrl_reg(st
         return v->arch.hvm_vmx.cpu_cr2;
     case 3:
         return v->arch.hvm_vmx.cpu_cr3;
+    case 4:
+        return v->arch.hvm_vmx.cpu_shadow_cr4;
     default:
         BUG();
     }
@@ -753,8 +742,12 @@ static void vmx_setup_hvm_funcs(void)
 
     hvm_funcs.realmode = vmx_realmode;
     hvm_funcs.paging_enabled = vmx_paging_enabled;
+    hvm_funcs.long_mode_enabled = vmx_long_mode_enabled;
+    hvm_funcs.guest_x86_mode = vmx_guest_x86_mode;
     hvm_funcs.instruction_length = vmx_instruction_length;
     hvm_funcs.get_guest_ctrl_reg = vmx_get_ctrl_reg;
+
+    hvm_funcs.update_host_cr3 = vmx_update_host_cr3;
 
     hvm_funcs.stts = vmx_stts;
     hvm_funcs.set_tsc_offset = vmx_set_tsc_offset;
@@ -855,53 +848,25 @@ static void inline __update_guest_eip(un
     __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
 }
 
-
 static int vmx_do_page_fault(unsigned long va, struct cpu_user_regs *regs)
 {
-    unsigned long gpa; /* FIXME: PAE */
     int result;
 
 #if 0 /* keep for debugging */
     {
-        unsigned long eip;
-
+        unsigned long eip, cs;
+
+        __vmread(GUEST_CS_BASE, &cs);
         __vmread(GUEST_RIP, &eip);
         HVM_DBG_LOG(DBG_LEVEL_VMMU,
-                    "vmx_do_page_fault = 0x%lx, eip = %lx, error_code = %lx",
-                    va, eip, (unsigned long)regs->error_code);
+                    "vmx_do_page_fault = 0x%lx, cs_base=%lx, "
+                    "eip = %lx, error_code = %lx\n",
+                    va, cs, eip, (unsigned long)regs->error_code);
     }
 #endif
 
-    if ( !vmx_paging_enabled(current) )
-    {
-        /* construct 1-to-1 direct mapping */
-        if ( shadow_direct_map_fault(va, regs) ) 
-            return 1;
-
-        handle_mmio(va, va);
-        TRACE_VMEXIT (2,2);
-        return 1;
-    }
-    gpa = gva_to_gpa(va);
-
-    /* Use 1:1 page table to identify MMIO address space */
-    if ( mmio_space(gpa) ){
-        struct vcpu *v = current;
-        /* No support for APIC */
-        if (!hvm_apic_support(v->domain) && gpa >= 0xFEC00000) { 
-            u32 inst_len;
-            __vmread(VM_EXIT_INSTRUCTION_LEN, &(inst_len));
-            __update_guest_eip(inst_len);
-            return 1;
-        }
-        TRACE_VMEXIT (2,2);
-        /* in the case of MMIO, we are more interested in gpa than in va */
-        TRACE_VMEXIT (4,gpa);
-        handle_mmio(va, gpa);
-        return 1;
-    }
-
-    result = shadow_fault(va, regs);
+    result = shadow2_fault(va, regs);
+
     TRACE_VMEXIT (2,result);
 #if 0
     if ( !result )
@@ -972,23 +937,11 @@ static void vmx_vmexit_do_cpuid(struct c
                 clear_bit(X86_FEATURE_APIC, &edx);
             }
     
-#if CONFIG_PAGING_LEVELS < 3
-            edx &= ~(bitmaskof(X86_FEATURE_PAE)  |
-                     bitmaskof(X86_FEATURE_PSE)  |
-                     bitmaskof(X86_FEATURE_PSE36));
-#else
-            if ( v->domain->arch.ops->guest_paging_levels == PAGING_L2 )
-            {
-                if ( v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
-                    clear_bit(X86_FEATURE_PSE36, &edx);
-                else
-                {
-                    clear_bit(X86_FEATURE_PAE, &edx);
-                    clear_bit(X86_FEATURE_PSE, &edx);
-                    clear_bit(X86_FEATURE_PSE36, &edx);
-                }
-            }
+#if CONFIG_PAGING_LEVELS >= 3
+            if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
 #endif
+                clear_bit(X86_FEATURE_PAE, &edx);
+            clear_bit(X86_FEATURE_PSE36, &edx);
 
             ebx &= NUM_THREADS_RESET_MASK;  
 
@@ -1086,7 +1039,7 @@ static void vmx_vmexit_do_invlpg(unsigne
      * We do the safest things first, then try to update the shadow
      * copying from guest
      */
-    shadow_invlpg(v, va);
+    shadow2_invlpg(v, va);
 }
 
 
@@ -1307,11 +1260,8 @@ vmx_world_restore(struct vcpu *v, struct
 
     error |= __vmwrite(CR0_READ_SHADOW, c->cr0);
 
-    if (!vmx_paging_enabled(v)) {
-        HVM_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table");
-        __vmwrite(GUEST_CR3, pagetable_get_paddr(v->domain->arch.phys_table));
+    if (!vmx_paging_enabled(v))
         goto skip_cr3;
-    }
 
     if (c->cr3 == v->arch.hvm_vmx.cpu_cr3) {
         /*
@@ -1325,7 +1275,6 @@ vmx_world_restore(struct vcpu *v, struct
             domain_crash_synchronous();
             return 0;
         }
-        shadow_sync_all(v->domain);
     } else {
         /*
          * If different, make a shadow. Check if the PDBR is valid
@@ -1348,12 +1297,16 @@ vmx_world_restore(struct vcpu *v, struct
          * arch.shadow_table should now hold the next CR3 for shadow
          */
         v->arch.hvm_vmx.cpu_cr3 = c->cr3;
-        update_pagetables(v);
+    }
+
+ skip_cr3:
+
+    shadow2_update_paging_modes(v);
+    if (!vmx_paging_enabled(v))
+        HVM_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table");
+    else
         HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %x", c->cr3);
-        __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table));
-    }
-
- skip_cr3:
+    __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
 
     error |= __vmread(CR4_READ_SHADOW, &old_cr4);
     error |= __vmwrite(GUEST_CR4, (c->cr4 | VMX_CR4_HOST_MASK));
@@ -1485,6 +1438,7 @@ static int vmx_set_cr0(unsigned long val
     int paging_enabled;
     unsigned long vm_entry_value;
     unsigned long old_cr0;
+    unsigned long old_base_mfn;
 
     /*
      * CR0: We don't want to lose PE and PG.
@@ -1514,7 +1468,8 @@ static int vmx_set_cr0(unsigned long val
             v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)) ||
              !get_page(mfn_to_page(mfn), v->domain) )
         {
-            printk("Invalid CR3 value = %lx", v->arch.hvm_vmx.cpu_cr3);
+            printk("Invalid CR3 value = %lx (mfn=%lx)\n", 
+                   v->arch.hvm_vmx.cpu_cr3, mfn);
             domain_crash_synchronous(); /* need to take a clean path */
         }
 
@@ -1539,51 +1494,22 @@ static int vmx_set_cr0(unsigned long val
             __vmread(VM_ENTRY_CONTROLS, &vm_entry_value);
             vm_entry_value |= VM_ENTRY_CONTROLS_IA32E_MODE;
             __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
-
-            if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L4) )
-            {
-                printk("Unsupported guest paging levels\n");
-                domain_crash_synchronous(); /* need to take a clean path */
-            }
-        }
-        else
-#endif  /* __x86_64__ */
-        {
-#if CONFIG_PAGING_LEVELS >= 3
-            /* seems it's a 32-bit or 32-bit PAE guest */
-
-            if ( test_bit(VMX_CPU_STATE_PAE_ENABLED,
-                        &v->arch.hvm_vmx.cpu_state) )
-            {
-                /* The guest enables PAE first and then it enables PG, it is
-                 * really a PAE guest */
-                if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L3) )
-                {
-                    printk("Unsupported guest paging levels\n");
-                    domain_crash_synchronous();
-                }
-            }
-            else
-            {
-                if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L2) )
-                {
-                    printk("Unsupported guest paging levels\n");
-                    domain_crash_synchronous(); /* need to take a clean path */
-                }
-            }
+        }
 #endif
-        }
 
         /*
          * Now arch.guest_table points to machine physical.
          */
+        old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
         v->arch.guest_table = pagetable_from_pfn(mfn);
-        update_pagetables(v);
+        if (old_base_mfn)
+            put_page(mfn_to_page(old_base_mfn));
+        shadow2_update_paging_modes(v);
 
         HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
                     (unsigned long) (mfn << PAGE_SHIFT));
 
-        __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table));
+        __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
         /*
          * arch->shadow_table should hold the next CR3 for shadow
          */
@@ -1625,7 +1551,6 @@ static int vmx_set_cr0(unsigned long val
             }
         }
 
-        clear_all_shadow_status(v->domain);
         if ( vmx_assist(v, VMX_ASSIST_INVOKE) ) {
             set_bit(VMX_CPU_STATE_ASSIST_ENABLED, &v->arch.hvm_vmx.cpu_state);
             __vmread(GUEST_RIP, &eip);
@@ -1651,9 +1576,8 @@ static int vmx_set_cr0(unsigned long val
     }
     else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE )
     {
-        /* we should take care of this kind of situation */
-        clear_all_shadow_status(v->domain);
-        __vmwrite(GUEST_CR3, pagetable_get_paddr(v->domain->arch.phys_table));
+        __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
+        shadow2_update_paging_modes(v);
     }
 
     return 1;
@@ -1738,7 +1662,7 @@ static int mov_to_cr(int gp, int cr, str
             mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT);
             if (mfn != pagetable_get_pfn(v->arch.guest_table))
                 __hvm_bug(regs);
-            shadow_sync_all(v->domain);
+            shadow2_update_cr3(v);
         } else {
             /*
              * If different, make a shadow. Check if the PDBR is valid
@@ -1759,16 +1683,11 @@ static int mov_to_cr(int gp, int cr, str
             /*
              * arch.shadow_table should now hold the next CR3 for shadow
              */
-#if CONFIG_PAGING_LEVELS >= 3
-            if ( v->domain->arch.ops->guest_paging_levels == PAGING_L3 )
-                shadow_sync_all(v->domain);
-#endif
-
             v->arch.hvm_vmx.cpu_cr3 = value;
-            update_pagetables(v);
+            update_cr3(v);
             HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx",
                         value);
-            __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table));
+            __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
         }
         break;
     }
@@ -1785,12 +1704,6 @@ static int mov_to_cr(int gp, int cr, str
                 /* The guest is a 32-bit PAE guest. */
 #if CONFIG_PAGING_LEVELS >= 3
                 unsigned long mfn, old_base_mfn;
-
-                if( !shadow_set_guest_paging_levels(v->domain, PAGING_L3) )
-                {
-                    printk("Unsupported guest paging levels\n");
-                    domain_crash_synchronous(); /* need to take a clean path */
-                }
 
                 if ( !VALID_MFN(mfn = get_mfn_from_gpfn(
                                     v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)) ||
@@ -1800,21 +1713,20 @@ static int mov_to_cr(int gp, int cr, str
                     domain_crash_synchronous(); /* need to take a clean path */
                 }
 
-                old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
-                if ( old_base_mfn )
-                    put_page(mfn_to_page(old_base_mfn));
 
                 /*
                  * Now arch.guest_table points to machine physical.
                  */
 
+                old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
                 v->arch.guest_table = pagetable_from_pfn(mfn);
-                update_pagetables(v);
+                if ( old_base_mfn )
+                    put_page(mfn_to_page(old_base_mfn));
 
                 HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx",
                             (unsigned long) (mfn << PAGE_SHIFT));
 
-                __vmwrite(GUEST_CR3, 
pagetable_get_paddr(v->arch.shadow_table));
+                __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3);
 
                 /*
                  * arch->shadow_table should hold the next CR3 for shadow
@@ -1822,27 +1734,6 @@ static int mov_to_cr(int gp, int cr, str
 
                 HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = 
%lx",
                             v->arch.hvm_vmx.cpu_cr3, mfn);
-#endif
-            }
-            else
-            {
-                /*  The guest is a 64 bit or 32-bit PAE guest. */
-#if CONFIG_PAGING_LEVELS >= 3
-                if ( (v->domain->arch.ops != NULL) &&
-                        v->domain->arch.ops->guest_paging_levels == PAGING_L2)
-                {
-                    /* Seems the guest first enables PAE without enabling PG,
-                     * it must enable PG after that, and it is a 32-bit PAE
-                     * guest */
-
-                    if ( !shadow_set_guest_paging_levels(v->domain,
-                                                            PAGING_L3) )
-                    {
-                        printk("Unsupported guest paging levels\n");
-                        /* need to take a clean path */
-                        domain_crash_synchronous();
-                    }
-                }
 #endif
             }
         }
@@ -1864,8 +1755,7 @@ static int mov_to_cr(int gp, int cr, str
          * all TLB entries except global entries.
          */
         if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
-            shadow_sync_all(v->domain);
-
+            shadow2_update_paging_modes(v);
         break;
     }
     default:
diff -r f2151423f729 -r 01345b08d012 xen/arch/x86/mm.c
--- a/xen/arch/x86/mm.c Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/arch/x86/mm.c Wed Aug 16 17:11:56 2006 +0100
@@ -137,7 +137,7 @@ static void free_l1_table(struct page_in
 
 static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long,
                         unsigned long type);
-static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
+static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t, unsigned long gl1mfn);
 
 /* Used to defer flushing of memory structures. */
 struct percpu_mm_info {
@@ -274,9 +274,9 @@ void share_xen_page_with_privileged_gues
 #else
 /*
  * In debug builds we shadow a selection of <4GB PDPTs to exercise code paths.
- * We cannot safely shadow the idle page table, nor shadow-mode page tables
+ * We cannot safely shadow the idle page table, nor shadow (v1) page tables
  * (detected by lack of an owning domain). As required for correctness, we
- * always shadow PDPTs aboive 4GB.
+ * always shadow PDPTs above 4GB.
  */
 #define l3tab_needs_shadow(mfn)                         \
     (((((mfn) << PAGE_SHIFT) != __pa(idle_pg_table)) && \
@@ -297,17 +297,21 @@ static int __init cache_pae_fixmap_addre
 }
 __initcall(cache_pae_fixmap_address);
 
-static void __write_ptbase(unsigned long mfn)
+static DEFINE_PER_CPU(u32, make_cr3_timestamp);
+
+void make_cr3(struct vcpu *v, unsigned long mfn)
+/* Takes the MFN of a PAE l3 table, copies the contents to below 4GB if
+ * necessary, and sets v->arch.cr3 to the value to load in CR3. */
 {
     l3_pgentry_t *highmem_l3tab, *lowmem_l3tab;
-    struct pae_l3_cache *cache = &current->arch.pae_l3_cache;
+    struct pae_l3_cache *cache = &v->arch.pae_l3_cache;
     unsigned int cpu = smp_processor_id();
 
-    /* Fast path 1: does this mfn need a shadow at all? */
+    /* Fast path: does this mfn need a shadow at all? */
     if ( !l3tab_needs_shadow(mfn) )
     {
-        write_cr3(mfn << PAGE_SHIFT);
-        /* Cache is no longer in use or valid (/after/ write to %cr3). */
+        v->arch.cr3 = mfn << PAGE_SHIFT;
+        /* Cache is no longer in use or valid */
         cache->high_mfn = 0;
         return;
     }
@@ -315,13 +319,6 @@ static void __write_ptbase(unsigned long
     /* Caching logic is not interrupt safe. */
     ASSERT(!in_irq());
 
-    /* Fast path 2: is this mfn already cached? */
-    if ( cache->high_mfn == mfn )
-    {
-        write_cr3(__pa(cache->table[cache->inuse_idx]));
-        return;
-    }
-
     /* Protects against pae_flush_pgd(). */
     spin_lock(&cache->lock);
 
@@ -330,29 +327,33 @@ static void __write_ptbase(unsigned long
 
     /* Map the guest L3 table and copy to the chosen low-memory cache. */
     *(fix_pae_highmem_pl1e - cpu) = l1e_from_pfn(mfn, __PAGE_HYPERVISOR);
+    /* First check the previous high mapping can't be in the TLB. 
+     * (i.e. have we loaded CR3 since we last did this?) */
+    if ( unlikely(this_cpu(make_cr3_timestamp) == this_cpu(tlbflush_time)) )
+        local_flush_tlb_one(fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu));
     highmem_l3tab = (l3_pgentry_t *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu);
     lowmem_l3tab  = cache->table[cache->inuse_idx];
     memcpy(lowmem_l3tab, highmem_l3tab, sizeof(cache->table[0]));
     *(fix_pae_highmem_pl1e - cpu) = l1e_empty();
-
-    /* Install the low-memory L3 table in CR3. */
-    write_cr3(__pa(lowmem_l3tab));
+    this_cpu(make_cr3_timestamp) = this_cpu(tlbflush_time);
+
+    v->arch.cr3 = __pa(lowmem_l3tab);
 
     spin_unlock(&cache->lock);
 }
 
 #else /* !CONFIG_X86_PAE */
 
-static void __write_ptbase(unsigned long mfn)
-{
-    write_cr3(mfn << PAGE_SHIFT);
+void make_cr3(struct vcpu *v, unsigned long mfn)
+{
+    v->arch.cr3 = mfn << PAGE_SHIFT;
 }
 
 #endif /* !CONFIG_X86_PAE */
 
 void write_ptbase(struct vcpu *v)
 {
-    __write_ptbase(pagetable_get_pfn(v->arch.monitor_table));
+    write_cr3(v->arch.cr3);
 }
 
 void invalidate_shadow_ldt(struct vcpu *v)
@@ -423,8 +424,6 @@ int map_ldt_shadow_page(unsigned int off
 
     BUG_ON(unlikely(in_irq()));
 
-    shadow_sync_va(v, gva);
-
     TOGGLE_MODE();
     __copy_from_user(&l1e, &linear_pg_table[l1_linear_offset(gva)],
                      sizeof(l1e));
@@ -440,12 +439,12 @@ int map_ldt_shadow_page(unsigned int off
 
     res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
 
-    if ( !res && unlikely(shadow_mode_refcounts(d)) )
-    {
-        shadow_lock(d);
-        shadow_remove_all_write_access(d, gmfn, mfn);
+    if ( !res && unlikely(shadow2_mode_refcounts(d)) )
+    {
+        shadow2_lock(d);
+        shadow2_remove_write_access(d->vcpu[0], _mfn(mfn), 0, 0);
         res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page);
-        shadow_unlock(d);
+        shadow2_unlock(d);
     }
 
     if ( unlikely(!res) )
@@ -513,7 +512,7 @@ get_linear_pagetable(
     struct page_info *page;
     unsigned long pfn;
 
-    ASSERT( !shadow_mode_refcounts(d) );
+    ASSERT( !shadow2_mode_refcounts(d) );
 
     if ( (root_get_flags(re) & _PAGE_RW) )
     {
@@ -576,7 +575,8 @@ get_page_from_l1e(
 
         if ( !iomem_access_permitted(d, mfn, mfn) )
         {
-            MEM_LOG("Non-privileged attempt to map I/O space %08lx", mfn);
+            MEM_LOG("Non-privileged (%u) attempt to map I/O space %08lx", 
+                    d->domain_id, mfn);
             return 0;
         }
 
@@ -587,9 +587,14 @@ get_page_from_l1e(
         d = dom_io;
     }
 
-    okay = ((l1e_get_flags(l1e) & _PAGE_RW) ?
-            get_page_and_type(page, d, PGT_writable_page) :
-            get_page(page, d));
+    /* Foreign mappings into guests in shadow2 external mode don't
+     * contribute to writeable mapping refcounts.  (This allows the
+     * qemu-dm helper process in dom0 to map the domain's memory without
+     * messing up the count of "real" writable mappings.) */
+    okay = (((l1e_get_flags(l1e) & _PAGE_RW) && 
+             !(unlikely(shadow2_mode_external(d) && (d != current->domain))))
+            ? get_page_and_type(page, d, PGT_writable_page)
+            : get_page(page, d));
     if ( !okay )
     {
         MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte
@@ -609,8 +614,6 @@ get_page_from_l2e(
     struct domain *d, unsigned long vaddr)
 {
     int rc;
-
-    ASSERT(!shadow_mode_refcounts(d));
 
     if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
         return 1;
@@ -641,8 +644,6 @@ get_page_from_l3e(
 {
     int rc;
 
-    ASSERT(!shadow_mode_refcounts(d));
-
     if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
         return 1;
 
@@ -668,8 +669,6 @@ get_page_from_l4e(
     struct domain *d, unsigned long vaddr)
 {
     int rc;
-
-    ASSERT( !shadow_mode_refcounts(d) );
 
     if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
         return 1;
@@ -727,7 +726,10 @@ void put_page_from_l1e(l1_pgentry_t l1e,
         domain_crash(d);
     }
 
-    if ( l1e_get_flags(l1e) & _PAGE_RW )
+    /* Remember we didn't take a type-count of foreign writable mappings
+     * to shadow2 external domains */
+    if ( (l1e_get_flags(l1e) & _PAGE_RW) && 
+         !(unlikely((e != d) && shadow2_mode_external(e))) )
     {
         put_page_and_type(page);
     }
@@ -784,7 +786,7 @@ static int alloc_l1_table(struct page_in
     l1_pgentry_t  *pl1e;
     int            i;
 
-    ASSERT(!shadow_mode_refcounts(d));
+    ASSERT(!shadow2_mode_refcounts(d));
 
     pl1e = map_domain_page(pfn);
 
@@ -832,6 +834,8 @@ static int create_pae_xen_mappings(l3_pg
      *  2. Cannot appear in another page table's L3:
      *     a. alloc_l3_table() calls this function and this check will fail
      *     b. mod_l3_entry() disallows updates to slot 3 in an existing table
+     *
+     * XXX -- this needs revisiting for shadow2_mode_refcount()==true...
      */
     page = l3e_get_page(l3e3);
     BUG_ON(page->u.inuse.type_info & PGT_pinned);
@@ -955,11 +959,7 @@ static int alloc_l2_table(struct page_in
     l2_pgentry_t  *pl2e;
     int            i;
 
-    /* See the code in shadow_promote() to understand why this is here. */
-    if ( (PGT_base_page_table == PGT_l2_page_table) &&
-         unlikely(shadow_mode_refcounts(d)) )
-        return 1;
-    ASSERT(!shadow_mode_refcounts(d));
+    ASSERT(!shadow2_mode_refcounts(d));
     
     pl2e = map_domain_page(pfn);
 
@@ -1009,11 +1009,7 @@ static int alloc_l3_table(struct page_in
     l3_pgentry_t  *pl3e;
     int            i;
 
-    /* See the code in shadow_promote() to understand why this is here. */
-    if ( (PGT_base_page_table == PGT_l3_page_table) &&
-         shadow_mode_refcounts(d) )
-        return 1;
-    ASSERT(!shadow_mode_refcounts(d));
+    ASSERT(!shadow2_mode_refcounts(d));
 
 #ifdef CONFIG_X86_PAE
     /*
@@ -1072,11 +1068,7 @@ static int alloc_l4_table(struct page_in
     unsigned long vaddr;
     int            i;
 
-    /* See the code in shadow_promote() to understand why this is here. */
-    if ( (PGT_base_page_table == PGT_l4_page_table) &&
-         shadow_mode_refcounts(d) )
-        return 1;
-    ASSERT(!shadow_mode_refcounts(d));
+    ASSERT(!shadow2_mode_refcounts(d));
 
     for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
     {
@@ -1183,51 +1175,61 @@ static void free_l4_table(struct page_in
 
 static inline int update_l1e(l1_pgentry_t *pl1e, 
                              l1_pgentry_t  ol1e, 
-                             l1_pgentry_t  nl1e)
-{
+                             l1_pgentry_t  nl1e,
+                             unsigned long gl1mfn,
+                             struct vcpu *v)
+{
+    int rv = 1;
+    if ( unlikely(shadow2_mode_enabled(v->domain)) )
+        shadow2_lock(v->domain);
 #ifndef PTE_UPDATE_WITH_CMPXCHG
-    return !__copy_to_user(pl1e, &nl1e, sizeof(nl1e));
+    rv = (!__copy_to_user(pl1e, &nl1e, sizeof(nl1e)));
 #else
-    intpte_t o = l1e_get_intpte(ol1e);
-    intpte_t n = l1e_get_intpte(nl1e);
-
-    for ( ; ; )
-    {
-        if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) )
-        {
-            MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte
-                    ": saw %" PRIpte,
-                    l1e_get_intpte(ol1e),
-                    l1e_get_intpte(nl1e),
-                    o);
-            return 0;
-        }
-
-        if ( o == l1e_get_intpte(ol1e) )
-            break;
-
-        /* Allowed to change in Accessed/Dirty flags only. */
-        BUG_ON((o ^ l1e_get_intpte(ol1e)) &
-               ~(int)(_PAGE_ACCESSED|_PAGE_DIRTY));
-        ol1e = l1e_from_intpte(o);
-    }
-
-    return 1;
+    {
+        intpte_t o = l1e_get_intpte(ol1e);
+        intpte_t n = l1e_get_intpte(nl1e);
+        
+        for ( ; ; )
+        {
+            if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) )
+            {
+                MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte
+                        ": saw %" PRIpte,
+                        l1e_get_intpte(ol1e),
+                        l1e_get_intpte(nl1e),
+                        o);
+                rv = 0;
+                break;
+            }
+
+            if ( o == l1e_get_intpte(ol1e) )
+                break;
+
+            /* Allowed to change in Accessed/Dirty flags only. */
+            BUG_ON((o ^ l1e_get_intpte(ol1e)) &
+                   ~(int)(_PAGE_ACCESSED|_PAGE_DIRTY));
+            ol1e = l1e_from_intpte(o);
+        }
+    }
 #endif
+    if ( unlikely(shadow2_mode_enabled(v->domain)) )
+    {
+        shadow2_validate_guest_entry(v, _mfn(gl1mfn), pl1e);
+        shadow2_unlock(v->domain);    
+    }
+    return rv;
 }
 
 
 /* Update the L1 entry at pl1e to new value nl1e. */
-static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
+static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e, 
+                        unsigned long gl1mfn)
 {
     l1_pgentry_t ol1e;
     struct domain *d = current->domain;
 
     if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
         return 0;
-
-    if ( unlikely(shadow_mode_refcounts(d)) )
-        return update_l1e(pl1e, ol1e, nl1e);
 
     if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
     {
@@ -1239,13 +1241,13 @@ static int mod_l1_entry(l1_pgentry_t *pl
         }
 
         /* Fast path for identical mapping, r/w and presence. */
-        if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT))
-            return update_l1e(pl1e, ol1e, nl1e);
+        if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT) )
+            return update_l1e(pl1e, ol1e, nl1e, gl1mfn, current);
 
         if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) )
             return 0;
         
-        if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
+        if ( unlikely(!update_l1e(pl1e, ol1e, nl1e, gl1mfn, current)) )
         {
             put_page_from_l1e(nl1e, d);
             return 0;
@@ -1253,7 +1255,7 @@ static int mod_l1_entry(l1_pgentry_t *pl
     }
     else
     {
-        if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
+        if ( unlikely(!update_l1e(pl1e, ol1e, nl1e, gl1mfn, current)) )
             return 0;
     }
 
@@ -1262,9 +1264,9 @@ static int mod_l1_entry(l1_pgentry_t *pl
 }
 
 #ifndef PTE_UPDATE_WITH_CMPXCHG
-#define UPDATE_ENTRY(_t,_p,_o,_n) ({ (*(_p) = (_n)); 1; })
+#define _UPDATE_ENTRY(_t,_p,_o,_n) ({ (*(_p) = (_n)); 1; })
 #else
-#define UPDATE_ENTRY(_t,_p,_o,_n) ({                            \
+#define _UPDATE_ENTRY(_t,_p,_o,_n) ({                            \
     for ( ; ; )                                                 \
     {                                                           \
         intpte_t __o = cmpxchg((intpte_t *)(_p),                \
@@ -1279,6 +1281,18 @@ static int mod_l1_entry(l1_pgentry_t *pl
     }                                                           \
     1; })
 #endif
+#define UPDATE_ENTRY(_t,_p,_o,_n,_m)  ({                            \
+    int rv;                                                         \
+    if ( unlikely(shadow2_mode_enabled(current->domain)) )          \
+        shadow2_lock(current->domain);                              \
+    rv = _UPDATE_ENTRY(_t, _p, _o, _n);                             \
+    if ( unlikely(shadow2_mode_enabled(current->domain)) )          \
+    {                                                               \
+        shadow2_validate_guest_entry(current, _mfn(_m), (_p));      \
+        shadow2_unlock(current->domain);                            \
+    }                                                               \
+    rv;                                                             \
+})
 
 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
 static int mod_l2_entry(l2_pgentry_t *pl2e, 
@@ -1309,19 +1323,19 @@ static int mod_l2_entry(l2_pgentry_t *pl
 
         /* Fast path for identical mapping and presence. */
         if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT))
-            return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e);
+            return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn);
 
         if ( unlikely(!l1_backptr(&vaddr, pgentry_ptr_to_slot(pl2e), type)) ||
              unlikely(!get_page_from_l2e(nl2e, pfn, current->domain, vaddr)) )
             return 0;
 
-        if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
+        if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn)) )
         {
             put_page_from_l2e(nl2e, pfn);
             return 0;
         }
     }
-    else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) )
+    else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn)) )
     {
         return 0;
     }
@@ -1329,7 +1343,6 @@ static int mod_l2_entry(l2_pgentry_t *pl
     put_page_from_l2e(ol2e, pfn);
     return 1;
 }
-
 
 #if CONFIG_PAGING_LEVELS >= 3
 
@@ -1356,7 +1369,7 @@ static int mod_l3_entry(l3_pgentry_t *pl
      */
     if ( pgentry_ptr_to_slot(pl3e) >= 3 )
         return 0;
-#endif
+#endif 
 
     if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
         return 0;
@@ -1372,7 +1385,7 @@ static int mod_l3_entry(l3_pgentry_t *pl
 
         /* Fast path for identical mapping and presence. */
         if (!l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT))
-            return UPDATE_ENTRY(l3, pl3e, ol3e, nl3e);
+            return UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn);
 
 #if CONFIG_PAGING_LEVELS >= 4
         if ( unlikely(!l2_backptr(&vaddr, pgentry_ptr_to_slot(pl3e), type)) ||
@@ -1383,15 +1396,15 @@ static int mod_l3_entry(l3_pgentry_t *pl
             << L3_PAGETABLE_SHIFT;
         if ( unlikely(!get_page_from_l3e(nl3e, pfn, current->domain, vaddr)) )
             return 0;
-#endif
-
-        if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) )
+#endif 
+
+        if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn)) )
         {
             put_page_from_l3e(nl3e, pfn);
             return 0;
         }
     }
-    else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) )
+    else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn)) )
     {
         return 0;
     }
@@ -1438,19 +1451,19 @@ static int mod_l4_entry(l4_pgentry_t *pl
 
         /* Fast path for identical mapping and presence. */
         if (!l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT))
-            return UPDATE_ENTRY(l4, pl4e, ol4e, nl4e);
+            return UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn);
 
         if ( unlikely(!l3_backptr(&vaddr, pgentry_ptr_to_slot(pl4e), type)) ||
              unlikely(!get_page_from_l4e(nl4e, pfn, current->domain, vaddr)) )
             return 0;
 
-        if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) )
+        if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn)) )
         {
             put_page_from_l4e(nl4e, pfn);
             return 0;
         }
     }
-    else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) )
+    else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn)) )
     {
         return 0;
     }
@@ -1506,18 +1519,21 @@ void free_page_type(struct page_info *pa
          */
         this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_ALL_TLBS;
 
-        if ( unlikely(shadow_mode_enabled(owner)) )
+        if ( unlikely(shadow2_mode_enabled(owner)
+                 && !shadow2_lock_is_acquired(owner)) )
         {
             /* Raw page tables are rewritten during save/restore. */
-            if ( !shadow_mode_translate(owner) )
+            if ( !shadow2_mode_translate(owner) )
                 mark_dirty(owner, page_to_mfn(page));
 
-            if ( shadow_mode_refcounts(owner) )
+            if ( shadow2_mode_refcounts(owner) )
                 return;
 
             gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
             ASSERT(VALID_M2P(gmfn));
-            remove_shadow(owner, gmfn, type & PGT_type_mask);
+            shadow2_lock(owner);
+            shadow2_remove_all_shadows(owner->vcpu[0], _mfn(gmfn));
+            shadow2_unlock(owner);
         }
     }
 
@@ -1573,9 +1589,6 @@ void put_page_type(struct page_info *pag
 
         if ( unlikely((nx & PGT_count_mask) == 0) )
         {
-            /* Record TLB information for flush later. Races are harmless. */
-            page->tlbflush_timestamp = tlbflush_current_time();
-            
             if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
                  likely(nx & PGT_validated) )
             {
@@ -1593,6 +1606,9 @@ void put_page_type(struct page_info *pag
                 x  &= ~PGT_validated;
                 nx &= ~PGT_validated;
             }
+
+            /* Record TLB information for flush later. */
+            page->tlbflush_timestamp = tlbflush_current_time();
         }
         else if ( unlikely((nx & (PGT_pinned|PGT_type_mask|PGT_count_mask)) == 
                            (PGT_pinned|PGT_l1_page_table|1)) )
@@ -1682,7 +1698,7 @@ int get_page_type(struct page_info *page
 #endif
                     /* Fixme: add code to propagate va_unknown to subtables. */
                     if ( ((type & PGT_type_mask) >= PGT_l2_page_table) &&
-                         !shadow_mode_refcounts(page_get_owner(page)) )
+                         !shadow2_mode_refcounts(page_get_owner(page)) )
                         return 0;
                     /* This table is possibly mapped at multiple locations. */
                     nx &= ~PGT_va_mask;
@@ -1729,7 +1745,10 @@ int new_guest_cr3(unsigned long mfn)
     int okay;
     unsigned long old_base_mfn;
 
-    if ( shadow_mode_refcounts(d) )
+    if ( hvm_guest(v) && !hvm_paging_enabled(v) )
+        domain_crash_synchronous();
+
+    if ( shadow2_mode_refcounts(d) )
     {
         okay = get_page_from_pagenr(mfn, d);
         if ( unlikely(!okay) )
@@ -1747,7 +1766,7 @@ int new_guest_cr3(unsigned long mfn)
             MEM_LOG("New baseptr %lx: slow path via idle pagetables", mfn);
             old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
             v->arch.guest_table = pagetable_null();
-            update_pagetables(v);
+            update_cr3(v);
             write_cr3(__pa(idle_pg_table));
             if ( old_base_mfn != 0 )
                 put_page_and_type(mfn_to_page(old_base_mfn));
@@ -1769,30 +1788,20 @@ int new_guest_cr3(unsigned long mfn)
     invalidate_shadow_ldt(v);
 
     old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
+
     v->arch.guest_table = pagetable_from_pfn(mfn);
-    update_pagetables(v); /* update shadow_table and monitor_table */
+    update_cr3(v); /* update shadow_table and cr3 fields of vcpu struct */
 
     write_ptbase(v);
 
     if ( likely(old_base_mfn != 0) )
     {
-        if ( shadow_mode_refcounts(d) )
+        if ( shadow2_mode_refcounts(d) )
             put_page(mfn_to_page(old_base_mfn));
         else
             put_page_and_type(mfn_to_page(old_base_mfn));
     }
 
-    /* CR3 also holds a ref to its shadow... */
-    if ( shadow_mode_enabled(d) )
-    {
-        if ( v->arch.monitor_shadow_ref )
-            put_shadow_ref(v->arch.monitor_shadow_ref);
-        v->arch.monitor_shadow_ref =
-            pagetable_get_pfn(v->arch.monitor_table);
-        ASSERT(!page_get_owner(mfn_to_page(v->arch.monitor_shadow_ref)));
-        get_shadow_ref(v->arch.monitor_shadow_ref);
-    }
-
     return 1;
 }
 
@@ -1807,8 +1816,6 @@ static void process_deferred_ops(void)
 
     if ( deferred_ops & (DOP_FLUSH_ALL_TLBS|DOP_FLUSH_TLB) )
     {
-        if ( shadow_mode_enabled(d) )
-            shadow_sync_all(d);
         if ( deferred_ops & DOP_FLUSH_ALL_TLBS )
             flush_tlb_mask(d->domain_dirty_cpumask);
         else
@@ -1974,7 +1981,7 @@ int do_mmuext_op(
             type = PGT_root_page_table;
 
         pin_page:
-            if ( shadow_mode_refcounts(FOREIGNDOM) )
+            if ( shadow2_mode_refcounts(FOREIGNDOM) )
                 break;
 
             okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM);
@@ -1996,7 +2003,7 @@ int do_mmuext_op(
             break;
 
         case MMUEXT_UNPIN_TABLE:
-            if ( shadow_mode_refcounts(d) )
+            if ( shadow2_mode_refcounts(d) )
                 break;
 
             if ( unlikely(!(okay = get_page_from_pagenr(mfn, d))) )
@@ -2009,6 +2016,12 @@ int do_mmuext_op(
             {
                 put_page_and_type(page);
                 put_page(page);
+                if ( shadow2_mode_enabled(d) )
+                {
+                    shadow2_lock(d);
+                    shadow2_remove_all_shadows(v, _mfn(mfn));
+                    shadow2_unlock(d);
+                }
             }
             else
             {
@@ -2050,9 +2063,9 @@ int do_mmuext_op(
             break;
     
         case MMUEXT_INVLPG_LOCAL:
-            if ( shadow_mode_enabled(d) )
-                shadow_invlpg(v, op.arg1.linear_addr);
-            local_flush_tlb_one(op.arg1.linear_addr);
+            if ( !shadow2_mode_enabled(d) 
+                 || shadow2_invlpg(v, op.arg1.linear_addr) != 0 )
+                local_flush_tlb_one(op.arg1.linear_addr);
             break;
 
         case MMUEXT_TLB_FLUSH_MULTI:
@@ -2098,7 +2111,7 @@ int do_mmuext_op(
             unsigned long ptr  = op.arg1.linear_addr;
             unsigned long ents = op.arg2.nr_ents;
 
-            if ( shadow_mode_external(d) )
+            if ( shadow2_mode_external(d) )
             {
                 MEM_LOG("ignoring SET_LDT hypercall from external "
                         "domain %u", d->domain_id);
@@ -2171,9 +2184,6 @@ int do_mmu_update(
 
     LOCK_BIGLOCK(d);
 
-    if ( unlikely(shadow_mode_enabled(d)) )
-        check_pagetable(v, "pre-mmu"); /* debug */
-
     if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
     {
         count &= ~MMU_UPDATE_PREEMPTED;
@@ -2248,7 +2258,12 @@ int do_mmu_update(
             case PGT_l3_page_table:
             case PGT_l4_page_table:
             {
-                ASSERT(!shadow_mode_refcounts(d));
+                if ( shadow2_mode_refcounts(d) )
+                {
+                    DPRINTK("mmu update on shadow-refcounted domain!");
+                    break;
+                }
+
                 if ( unlikely(!get_page_type(
                     page, type_info & (PGT_type_mask|PGT_va_mask))) )
                     goto not_a_pt;
@@ -2258,10 +2273,7 @@ int do_mmu_update(
                 case PGT_l1_page_table:
                 {
                     l1_pgentry_t l1e = l1e_from_intpte(req.val);
-                    okay = mod_l1_entry(va, l1e);
-                    if ( okay && unlikely(shadow_mode_enabled(d)) )
-                        shadow_l1_normal_pt_update(
-                            d, req.ptr, l1e, &sh_mapcache);
+                    okay = mod_l1_entry(va, l1e, mfn);
                 }
                 break;
                 case PGT_l2_page_table:
@@ -2269,9 +2281,6 @@ int do_mmu_update(
                     l2_pgentry_t l2e = l2e_from_intpte(req.val);
                     okay = mod_l2_entry(
                         (l2_pgentry_t *)va, l2e, mfn, type_info);
-                    if ( okay && unlikely(shadow_mode_enabled(d)) )
-                        shadow_l2_normal_pt_update(
-                            d, req.ptr, l2e, &sh_mapcache);
                 }
                 break;
 #if CONFIG_PAGING_LEVELS >= 3
@@ -2279,9 +2288,6 @@ int do_mmu_update(
                 {
                     l3_pgentry_t l3e = l3e_from_intpte(req.val);
                     okay = mod_l3_entry(va, l3e, mfn, type_info);
-                    if ( okay && unlikely(shadow_mode_enabled(d)) )
-                        shadow_l3_normal_pt_update(
-                            d, req.ptr, l3e, &sh_mapcache);
                 }
                 break;
 #endif
@@ -2290,9 +2296,6 @@ int do_mmu_update(
                 {
                     l4_pgentry_t l4e = l4e_from_intpte(req.val);
                     okay = mod_l4_entry(va, l4e, mfn, type_info);
-                    if ( okay && unlikely(shadow_mode_enabled(d)) )
-                        shadow_l4_normal_pt_update(
-                            d, req.ptr, l4e, &sh_mapcache);
                 }
                 break;
 #endif
@@ -2308,19 +2311,17 @@ int do_mmu_update(
                 if ( unlikely(!get_page_type(page, PGT_writable_page)) )
                     break;
 
-                if ( shadow_mode_enabled(d) )
-                {
-                    shadow_lock(d);
-                    __mark_dirty(d, mfn);
-                    if ( page_is_page_table(page) && !page_out_of_sync(page) )
-                        shadow_mark_mfn_out_of_sync(v, gmfn, mfn);
-                }
+                if ( unlikely(shadow2_mode_enabled(d)) )
+                    shadow2_lock(d);
 
                 *(intpte_t *)va = req.val;
                 okay = 1;
 
-                if ( shadow_mode_enabled(d) )
-                    shadow_unlock(d);
+                if ( unlikely(shadow2_mode_enabled(d)) )
+                {
+                    shadow2_validate_guest_entry(v, _mfn(mfn), va);
+                    shadow2_unlock(d);
+                }
 
                 put_page_type(page);
             }
@@ -2333,12 +2334,6 @@ int do_mmu_update(
             break;
 
         case MMU_MACHPHYS_UPDATE:
-
-            if ( shadow_mode_translate(FOREIGNDOM) )
-            {
-                MEM_LOG("can't mutate m2p table of translate mode guest");
-                break;
-            }
 
             mfn = req.ptr >> PAGE_SHIFT;
             gpfn = req.val;
@@ -2349,9 +2344,13 @@ int do_mmu_update(
                 break;
             }
 
-            set_gpfn_from_mfn(mfn, gpfn);
+            if ( shadow2_mode_translate(FOREIGNDOM) )
+                shadow2_guest_physmap_add_page(FOREIGNDOM, gpfn, mfn);
+            else 
+                set_gpfn_from_mfn(mfn, gpfn);
             okay = 1;
 
+            // Mark the new gfn dirty...
             mark_dirty(FOREIGNDOM, mfn);
 
             put_page(mfn_to_page(mfn));
@@ -2381,9 +2380,6 @@ int do_mmu_update(
     done += i;
     if ( unlikely(!guest_handle_is_null(pdone)) )
         copy_to_guest(pdone, &done, 1);
-
-    if ( unlikely(shadow_mode_enabled(d)) )
-        check_pagetable(v, "post-mmu"); /* debug */
 
     UNLOCK_BIGLOCK(d);
     return rc;
@@ -2402,7 +2398,6 @@ static int create_grant_pte_mapping(
     struct domain *d = v->domain;
 
     ASSERT(spin_is_locked(&d->big_lock));
-    ASSERT(!shadow_mode_refcounts(d));
 
     gmfn = pte_addr >> PAGE_SHIFT;
     mfn = gmfn_to_mfn(d, gmfn);
@@ -2418,7 +2413,7 @@ static int create_grant_pte_mapping(
     page = mfn_to_page(mfn);
 
     type_info = page->u.inuse.type_info;
-    if ( ((type_info & PGT_type_mask) != PGT_l1_page_table) ||
+    if ( ((type_info & PGT_type_mask) != PGT_l1_page_table) ||         
          !get_page_type(page, type_info & (PGT_type_mask|PGT_va_mask)) )
     {
         MEM_LOG("Grant map attempted to update a non-L1 page");
@@ -2427,28 +2422,22 @@ static int create_grant_pte_mapping(
     }
 
     ol1e = *(l1_pgentry_t *)va;
-    if ( !update_l1e(va, ol1e, _nl1e) )
+    if ( !update_l1e(va, ol1e, _nl1e, mfn, v) )
     {
         put_page_type(page);
         rc = GNTST_general_error;
         goto failed;
     } 
 
-    put_page_from_l1e(ol1e, d);
-
-    if ( unlikely(shadow_mode_enabled(d)) )
-    {
-        struct domain_mmap_cache sh_mapcache;
-        domain_mmap_cache_init(&sh_mapcache);
-        shadow_l1_normal_pt_update(d, pte_addr, _nl1e, &sh_mapcache);
-        domain_mmap_cache_destroy(&sh_mapcache);
-    }
+    if ( !shadow2_mode_refcounts(d) )
+        put_page_from_l1e(ol1e, d);
 
     put_page_type(page);
  
  failed:
     unmap_domain_page(va);
     put_page(page);
+
     return rc;
 }
 
@@ -2462,8 +2451,6 @@ static int destroy_grant_pte_mapping(
     u32 type_info;
     l1_pgentry_t ol1e;
 
-    ASSERT(!shadow_mode_refcounts(d));
-
     gmfn = addr >> PAGE_SHIFT;
     mfn = gmfn_to_mfn(d, gmfn);
 
@@ -2504,7 +2491,9 @@ static int destroy_grant_pte_mapping(
     }
 
     /* Delete pagetable entry. */
-    if ( unlikely(!update_l1e((l1_pgentry_t *)va, ol1e, l1e_empty())) )
+    if ( unlikely(!update_l1e(
+                      (l1_pgentry_t *)va, ol1e, l1e_empty(), mfn, 
+                      d->vcpu[0] /* Change if we go to per-vcpu shadows. */)) )
     {
         MEM_LOG("Cannot delete PTE entry at %p", va);
         put_page_type(page);
@@ -2512,14 +2501,6 @@ static int destroy_grant_pte_mapping(
         goto failed;
     }
 
-    if ( unlikely(shadow_mode_enabled(d)) )
-    {
-        struct domain_mmap_cache sh_mapcache;
-        domain_mmap_cache_init(&sh_mapcache);
-        shadow_l1_normal_pt_update(d, addr, l1e_empty(), &sh_mapcache);
-        domain_mmap_cache_destroy(&sh_mapcache);
-    }
-
     put_page_type(page);
 
  failed:
@@ -2536,31 +2517,22 @@ static int create_grant_va_mapping(
     struct domain *d = v->domain;
     
     ASSERT(spin_is_locked(&d->big_lock));
-    ASSERT(!shadow_mode_refcounts(d));
-
-    /*
-     * This is actually overkill - we don't need to sync the L1 itself,
-     * just everything involved in getting to this L1 (i.e. we need
-     * linear_pg_table[l1_linear_offset(va)] to be in sync)...
-     */
-    __shadow_sync_va(v, va);
 
     pl1e = &linear_pg_table[l1_linear_offset(va)];
 
     if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) ||
-         !update_l1e(pl1e, ol1e, _nl1e) )
+         !update_l1e(pl1e, ol1e, _nl1e, 
+                    l2e_get_pfn(__linear_l2_table[l2_linear_offset(va)]), v) )
         return GNTST_general_error;
 
-    put_page_from_l1e(ol1e, d);
-
-    if ( unlikely(shadow_mode_enabled(d)) )
-        shadow_do_update_va_mapping(va, _nl1e, v);
+    if ( !shadow2_mode_refcounts(d) )
+        put_page_from_l1e(ol1e, d);
 
     return GNTST_okay;
 }
 
 static int destroy_grant_va_mapping(
-    unsigned long addr, unsigned long frame)
+    unsigned long addr, unsigned long frame, struct domain *d)
 {
     l1_pgentry_t *pl1e, ol1e;
     
@@ -2584,12 +2556,14 @@ static int destroy_grant_va_mapping(
     }
 
     /* Delete pagetable entry. */
-    if ( unlikely(!update_l1e(pl1e, ol1e, l1e_empty())) )
+    if ( unlikely(!update_l1e(pl1e, ol1e, l1e_empty(), 
+                      l2e_get_pfn(__linear_l2_table[l2_linear_offset(addr)]),
+                      d->vcpu[0] /* Change for per-vcpu shadows */)) )
     {
         MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e);
         return GNTST_general_error;
     }
-    
+
     return 0;
 }
 
@@ -2597,7 +2571,7 @@ int create_grant_host_mapping(
     unsigned long addr, unsigned long frame, unsigned int flags)
 {
     l1_pgentry_t pte = l1e_from_pfn(frame, GRANT_PTE_FLAGS);
-        
+
     if ( (flags & GNTMAP_application_map) )
         l1e_add_flags(pte,_PAGE_USER);
     if ( !(flags & GNTMAP_readonly) )
@@ -2613,7 +2587,7 @@ int destroy_grant_host_mapping(
 {
     if ( flags & GNTMAP_contains_pte )
         return destroy_grant_pte_mapping(addr, frame, current->domain);
-    return destroy_grant_va_mapping(addr, frame);
+    return destroy_grant_va_mapping(addr, frame, current->domain);
 }
 
 int steal_page(
@@ -2675,46 +2649,44 @@ int do_update_va_mapping(unsigned long v
 
     perfc_incrc(calls_to_update_va);
 
-    if ( unlikely(!__addr_ok(va) && !shadow_mode_external(d)) )
+    if ( unlikely(!__addr_ok(va) && !shadow2_mode_external(d)) )
         return -EINVAL;
 
+    if ( unlikely(shadow2_mode_refcounts(d)) )
+    {
+        DPRINTK("Grant op on a shadow-refcounted domain\n");
+        return -EINVAL; 
+    }
+
     LOCK_BIGLOCK(d);
 
-    if ( unlikely(shadow_mode_enabled(d)) )
-        check_pagetable(v, "pre-va"); /* debug */
-
-    if ( unlikely(!mod_l1_entry(&linear_pg_table[l1_linear_offset(va)],
-                                val)) )
-        rc = -EINVAL;
-
-    if ( likely(rc == 0) && unlikely(shadow_mode_enabled(d)) )
+    if ( likely(rc == 0) && unlikely(shadow2_mode_enabled(d)) )
     {
         if ( unlikely(this_cpu(percpu_mm_info).foreign &&
-                      (shadow_mode_translate(d) ||
-                       shadow_mode_translate(
+                      (shadow2_mode_translate(d) ||
+                       shadow2_mode_translate(
                            this_cpu(percpu_mm_info).foreign))) )
         {
             /*
              * The foreign domain's pfn's are in a different namespace. There's
-             * not enough information in just a gpte to figure out how to
+             * not enough information in just a gpte to figure out how to   
              * (re-)shadow this entry.
              */
             domain_crash(d);
         }
+    }
+
+    if ( unlikely(!mod_l1_entry(
+                      &linear_pg_table[l1_linear_offset(va)], val,
+                      l2e_get_pfn(__linear_l2_table[l2_linear_offset(va)]))) )
+        rc = -EINVAL;
     
-        rc = shadow_do_update_va_mapping(va, val, v);
-
-        check_pagetable(v, "post-va"); /* debug */
-    }
-
     switch ( flags & UVMF_FLUSHTYPE_MASK )
     {
     case UVMF_TLB_FLUSH:
         switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
         {
         case UVMF_LOCAL:
-            if ( unlikely(shadow_mode_enabled(d)) )
-                shadow_sync_all(d);
             local_flush_tlb();
             break;
         case UVMF_ALL:
@@ -2733,9 +2705,9 @@ int do_update_va_mapping(unsigned long v
         switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
         {
         case UVMF_LOCAL:
-            if ( unlikely(shadow_mode_enabled(d)) )
-                shadow_invlpg(current, va);
-            local_flush_tlb_one(va);
+            if ( !shadow2_mode_enabled(d) 
+                 || (shadow2_invlpg(current, va) != 0) ) 
+                local_flush_tlb_one(va);
             break;
         case UVMF_ALL:
             flush_tlb_one_mask(d->domain_dirty_cpumask, va);
@@ -2807,8 +2779,6 @@ long set_gdt(struct vcpu *v,
 
     if ( entries > FIRST_RESERVED_GDT_ENTRY )
         return -EINVAL;
-
-    shadow_sync_all(d);
 
     /* Check the pages in the new GDT. */
     for ( i = 0; i < nr_pages; i++ ) {
@@ -2912,24 +2882,13 @@ long do_update_descriptor(u64 pa, u64 de
         break;
     }
 
-    if ( shadow_mode_enabled(dom) )
-    {
-        shadow_lock(dom);
-
-        __mark_dirty(dom, mfn);
-
-        if ( page_is_page_table(page) && !page_out_of_sync(page) )
-            shadow_mark_mfn_out_of_sync(current, gmfn, mfn);
-    }
+    mark_dirty(dom, mfn);
 
     /* All is good so make the update. */
     gdt_pent = map_domain_page(mfn);
     memcpy(&gdt_pent[offset], &d, 8);
     unmap_domain_page(gdt_pent);
 
-    if ( shadow_mode_enabled(dom) )
-        shadow_unlock(dom);
-
     put_page_type(page);
 
     ret = 0; /* success */
@@ -2981,8 +2940,8 @@ long arch_memory_op(int op, XEN_GUEST_HA
         default:
             break;
         }
-        
-        if ( !shadow_mode_translate(d) || (mfn == 0) )
+
+        if ( !shadow2_mode_translate(d) || (mfn == 0) )
         {
             put_domain(d);
             return -EINVAL;
@@ -3011,7 +2970,7 @@ long arch_memory_op(int op, XEN_GUEST_HA
         guest_physmap_add_page(d, xatp.gpfn, mfn);
 
         UNLOCK_BIGLOCK(d);
-
+        
         put_domain(d);
 
         break;
@@ -3136,7 +3095,8 @@ static int ptwr_emulated_update(
     unsigned long pfn;
     struct page_info *page;
     l1_pgentry_t pte, ol1e, nl1e, *pl1e;
-    struct domain *d = current->domain;
+    struct vcpu *v = current;
+    struct domain *d = v->domain;
 
     /* Aligned access only, thank you. */
     if ( !access_ok(addr, bytes) || ((addr & (bytes-1)) != 0) )
@@ -3196,25 +3156,36 @@ static int ptwr_emulated_update(
         return X86EMUL_UNHANDLEABLE;
     }
 
+
     /* Checked successfully: do the update (write or cmpxchg). */
     pl1e = map_domain_page(page_to_mfn(page));
     pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK));
     if ( do_cmpxchg )
     {
+        if ( shadow2_mode_enabled(d) )
+            shadow2_lock(d);
         ol1e = l1e_from_intpte(old);
         if ( cmpxchg((intpte_t *)pl1e, old, val) != old )
         {
+            if ( shadow2_mode_enabled(d) )
+                shadow2_unlock(d);
             unmap_domain_page(pl1e);
             put_page_from_l1e(nl1e, d);
             return X86EMUL_CMPXCHG_FAILED;
         }
+        if ( unlikely(shadow2_mode_enabled(v->domain)) )
+        {
+            shadow2_validate_guest_entry(v, _mfn(page_to_mfn(page)), pl1e);
+            shadow2_unlock(v->domain);    
+        }
     }
     else
     {
         ol1e = *pl1e;
-        if ( !update_l1e(pl1e, ol1e, nl1e) )
+        if ( !update_l1e(pl1e, ol1e, nl1e, page_to_mfn(page), v) )
             BUG();
     }
+
     unmap_domain_page(pl1e);
 
     /* Finally, drop the old PTE. */
diff -r f2151423f729 -r 01345b08d012 xen/arch/x86/setup.c
--- a/xen/arch/x86/setup.c      Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/arch/x86/setup.c      Wed Aug 16 17:11:56 2006 +0100
@@ -532,8 +532,6 @@ void __init __start_xen(multiboot_info_t
     if ( opt_watchdog ) 
         watchdog_enable();
 
-    shadow_mode_init();
-
     /* initialize access control security module */
     acm_init(&initrdidx, mbi, initial_images_start);
 
diff -r f2151423f729 -r 01345b08d012 xen/arch/x86/smpboot.c
--- a/xen/arch/x86/smpboot.c    Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/arch/x86/smpboot.c    Wed Aug 16 17:11:56 2006 +0100
@@ -896,7 +896,7 @@ static int __devinit do_boot_cpu(int api
        v = alloc_idle_vcpu(cpu);
        BUG_ON(v == NULL);
 
-       v->arch.monitor_table = pagetable_from_paddr(__pa(idle_pg_table));
+       v->arch.cr3 = __pa(idle_pg_table);
 
        /* start_eip had better be page-aligned! */
        start_eip = setup_trampoline();
diff -r f2151423f729 -r 01345b08d012 xen/arch/x86/traps.c
--- a/xen/arch/x86/traps.c      Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/arch/x86/traps.c      Wed Aug 16 17:11:56 2006 +0100
@@ -277,6 +277,21 @@ void show_stack(struct cpu_user_regs *re
     show_trace(regs);
 }
 
+void show_xen_trace()
+{
+    struct cpu_user_regs regs;
+#ifdef __x86_64
+    __asm__("movq %%rsp,%0" : "=m" (regs.rsp));
+    __asm__("movq %%rbp,%0" : "=m" (regs.rbp));
+    __asm__("leaq 0(%%rip),%0" : "=a" (regs.rip));
+#else
+    __asm__("movl %%esp,%0" : "=m" (regs.esp));
+    __asm__("movl %%ebp,%0" : "=m" (regs.ebp));
+    __asm__("call 1f; 1: popl %0" : "=a" (regs.eip));
+#endif
+    show_trace(&regs);
+}
+
 void show_stack_overflow(unsigned long esp)
 {
 #ifdef MEMORY_GUARD
@@ -861,8 +876,8 @@ static int fixup_page_fault(unsigned lon
 
     if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
     {
-        if ( shadow_mode_external(d) && guest_mode(regs) )
-            return shadow_fault(addr, regs);
+        if ( shadow2_mode_external(d) && guest_mode(regs) )
+            return shadow2_fault(addr, regs);
         if ( (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
             return handle_gdt_ldt_mapping_fault(
                 addr - GDT_LDT_VIRT_START, regs);
@@ -873,14 +888,14 @@ static int fixup_page_fault(unsigned lon
         return (spurious_page_fault(addr, regs) ? EXCRET_not_a_fault : 0);
     }
 
-    if ( unlikely(shadow_mode_enabled(d)) )
-        return shadow_fault(addr, regs);
-
     if ( likely(VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) &&
          guest_kernel_mode(v, regs) &&
          ((regs->error_code & (PGERR_write_access|PGERR_page_present)) ==
           (PGERR_write_access|PGERR_page_present)) )
         return ptwr_do_page_fault(d, addr, regs) ? EXCRET_fault_fixed : 0;
+
+    if ( shadow2_mode_enabled(d) )
+        return shadow2_fault(addr, regs);
 
     return 0;
 }
@@ -905,6 +920,13 @@ asmlinkage int do_page_fault(struct cpu_
     DEBUGGER_trap_entry(TRAP_page_fault, regs);
 
     perfc_incrc(page_faults);
+
+    if ( shadow2_mode_enabled(current->domain) )
+        debugtrace_printk("%s %s %d dom=%d eip=%p cr2=%p code=%d cs=%x\n",
+                          __func__, __FILE__, __LINE__,
+                          current->domain->domain_id,
+                          (void *)regs->eip, (void *)addr, regs->error_code,
+                          regs->cs);
 
     if ( unlikely((rc = fixup_page_fault(addr, regs)) != 0) )
         return rc;
diff -r f2151423f729 -r 01345b08d012 xen/arch/x86/x86_32/domain_page.c
--- a/xen/arch/x86/x86_32/domain_page.c Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/arch/x86/x86_32/domain_page.c Wed Aug 16 17:11:56 2006 +0100
@@ -15,6 +15,7 @@
 #include <asm/current.h>
 #include <asm/flushtlb.h>
 #include <asm/hardirq.h>
+#include <asm/hvm/support.h>
 
 static inline struct vcpu *mapcache_current_vcpu(void)
 {
@@ -58,10 +59,10 @@ void *map_domain_page(unsigned long pfn)
     cache = &v->domain->arch.mapcache;
 
     hashent = &cache->vcpu_maphash[vcpu].hash[MAPHASH_HASHFN(pfn)];
-    if ( hashent->pfn == pfn )
-    {
-        idx = hashent->idx;
+    if ( hashent->pfn == pfn && (idx = hashent->idx) != MAPHASHENT_NOTINUSE )
+    {
         hashent->refcnt++;
+        ASSERT(idx < MAPCACHE_ENTRIES);
         ASSERT(hashent->refcnt != 0);
         ASSERT(l1e_get_pfn(cache->l1tab[idx]) == pfn);
         goto out;
@@ -178,6 +179,30 @@ void mapcache_init(struct domain *d)
                 MAPHASHENT_NOTINUSE;
 }
 
+paddr_t mapped_domain_page_to_maddr(void *va) 
+/* Convert a pointer in a mapped domain page to a machine address. 
+ * Takes any pointer that's valid for use in unmap_domain_page() */
+{
+    unsigned int idx;
+    struct vcpu *v;
+    struct mapcache *cache;
+    unsigned long pfn;
+
+    ASSERT(!in_irq());
+
+    ASSERT((void *)MAPCACHE_VIRT_START <= va);
+    ASSERT(va < (void *)MAPCACHE_VIRT_END);
+
+    v = mapcache_current_vcpu();
+
+    cache = &v->domain->arch.mapcache;
+
+    idx = ((unsigned long)va - MAPCACHE_VIRT_START) >> PAGE_SHIFT;
+    pfn = l1e_get_pfn(cache->l1tab[idx]);
+    return ((paddr_t) pfn << PAGE_SHIFT 
+            | ((unsigned long) va & ~PAGE_MASK));
+}
+
 #define GLOBALMAP_BITS (IOREMAP_MBYTES << (20 - PAGE_SHIFT))
 static unsigned long inuse[BITS_TO_LONGS(GLOBALMAP_BITS)];
 static unsigned long garbage[BITS_TO_LONGS(GLOBALMAP_BITS)];
@@ -233,6 +258,8 @@ void unmap_domain_page_global(void *va)
     l1_pgentry_t *pl1e;
     unsigned int idx;
 
+    ASSERT((__va >= IOREMAP_VIRT_START) && (__va <= (IOREMAP_VIRT_END - 1)));
+
     /* /First/, we zap the PTE. */
     pl2e = virt_to_xen_l2e(__va);
     pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(__va);
diff -r f2151423f729 -r 01345b08d012 xen/arch/x86/x86_32/mm.c
--- a/xen/arch/x86/x86_32/mm.c  Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/arch/x86/x86_32/mm.c  Wed Aug 16 17:11:56 2006 +0100
@@ -75,8 +75,7 @@ void __init paging_init(void)
     printk("PAE disabled.\n");
 #endif
 
-    idle_vcpu[0]->arch.monitor_table =
-        pagetable_from_paddr(__pa(idle_pg_table));
+    idle_vcpu[0]->arch.cr3 = __pa(idle_pg_table);
 
     if ( cpu_has_pge )
     {
diff -r f2151423f729 -r 01345b08d012 xen/arch/x86/x86_64/mm.c
--- a/xen/arch/x86/x86_64/mm.c  Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/arch/x86/x86_64/mm.c  Wed Aug 16 17:11:56 2006 +0100
@@ -81,8 +81,7 @@ void __init paging_init(void)
     l2_pgentry_t *l2_ro_mpt;
     struct page_info *pg;
 
-    idle_vcpu[0]->arch.monitor_table =
-        pagetable_from_paddr(__pa(idle_pg_table));
+    idle_vcpu[0]->arch.cr3 = __pa(idle_pg_table);
 
     /* Create user-accessible L2 directory to map the MPT for guests. */
     l3_ro_mpt = alloc_xenheap_page();
diff -r f2151423f729 -r 01345b08d012 xen/arch/x86/x86_64/traps.c
--- a/xen/arch/x86/x86_64/traps.c       Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/arch/x86/x86_64/traps.c       Wed Aug 16 17:11:56 2006 +0100
@@ -84,7 +84,8 @@ void show_page_walk(unsigned long addr)
     l4e = l4t[l4_table_offset(addr)];
     mfn = l4e_get_pfn(l4e);
     pfn = get_gpfn_from_mfn(mfn);
-    printk(" L4 = %"PRIpte" %016lx\n", l4e_get_intpte(l4e), pfn);
+    printk(" L4[0x%lx] = %"PRIpte" %016lx\n",
+           l4_table_offset(addr), l4e_get_intpte(l4e), pfn);
     if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
         return;
 
@@ -92,7 +93,8 @@ void show_page_walk(unsigned long addr)
     l3e = l3t[l3_table_offset(addr)];
     mfn = l3e_get_pfn(l3e);
     pfn = get_gpfn_from_mfn(mfn);
-    printk("  L3 = %"PRIpte" %016lx\n", l3e_get_intpte(l3e), pfn);
+    printk("  L3[0x%lx] = %"PRIpte" %016lx\n",
+           l3_table_offset(addr), l3e_get_intpte(l3e), pfn);
     if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
         return;
 
@@ -100,7 +102,8 @@ void show_page_walk(unsigned long addr)
     l2e = l2t[l2_table_offset(addr)];
     mfn = l2e_get_pfn(l2e);
     pfn = get_gpfn_from_mfn(mfn);
-    printk("   L2 = %"PRIpte" %016lx %s\n", l2e_get_intpte(l2e), pfn,
+    printk("   L2[0x%lx] = %"PRIpte" %016lx %s\n",
+           l2_table_offset(addr), l2e_get_intpte(l2e), pfn,
            (l2e_get_flags(l2e) & _PAGE_PSE) ? "(PSE)" : "");
     if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ||
          (l2e_get_flags(l2e) & _PAGE_PSE) )
@@ -110,7 +113,8 @@ void show_page_walk(unsigned long addr)
     l1e = l1t[l1_table_offset(addr)];
     mfn = l1e_get_pfn(l1e);
     pfn = get_gpfn_from_mfn(mfn);
-    printk("    L1 = %"PRIpte" %016lx\n", l1e_get_intpte(l1e), pfn);
+    printk("    L1[0x%lx] = %"PRIpte" %016lx\n",
+           l1_table_offset(addr), l1e_get_intpte(l1e), pfn);
 }
 
 asmlinkage void double_fault(void);
@@ -162,7 +166,7 @@ void toggle_guest_mode(struct vcpu *v)
 {
     v->arch.flags ^= TF_kernel_mode;
     __asm__ __volatile__ ( "swapgs" );
-    update_pagetables(v);
+    update_cr3(v);
     write_ptbase(v);
 }
 
diff -r f2151423f729 -r 01345b08d012 xen/common/acm_ops.c
--- a/xen/common/acm_ops.c      Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/common/acm_ops.c      Wed Aug 16 17:11:56 2006 +0100
@@ -26,7 +26,6 @@
 #include <xen/trace.h>
 #include <xen/console.h>
 #include <xen/guest_access.h>
-#include <asm/shadow.h>
 #include <public/sched_ctl.h>
 #include <acm/acm_hooks.h>
 
diff -r f2151423f729 -r 01345b08d012 xen/common/grant_table.c
--- a/xen/common/grant_table.c  Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/common/grant_table.c  Wed Aug 16 17:11:56 2006 +0100
@@ -434,7 +434,7 @@ __gnttab_unmap_grant_ref(
 
     /* If just unmapped a writable mapping, mark as dirtied */
     if ( !(flags & GNTMAP_readonly) )
-         gnttab_log_dirty(rd, frame);
+         gnttab_mark_dirty(rd, frame);
 
     if ( ((act->pin & (GNTPIN_devw_mask|GNTPIN_hstw_mask)) == 0) &&
          !(flags & GNTMAP_readonly) )
@@ -731,7 +731,7 @@ __release_grant_for_copy(
     const unsigned long r_frame = act->frame;
 
     if ( !readonly )
-        gnttab_log_dirty(rd, r_frame);
+        gnttab_mark_dirty(rd, r_frame);
 
     spin_lock(&rd->grant_table->lock);
     if ( readonly )
diff -r f2151423f729 -r 01345b08d012 xen/common/keyhandler.c
--- a/xen/common/keyhandler.c   Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/common/keyhandler.c   Wed Aug 16 17:11:56 2006 +0100
@@ -241,9 +241,6 @@ static void read_clocks(unsigned char ke
 }
 
 extern void dump_runq(unsigned char key);
-#ifndef NDEBUG
-extern void audit_domains_key(unsigned char key);
-#endif
 
 #ifdef PERF_COUNTERS
 extern void perfc_printall(unsigned char key);
@@ -261,10 +258,16 @@ static void do_debug_key(unsigned char k
 #ifndef NDEBUG
 static void debugtrace_key(unsigned char key)
 {
-    debugtrace_send_to_console = !debugtrace_send_to_console;
-    debugtrace_dump();
-    printk("debugtrace_printk now writing to %s.\n",
-           debugtrace_send_to_console ? "console" : "buffer");
+    debugtrace_toggle();
+}
+
+static void shadow2_audit_key(unsigned char key)
+{
+    extern int shadow2_audit_enable;
+
+    shadow2_audit_enable = !shadow2_audit_enable;
+    printk("%s shadow2_audit_enable=%d\n",
+           __func__, shadow2_audit_enable);
 }
 #endif
 
@@ -288,7 +291,7 @@ void initialize_keytable(void)
 
 #ifndef NDEBUG
     register_keyhandler(
-        'o', audit_domains_key,  "audit domains >0 EXPERIMENTAL");
+        'O', shadow2_audit_key,  "toggle shadow2 audits");
     register_keyhandler(
         'T', debugtrace_key, "toggle debugtrace to console/buffer");
 #endif
diff -r f2151423f729 -r 01345b08d012 xen/common/memory.c
--- a/xen/common/memory.c       Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/common/memory.c       Wed Aug 16 17:11:56 2006 +0100
@@ -126,6 +126,11 @@ populate_physmap(
             for ( j = 0; j < (1 << extent_order); j++ )
                 guest_physmap_add_page(d, gpfn + j, mfn + j);
         }
+        else if ( unlikely(shadow2_mode_translate(d)) )
+        {
+            for ( j = 0; j < (1 << extent_order); j++ )
+                shadow2_guest_physmap_add_page(d, gpfn + j, mfn + j);
+        }
         else
         {
             for ( j = 0; j < (1 << extent_order); j++ )
@@ -153,7 +158,7 @@ guest_remove_page(
     if ( unlikely(!mfn_valid(mfn)) )
     {
         DPRINTK("Domain %u page number %lx invalid\n",
-                d->domain_id, mfn);
+                d->domain_id, gmfn);
         return 0;
     }
             
@@ -179,7 +184,7 @@ guest_remove_page(
                 (unsigned long)page->count_info, page->u.inuse.type_info);
     }
 
-    guest_physmap_remove_page(d, gmfn, mfn);
+    shadow2_guest_physmap_remove_page(d, gmfn, mfn);
 
     put_page(page);
 
@@ -250,7 +255,7 @@ translate_gpfn_list(
     if ( (d = find_domain_by_id(op.domid)) == NULL )
         return -ESRCH;
 
-    if ( !shadow_mode_translate(d) )
+    if ( !(shadow_mode_translate(d) || shadow2_mode_translate(d)) )
     {
         put_domain(d);
         return -EINVAL;
diff -r f2151423f729 -r 01345b08d012 xen/drivers/char/console.c
--- a/xen/drivers/char/console.c        Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/drivers/char/console.c        Wed Aug 16 17:11:56 2006 +0100
@@ -569,7 +569,7 @@ int console_getc(void)
 #ifndef NDEBUG
 
 /* Send output direct to console, or buffer it? */
-int debugtrace_send_to_console;
+static volatile int debugtrace_send_to_console;
 
 static char        *debugtrace_buf; /* Debug-trace buffer */
 static unsigned int debugtrace_prd; /* Producer index     */
@@ -578,16 +578,10 @@ static DEFINE_SPINLOCK(debugtrace_lock);
 static DEFINE_SPINLOCK(debugtrace_lock);
 integer_param("debugtrace", debugtrace_kilobytes);
 
-void debugtrace_dump(void)
-{
-    unsigned long flags;
-
+static void debugtrace_dump_worker(void)
+{
     if ( (debugtrace_bytes == 0) || !debugtrace_used )
         return;
-
-    watchdog_disable();
-
-    spin_lock_irqsave(&debugtrace_lock, flags);
 
     printk("debugtrace_dump() starting\n");
 
@@ -602,15 +596,47 @@ void debugtrace_dump(void)
     memset(debugtrace_buf, '\0', debugtrace_bytes);
 
     printk("debugtrace_dump() finished\n");
+}
+
+void debugtrace_toggle(void)
+{
+    unsigned long flags;
+
+    watchdog_disable();
+    spin_lock_irqsave(&debugtrace_lock, flags);
+
+    // dump the buffer *before* toggling, in case the act of dumping the
+    // buffer itself causes more printk's...
+    //
+    printk("debugtrace_printk now writing to %s.\n",
+           !debugtrace_send_to_console ? "console": "buffer");
+    if ( !debugtrace_send_to_console )
+        debugtrace_dump_worker();
+
+    debugtrace_send_to_console = !debugtrace_send_to_console;
 
     spin_unlock_irqrestore(&debugtrace_lock, flags);
-
     watchdog_enable();
+
+}
+
+void debugtrace_dump(void)
+{
+    unsigned long flags;
+
+    watchdog_disable();
+    spin_lock_irqsave(&debugtrace_lock, flags);
+
+    debugtrace_dump_worker();
+
+    spin_unlock_irqrestore(&debugtrace_lock, flags);
+    watchdog_enable();
 }
 
 void debugtrace_printk(const char *fmt, ...)
 {
     static char    buf[1024];
+    static u32 count;
 
     va_list       args;
     char         *p;
@@ -625,8 +651,10 @@ void debugtrace_printk(const char *fmt, 
 
     ASSERT(debugtrace_buf[debugtrace_bytes - 1] == 0);
 
+    sprintf(buf, "%u ", ++count);
+
     va_start(args, fmt);
-    (void)vsnprintf(buf, sizeof(buf), fmt, args);
+    (void)vsnprintf(buf + strlen(buf), sizeof(buf), fmt, args);
     va_end(args);
 
     if ( debugtrace_send_to_console )
diff -r f2151423f729 -r 01345b08d012 xen/include/asm-x86/bitops.h
--- a/xen/include/asm-x86/bitops.h      Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/include/asm-x86/bitops.h      Wed Aug 16 17:11:56 2006 +0100
@@ -75,6 +75,24 @@ static __inline__ void clear_bit(int nr,
                :"=m" (ADDR)
                :"dIr" (nr));
 }
+
+/**
+ * __clear_bit - Clears a bit in memory
+ * @nr: Bit to clear
+ * @addr: Address to start counting from
+ *
+ * Unlike clear_bit(), this function is non-atomic and may be reordered.
+ * If it's called on the same region of memory simultaneously, the effect
+ * may be that only one operation succeeds.
+ */
+static __inline__ void __clear_bit(int nr, volatile void * addr)
+{
+       __asm__(
+               "btrl %1,%0"
+               :"=m" (ADDR)
+               :"dIr" (nr));
+}
+
 #define smp_mb__before_clear_bit()     barrier()
 #define smp_mb__after_clear_bit()      barrier()
 
diff -r f2151423f729 -r 01345b08d012 xen/include/asm-x86/config.h
--- a/xen/include/asm-x86/config.h      Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/include/asm-x86/config.h      Wed Aug 16 17:11:56 2006 +0100
@@ -79,9 +79,14 @@
 
 #ifndef __ASSEMBLY__
 extern unsigned long _end; /* standard ELF symbol */
+
+static inline void FORCE_CRASH(void) __attribute__((noreturn,always_inline)); 
+static inline void FORCE_CRASH(void) 
+{
+    __asm__ __volatile__ ( "ud2" );
+    while(1);
+}
 #endif /* __ASSEMBLY__ */
-
-#define FORCE_CRASH() __asm__ __volatile__ ( "ud2" )
 
 #if defined(__x86_64__)
 
@@ -149,9 +154,14 @@ extern unsigned long _end; /* standard E
 /* Slot 256: read-only guest-accessible machine-to-phys translation table. */
 #define RO_MPT_VIRT_START       (PML4_ADDR(256))
 #define RO_MPT_VIRT_END         (RO_MPT_VIRT_START + PML4_ENTRY_BYTES/2)
+
+// current unused?
+#if 0
 /* Slot 257: read-only guest-accessible linear page table. */
 #define RO_LINEAR_PT_VIRT_START (PML4_ADDR(257))
 #define RO_LINEAR_PT_VIRT_END   (RO_LINEAR_PT_VIRT_START + PML4_ENTRY_BYTES)
+#endif
+
 /* Slot 258: linear page table (guest table). */
 #define LINEAR_PT_VIRT_START    (PML4_ADDR(258))
 #define LINEAR_PT_VIRT_END      (LINEAR_PT_VIRT_START + PML4_ENTRY_BYTES)
@@ -175,7 +185,7 @@ extern unsigned long _end; /* standard E
 #define DIRECTMAP_VIRT_START    (PML4_ADDR(262))
 #define DIRECTMAP_VIRT_END      (DIRECTMAP_VIRT_START + PML4_ENTRY_BYTES*2)
 
-#define PGT_base_page_table PGT_l4_page_table
+#define PGT_base_page_table     PGT_l4_page_table
 
 #define __HYPERVISOR_CS64 0xe010
 #define __HYPERVISOR_CS32 0xe008
@@ -274,9 +284,9 @@ extern unsigned long _end; /* standard E
     (L2_PAGETABLE_LAST_XEN_SLOT - L2_PAGETABLE_FIRST_XEN_SLOT + 1)
 
 #ifdef CONFIG_X86_PAE
-# define PGT_base_page_table PGT_l3_page_table
-#else
-# define PGT_base_page_table PGT_l2_page_table
+# define PGT_base_page_table     PGT_l3_page_table
+#else
+# define PGT_base_page_table     PGT_l2_page_table
 #endif
 
 #define __HYPERVISOR_CS 0xe008
diff -r f2151423f729 -r 01345b08d012 xen/include/asm-x86/domain.h
--- a/xen/include/asm-x86/domain.h      Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/include/asm-x86/domain.h      Wed Aug 16 17:11:56 2006 +0100
@@ -73,42 +73,42 @@ struct arch_domain
     /* I/O-port admin-specified access capabilities. */
     struct rangeset *ioport_caps;
 
-    /* Shadow mode status and controls. */
-    struct shadow_ops *ops;
-    unsigned int shadow_mode;  /* flags to control shadow table operation */
-    unsigned int shadow_nest;  /* Recursive depth of shadow_lock() nesting */
-
-    /* shadow hashtable */
-    struct shadow_status *shadow_ht;
-    struct shadow_status *shadow_ht_free;
-    struct shadow_status *shadow_ht_extras; /* extra allocation units */
-    unsigned int shadow_extras_count;
-
-    /* shadow dirty bitmap */
+    /* HVM stuff */
+    struct hvm_domain   hvm_domain;
+
+    /* Shadow-translated guest: Pseudophys base address of reserved area. */
+    unsigned long first_reserved_pfn;
+
+    /* Shadow2 stuff */
+    u32               shadow2_mode;  /* flags to control shadow operation */
+    spinlock_t        shadow2_lock;  /* shadow2 domain lock */
+    int               shadow2_locker; /* processor which holds the lock */
+    const char       *shadow2_locker_function; /* Func that took it */
+    struct list_head  shadow2_freelists[SHADOW2_MAX_ORDER + 1]; 
+    struct list_head  shadow2_p2m_freelist;
+    struct list_head  shadow2_p2m_inuse;
+    struct list_head  shadow2_toplevel_shadows;
+    unsigned int      shadow2_total_pages;  /* number of pages allocated */
+    unsigned int      shadow2_free_pages;   /* number of pages on freelists */
+    unsigned int      shadow2_p2m_pages;    /* number of pages in p2m map */
+
+    /* Shadow2 hashtable */
+    struct shadow2_hash_entry *shadow2_hash_table;
+    struct shadow2_hash_entry *shadow2_hash_freelist;
+    struct shadow2_hash_entry *shadow2_hash_allocations;
+    int shadow2_hash_walking;  /* Some function is walking the hash table */
+
+    /* Shadow log-dirty bitmap */
     unsigned long *shadow_dirty_bitmap;
     unsigned int shadow_dirty_bitmap_size;  /* in pages, bit per page */
 
-    /* shadow mode stats */
-    unsigned int shadow_page_count;
-    unsigned int hl2_page_count;
-    unsigned int snapshot_page_count;
-
+    /* Shadow log-dirty mode stats */
     unsigned int shadow_fault_count;
     unsigned int shadow_dirty_count;
 
-    /* full shadow mode */
-    struct out_of_sync_entry *out_of_sync; /* list of out-of-sync pages */
-    struct out_of_sync_entry *out_of_sync_free;
-    struct out_of_sync_entry *out_of_sync_extras;
-    unsigned int out_of_sync_extras_count;
-
-    struct list_head free_shadow_frames;
-
-    pagetable_t         phys_table;         /* guest 1:1 pagetable */
-    struct hvm_domain   hvm_domain;
-
-    /* Shadow-translated guest: Pseudophys base address of reserved area. */
-    unsigned long first_reserved_pfn;
+    /* Shadow translated domain: P2M mapping */
+    pagetable_t phys_table;
+
 } __cacheline_aligned;
 
 #ifdef CONFIG_X86_PAE
@@ -166,25 +166,34 @@ struct arch_vcpu
      */
     l1_pgentry_t *perdomain_ptes;
 
-    pagetable_t  guest_table_user;      /* x86/64: user-space pagetable. */
-    pagetable_t  guest_table;           /* (MA) guest notion of cr3 */
-    pagetable_t  shadow_table;          /* (MA) shadow of guest */
-    pagetable_t  monitor_table;         /* (MA) used in hypervisor */
-
-    l2_pgentry_t *guest_vtable;         /* virtual address of pagetable */
-    l2_pgentry_t *shadow_vtable;        /* virtual address of shadow_table */
-    l2_pgentry_t *monitor_vtable;              /* virtual address of 
monitor_table */
-    l1_pgentry_t *hl2_vtable;                  /* virtual address of hl2_table 
*/
-
 #ifdef CONFIG_X86_64
-    l3_pgentry_t *guest_vl3table;
-    l4_pgentry_t *guest_vl4table;
-#endif
-
-    unsigned long monitor_shadow_ref;
+    pagetable_t guest_table_user;       /* (MFN) x86/64 user-space pagetable */
+#endif
+    pagetable_t guest_table;            /* (MFN) guest notion of cr3 */
+    /* guest_table holds a ref to the page, and also a type-count unless
+     * shadow refcounts are in use */
+    pagetable_t shadow_table;           /* (MFN) shadow of guest */
+    pagetable_t monitor_table;          /* (MFN) hypervisor PT (for HVM) */
+    unsigned long cr3;                     /* (MA) value to install in HW CR3 
*/
+
+    void *guest_vtable;                 /* virtual address of pagetable */
+    void *shadow_vtable;                /* virtual address of shadow_table */
+    root_pgentry_t *monitor_vtable;            /* virtual address of 
monitor_table */
 
     /* Current LDT details. */
     unsigned long shadow_ldt_mapcnt;
+
+    /* Shadow2 stuff */
+    /* -- pointers to mode-specific entry points */
+    struct shadow2_entry_points *shadow2; 
+    unsigned long last_emulated_mfn;    /* last mfn we emulated a write to */
+    u8 shadow2_propagate_fault;         /* emulated fault needs to be */
+                                        /* propagated to guest */
+#if CONFIG_PAGING_LEVELS >= 3
+    u8 shadow2_pae_flip_pending;        /* shadow update requires this PAE cpu
+                                         * to recopy/install its L3 table.
+                                         */
+#endif
 } __cacheline_aligned;
 
 /* shorthands to improve code legibility */
diff -r f2151423f729 -r 01345b08d012 xen/include/asm-x86/grant_table.h
--- a/xen/include/asm-x86/grant_table.h Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/include/asm-x86/grant_table.h Wed Aug 16 17:11:56 2006 +0100
@@ -31,7 +31,7 @@ int destroy_grant_host_mapping(
 #define gnttab_shared_gmfn(d, t, i)                     \
     (mfn_to_gmfn(d, gnttab_shared_mfn(d, t, i)))
 
-#define gnttab_log_dirty(d, f) mark_dirty((d), (f))
+#define gnttab_mark_dirty(d, f) mark_dirty((d), (f))
 
 static inline void gnttab_clear_flag(unsigned long nr, uint16_t *addr)
 {
diff -r f2151423f729 -r 01345b08d012 xen/include/asm-x86/hvm/hvm.h
--- a/xen/include/asm-x86/hvm/hvm.h     Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/include/asm-x86/hvm/hvm.h     Wed Aug 16 17:11:56 2006 +0100
@@ -56,8 +56,15 @@ struct hvm_function_table {
      */
     int (*realmode)(struct vcpu *v);
     int (*paging_enabled)(struct vcpu *v);
+    int (*long_mode_enabled)(struct vcpu *v);
+    int (*guest_x86_mode)(struct vcpu *v);
     int (*instruction_length)(struct vcpu *v);
     unsigned long (*get_guest_ctrl_reg)(struct vcpu *v, unsigned int num);
+
+    /* 
+     * Re-set the value of CR3 that Xen runs on when handling VM exits
+     */
+    void (*update_host_cr3)(struct vcpu *v);
 
     /*
      * Update specifics of the guest state:
@@ -134,9 +141,27 @@ hvm_paging_enabled(struct vcpu *v)
 }
 
 static inline int
+hvm_long_mode_enabled(struct vcpu *v)
+{
+    return hvm_funcs.long_mode_enabled(v);
+}
+
+static inline int
+hvm_guest_x86_mode(struct vcpu *v)
+{
+    return hvm_funcs.guest_x86_mode(v);
+}
+
+static inline int
 hvm_instruction_length(struct vcpu *v)
 {
     return hvm_funcs.instruction_length(v);
+}
+
+static inline void
+hvm_update_host_cr3(struct vcpu *v)
+{
+    hvm_funcs.update_host_cr3(v);
 }
 
 void hvm_hypercall_page_initialise(struct domain *d,
diff -r f2151423f729 -r 01345b08d012 xen/include/asm-x86/hvm/support.h
--- a/xen/include/asm-x86/hvm/support.h Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/include/asm-x86/hvm/support.h Wed Aug 16 17:11:56 2006 +0100
@@ -116,10 +116,13 @@ enum hval_bitmaps {
 #define DBG_LEVEL_IOAPIC            (1 << 9)
 
 extern unsigned int opt_hvm_debug_level;
-#define HVM_DBG_LOG(level, _f, _a...)           \
-    if ( (level) & opt_hvm_debug_level )        \
-        printk("[HVM:%d.%d] <%s> " _f "\n",     \
-               current->domain->domain_id, current->vcpu_id, __func__, ## _a)
+#define HVM_DBG_LOG(level, _f, _a...)                                         \
+    do {                                                                      \
+        if ( (level) & opt_hvm_debug_level )                                  \
+            printk("[HVM:%d.%d] <%s> " _f "\n",                               \
+                   current->domain->domain_id, current->vcpu_id, __func__,    \
+                   ## _a);                                                    \
+    } while (0)
 #else
 #define HVM_DBG_LOG(level, _f, _a...)
 #endif
diff -r f2151423f729 -r 01345b08d012 xen/include/asm-x86/hvm/vcpu.h
--- a/xen/include/asm-x86/hvm/vcpu.h    Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/include/asm-x86/hvm/vcpu.h    Wed Aug 16 17:11:56 2006 +0100
@@ -29,6 +29,7 @@
 #define HVM_VCPU_INIT_SIPI_SIPI_STATE_WAIT_SIPI     1
 
 struct hvm_vcpu {
+    unsigned long       hw_cr3;     /* value we give to HW to use */
     unsigned long       ioflags;
     struct hvm_io_op    io_op;
     struct vlapic       *vlapic;
@@ -39,6 +40,11 @@ struct hvm_vcpu {
     unsigned long       init_sipi_sipi_state;
 
     int                 xen_port;
+
+#if CONFIG_PAGING_LEVELS >= 3
+    l3_pgentry_t hvm_lowmem_l3tab[4]
+    __attribute__((__aligned__(32)));
+#endif
 
     /* Flags */
     int                 flag_dr_dirty;
diff -r f2151423f729 -r 01345b08d012 xen/include/asm-x86/hvm/vmx/vmcs.h
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h        Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h        Wed Aug 16 17:11:56 2006 +0100
@@ -87,6 +87,7 @@ struct arch_vmx_struct {
 
     unsigned long        cpu_cr0; /* copy of guest CR0 */
     unsigned long        cpu_shadow_cr0; /* copy of guest read shadow CR0 */
+    unsigned long        cpu_shadow_cr4; /* copy of guest read shadow CR4 */
     unsigned long        cpu_cr2; /* save CR2 */
     unsigned long        cpu_cr3;
     unsigned long        cpu_state;
diff -r f2151423f729 -r 01345b08d012 xen/include/asm-x86/hvm/vmx/vmx.h
--- a/xen/include/asm-x86/hvm/vmx/vmx.h Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/include/asm-x86/hvm/vmx/vmx.h Wed Aug 16 17:11:56 2006 +0100
@@ -298,6 +298,9 @@ static always_inline void __vmwrite_vcpu
     case GUEST_CR0:
         v->arch.hvm_vmx.cpu_cr0 = value;
         break;
+    case CR4_READ_SHADOW:
+        v->arch.hvm_vmx.cpu_shadow_cr4 = value;
+        break;
     case CPU_BASED_VM_EXEC_CONTROL:
         v->arch.hvm_vmx.cpu_based_exec_control = value;
         break;
@@ -317,11 +320,14 @@ static always_inline void __vmread_vcpu(
     case GUEST_CR0:
         *value = v->arch.hvm_vmx.cpu_cr0;
         break;
+    case CR4_READ_SHADOW:
+        *value = v->arch.hvm_vmx.cpu_shadow_cr4;
+        break;
     case CPU_BASED_VM_EXEC_CONTROL:
         *value = v->arch.hvm_vmx.cpu_based_exec_control;
         break;
     default:
-        printk("__vmread_cpu: invalid field %lx\n", field);
+        printk("__vmread_vcpu: invalid field %lx\n", field);
         break;
     }
 }
@@ -342,6 +348,7 @@ static inline int __vmwrite(unsigned lon
     switch ( field ) {
     case CR0_READ_SHADOW:
     case GUEST_CR0:
+    case CR4_READ_SHADOW:
     case CPU_BASED_VM_EXEC_CONTROL:
         __vmwrite_vcpu(v, field, value);
         break;
@@ -402,6 +409,46 @@ static inline int vmx_paging_enabled(str
 
     __vmread_vcpu(v, CR0_READ_SHADOW, &cr0);
     return (cr0 & X86_CR0_PE) && (cr0 & X86_CR0_PG);
+}
+
+/* Works only for vcpu == current */
+static inline int vmx_long_mode_enabled(struct vcpu *v)
+{
+    ASSERT(v == current);
+    return VMX_LONG_GUEST(current);
+}
+
+/* Works only for vcpu == current */
+static inline int vmx_realmode(struct vcpu *v)
+{
+    unsigned long rflags;
+    ASSERT(v == current);
+
+    __vmread(GUEST_RFLAGS, &rflags);
+    return rflags & X86_EFLAGS_VM;
+}
+
+/* Works only for vcpu == current */
+static inline void vmx_update_host_cr3(struct vcpu *v)
+{
+    ASSERT(v == current);
+    __vmwrite(HOST_CR3, v->arch.cr3);
+}
+
+static inline int vmx_guest_x86_mode(struct vcpu *v)
+{
+    unsigned long cs_ar_bytes;
+    ASSERT(v == current);
+
+    if ( vmx_long_mode_enabled(v) )
+    {
+        __vmread(GUEST_CS_AR_BYTES, &cs_ar_bytes);
+        return (cs_ar_bytes & (1u<<13)) ? 8 : 4;
+    }
+    if ( vmx_realmode(v) )
+        return 2;
+    __vmread(GUEST_CS_AR_BYTES, &cs_ar_bytes);
+    return (cs_ar_bytes & (1u<<14)) ? 4 : 2;
 }
 
 static inline int vmx_pgbit_test(struct vcpu *v)
diff -r f2151423f729 -r 01345b08d012 xen/include/asm-x86/mm.h
--- a/xen/include/asm-x86/mm.h  Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/include/asm-x86/mm.h  Wed Aug 16 17:11:56 2006 +0100
@@ -20,7 +20,11 @@ struct page_info
 struct page_info
 {
     /* Each frame can be threaded onto a doubly-linked list. */
-    struct list_head list;
+    union {
+        struct list_head list;
+        /* Shadow2 uses this field as an up-pointer in lower-level shadows */
+        paddr_t up;
+    };
 
     /* Reference count and various PGC_xxx flags and fields. */
     u32 count_info;
@@ -46,8 +50,20 @@ struct page_info
 
     } u;
 
-    /* Timestamp from 'TLB clock', used to reduce need for safety flushes. */
-    u32 tlbflush_timestamp;
+    union {
+        /* Timestamp from 'TLB clock', used to reduce need for safety
+         * flushes.  Only valid on a) free pages, and b) guest pages with a
+         * zero type count. */
+        u32 tlbflush_timestamp;
+
+        /* Only used on guest pages with a shadow.
+         * Guest pages with a shadow must have a non-zero type count, so this
+         * does not conflict with the tlbflush timestamp. */
+        u32 shadow2_flags;
+
+        // XXX -- we expect to add another field here, to be used for min/max
+        // purposes, which is only used for shadow pages.
+    };
 };
 
  /* The following page types are MUTUALLY EXCLUSIVE. */
@@ -60,6 +76,7 @@ struct page_info
 #define PGT_ldt_page        (6U<<29) /* using this page in an LDT? */
 #define PGT_writable_page   (7U<<29) /* has writable mappings of this page? */
 
+#ifndef SHADOW2
 #define PGT_l1_shadow       PGT_l1_page_table
 #define PGT_l2_shadow       PGT_l2_page_table
 #define PGT_l3_shadow       PGT_l3_page_table
@@ -69,14 +86,16 @@ struct page_info
 #define PGT_writable_pred   (7U<<29) /* predicted gpfn with writable ref */
 
 #define PGT_fl1_shadow      (5U<<29)
+#endif
+
 #define PGT_type_mask       (7U<<29) /* Bits 29-31. */
 
+ /* Owning guest has pinned this page to its current type? */
+#define _PGT_pinned         28
+#define PGT_pinned          (1U<<_PGT_pinned)
  /* Has this page been validated for use as its current type? */
-#define _PGT_validated      28
+#define _PGT_validated      27
 #define PGT_validated       (1U<<_PGT_validated)
- /* Owning guest has pinned this page to its current type? */
-#define _PGT_pinned         27
-#define PGT_pinned          (1U<<_PGT_pinned)
 #if defined(__i386__)
  /* The 11 most significant bits of virt address if this is a page table. */
 #define PGT_va_shift        16
@@ -98,6 +117,7 @@ struct page_info
  /* 16-bit count of uses of this frame as its current type. */
 #define PGT_count_mask      ((1U<<16)-1)
 
+#ifndef SHADOW2
 #ifdef __x86_64__
 #define PGT_high_mfn_shift  52
 #define PGT_high_mfn_mask   (0xfffUL << PGT_high_mfn_shift)
@@ -112,19 +132,53 @@ struct page_info
 #define PGT_score_shift     23
 #define PGT_score_mask      (((1U<<4)-1)<<PGT_score_shift)
 #endif
+#endif /* SHADOW2 */
 
  /* Cleared when the owning guest 'frees' this page. */
 #define _PGC_allocated      31
 #define PGC_allocated       (1U<<_PGC_allocated)
- /* Set when fullshadow mode marks a page out-of-sync */
+ /* Set on a *guest* page to mark it out-of-sync with its shadow */
 #define _PGC_out_of_sync     30
 #define PGC_out_of_sync     (1U<<_PGC_out_of_sync)
- /* Set when fullshadow mode is using a page as a page table */
+ /* Set when is using a page as a page table */
 #define _PGC_page_table      29
 #define PGC_page_table      (1U<<_PGC_page_table)
  /* 29-bit count of references to this frame. */
 #define PGC_count_mask      ((1U<<29)-1)
 
+/* shadow2 uses the count_info on shadow pages somewhat differently */
+/* NB: please coordinate any changes here with the SH2F's in shadow2.h */
+#define PGC_SH2_none           (0U<<28) /* on the shadow2 free list */
+#define PGC_SH2_min_shadow     (1U<<28)
+#define PGC_SH2_l1_32_shadow   (1U<<28) /* shadowing a 32-bit L1 guest page */
+#define PGC_SH2_fl1_32_shadow  (2U<<28) /* L1 shadow for a 32b 4M superpage */
+#define PGC_SH2_l2_32_shadow   (3U<<28) /* shadowing a 32-bit L2 guest page */
+#define PGC_SH2_l1_pae_shadow  (4U<<28) /* shadowing a pae L1 page */
+#define PGC_SH2_fl1_pae_shadow (5U<<28) /* L1 shadow for pae 2M superpg */
+#define PGC_SH2_l2_pae_shadow  (6U<<28) /* shadowing a pae L2-low page */
+#define PGC_SH2_l2h_pae_shadow (7U<<28) /* shadowing a pae L2-high page */
+#define PGC_SH2_l3_pae_shadow  (8U<<28) /* shadowing a pae L3 page */
+#define PGC_SH2_l1_64_shadow   (9U<<28) /* shadowing a 64-bit L1 page */
+#define PGC_SH2_fl1_64_shadow (10U<<28) /* L1 shadow for 64-bit 2M superpg */
+#define PGC_SH2_l2_64_shadow  (11U<<28) /* shadowing a 64-bit L2 page */
+#define PGC_SH2_l3_64_shadow  (12U<<28) /* shadowing a 64-bit L3 page */
+#define PGC_SH2_l4_64_shadow  (13U<<28) /* shadowing a 64-bit L4 page */
+#define PGC_SH2_max_shadow    (13U<<28)
+#define PGC_SH2_p2m_table     (14U<<28) /* in use as the p2m table */
+#define PGC_SH2_monitor_table (15U<<28) /* in use as a monitor table */
+#define PGC_SH2_unused        (15U<<28)
+
+#define PGC_SH2_type_mask     (15U<<28)
+#define PGC_SH2_type_shift          28
+
+#define PGC_SH2_pinned         (1U<<27)
+
+#define _PGC_SH2_log_dirty          26
+#define PGC_SH2_log_dirty      (1U<<26)
+
+/* 26 bit ref count for shadow pages */
+#define PGC_SH2_count_mask    ((1U<<26) - 1)
+
 /* We trust the slab allocator in slab.c, and our use of it. */
 #define PageSlab(page)     (1)
 #define PageSetSlab(page)   ((void)0)
@@ -134,14 +188,22 @@ struct page_info
 
 #if defined(__i386__)
 #define pickle_domptr(_d)   ((u32)(unsigned long)(_d))
-#define unpickle_domptr(_d) ((struct domain *)(unsigned long)(_d))
+static inline struct domain *unpickle_domptr(u32 _domain)
+{ return (_domain & 1) ? NULL : (void *)_domain; }
 #define PRtype_info "08lx" /* should only be used for printk's */
 #elif defined(__x86_64__)
 static inline struct domain *unpickle_domptr(u32 _domain)
-{ return (_domain == 0) ? NULL : __va(_domain); }
+{ return ((_domain == 0) || (_domain & 1)) ? NULL : __va(_domain); }
 static inline u32 pickle_domptr(struct domain *domain)
 { return (domain == NULL) ? 0 : (u32)__pa(domain); }
 #define PRtype_info "016lx"/* should only be used for printk's */
+#endif
+
+/* The order of the largest allocation unit we use for shadow pages */
+#if CONFIG_PAGING_LEVELS == 2
+#define SHADOW2_MAX_ORDER 0 /* Only ever need 4k allocations */
+#else  
+#define SHADOW2_MAX_ORDER 2 /* Need up to 16k allocs for 32-bit on PAE/64 */
 #endif
 
 #define page_get_owner(_p)    (unpickle_domptr((_p)->u.inuse._domain))
@@ -165,7 +227,7 @@ extern int shadow_remove_all_write_acces
 extern int shadow_remove_all_write_access(
     struct domain *d, unsigned long gmfn, unsigned long mfn);
 extern u32 shadow_remove_all_access( struct domain *d, unsigned long gmfn);
-extern int _shadow_mode_refcounts(struct domain *d);
+extern int _shadow2_mode_refcounts(struct domain *d);
 
 static inline void put_page(struct page_info *page)
 {
@@ -197,8 +259,8 @@ static inline int get_page(struct page_i
              unlikely((nx & PGC_count_mask) == 0) || /* Count overflow? */
              unlikely(d != _domain) )                /* Wrong owner? */
         {
-            if ( !_shadow_mode_refcounts(domain) )
-                DPRINTK("Error pfn %lx: rd=%p, od=%p, caf=%08x, taf=%"
+            if ( !_shadow2_mode_refcounts(domain) )
+                DPRINTK("Error pfn %lx: rd=%p, od=%p, caf=%08x, taf=%" 
                         PRtype_info "\n",
                         page_to_mfn(page), domain, unpickle_domptr(d),
                         x, page->u.inuse.type_info);
@@ -254,6 +316,16 @@ static inline int page_is_removable(stru
     ASSERT(((_p)->count_info & PGC_count_mask) != 0);          \
     ASSERT(page_get_owner(_p) == (_d))
 
+// Quick test for whether a given page can be represented directly in CR3.
+//
+#if CONFIG_PAGING_LEVELS == 3
+#define MFN_FITS_IN_CR3(_MFN) !(mfn_x(_MFN) >> 20)
+
+/* returns a lowmem machine address of the copied L3 root table */
+unsigned long
+pae_copy_root(struct vcpu *v, l3_pgentry_t *l3tab);
+#endif /* CONFIG_PAGING_LEVELS == 3 */
+
 int check_descriptor(struct desc_struct *d);
 
 /*
@@ -271,29 +343,44 @@ int check_descriptor(struct desc_struct 
 #define set_gpfn_from_mfn(mfn, pfn) (machine_to_phys_mapping[(mfn)] = (pfn))
 #define get_gpfn_from_mfn(mfn)      (machine_to_phys_mapping[(mfn)])
 
+
+#define mfn_to_gmfn(_d, mfn)                            \
+    ( (shadow2_mode_translate(_d))                      \
+      ? get_gpfn_from_mfn(mfn)                          \
+      : (mfn) )
+
+#define gmfn_to_mfn(_d, gpfn)  mfn_x(sh2_gfn_to_mfn(_d, gpfn))
+
+
 /*
  * The phys_to_machine_mapping is the reversed mapping of MPT for full
  * virtualization.  It is only used by shadow_mode_translate()==true
  * guests, so we steal the address space that would have normally
  * been used by the read-only MPT map.
  */
-#define phys_to_machine_mapping ((unsigned long *)RO_MPT_VIRT_START)
-#define NR_P2M_TABLE_ENTRIES    ((unsigned long *)RO_MPT_VIRT_END \
-                                 - phys_to_machine_mapping)
+#define phys_to_machine_mapping ((l1_pgentry_t *)RO_MPT_VIRT_START)
 #define INVALID_MFN             (~0UL)
 #define VALID_MFN(_mfn)         (!((_mfn) & (1U<<31)))
 
-#define set_mfn_from_gpfn(pfn, mfn) (phys_to_machine_mapping[(pfn)] = (mfn))
 static inline unsigned long get_mfn_from_gpfn(unsigned long pfn)
 {
-    unsigned long mfn;
-
-    if ( unlikely(pfn >= NR_P2M_TABLE_ENTRIES) ||
-         unlikely(__copy_from_user(&mfn, &phys_to_machine_mapping[pfn],
-                                   sizeof(mfn))) )
-       mfn = INVALID_MFN;
-
-    return mfn;
+    l1_pgentry_t l1e = l1e_empty();
+    int ret;
+
+#if CONFIG_PAGING_LEVELS > 2
+    if ( pfn > (RO_MPT_VIRT_END - RO_MPT_VIRT_START) / sizeof (l1_pgentry_t) ) 
+        /* This pfn is higher than the p2m map can hold */
+        return INVALID_MFN;
+#endif
+
+    ret = __copy_from_user(&l1e,
+                               &phys_to_machine_mapping[pfn],
+                               sizeof(l1e));
+
+    if ( (ret == 0) && (l1e_get_flags(l1e) & _PAGE_PRESENT) )
+        return l1e_get_pfn(l1e);
+
+    return INVALID_MFN;
 }
 
 #ifdef MEMORY_GUARD
@@ -333,6 +420,7 @@ void audit_domains(void);
 #endif
 
 int new_guest_cr3(unsigned long pfn);
+void make_cr3(struct vcpu *v, unsigned long mfn);
 
 void propagate_page_fault(unsigned long addr, u16 error_code);
 
diff -r f2151423f729 -r 01345b08d012 xen/include/asm-x86/msr.h
--- a/xen/include/asm-x86/msr.h Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/include/asm-x86/msr.h Wed Aug 16 17:11:56 2006 +0100
@@ -112,6 +112,10 @@ static inline void wrmsrl(unsigned int m
 #define MSR_IA32_VMX_EXIT_CTLS_MSR              0x483
 #define MSR_IA32_VMX_ENTRY_CTLS_MSR             0x484
 #define MSR_IA32_VMX_MISC_MSR                   0x485
+#define MSR_IA32_VMX_CR0_FIXED0                 0x486
+#define MSR_IA32_VMX_CR0_FIXED1                 0x487
+#define MSR_IA32_VMX_CR4_FIXED0                 0x488
+#define MSR_IA32_VMX_CR4_FIXED1                 0x489
 #define IA32_FEATURE_CONTROL_MSR                0x3a
 #define IA32_FEATURE_CONTROL_MSR_LOCK           0x1
 #define IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON   0x4
diff -r f2151423f729 -r 01345b08d012 xen/include/asm-x86/page-guest32.h
--- a/xen/include/asm-x86/page-guest32.h        Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/include/asm-x86/page-guest32.h        Wed Aug 16 17:11:56 2006 +0100
@@ -89,15 +89,8 @@ static inline l2_pgentry_32_t l2e_from_p
 
 #define linear_l1_table_32                                                 \
     ((l1_pgentry_32_t *)(LINEAR_PT_VIRT_START))
-#define __linear_l2_table_32                                                 \
-    ((l2_pgentry_32_t *)(LINEAR_PT_VIRT_START +                            \
-                     (LINEAR_PT_VIRT_START >> (PAGETABLE_ORDER<<0))))
 
 #define linear_pg_table_32 linear_l1_table_32
-#define linear_l2_table_32(_ed) ((_ed)->arch.guest_vtable)
-
-#define va_to_l1mfn_32(_ed, _va) \
-    (l2e_get_pfn(linear_l2_table(_ed)[_va>>L2_PAGETABLE_SHIFT]))
 
 #endif /* __X86_PAGE_GUEST_H__ */
 
diff -r f2151423f729 -r 01345b08d012 xen/include/asm-x86/page.h
--- a/xen/include/asm-x86/page.h        Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/include/asm-x86/page.h        Wed Aug 16 17:11:56 2006 +0100
@@ -233,26 +233,18 @@ typedef struct { u64 pfn; } pagetable_t;
      + DOMAIN_ENTRIES_PER_L4_PAGETABLE)
 #endif
 
-#define LINEAR_PT_OFFSET (LINEAR_PT_VIRT_START & VADDR_MASK)
-#define linear_l1_table                                             \
-    ((l1_pgentry_t *)(LINEAR_PT_VIRT_START))
-#define __linear_l2_table                                           \
-    ((l2_pgentry_t *)(LINEAR_PT_VIRT_START +                        \
-                     (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<0))))
-#define __linear_l3_table                                           \
-    ((l3_pgentry_t *)(LINEAR_PT_VIRT_START +                        \
-                     (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<0)) +   \
-                     (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<1))))
-#define __linear_l4_table                                           \
-    ((l4_pgentry_t *)(LINEAR_PT_VIRT_START +                        \
-                     (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<0)) +   \
-                     (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<1)) +   \
-                     (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<2))))
-
+/* Where to find each level of the linear mapping */
+#define __linear_l1_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START))
+#define __linear_l2_table \
+ ((l2_pgentry_t *)(__linear_l1_table + l1_linear_offset(LINEAR_PT_VIRT_START)))
+#define __linear_l3_table \
+ ((l3_pgentry_t *)(__linear_l2_table + l2_linear_offset(LINEAR_PT_VIRT_START)))
+#define __linear_l4_table \
+ ((l4_pgentry_t *)(__linear_l3_table + l3_linear_offset(LINEAR_PT_VIRT_START)))
+
+#define linear_l1_table __linear_l1_table
 #define linear_pg_table linear_l1_table
-#define linear_l2_table(v) ((v)->arch.guest_vtable)
-#define linear_l3_table(v) ((v)->arch.guest_vl3table)
-#define linear_l4_table(v) ((v)->arch.guest_vl4table)
+#define linear_l2_table(v) ((l2_pgentry_t *)(v)->arch.guest_vtable)
 
 #ifndef __ASSEMBLY__
 #if CONFIG_PAGING_LEVELS == 3
@@ -294,6 +286,7 @@ extern void paging_init(void);
 #define _PAGE_AVAIL1   0x400U
 #define _PAGE_AVAIL2   0x800U
 #define _PAGE_AVAIL    0xE00U
+#define _PAGE_PSE_PAT 0x1000U
 
 /*
  * Debug option: Ensure that granted mappings are not implicitly unmapped.
@@ -307,9 +300,9 @@ extern void paging_init(void);
 #endif
 
 /*
- * Disallow unused flag bits plus PAT, PSE and GLOBAL. Also disallow GNTTAB
- * if we are using it for grant-table debugging. Permit the NX bit if the
- * hardware supports it.
+ * Disallow unused flag bits plus PAT, PSE and GLOBAL.
+ * Also disallow GNTTAB if we are using it for grant-table debugging.
+ * Permit the NX bit if the hardware supports it.
  */
 #define BASE_DISALLOW_MASK ((0xFFFFF180U | _PAGE_GNTTAB) & ~_PAGE_NX)
 
diff -r f2151423f729 -r 01345b08d012 xen/include/asm-x86/perfc_defn.h
--- a/xen/include/asm-x86/perfc_defn.h  Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/include/asm-x86/perfc_defn.h  Wed Aug 16 17:11:56 2006 +0100
@@ -144,4 +144,57 @@ PERFCOUNTER_CPU(remove_write_bad_predict
 PERFCOUNTER_CPU(remove_write_bad_prediction, "remove_write bad prediction")
 PERFCOUNTER_CPU(update_hl2e_invlpg,     "update_hl2e calls invlpg")
 
+/* Shadow2 counters */
+PERFCOUNTER_CPU(shadow2_alloc,          "calls to shadow2_alloc")
+PERFCOUNTER_CPU(shadow2_alloc_tlbflush, "shadow2_alloc flushed TLBs")
+PERFSTATUS(shadow2_alloc_count,         "number of shadow pages in use")
+PERFCOUNTER_CPU(shadow2_free,           "calls to shadow2_free")
+PERFCOUNTER_CPU(shadow2_prealloc_1,     "shadow2 recycles old shadows")
+PERFCOUNTER_CPU(shadow2_prealloc_2,     "shadow2 recycles in-use shadows")
+PERFCOUNTER_CPU(shadow2_linear_map_failed, "shadow2 hit read-only linear map")
+PERFCOUNTER_CPU(shadow2_a_update,       "shadow2 A bit update")
+PERFCOUNTER_CPU(shadow2_ad_update,      "shadow2 A&D bit update")
+PERFCOUNTER_CPU(shadow2_fault,          "calls to shadow2_fault")
+PERFCOUNTER_CPU(shadow2_fault_bail_bad_gfn, "shadow2_fault guest bad gfn")
+PERFCOUNTER_CPU(shadow2_fault_bail_not_present, 
+                                        "shadow2_fault guest not-present")
+PERFCOUNTER_CPU(shadow2_fault_bail_nx,  "shadow2_fault guest NX fault")
+PERFCOUNTER_CPU(shadow2_fault_bail_ro_mapping, "shadow2_fault guest R/W fault")
+PERFCOUNTER_CPU(shadow2_fault_bail_user_supervisor, 
+                                        "shadow2_fault guest U/S fault")
+PERFCOUNTER_CPU(shadow2_fault_emulate_read, "shadow2_fault emulates a read")
+PERFCOUNTER_CPU(shadow2_fault_emulate_write, "shadow2_fault emulates a write")
+PERFCOUNTER_CPU(shadow2_fault_emulate_failed, "shadow2_fault emulator fails")
+PERFCOUNTER_CPU(shadow2_fault_mmio,     "shadow2_fault handled as mmio")
+PERFCOUNTER_CPU(shadow2_fault_fixed,    "shadow2_fault fixed fault")
+PERFCOUNTER_CPU(shadow2_ptwr_emulate,   "shadow2 causes ptwr to emulate")
+PERFCOUNTER_CPU(shadow2_validate_gl1e_calls, "calls to shadow2_validate_gl1e")
+PERFCOUNTER_CPU(shadow2_validate_gl2e_calls, "calls to shadow2_validate_gl2e")
+PERFCOUNTER_CPU(shadow2_validate_gl3e_calls, "calls to shadow2_validate_gl3e")
+PERFCOUNTER_CPU(shadow2_validate_gl4e_calls, "calls to shadow2_validate_gl4e")
+PERFCOUNTER_CPU(shadow2_hash_lookups,   "calls to shadow2_hash_lookup")
+PERFCOUNTER_CPU(shadow2_hash_lookup_head, "shadow2 hash hit in bucket head")
+PERFCOUNTER_CPU(shadow2_hash_lookup_miss, "shadow2 hash misses")
+PERFCOUNTER_CPU(shadow2_get_shadow_status, "calls to get_shadow_status")
+PERFCOUNTER_CPU(shadow2_hash_inserts,   "calls to shadow2_hash_insert")
+PERFCOUNTER_CPU(shadow2_hash_deletes,   "calls to shadow2_hash_delete")
+PERFCOUNTER_CPU(shadow2_writeable,      "shadow2 removes write access")
+PERFCOUNTER_CPU(shadow2_writeable_h_1,  "shadow2 writeable: 32b w2k3")
+PERFCOUNTER_CPU(shadow2_writeable_h_2,  "shadow2 writeable: 32pae w2k3")
+PERFCOUNTER_CPU(shadow2_writeable_h_3,  "shadow2 writeable: 64b w2k3")
+PERFCOUNTER_CPU(shadow2_writeable_h_4,  "shadow2 writeable: 32b linux low")
+PERFCOUNTER_CPU(shadow2_writeable_bf,   "shadow2 writeable brute-force")
+PERFCOUNTER_CPU(shadow2_mappings,       "shadow2 removes all mappings")
+PERFCOUNTER_CPU(shadow2_mappings_bf,    "shadow2 rm-mappings brute-force")
+PERFCOUNTER_CPU(shadow2_early_unshadow, "shadow2 unshadows for fork/exit")
+PERFCOUNTER_CPU(shadow2_early_unshadow_top, "shadow2 unhooks for fork/exit")
+PERFCOUNTER_CPU(shadow2_unshadow,       "shadow2 unshadows a page")
+PERFCOUNTER_CPU(shadow2_up_pointer,     "shadow2 unshadow by up-pointer")
+PERFCOUNTER_CPU(shadow2_unshadow_bf,    "shadow2 unshadow brute-force")
+PERFCOUNTER_CPU(shadow2_get_page_fail,  "shadow2_get_page_from_l1e failed")
+PERFCOUNTER_CPU(shadow2_guest_walk,     "shadow2 walks guest tables")
+PERFCOUNTER_CPU(shadow2_walk_cache_hit, "shadow2 walk-cache hits")
+PERFCOUNTER_CPU(shadow2_walk_cache_miss, "shadow2 walk-cache misses")
+
+
 /*#endif*/ /* __XEN_PERFC_DEFN_H__ */
diff -r f2151423f729 -r 01345b08d012 xen/include/asm-x86/processor.h
--- a/xen/include/asm-x86/processor.h   Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/include/asm-x86/processor.h   Wed Aug 16 17:11:56 2006 +0100
@@ -545,6 +545,7 @@ extern always_inline void prefetchw(cons
 #endif
 
 void show_stack(struct cpu_user_regs *regs);
+void show_xen_trace(void);
 void show_stack_overflow(unsigned long esp);
 void show_registers(struct cpu_user_regs *regs);
 void show_execution_state(struct cpu_user_regs *regs);
diff -r f2151423f729 -r 01345b08d012 xen/include/asm-x86/shadow.h
--- a/xen/include/asm-x86/shadow.h      Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/include/asm-x86/shadow.h      Wed Aug 16 17:11:56 2006 +0100
@@ -1,8 +1,7 @@
 /******************************************************************************
  * include/asm-x86/shadow.h
  * 
- * Copyright (c) 2005 Michael A Fetterman
- * Based on an earlier implementation by Ian Pratt et al
+ * Copyright (c) 2006 by XenSource Inc.
  * 
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -22,1782 +21,28 @@
 #ifndef _XEN_SHADOW_H
 #define _XEN_SHADOW_H
 
-#include <xen/config.h>
-#include <xen/types.h>
-#include <xen/perfc.h>
-#include <xen/sched.h>
-#include <xen/mm.h>
-#include <xen/domain_page.h>
-#include <asm/current.h>
-#include <asm/flushtlb.h>
-#include <asm/processor.h>
-#include <asm/hvm/hvm.h>
-#include <asm/hvm/support.h>
-#include <asm/regs.h>
-#include <public/dom0_ops.h>
-#include <asm/shadow_public.h>
-#include <asm/page-guest32.h>
-#include <asm/shadow_ops.h>
+/* This file is just a wrapper around the new Shadow2 header,
+ * providing names that must be defined in any shadow implementation. */
 
-/* Shadow PT operation mode : shadow-mode variable in arch_domain. */
+#include <asm/shadow2.h>
 
-#define SHM_enable    (1<<0) /* we're in one of the shadow modes */
-#define SHM_refcounts (1<<1) /* refcounts based on shadow tables instead of
-                                guest tables */
-#define SHM_write_all (1<<2) /* allow write access to all guest pt pages,
-                                regardless of pte write permissions */
-#define SHM_log_dirty (1<<3) /* enable log dirty mode */
-#define SHM_translate (1<<4) /* Xen does p2m translation, not guest */
-#define SHM_external  (1<<5) /* Xen does not steal address space from the
-                                domain for its own booking; requires VT or
-                                similar mechanisms */
-#define SHM_wr_pt_pte (1<<6) /* guest allowed to set PAGE_RW bit in PTEs which
-                                point to page table pages. */
+/* How to make sure a page is not referred to in a shadow PT */
+/* This will need to be a for_each_vcpu if we go to per-vcpu shadows */ 
+#define shadow_drop_references(_d, _p)                      \
+    shadow2_remove_all_mappings((_d)->vcpu[0], _mfn(page_to_mfn(_p)))
+#define shadow_sync_and_drop_references(_d, _p)             \
+    shadow2_remove_all_mappings((_d)->vcpu[0], _mfn(page_to_mfn(_p)))
 
-#define shadow_mode_enabled(_d)   ((_d)->arch.shadow_mode)
-#define shadow_mode_refcounts(_d) ((_d)->arch.shadow_mode & SHM_refcounts)
-#define shadow_mode_write_l1(_d)  (VM_ASSIST(_d, 
VMASST_TYPE_writable_pagetables))
-#define shadow_mode_write_all(_d) ((_d)->arch.shadow_mode & SHM_write_all)
-#define shadow_mode_log_dirty(_d) ((_d)->arch.shadow_mode & SHM_log_dirty)
-#define shadow_mode_translate(_d) ((_d)->arch.shadow_mode & SHM_translate)
-#define shadow_mode_external(_d)  ((_d)->arch.shadow_mode & SHM_external)
-#define shadow_mode_wr_pt_pte(_d) ((_d)->arch.shadow_mode & SHM_wr_pt_pte)
+/* Whether we are translating the domain's frame numbers for it */
+#define shadow_mode_translate(d)  shadow2_mode_translate(d)
 
-#define shadow_linear_pg_table ((l1_pgentry_t *)SH_LINEAR_PT_VIRT_START)
-#define __shadow_linear_l2_table ((l2_pgentry_t *)(SH_LINEAR_PT_VIRT_START + \
-     (SH_LINEAR_PT_VIRT_START >> (L2_PAGETABLE_SHIFT - L1_PAGETABLE_SHIFT))))
-#define shadow_linear_l2_table(_v) ((_v)->arch.shadow_vtable)
+/* ...and  if so, how to add and remove entries in the mapping */
+#define guest_physmap_add_page(_d, _p, _m)                  \
+    shadow2_guest_physmap_add_page((_d), (_p), (_m))
+#define guest_physmap_remove_page(_d, _p, _m   )            \
+    shadow2_guest_physmap_remove_page((_d), (_p), (_m))
 
-// easy access to the hl2 table (for translated but not external modes only)
-#define __linear_hl2_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START + \
-     (PERDOMAIN_VIRT_START >> (L2_PAGETABLE_SHIFT - L1_PAGETABLE_SHIFT))))
-
-/*
- * For now we use the per-domain BIGLOCK rather than a shadow-specific lock.
- * We usually have the BIGLOCK already acquired anyway, so this is unlikely
- * to cause much unnecessary extra serialisation. Also it's a recursive
- * lock, and there are some code paths containing nested shadow_lock().
- * The #if0'ed code below is therefore broken until such nesting is removed.
- */
-#if 0
-#define shadow_lock_init(_d)                    \
-    spin_lock_init(&(_d)->arch.shadow_lock)
-#define shadow_lock_is_acquired(_d)             \
-    spin_is_locked(&(_d)->arch.shadow_lock)
-#define shadow_lock(_d)                         \
-do {                                            \
-    ASSERT(!shadow_lock_is_acquired(_d));       \
-    spin_lock(&(_d)->arch.shadow_lock);         \
-} while (0)
-#define shadow_unlock(_d)                       \
-do {                                            \
-    ASSERT(!shadow_lock_is_acquired(_d));       \
-    spin_unlock(&(_d)->arch.shadow_lock);       \
-} while (0)
-#else
-#define shadow_lock_init(_d)                    \
-    ((_d)->arch.shadow_nest = 0)
-#define shadow_lock_is_acquired(_d)             \
-    (spin_is_locked(&(_d)->big_lock) && ((_d)->arch.shadow_nest != 0))
-#define shadow_lock(_d)                         \
-do {                                            \
-    LOCK_BIGLOCK(_d);                           \
-    (_d)->arch.shadow_nest++;                   \
-} while (0)
-#define shadow_unlock(_d)                       \
-do {                                            \
-    ASSERT(shadow_lock_is_acquired(_d));        \
-    (_d)->arch.shadow_nest--;                   \
-    UNLOCK_BIGLOCK(_d);                         \
-} while (0)
-#endif
-
-#if CONFIG_PAGING_LEVELS >= 3
-static inline u64 get_cr3_idxval(struct vcpu *v)
-{
-    u64 pae_cr3;
-
-    if ( v->domain->arch.ops->guest_paging_levels == PAGING_L3 &&
-            !shadow_mode_log_dirty(v->domain) )
-    {
-        pae_cr3 = hvm_get_guest_ctrl_reg(v, 3); /* get CR3 */
-        return (pae_cr3 >> PAE_CR3_ALIGN) & PAE_CR3_IDX_MASK;
-    }
-    else
-        return 0;
-}
-
-#define shadow_key_t u64
-#define index_to_key(x) ((x) << 32)
-#else
-#define get_cr3_idxval(v) (0)
-#define shadow_key_t unsigned long
-#define index_to_key(x)  (0)
-#endif
-
-
-#define SHADOW_ENCODE_MIN_MAX(_min, _max) ((((GUEST_L1_PAGETABLE_ENTRIES - 1) 
- (_max)) << 16) | (_min))
-#define SHADOW_MIN(_encoded) ((_encoded) & ((1u<<16) - 1))
-#define SHADOW_MAX(_encoded) ((GUEST_L1_PAGETABLE_ENTRIES - 1) - ((_encoded) 
>> 16))
-extern void shadow_direct_map_clean(struct domain *d);
-extern int shadow_direct_map_init(struct domain *d);
-extern int shadow_direct_map_fault(
-    unsigned long vpa, struct cpu_user_regs *regs);
-extern void shadow_mode_init(void);
-extern int shadow_mode_control(struct domain *p, dom0_shadow_control_t *sc);
-extern int shadow_fault(unsigned long va, struct cpu_user_regs *regs);
-extern int shadow_mode_enable(struct domain *p, unsigned int mode);
-extern void shadow_invlpg(struct vcpu *, unsigned long);
-extern struct out_of_sync_entry *shadow_mark_mfn_out_of_sync(
-    struct vcpu *v, unsigned long gpfn, unsigned long mfn);
-extern void free_monitor_pagetable(struct vcpu *v);
-extern void __shadow_sync_all(struct domain *d);
-extern int __shadow_out_of_sync(struct vcpu *v, unsigned long va);
-extern int set_p2m_entry(
-    struct domain *d, unsigned long pfn, unsigned long mfn,
-    struct domain_mmap_cache *l2cache,
-    struct domain_mmap_cache *l1cache);
-extern void remove_shadow(struct domain *d, unsigned long gpfn, u32 stype);
-
-extern void free_shadow_page(unsigned long smfn);
-
-extern void shadow_l1_normal_pt_update(struct domain *d,
-                                       paddr_t pa, l1_pgentry_t l1e,
-                                       struct domain_mmap_cache *cache);
-extern void shadow_l2_normal_pt_update(struct domain *d,
-                                       paddr_t pa, l2_pgentry_t l2e,
-                                       struct domain_mmap_cache *cache);
-#if CONFIG_PAGING_LEVELS >= 3
-#include <asm/page-guest32.h>
-/*
- * va_mask cannot be used because it's used by the shadow hash.
- * Use the score area for for now.
- */
-#define is_xen_l2_slot(t,s)                                                    
\
-    ( ((((t) & PGT_score_mask) >> PGT_score_shift) == 3) &&                    
\
-      ((s) >= (L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES - 1))) )
-
-extern unsigned long gva_to_gpa(unsigned long gva);
-extern void shadow_l3_normal_pt_update(struct domain *d,
-                                       paddr_t pa, l3_pgentry_t l3e,
-                                       struct domain_mmap_cache *cache);
-#endif
-#if CONFIG_PAGING_LEVELS >= 4
-extern void shadow_l4_normal_pt_update(struct domain *d,
-                                       paddr_t pa, l4_pgentry_t l4e,
-                                       struct domain_mmap_cache *cache);
-#endif
-extern int shadow_do_update_va_mapping(unsigned long va,
-                                       l1_pgentry_t val,
-                                       struct vcpu *v);
-
-
-static inline unsigned long __shadow_status(
-    struct domain *d, unsigned long gpfn, unsigned long stype);
-
-#if CONFIG_PAGING_LEVELS <= 2
-static inline void update_hl2e(struct vcpu *v, unsigned long va);
-#endif
-
-static inline int page_is_page_table(struct page_info *page)
-{
-    struct domain *owner = page_get_owner(page);
-    u32 type_info;
-
-    if ( owner && shadow_mode_refcounts(owner) )
-        return page->count_info & PGC_page_table;
-
-    type_info = page->u.inuse.type_info & PGT_type_mask;
-    return type_info && (type_info <= PGT_l4_page_table);
-}
-
-static inline int mfn_is_page_table(unsigned long mfn)
-{
-    if ( !mfn_valid(mfn) )
-        return 0;
-
-    return page_is_page_table(mfn_to_page(mfn));
-}
-
-static inline int page_out_of_sync(struct page_info *page)
-{
-    return page->count_info & PGC_out_of_sync;
-}
-
-static inline int mfn_out_of_sync(unsigned long mfn)
-{
-    if ( !mfn_valid(mfn) )
-        return 0;
-
-    return page_out_of_sync(mfn_to_page(mfn));
-}
-
-
-/************************************************************************/
-
-static void inline
-__shadow_sync_mfn(struct domain *d, unsigned long mfn)
-{
-    if ( d->arch.out_of_sync )
-    {
-        // XXX - could be smarter
-        //
-        __shadow_sync_all(d);
-    }
-}
-
-static void inline
-__shadow_sync_va(struct vcpu *v, unsigned long va)
-{
-    struct domain *d = v->domain;
-
-    if ( d->arch.out_of_sync && __shadow_out_of_sync(v, va) )
-    {
-        perfc_incrc(shadow_sync_va);
-
-        // XXX - could be smarter
-        //
-        __shadow_sync_all(v->domain);
-    }
-#if CONFIG_PAGING_LEVELS <= 2
-    // Also make sure the HL2 is up-to-date for this address.
-    //
-    if ( unlikely(shadow_mode_translate(v->domain)) )
-        update_hl2e(v, va);
-#endif
-}
-
-static void inline
-shadow_sync_all(struct domain *d)
-{
-    if ( unlikely(shadow_mode_enabled(d)) )
-    {
-        shadow_lock(d);
-
-        if ( d->arch.out_of_sync )
-            __shadow_sync_all(d);
-
-        ASSERT(d->arch.out_of_sync == NULL);
-
-        shadow_unlock(d);
-    }
-}
-
-// SMP BUG: This routine can't ever be used properly in an SMP context.
-//          It should be something like get_shadow_and_sync_va().
-//          This probably shouldn't exist.
-//
-static void inline
-shadow_sync_va(struct vcpu *v, unsigned long gva)
-{
-    struct domain *d = v->domain;
-    if ( unlikely(shadow_mode_enabled(d)) )
-    {
-        shadow_lock(d);
-        __shadow_sync_va(v, gva);
-        shadow_unlock(d);
-    }
-}
-
-extern void __shadow_mode_disable(struct domain *d);
-static inline void shadow_mode_disable(struct domain *d)
-{
-    if ( unlikely(shadow_mode_enabled(d)) )
-    {
-        shadow_lock(d);
-        __shadow_mode_disable(d);
-        shadow_unlock(d);
-    }
-}
-
-/************************************************************************/
-
-#define mfn_to_gmfn(_d, mfn)                         \
-    ( (shadow_mode_translate(_d))                      \
-      ? get_gpfn_from_mfn(mfn)                          \
-      : (mfn) )
-
-#define gmfn_to_mfn(_d, gpfn)                        \
-    ({                                                 \
-        unlikely(shadow_mode_translate(_d))            \
-        ? (likely(current->domain == (_d))             \
-           ? get_mfn_from_gpfn(gpfn)                    \
-           : get_mfn_from_gpfn_foreign(_d, gpfn))       \
-        : (gpfn);                                      \
-    })
-
-extern unsigned long get_mfn_from_gpfn_foreign(
-    struct domain *d, unsigned long gpfn);
-
-/************************************************************************/
-
-struct shadow_status {
-    struct shadow_status *next;   /* Pull-to-front list per hash bucket. */
-    shadow_key_t  gpfn_and_flags; /* Guest pfn plus flags. */
-    unsigned long smfn;           /* Shadow mfn.           */
-};
-
-#define shadow_ht_extra_size 128
-#define shadow_ht_buckets    256
-
-struct out_of_sync_entry {
-    struct out_of_sync_entry *next;
-    struct vcpu   *v;
-    unsigned long gpfn;    /* why is this here? */
-    unsigned long gmfn;
-    unsigned long snapshot_mfn;
-    paddr_t writable_pl1e; /* NB: this is a machine address */
-    unsigned long va;
-};
-
-#define out_of_sync_extra_size 127
-
-#define SHADOW_SNAPSHOT_ELSEWHERE (-1L)
-
-/************************************************************************/
-#define SHADOW_DEBUG 0
-#define SHADOW_VERBOSE_DEBUG 0
-#define SHADOW_VVERBOSE_DEBUG 0
-#define SHADOW_VVVERBOSE_DEBUG 0
-#define SHADOW_HASH_DEBUG 0
-#define FULLSHADOW_DEBUG 0
-
-#if SHADOW_DEBUG
-extern int shadow_status_noswap;
-#define SHADOW_REFLECTS_SNAPSHOT _PAGE_AVAIL0
-#endif
-
-#if SHADOW_VERBOSE_DEBUG
-#define SH_LOG(_f, _a...)                                               \
-    printk("DOM%uP%u: SH_LOG(%d): " _f "\n",                            \
-       current->domain->domain_id , smp_processor_id(), __LINE__ , ## _a )
-#define SH_VLOG(_f, _a...)                                              \
-    printk("DOM%uP%u: SH_VLOG(%d): " _f "\n",                           \
-           current->domain->domain_id, smp_processor_id(), __LINE__ , ## _a )
-#else
-#define SH_LOG(_f, _a...) ((void)0)
-#define SH_VLOG(_f, _a...) ((void)0)
-#endif
-
-#if SHADOW_VVERBOSE_DEBUG
-#define SH_VVLOG(_f, _a...)                                             \
-    printk("DOM%uP%u: SH_VVLOG(%d): " _f "\n",                          \
-           current->domain->domain_id, smp_processor_id(), __LINE__ , ## _a )
-#else
-#define SH_VVLOG(_f, _a...) ((void)0)
-#endif
-
-#if SHADOW_VVVERBOSE_DEBUG
-#define SH_VVVLOG(_f, _a...)                                            \
-    printk("DOM%uP%u: SH_VVVLOG(%d): " _f "\n",                         \
-           current->domain->domain_id, smp_processor_id(), __LINE__ , ## _a )
-#else
-#define SH_VVVLOG(_f, _a...) ((void)0)
-#endif
-
-#if FULLSHADOW_DEBUG
-#define FSH_LOG(_f, _a...)                                              \
-    printk("DOM%uP%u: FSH_LOG(%d): " _f "\n",                           \
-           current->domain->domain_id, smp_processor_id(), __LINE__ , ## _a )
-#else
-#define FSH_LOG(_f, _a...) ((void)0)
-#endif
-
-
-/************************************************************************/
-
-static inline int
-shadow_get_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
-{
-    l1_pgentry_t nl1e;
-    int res;
-    unsigned long mfn;
-    struct domain *owner;
-
-    ASSERT(l1e_get_flags(l1e) & _PAGE_PRESENT);
-
-    if ( !shadow_mode_refcounts(d) )
-        return 1;
-
-    nl1e = l1e;
-    l1e_remove_flags(nl1e, _PAGE_GLOBAL);
-
-    if ( unlikely(l1e_get_flags(nl1e) & L1_DISALLOW_MASK) )
-        return 0;
-
-    res = get_page_from_l1e(nl1e, d);
-
-    if ( unlikely(!res) && IS_PRIV(d) && !shadow_mode_translate(d) &&
-         !(l1e_get_flags(nl1e) & L1_DISALLOW_MASK) &&
-         (mfn = l1e_get_pfn(nl1e)) &&
-         mfn_valid(mfn) &&
-         (owner = page_get_owner(mfn_to_page(mfn))) &&
-         (d != owner) )
-    {
-        res = get_page_from_l1e(nl1e, owner);
-        printk("tried to map mfn %lx from domain %d into shadow page tables "
-               "of domain %d; %s\n",
-               mfn, owner->domain_id, d->domain_id,
-               res ? "success" : "failed");
-    }
-
-    if ( unlikely(!res) )
-    {
-        perfc_incrc(shadow_get_page_fail);
-        FSH_LOG("%s failed to get ref l1e=%" PRIpte "\n",
-                __func__, l1e_get_intpte(l1e));
-    }
-
-    return res;
-}
-
-static inline void
-shadow_put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
-{
-    if ( !shadow_mode_refcounts(d) )
-        return;
-
-    put_page_from_l1e(l1e, d);
-}
-
-static inline void
-shadow_put_page_type(struct domain *d, struct page_info *page)
-{
-    if ( !shadow_mode_refcounts(d) )
-        return;
-
-    put_page_type(page);
-}
-
-static inline int shadow_get_page(struct domain *d,
-                                  struct page_info *page,
-                                  struct domain *owner)
-{
-    if ( !shadow_mode_refcounts(d) )
-        return 1;
-    return get_page(page, owner);
-}
-
-static inline void shadow_put_page(struct domain *d,
-                                   struct page_info *page)
-{
-    if ( !shadow_mode_refcounts(d) )
-        return;
-    put_page(page);
-}
-
-/************************************************************************/
-
-static inline void __mark_dirty(struct domain *d, unsigned long mfn)
-{
-    unsigned long pfn;
-
-    ASSERT(shadow_lock_is_acquired(d));
-
-    if ( likely(!shadow_mode_log_dirty(d)) || !VALID_MFN(mfn) )
-        return;
-
-    ASSERT(d->arch.shadow_dirty_bitmap != NULL);
-
-    /* We /really/ mean PFN here, even for non-translated guests. */
-    pfn = get_gpfn_from_mfn(mfn);
-
-    /*
-     * Values with the MSB set denote MFNs that aren't really part of the 
-     * domain's pseudo-physical memory map (e.g., the shared info frame).
-     * Nothing to do here...
-     */
-    if ( unlikely(IS_INVALID_M2P_ENTRY(pfn)) )
-        return;
-
-    /* N.B. Can use non-atomic TAS because protected by shadow_lock. */
-    if ( likely(pfn < d->arch.shadow_dirty_bitmap_size) &&
-         !__test_and_set_bit(pfn, d->arch.shadow_dirty_bitmap) )
-    {
-        d->arch.shadow_dirty_count++;
-    }
-#ifndef NDEBUG
-    else if ( mfn_valid(mfn) )
-    {
-        SH_VLOG("mark_dirty OOR! mfn=%lx pfn=%lx max=%x (dom %p)",
-               mfn, pfn, d->arch.shadow_dirty_bitmap_size, d);
-        SH_VLOG("dom=%p caf=%08x taf=%" PRtype_info, 
-                page_get_owner(mfn_to_page(mfn)),
-                mfn_to_page(mfn)->count_info, 
-                mfn_to_page(mfn)->u.inuse.type_info );
-    }
-#endif
-}
-
-
-static inline void mark_dirty(struct domain *d, unsigned int mfn)
-{
-    if ( unlikely(shadow_mode_log_dirty(d)) )
-    {
-        shadow_lock(d);
-        __mark_dirty(d, mfn);
-        shadow_unlock(d);
-    }
-}
-
-
-/************************************************************************/
-#if CONFIG_PAGING_LEVELS <= 2
-static inline void
-__shadow_get_l2e(
-    struct vcpu *v, unsigned long va, l2_pgentry_t *psl2e)
-{
-    ASSERT(shadow_mode_enabled(v->domain));
-
-    *psl2e = v->arch.shadow_vtable[l2_table_offset(va)];
-}
-
-static inline void
-__shadow_set_l2e(
-    struct vcpu *v, unsigned long va, l2_pgentry_t value)
-{
-    ASSERT(shadow_mode_enabled(v->domain));
-
-    v->arch.shadow_vtable[l2_table_offset(va)] = value;
-}
-
-static inline void
-__guest_get_l2e(
-    struct vcpu *v, unsigned long va, l2_pgentry_t *pl2e)
-{
-    *pl2e = v->arch.guest_vtable[l2_table_offset(va)];
-}
-
-static inline void
-__guest_set_l2e(
-    struct vcpu *v, unsigned long va, l2_pgentry_t value)
-{
-    struct domain *d = v->domain;
-
-    v->arch.guest_vtable[l2_table_offset(va)] = value;
-
-    if ( unlikely(shadow_mode_translate(d)) )
-        update_hl2e(v, va);
-
-    __mark_dirty(d, pagetable_get_pfn(v->arch.guest_table));
-}
-
-static inline void
-__direct_get_l2e(
-    struct vcpu *v, unsigned long va, l2_pgentry_t *psl2e)
-{
-    l2_pgentry_t *phys_vtable;
-
-    ASSERT(shadow_mode_enabled(v->domain));
-
-    phys_vtable = map_domain_page(
-        pagetable_get_pfn(v->domain->arch.phys_table));
-
-    *psl2e = phys_vtable[l2_table_offset(va)];
-
-    unmap_domain_page(phys_vtable);
-}
-
-static inline void
-__direct_set_l2e(
-    struct vcpu *v, unsigned long va, l2_pgentry_t value)
-{
-    l2_pgentry_t *phys_vtable;
-
-    ASSERT(shadow_mode_enabled(v->domain));
-
-    phys_vtable = map_domain_page(
-        pagetable_get_pfn(v->domain->arch.phys_table));
-
-    phys_vtable[l2_table_offset(va)] = value;
-
-    unmap_domain_page(phys_vtable);
-}
-
-static inline void
-update_hl2e(struct vcpu *v, unsigned long va)
-{
-    int index = l2_table_offset(va);
-    unsigned long mfn;
-    l2_pgentry_t gl2e = v->arch.guest_vtable[index];
-    l1_pgentry_t old_hl2e, new_hl2e;
-    int need_flush = 0;
-
-    ASSERT(shadow_mode_translate(v->domain));
-
-    old_hl2e = v->arch.hl2_vtable[index];
-
-    if ( (l2e_get_flags(gl2e) & _PAGE_PRESENT) &&
-         VALID_MFN(mfn = get_mfn_from_gpfn(l2e_get_pfn(gl2e))) )
-        new_hl2e = l1e_from_pfn(mfn, __PAGE_HYPERVISOR);
-    else
-        new_hl2e = l1e_empty();
-
-    // only do the ref counting if something has changed.
-    //
-    if ( (l1e_has_changed(old_hl2e, new_hl2e, PAGE_FLAG_MASK)) )
-    {
-        if ( (l1e_get_flags(new_hl2e) & _PAGE_PRESENT) &&
-             !shadow_get_page(v->domain, mfn_to_page(l1e_get_pfn(new_hl2e)),
-                              v->domain) )
-            new_hl2e = l1e_empty();
-        if ( l1e_get_flags(old_hl2e) & _PAGE_PRESENT )
-        {
-            shadow_put_page(v->domain, mfn_to_page(l1e_get_pfn(old_hl2e)));
-            need_flush = 1;
-        }
-
-        v->arch.hl2_vtable[l2_table_offset(va)] = new_hl2e;
-
-        if ( need_flush )
-        {
-            perfc_incrc(update_hl2e_invlpg);
-            flush_tlb_one_mask(v->domain->domain_dirty_cpumask,
-                               &linear_pg_table[l1_linear_offset(va)]);
-        }
-    }
-}
-
-static inline void shadow_drop_references(
-    struct domain *d, struct page_info *page)
-{
-    if ( likely(!shadow_mode_refcounts(d)) ||
-         ((page->u.inuse.type_info & PGT_count_mask) == 0) )
-        return;
-
-    /* XXX This needs more thought... */
-    printk("%s: needing to call shadow_remove_all_access for mfn=%lx\n",
-           __func__, page_to_mfn(page));
-    printk("Before: mfn=%lx c=%08x t=%" PRtype_info "\n", page_to_mfn(page),
-           page->count_info, page->u.inuse.type_info);
-
-    shadow_lock(d);
-    shadow_remove_all_access(d, page_to_mfn(page));
-    shadow_unlock(d);
-
-    printk("After:  mfn=%lx c=%08x t=%" PRtype_info "\n", page_to_mfn(page),
-           page->count_info, page->u.inuse.type_info);
-}
-
-/* XXX Needs more thought. Neither pretty nor fast: a place holder. */
-static inline void shadow_sync_and_drop_references(
-    struct domain *d, struct page_info *page)
-{
-    if ( likely(!shadow_mode_refcounts(d)) )
-        return;
-
-    if ( page_out_of_sync(page) )
-        __shadow_sync_mfn(d, page_to_mfn(page));
-
-    shadow_remove_all_access(d, page_to_mfn(page));
-}
-#endif
-
-/************************************************************************/
-
-/*
- * Add another shadow reference to smfn.
- */
-static inline int
-get_shadow_ref(unsigned long smfn)
-{
-    u32 x, nx;
-
-    ASSERT(mfn_valid(smfn));
-
-    x = mfn_to_page(smfn)->count_info;
-    nx = x + 1;
-
-    if ( unlikely(nx == 0) )
-    {
-        printk("get_shadow_ref overflow, gmfn=%" PRtype_info  " smfn=%lx\n",
-               mfn_to_page(smfn)->u.inuse.type_info & PGT_mfn_mask,
-               smfn);
-        BUG();
-    }
-    
-    // Guarded by the shadow lock...
-    //
-    mfn_to_page(smfn)->count_info = nx;
-
-    return 1;
-}
-
-/*
- * Drop a shadow reference to smfn.
- */
-static inline void
-put_shadow_ref(unsigned long smfn)
-{
-    u32 x, nx;
-
-    ASSERT(mfn_valid(smfn));
-
-    x = mfn_to_page(smfn)->count_info;
-    nx = x - 1;
-
-    if ( unlikely(x == 0) )
-    {
-        printk("put_shadow_ref underflow, smfn=%lx oc=%08x t=%" 
-               PRtype_info "\n",
-               smfn,
-               mfn_to_page(smfn)->count_info,
-               mfn_to_page(smfn)->u.inuse.type_info);
-        BUG();
-    }
-
-    // Guarded by the shadow lock...
-    //
-    mfn_to_page(smfn)->count_info = nx;
-
-    if ( unlikely(nx == 0) )
-    {
-        free_shadow_page(smfn);
-    }
-}
-
-static inline void
-shadow_pin(unsigned long smfn)
-{
-    ASSERT( !(mfn_to_page(smfn)->u.inuse.type_info & PGT_pinned) );
-
-    mfn_to_page(smfn)->u.inuse.type_info |= PGT_pinned;
-    if ( unlikely(!get_shadow_ref(smfn)) )
-        BUG();
-}
-
-static inline void
-shadow_unpin(unsigned long smfn)
-{
-    ASSERT( (mfn_to_page(smfn)->u.inuse.type_info & PGT_pinned) );
-
-    mfn_to_page(smfn)->u.inuse.type_info &= ~PGT_pinned;
-    put_shadow_ref(smfn);
-}
-
-/*
- * SMP issue. The following code assumes the shadow lock is held. Re-visit
- * when working on finer-gained locks for shadow.
- */
-static inline void set_guest_back_ptr(
-    struct domain *d, l1_pgentry_t spte,
-    unsigned long smfn, unsigned int index)
-{
-    struct page_info *gpage;
-
-    ASSERT(shadow_lock_is_acquired(d));
-
-    if ( !shadow_mode_external(d) || 
-         ((l1e_get_flags(spte) & (_PAGE_PRESENT|_PAGE_RW)) !=
-          (_PAGE_PRESENT|_PAGE_RW)) )
-        return;
-
-    gpage = l1e_get_page(spte);
-
-    ASSERT(smfn != 0);
-    ASSERT(page_to_mfn(gpage) != 0);
-
-    gpage->tlbflush_timestamp = smfn;
-    gpage->u.inuse.type_info &= ~PGT_va_mask;
-    gpage->u.inuse.type_info |= (unsigned long)index << PGT_va_shift;
-}
-
-/************************************************************************/
-#if CONFIG_PAGING_LEVELS <= 2
-extern void shadow_mark_va_out_of_sync(
-    struct vcpu *v, unsigned long gpfn, unsigned long mfn,
-    unsigned long va);
-
-static inline int l1pte_write_fault(
-    struct vcpu *v, l1_pgentry_t *gpte_p, l1_pgentry_t *spte_p,
-    unsigned long va)
-{
-    struct domain *d = v->domain;
-    l1_pgentry_t gpte = *gpte_p;
-    l1_pgentry_t spte;
-    unsigned long gpfn = l1e_get_pfn(gpte);
-    unsigned long gmfn = gmfn_to_mfn(d, gpfn);
-
-    //printk("l1pte_write_fault gmfn=%lx\n", gmfn);
-
-    if ( unlikely(!VALID_MFN(gmfn)) )
-    {
-        SH_VLOG("l1pte_write_fault: invalid gpfn=%lx", gpfn);
-        *spte_p = l1e_empty();
-        return 0;
-    }
-
-    ASSERT(l1e_get_flags(gpte) & _PAGE_RW);
-    l1e_add_flags(gpte, _PAGE_DIRTY | _PAGE_ACCESSED);
-    spte = l1e_from_pfn(gmfn, l1e_get_flags(gpte) & ~_PAGE_GLOBAL);
-
-    SH_VVLOG("l1pte_write_fault: updating spte=0x%" PRIpte " gpte=0x%" PRIpte,
-             l1e_get_intpte(spte), l1e_get_intpte(gpte));
-
-    __mark_dirty(d, gmfn);
-
-    if ( mfn_is_page_table(gmfn) )
-        shadow_mark_va_out_of_sync(v, gpfn, gmfn, va);
-
-    *gpte_p = gpte;
-    *spte_p = spte;
-
-    return 1;
-}
-
-static inline int l1pte_read_fault(
-    struct domain *d, l1_pgentry_t *gpte_p, l1_pgentry_t *spte_p)
-{ 
-    l1_pgentry_t gpte = *gpte_p;
-    l1_pgentry_t spte = *spte_p;
-    unsigned long pfn = l1e_get_pfn(gpte);
-    unsigned long mfn = gmfn_to_mfn(d, pfn);
-
-    if ( unlikely(!VALID_MFN(mfn)) )
-    {
-        SH_VLOG("l1pte_read_fault: invalid gpfn=%lx", pfn);
-        *spte_p = l1e_empty();
-        return 0;
-    }
-
-    l1e_add_flags(gpte, _PAGE_ACCESSED);
-    spte = l1e_from_pfn(mfn, l1e_get_flags(gpte) & ~_PAGE_GLOBAL);
-
-    if ( shadow_mode_log_dirty(d) || !(l1e_get_flags(gpte) & _PAGE_DIRTY) ||
-         mfn_is_page_table(mfn) )
-    {
-        l1e_remove_flags(spte, _PAGE_RW);
-    }
-
-    SH_VVLOG("l1pte_read_fault: updating spte=0x%" PRIpte " gpte=0x%" PRIpte,
-             l1e_get_intpte(spte), l1e_get_intpte(gpte));
-    *gpte_p = gpte;
-    *spte_p = spte;
-
-    return 1;
-}
-#endif
-
-static inline void l1pte_propagate_from_guest(
-    struct domain *d, guest_l1_pgentry_t gpte, l1_pgentry_t *spte_p)
-{ 
-    unsigned long mfn;
-    l1_pgentry_t spte;
-
-    spte = l1e_empty();
-
-    if ( ((guest_l1e_get_flags(gpte) & (_PAGE_PRESENT|_PAGE_ACCESSED) ) ==
-          (_PAGE_PRESENT|_PAGE_ACCESSED)) &&
-         VALID_MFN(mfn = gmfn_to_mfn(d, l1e_get_pfn(gpte))) )
-    {
-        spte = l1e_from_pfn(
-            mfn, guest_l1e_get_flags(gpte) & ~(_PAGE_GLOBAL | _PAGE_AVAIL));
-
-        if ( shadow_mode_log_dirty(d) ||
-             !(guest_l1e_get_flags(gpte) & _PAGE_DIRTY) ||
-             mfn_is_page_table(mfn) )
-        {
-            l1e_remove_flags(spte, _PAGE_RW);
-        }
-    }
-
-    if ( l1e_get_intpte(spte) || l1e_get_intpte(gpte) )
-        SH_VVVLOG("%s: gpte=%" PRIpte ", new spte=%" PRIpte,
-                  __func__, l1e_get_intpte(gpte), l1e_get_intpte(spte));
-
-    *spte_p = spte;
-}
-
-static inline void hl2e_propagate_from_guest(
-    struct domain *d, l2_pgentry_t gpde, l1_pgentry_t *hl2e_p)
-{
-    unsigned long pfn = l2e_get_pfn(gpde);
-    unsigned long mfn;
-    l1_pgentry_t hl2e;
-    
-    hl2e = l1e_empty();
-
-    if ( l2e_get_flags(gpde) & _PAGE_PRESENT )
-    {
-        mfn = gmfn_to_mfn(d, pfn);
-        if ( VALID_MFN(mfn) && mfn_valid(mfn) )
-            hl2e = l1e_from_pfn(mfn, __PAGE_HYPERVISOR);
-    }
-
-    if ( l1e_get_intpte(hl2e) || l2e_get_intpte(gpde) )
-        SH_VVLOG("%s: gpde=%" PRIpte " hl2e=%" PRIpte, __func__,
-                 l2e_get_intpte(gpde), l1e_get_intpte(hl2e));
-
-    *hl2e_p = hl2e;
-}
-
-static inline void l2pde_general(
-    struct domain *d,
-    guest_l2_pgentry_t *gpde_p,
-    l2_pgentry_t *spde_p,
-    unsigned long sl1mfn)
-{
-    guest_l2_pgentry_t gpde = *gpde_p;
-    l2_pgentry_t spde;
-
-    spde = l2e_empty();
-    if ( (guest_l2e_get_flags(gpde) & _PAGE_PRESENT) && (sl1mfn != 0) )
-    {
-        spde = l2e_from_pfn(
-            sl1mfn,
-            (guest_l2e_get_flags(gpde) | _PAGE_RW | _PAGE_ACCESSED) & 
~_PAGE_AVAIL);
-
-        /* N.B. PDEs do not have a dirty bit. */
-        guest_l2e_add_flags(gpde, _PAGE_ACCESSED);
-
-        *gpde_p = gpde;
-    } 
-
-    if ( l2e_get_intpte(spde) || l2e_get_intpte(gpde) )
-        SH_VVLOG("%s: gpde=%" PRIpte ", new spde=%" PRIpte, __func__,
-                 l2e_get_intpte(gpde), l2e_get_intpte(spde));
-
-    *spde_p = spde;
-}
-
-static inline void l2pde_propagate_from_guest(
-    struct domain *d, guest_l2_pgentry_t *gpde_p, l2_pgentry_t *spde_p)
-{
-    guest_l2_pgentry_t gpde = *gpde_p;
-    unsigned long sl1mfn = 0;
-
-    if ( guest_l2e_get_flags(gpde) & _PAGE_PRESENT )
-        sl1mfn =  __shadow_status(d, l2e_get_pfn(gpde), PGT_l1_shadow);
-    l2pde_general(d, gpde_p, spde_p, sl1mfn);
-}
-    
-/************************************************************************/
-
-// returns true if a tlb flush is needed
-//
-static int inline
-validate_pte_change(
-    struct domain *d,
-    guest_l1_pgentry_t new_pte,
-    l1_pgentry_t *shadow_pte_p)
-{
-    l1_pgentry_t old_spte, new_spte;
-    int need_flush = 0;
-
-    perfc_incrc(validate_pte_calls);
-
-    l1pte_propagate_from_guest(d, new_pte, &new_spte);
-
-    if ( shadow_mode_refcounts(d) )
-    {
-        old_spte = *shadow_pte_p;
-
-        if ( l1e_get_intpte(old_spte) == l1e_get_intpte(new_spte) )
-        {
-            // No accounting required...
-            //
-            perfc_incrc(validate_pte_changes1);
-        }
-        else if ( l1e_get_intpte(old_spte) == 
(l1e_get_intpte(new_spte)|_PAGE_RW) )
-        {
-            // Fast path for PTEs that have merely been write-protected
-            // (e.g., during a Unix fork()). A strict reduction in privilege.
-            //
-            perfc_incrc(validate_pte_changes2);
-            if ( likely(l1e_get_flags(new_spte) & _PAGE_PRESENT) )
-                shadow_put_page_type(d, mfn_to_page(l1e_get_pfn(new_spte)));
-        }
-        else if ( ((l1e_get_flags(old_spte) | l1e_get_flags(new_spte)) &
-                   _PAGE_PRESENT ) &&
-                  l1e_has_changed(old_spte, new_spte, _PAGE_RW | 
_PAGE_PRESENT) )
-        {
-            // only do the ref counting if something important changed.
-            //
-            perfc_incrc(validate_pte_changes3);
-
-            if ( l1e_get_flags(old_spte) & _PAGE_PRESENT )
-            {
-                shadow_put_page_from_l1e(old_spte, d);
-                need_flush = 1;
-            }
-            if ( (l1e_get_flags(new_spte) & _PAGE_PRESENT) &&
-                 !shadow_get_page_from_l1e(new_spte, d) ) {
-                new_spte = l1e_empty();
-                need_flush = -1; /* need to unshadow the page */
-            }
-        }
-        else
-        {
-            perfc_incrc(validate_pte_changes4);
-        }
-    }
-
-    *shadow_pte_p = new_spte;
-
-    return need_flush;
-}
-
-// returns true if a tlb flush is needed
-//
-static int inline
-validate_hl2e_change(
-    struct domain *d,
-    l2_pgentry_t new_gpde,
-    l1_pgentry_t *shadow_hl2e_p)
-{
-    l1_pgentry_t old_hl2e, new_hl2e;
-    int need_flush = 0;
-
-    perfc_incrc(validate_hl2e_calls);
-
-    old_hl2e = *shadow_hl2e_p;
-    hl2e_propagate_from_guest(d, new_gpde, &new_hl2e);
-
-    // Only do the ref counting if something important changed.
-    //
-    if ( ((l1e_get_flags(old_hl2e) | l1e_get_flags(new_hl2e)) & _PAGE_PRESENT) 
&&
-         l1e_has_changed(old_hl2e, new_hl2e, _PAGE_PRESENT) )
-    {
-        perfc_incrc(validate_hl2e_changes);
-
-        if ( (l1e_get_flags(new_hl2e) & _PAGE_PRESENT) &&
-             !get_page(mfn_to_page(l1e_get_pfn(new_hl2e)), d) )
-            new_hl2e = l1e_empty();
-        if ( l1e_get_flags(old_hl2e) & _PAGE_PRESENT )
-        {
-            put_page(mfn_to_page(l1e_get_pfn(old_hl2e)));
-            need_flush = 1;
-        }
-    }
-
-    *shadow_hl2e_p = new_hl2e;
-
-    return need_flush;
-}
-
-// returns true if a tlb flush is needed
-//
-static int inline
-validate_pde_change(
-    struct domain *d,
-    guest_l2_pgentry_t new_gpde,
-    l2_pgentry_t *shadow_pde_p)
-{
-    l2_pgentry_t old_spde, new_spde;
-    int need_flush = 0;
-
-    perfc_incrc(validate_pde_calls);
-
-    old_spde = *shadow_pde_p;
-    l2pde_propagate_from_guest(d, &new_gpde, &new_spde);
-
-    // Only do the ref counting if something important changed.
-    //
-    if ( ((l2e_get_intpte(old_spde) | l2e_get_intpte(new_spde)) & 
_PAGE_PRESENT) &&
-         l2e_has_changed(old_spde, new_spde, _PAGE_PRESENT) )
-    {
-        perfc_incrc(validate_pde_changes);
-
-        if ( (l2e_get_flags(new_spde) & _PAGE_PRESENT) &&
-             !get_shadow_ref(l2e_get_pfn(new_spde)) )
-            BUG();
-        if ( l2e_get_flags(old_spde) & _PAGE_PRESENT )
-        {
-            put_shadow_ref(l2e_get_pfn(old_spde));
-            need_flush = 1;
-        }
-    }
-
-    *shadow_pde_p = new_spde;
-
-    return need_flush;
-}
-
-/*********************************************************************/
-
-#if SHADOW_HASH_DEBUG
-
-static void shadow_audit(struct domain *d, int print)
-{
-    int live = 0, free = 0, j = 0, abs;
-    struct shadow_status *a;
-
-    for ( j = 0; j < shadow_ht_buckets; j++ )
-    {
-        a = &d->arch.shadow_ht[j];        
-        if ( a->gpfn_and_flags )
-        {
-            live++;
-            ASSERT(a->smfn);
-        }
-        else
-            ASSERT(!a->next);
-
-        a = a->next;
-        while ( a && (live < 9999) )
-        { 
-            live++;
-            if ( (a->gpfn_and_flags == 0) || (a->smfn == 0) )
-            {
-                printk("XXX live=%d gpfn+flags=%lx sp=%lx next=%p\n",
-                       live, a->gpfn_and_flags, a->smfn, a->next);
-                BUG();
-            }
-            ASSERT(a->smfn);
-            a = a->next;
-        }
-        ASSERT(live < 9999);
-    }
-
-    for ( a = d->arch.shadow_ht_free; a != NULL; a = a->next )
-        free++;
-
-    if ( print )
-        printk("Xlive=%d free=%d\n", live, free);
-
-    // BUG: this only works if there's only a single domain which is
-    //      using shadow tables.
-    //
-    abs = (
-        perfc_value(shadow_l1_pages) +
-        perfc_value(shadow_l2_pages) +
-        perfc_value(hl2_table_pages) +
-        perfc_value(snapshot_pages) +
-        perfc_value(writable_pte_predictions)
-        ) - live;
-#ifdef PERF_COUNTERS
-    if ( (abs < -1) || (abs > 1) )
-    {
-        printk("live=%d free=%d l1=%d l2=%d hl2=%d snapshot=%d 
writable_ptes=%d\n",
-               live, free,
-               perfc_value(shadow_l1_pages),
-               perfc_value(shadow_l2_pages),
-               perfc_value(hl2_table_pages),
-               perfc_value(snapshot_pages),
-               perfc_value(writable_pte_predictions));
-        BUG();
-    }
-#endif
-
-    // XXX ought to add some code to audit the out-of-sync entries, too.
-    //
-}
-#else
-#define shadow_audit(p, print) ((void)0)
-#endif
-
-
-static inline struct shadow_status *hash_bucket(
-    struct domain *d, unsigned int gpfn)
-{
-    return &d->arch.shadow_ht[gpfn % shadow_ht_buckets];
-}
-
-
-/*
- * N.B. This takes a guest pfn (i.e. a pfn in the guest's namespace,
- *      which, depending on full shadow mode, may or may not equal
- *      its mfn).
- *      It returns the shadow's mfn, or zero if it doesn't exist.
- */
-static inline unsigned long __shadow_status(
-    struct domain *d, unsigned long gpfn, unsigned long stype)
-{
-    struct shadow_status *p, *x, *head;
-    shadow_key_t key;
-#if CONFIG_PAGING_LEVELS >= 3
-    if ( d->arch.ops->guest_paging_levels == PAGING_L3 && stype == 
PGT_l4_shadow )
-        key = gpfn | stype | index_to_key(get_cr3_idxval(current));
-    else
-#endif
-        key = gpfn | stype;
-
-    ASSERT(shadow_lock_is_acquired(d));
-    ASSERT(gpfn == (gpfn & PGT_mfn_mask));
-    ASSERT(stype && !(stype & ~PGT_type_mask));
-
-    perfc_incrc(shadow_status_calls);
-
-    x = head = hash_bucket(d, gpfn);
-    p = NULL;
-
-    shadow_audit(d, 0);
-
-    do
-    {
-        ASSERT(x->gpfn_and_flags || ((x == head) && (x->next == NULL)));
-
-        if ( x->gpfn_and_flags == key )
-        {
-#if SHADOW_DEBUG
-            if ( unlikely(shadow_status_noswap) )
-                return x->smfn;
-#endif
-            /* Pull-to-front if 'x' isn't already the head item. */
-            if ( unlikely(x != head) )
-            {
-                /* Delete 'x' from list and reinsert immediately after head. */
-                p->next = x->next;
-                x->next = head->next;
-                head->next = x;
-
-                /* Swap 'x' contents with head contents. */
-                SWAP(head->gpfn_and_flags, x->gpfn_and_flags);
-                SWAP(head->smfn, x->smfn);
-            }
-            else
-            {
-                perfc_incrc(shadow_status_hit_head);
-            }
-
-            return head->smfn;
-        }
-
-        p = x;
-        x = x->next;
-    }
-    while ( x != NULL );
-
-    perfc_incrc(shadow_status_miss);
-    return 0;
-}
-
-/*
- * Not clear if pull-to-front is worth while for this or not,
- * as it generally needs to scan the entire bucket anyway.
- * Much simpler without.
- *
- * Either returns PGT_none, or PGT_l{1,2,3,4}_page_table.
- */
-static inline u32
-shadow_max_pgtable_type(struct domain *d, unsigned long gpfn,
-                        unsigned long *smfn)
-{
-    struct shadow_status *x;
-    u32 pttype = PGT_none, type;
-
-    ASSERT(shadow_lock_is_acquired(d));
-    ASSERT(gpfn == (gpfn & PGT_mfn_mask));
-
-    perfc_incrc(shadow_max_type);
-
-    x = hash_bucket(d, gpfn);
-
-    while ( x && x->gpfn_and_flags )
-    {
-        if ( (x->gpfn_and_flags & PGT_mfn_mask) == gpfn )
-        {
-            type = x->gpfn_and_flags & PGT_type_mask;
-
-            switch ( type )
-            {
-            case PGT_hl2_shadow:
-                // Treat an HL2 as if it's an L1
-                //
-                type = PGT_l1_shadow;
-                break;
-            case PGT_snapshot:
-            case PGT_writable_pred:
-                // Ignore snapshots -- they don't in and of themselves 
constitute
-                // treating a page as a page table
-                //
-                goto next;
-            case PGT_base_page_table:
-                // Early exit if we found the max possible value
-                //
-                return type;
-            default:
-                break;
-            }
-
-            if ( type > pttype )
-            {
-                pttype = type;
-                if ( smfn )
-                    *smfn = x->smfn;
-            }
-        }
-    next:
-        x = x->next;
-    }
-
-    return pttype;
-}
-
-static inline void delete_shadow_status(
-    struct domain *d, unsigned long gpfn, unsigned long gmfn, unsigned int 
stype, u64 index)
-{
-    struct shadow_status *p, *x, *n, *head;
-
-    shadow_key_t key = gpfn | stype | index_to_key(index);
-
-    ASSERT(shadow_lock_is_acquired(d));
-    ASSERT(!(gpfn & ~PGT_mfn_mask));
-    ASSERT(stype && !(stype & ~PGT_type_mask));
-
-    head = hash_bucket(d, gpfn);
-
-    SH_VLOG("delete gpfn=%lx t=%08x bucket=%p", gpfn, stype, head);
-    shadow_audit(d, 0);
-
-    /* Match on head item? */
-    if ( head->gpfn_and_flags == key )
-    {
-        if ( (n = head->next) != NULL )
-        {
-            /* Overwrite head with contents of following node. */
-            head->gpfn_and_flags = n->gpfn_and_flags;
-            head->smfn           = n->smfn;
-
-            /* Delete following node. */
-            head->next           = n->next;
-
-            /* Add deleted node to the free list. */
-            n->gpfn_and_flags = 0;
-            n->smfn           = 0;
-            n->next           = d->arch.shadow_ht_free;
-            d->arch.shadow_ht_free = n;
-        }
-        else
-        {
-            /* This bucket is now empty. Initialise the head node. */
-            head->gpfn_and_flags = 0;
-            head->smfn           = 0;
-        }
-
-        goto found;
-    }
-
-    p = head;
-    x = head->next;
-
-    do
-    {
-        if ( x->gpfn_and_flags == key )
-        {
-            /* Delete matching node. */
-            p->next = x->next;
-
-            /* Add deleted node to the free list. */
-            x->gpfn_and_flags = 0;
-            x->smfn           = 0;
-            x->next           = d->arch.shadow_ht_free;
-            d->arch.shadow_ht_free = x;
-
-            goto found;
-        }
-
-        p = x;
-        x = x->next;
-    }
-    while ( x != NULL );
-
-    /* If we got here, it wasn't in the list! */
-    BUG();
-
- found:
-    // release ref to page
-    if ( stype != PGT_writable_pred )
-        put_page(mfn_to_page(gmfn));
-
-    shadow_audit(d, 0);
-}
-
-static inline void set_shadow_status(
-    struct domain *d, unsigned long gpfn, unsigned long gmfn,
-    unsigned long smfn, unsigned long stype, u64 index)
-{
-    struct shadow_status *x, *head, *extra;
-    int i;
-
-    shadow_key_t key = gpfn | stype | index_to_key(index);
-
-    SH_VVLOG("set gpfn=%lx gmfn=%lx smfn=%lx t=%lx", gpfn, gmfn, smfn, stype);
-
-    ASSERT(shadow_lock_is_acquired(d));
-
-    ASSERT(shadow_mode_translate(d) || gpfn);
-    ASSERT(!(gpfn & ~PGT_mfn_mask));
-
-    // XXX - need to be more graceful.
-    ASSERT(VALID_MFN(gmfn));
-
-    ASSERT(stype && !(stype & ~PGT_type_mask));
-
-    x = head = hash_bucket(d, gpfn);
-
-    SH_VLOG("set gpfn=%lx smfn=%lx t=%lx bucket=%p(%p)",
-             gpfn, smfn, stype, x, x->next);
-    shadow_audit(d, 0);
-
-    // grab a reference to the guest page to represent the entry in the shadow
-    // hash table
-    //
-    // XXX - Should PGT_writable_pred grab a page ref?
-    //     - Who/how are these hash table entry refs flushed if/when a page
-    //       is given away by the domain?
-    //
-    if ( stype != PGT_writable_pred )
-        get_page(mfn_to_page(gmfn), d);
-
-    /*
-     * STEP 1. If page is already in the table, update it in place.
-     */
-    do
-    {
-        if ( unlikely(x->gpfn_and_flags == key) )
-        {
-            if ( stype != PGT_writable_pred )
-                BUG(); // we should never replace entries into the hash table
-            x->smfn = smfn;
-            if ( stype != PGT_writable_pred )
-                put_page(mfn_to_page(gmfn)); // already had a ref...
-            goto done;
-        }
-
-        x = x->next;
-    }
-    while ( x != NULL );
-
-    /*
-     * STEP 2. The page must be inserted into the table.
-     */
-
-    /* If the bucket is empty then insert the new page as the head item. */
-    if ( head->gpfn_and_flags == 0 )
-    {
-        head->gpfn_and_flags = key;
-        head->smfn           = smfn;
-        ASSERT(head->next == NULL);
-        goto done;
-    }
-
-    /* We need to allocate a new node. Ensure the quicklist is non-empty. */
-    if ( unlikely(d->arch.shadow_ht_free == NULL) )
-    {
-        SH_VLOG("Allocate more shadow hashtable blocks.");
-
-        extra = xmalloc_bytes(
-            sizeof(void *) + (shadow_ht_extra_size * sizeof(*x)));
-
-        /* XXX Should be more graceful here. */
-        if ( extra == NULL )
-            BUG();
-
-        memset(extra, 0, sizeof(void *) + (shadow_ht_extra_size * sizeof(*x)));
-
-        /* Record the allocation block so it can be correctly freed later. */
-        d->arch.shadow_extras_count++;
-        *((struct shadow_status **)&extra[shadow_ht_extra_size]) = 
-            d->arch.shadow_ht_extras;
-        d->arch.shadow_ht_extras = &extra[0];
-
-        /* Thread a free chain through the newly-allocated nodes. */
-        for ( i = 0; i < (shadow_ht_extra_size - 1); i++ )
-            extra[i].next = &extra[i+1];
-        extra[i].next = NULL;
-
-        /* Add the new nodes to the free list. */
-        d->arch.shadow_ht_free = &extra[0];
-    }
-
-    /* Allocate a new node from the quicklist. */
-    x                      = d->arch.shadow_ht_free;
-    d->arch.shadow_ht_free = x->next;
-
-    /* Initialise the new node and insert directly after the head item. */
-    x->gpfn_and_flags = key;
-    x->smfn           = smfn;
-    x->next           = head->next;
-    head->next        = x;
-
- done:
-    shadow_audit(d, 0);
-
-    if ( stype <= PGT_l4_shadow )
-    {
-        // add to front of list of pages to check when removing write
-        // permissions for a page...
-        //
-    }
-}
-
-/************************************************************************/
-
-static inline void guest_physmap_add_page(
-    struct domain *d, unsigned long gpfn, unsigned long mfn)
-{
-    struct domain_mmap_cache c1, c2;
-
-    if ( likely(!shadow_mode_translate(d)) )
-        return;
-
-    domain_mmap_cache_init(&c1);
-    domain_mmap_cache_init(&c2);
-    shadow_lock(d);
-    shadow_sync_and_drop_references(d, mfn_to_page(mfn));
-    set_p2m_entry(d, gpfn, mfn, &c1, &c2);
-    set_gpfn_from_mfn(mfn, gpfn);
-    shadow_unlock(d);
-    domain_mmap_cache_destroy(&c1);
-    domain_mmap_cache_destroy(&c2);
-}
-
-static inline void guest_physmap_remove_page(
-    struct domain *d, unsigned long gpfn, unsigned long mfn)
-{
-    struct domain_mmap_cache c1, c2;
-    unsigned long type;
-
-    if ( likely(!shadow_mode_translate(d)) )
-        return;
-
-    domain_mmap_cache_init(&c1);
-    domain_mmap_cache_init(&c2);
-    shadow_lock(d);
-    shadow_sync_and_drop_references(d, mfn_to_page(mfn));
-    while ( (type = shadow_max_pgtable_type(d, gpfn, NULL)) != PGT_none )
-        free_shadow_page(__shadow_status(d, gpfn, type));
-    set_p2m_entry(d, gpfn, -1, &c1, &c2);
-    set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
-    shadow_unlock(d);
-    domain_mmap_cache_destroy(&c1);
-    domain_mmap_cache_destroy(&c2);
-}
-
-/************************************************************************/
-
-void static inline
-shadow_update_min_max(unsigned long smfn, int index)
-{
-    struct page_info *sl1page = mfn_to_page(smfn);
-    u32 min_max = sl1page->tlbflush_timestamp;
-    int min = SHADOW_MIN(min_max);
-    int max = SHADOW_MAX(min_max);
-    int update = 0;
-
-    if ( index < min )
-    {
-        min = index;
-        update = 1;
-    }
-    if ( index > max )
-    {
-        max = index;
-        update = 1;
-    }
-    if ( update )
-        sl1page->tlbflush_timestamp = SHADOW_ENCODE_MIN_MAX(min, max);
-}
-
-#if CONFIG_PAGING_LEVELS <= 2
-extern void shadow_map_l1_into_current_l2(unsigned long va);
-
-void static inline
-shadow_set_l1e(unsigned long va, l1_pgentry_t new_spte, int create_l1_shadow)
-{
-    struct vcpu *v = current;
-    struct domain *d = v->domain;
-    l2_pgentry_t sl2e = {0};
-
-    __shadow_get_l2e(v, va, &sl2e);
-    if ( !(l2e_get_flags(sl2e) & _PAGE_PRESENT) )
-    {
-        /*
-         * Either the L1 is not shadowed, or the shadow isn't linked into
-         * the current shadow L2.
-         */
-        if ( create_l1_shadow )
-        {
-            perfc_incrc(shadow_set_l1e_force_map);
-            shadow_map_l1_into_current_l2(va);
-        }
-        else /* check to see if it exists; if so, link it in */
-        {
-            l2_pgentry_t gpde = linear_l2_table(v)[l2_table_offset(va)];
-            unsigned long gl1pfn = l2e_get_pfn(gpde);
-            unsigned long sl1mfn = __shadow_status(d, gl1pfn, PGT_l1_shadow);
-
-            ASSERT( l2e_get_flags(gpde) & _PAGE_PRESENT );
-
-            if ( sl1mfn )
-            {
-                perfc_incrc(shadow_set_l1e_unlinked);
-                if ( !get_shadow_ref(sl1mfn) )
-                    BUG();
-                l2pde_general(d, &gpde, &sl2e, sl1mfn);
-                __guest_set_l2e(v, va, gpde);
-                __shadow_set_l2e(v, va, sl2e);
-            }
-            else
-            {
-                // no shadow exists, so there's nothing to do.
-                perfc_incrc(shadow_set_l1e_fail);
-                return;
-            }
-        }
-    }
-
-    __shadow_get_l2e(v, va, &sl2e);
-
-    if ( shadow_mode_refcounts(d) )
-    {
-        l1_pgentry_t old_spte = shadow_linear_pg_table[l1_linear_offset(va)];
-
-        // only do the ref counting if something important changed.
-        //
-        if ( l1e_has_changed(old_spte, new_spte, _PAGE_RW | _PAGE_PRESENT) )
-        {
-            if ( (l1e_get_flags(new_spte) & _PAGE_PRESENT) &&
-                 !shadow_get_page_from_l1e(new_spte, d) )
-                new_spte = l1e_empty();
-            if ( l1e_get_flags(old_spte) & _PAGE_PRESENT )
-                shadow_put_page_from_l1e(old_spte, d);
-        }
-
-    }
-
-    set_guest_back_ptr(d, new_spte, l2e_get_pfn(sl2e), l1_table_offset(va));
-    shadow_linear_pg_table[l1_linear_offset(va)] = new_spte;
-    shadow_update_min_max(l2e_get_pfn(sl2e), l1_table_offset(va));
-}
-#endif
-/************************************************************************/
-
-static inline int
-shadow_mode_page_writable(unsigned long va, struct cpu_user_regs *regs, 
unsigned long gpfn)
-{
-    struct vcpu *v = current;
-    struct domain *d = v->domain;
-    unsigned long mfn = gmfn_to_mfn(d, gpfn);
-    u32 type = mfn_to_page(mfn)->u.inuse.type_info & PGT_type_mask;
-
-    if ( shadow_mode_refcounts(d) &&
-         (type == PGT_writable_page) )
-        type = shadow_max_pgtable_type(d, gpfn, NULL);
-
-    // Strange but true: writable page tables allow kernel-mode access
-    // to L1 page table pages via write-protected PTEs...  Similarly, write 
-    // access to all page table pages is granted for shadow_mode_write_all
-    // clients.
-    //
-    if ( ((shadow_mode_write_l1(d) && (type == PGT_l1_page_table)) ||
-          (shadow_mode_write_all(d) && type && (type <= PGT_l4_page_table))) &&
-         ((va < HYPERVISOR_VIRT_START)
-#if defined(__x86_64__)
-          || (va >= HYPERVISOR_VIRT_END)
-#endif
-             ) &&
-         guest_kernel_mode(v, regs) )
-        return 1;
-
-    return 0;
-}
-
-#if CONFIG_PAGING_LEVELS <= 2
-static inline l1_pgentry_t gva_to_gpte(unsigned long gva)
-{
-    l2_pgentry_t gpde;
-    l1_pgentry_t gpte;
-    struct vcpu *v = current;
-
-    ASSERT( shadow_mode_translate(current->domain) );
-
-    __guest_get_l2e(v, gva, &gpde);
-    if ( unlikely(!(l2e_get_flags(gpde) & _PAGE_PRESENT)) )
-        return l1e_empty();;
-
-    // This is actually overkill - we only need to make sure the hl2
-    // is in-sync.
-    //
-    shadow_sync_va(v, gva);
-
-    if ( unlikely(__copy_from_user(&gpte,
-                                   &linear_pg_table[gva >> PAGE_SHIFT],
-                                   sizeof(gpte))) )
-    {
-        FSH_LOG("gva_to_gpte got a fault on gva=%lx", gva);
-        return l1e_empty();
-    }
-
-    return gpte;
-}
-
-static inline unsigned long gva_to_gpa(unsigned long gva)
-{
-    l1_pgentry_t gpte;
-
-    gpte = gva_to_gpte(gva);
-    if ( !(l1e_get_flags(gpte) & _PAGE_PRESENT) )
-        return 0;
-
-    return l1e_get_paddr(gpte) + (gva & ~PAGE_MASK);
-}
-#endif
-
-static inline unsigned long gva_to_mfn(unsigned long gva)
-{
-    unsigned long gpa = gva_to_gpa(gva);
-    return get_mfn_from_gpfn(gpa >> PAGE_SHIFT);
-}
-
-/************************************************************************/
-
-extern void __update_pagetables(struct vcpu *v);
-static inline void update_pagetables(struct vcpu *v)
-{
-    struct domain *d = v->domain;
-    int paging_enabled;
-
-    if ( hvm_guest(v) )
-        paging_enabled = hvm_paging_enabled(v);
-    else
-        // HACK ALERT: there's currently no easy way to figure out if a domU
-        // has set its arch.guest_table to zero, vs not yet initialized it.
-        //
-        paging_enabled = !!pagetable_get_paddr(v->arch.guest_table);
-
-    /*
-     * We don't call __update_pagetables() when hvm guest paging is
-     * disabled as we want the linear_pg_table to be inaccessible so that
-     * we bail out early of shadow_fault() if the hvm guest tries illegal
-     * accesses while it thinks paging is turned off.
-     */
-    if ( unlikely(shadow_mode_enabled(d)) && paging_enabled )
-    {
-        shadow_lock(d);
-        __update_pagetables(v);
-        shadow_unlock(d);
-    }
-
-    if ( likely(!shadow_mode_external(d)) )
-    {
-        if ( shadow_mode_enabled(d) )
-            v->arch.monitor_table = v->arch.shadow_table;
-        else
-#if CONFIG_PAGING_LEVELS == 4
-        if ( !(v->arch.flags & TF_kernel_mode) )
-            v->arch.monitor_table = v->arch.guest_table_user;
-        else
-#endif
-            v->arch.monitor_table = v->arch.guest_table;
-    }
-}
-
-void clear_all_shadow_status(struct domain *d);
-
-#if SHADOW_DEBUG
-extern int _check_pagetable(struct vcpu *v, char *s);
-extern int _check_all_pagetables(struct vcpu *v, char *s);
-
-#define check_pagetable(_v, _s) _check_pagetable(_v, _s)
-//#define check_pagetable(_v, _s) _check_all_pagetables(_v, _s)
-
-#else
-#define check_pagetable(_v, _s) ((void)0)
-#endif
-
-#endif /* XEN_SHADOW_H */
+#endif /* _XEN_SHADOW_H */
 
 /*
  * Local variables:
diff -r f2151423f729 -r 01345b08d012 xen/include/asm-x86/x86_32/page-2level.h
--- a/xen/include/asm-x86/x86_32/page-2level.h  Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/include/asm-x86/x86_32/page-2level.h  Wed Aug 16 17:11:56 2006 +0100
@@ -46,6 +46,7 @@ typedef l2_pgentry_t root_pgentry_t;
  *  12-bit flags = (pte[11:0])
  */
 
+#define _PAGE_NX_BIT            0U
 #define _PAGE_NX                0U
 
 /* Extract flags into 12-bit integer, or turn 12-bit flags into a pte mask. */
diff -r f2151423f729 -r 01345b08d012 xen/include/asm-x86/x86_32/page-3level.h
--- a/xen/include/asm-x86/x86_32/page-3level.h  Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/include/asm-x86/x86_32/page-3level.h  Wed Aug 16 17:11:56 2006 +0100
@@ -59,7 +59,8 @@ typedef l3_pgentry_t root_pgentry_t;
  *  32-bit flags = (pte[63:44],pte[11:0])
  */
 
-#define _PAGE_NX (cpu_has_nx ? (1<<31) : 0)
+#define _PAGE_NX_BIT (1U<<31)
+#define _PAGE_NX     (cpu_has_nx ? _PAGE_NX_BIT : 0)
 
 /* Extract flags into 32-bit integer, or turn 32-bit flags into a pte mask. */
 #define get_pte_flags(x) (((int)((x) >> 32) & ~0xFFF) | ((int)(x) & 0xFFF))
diff -r f2151423f729 -r 01345b08d012 xen/include/asm-x86/x86_64/page.h
--- a/xen/include/asm-x86/x86_64/page.h Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/include/asm-x86/x86_64/page.h Wed Aug 16 17:11:56 2006 +0100
@@ -44,6 +44,8 @@ typedef l4_pgentry_t root_pgentry_t;
 /* Given a virtual address, get an entry offset into a linear page table. */
 #define l1_linear_offset(_a) (((_a) & VADDR_MASK) >> L1_PAGETABLE_SHIFT)
 #define l2_linear_offset(_a) (((_a) & VADDR_MASK) >> L2_PAGETABLE_SHIFT)
+#define l3_linear_offset(_a) (((_a) & VADDR_MASK) >> L3_PAGETABLE_SHIFT)
+#define l4_linear_offset(_a) (((_a) & VADDR_MASK) >> L4_PAGETABLE_SHIFT)
 
 #define is_guest_l1_slot(_s) (1)
 #define is_guest_l2_slot(_t, _s) (1)
@@ -70,7 +72,8 @@ typedef l4_pgentry_t root_pgentry_t;
 #define put_pte_flags(x) (((intpte_t)((x) & ~0xFFF) << 40) | ((x) & 0xFFF))
 
 /* Bit 23 of a 24-bit flag mask. This corresponds to bit 63 of a pte.*/
-#define _PAGE_NX (cpu_has_nx ? (1U<<23) : 0U)
+#define _PAGE_NX_BIT (1U<<23)
+#define _PAGE_NX     (cpu_has_nx ? _PAGE_NX_BIT : 0U)
 
 #define L1_DISALLOW_MASK BASE_DISALLOW_MASK
 #define L2_DISALLOW_MASK BASE_DISALLOW_MASK
diff -r f2151423f729 -r 01345b08d012 xen/include/public/dom0_ops.h
--- a/xen/include/public/dom0_ops.h     Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/include/public/dom0_ops.h     Wed Aug 16 17:11:56 2006 +0100
@@ -262,6 +262,18 @@ DEFINE_XEN_GUEST_HANDLE(dom0_sched_id_t)
 #define DOM0_SHADOW_CONTROL_OP_CLEAN       11
 #define DOM0_SHADOW_CONTROL_OP_PEEK        12
 
+/* Shadow2 operations */
+#define DOM0_SHADOW2_CONTROL_OP_GET_ALLOCATION   30
+#define DOM0_SHADOW2_CONTROL_OP_SET_ALLOCATION   31
+#define DOM0_SHADOW2_CONTROL_OP_ENABLE           32
+
+/* Mode flags for Shadow2 enable op */
+#define DOM0_SHADOW2_CONTROL_FLAG_ENABLE    (1 << 0)
+#define DOM0_SHADOW2_CONTROL_FLAG_REFCOUNT  (1 << 1)
+#define DOM0_SHADOW2_CONTROL_FLAG_LOG_DIRTY (1 << 2)
+#define DOM0_SHADOW2_CONTROL_FLAG_TRANSLATE (1 << 3)
+#define DOM0_SHADOW2_CONTROL_FLAG_EXTERNAL  (1 << 4)
+
 struct dom0_shadow_control_stats {
     uint32_t fault_count;
     uint32_t dirty_count;
@@ -277,7 +289,9 @@ struct dom0_shadow_control {
     uint32_t       op;
     XEN_GUEST_HANDLE(ulong) dirty_bitmap;
     /* IN/OUT variables. */
-    uint64_t       pages;        /* size of buffer, updated with actual size */
+    uint64_t       pages;    /* size of buffer, updated with actual size */
+    uint32_t       mb;       /* Shadow2 memory allocation in MB */
+    uint32_t       mode;     /* Shadow2 mode to enable */
     /* OUT variables. */
     struct dom0_shadow_control_stats stats;
 };
diff -r f2151423f729 -r 01345b08d012 xen/include/xen/domain_page.h
--- a/xen/include/xen/domain_page.h     Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/include/xen/domain_page.h     Wed Aug 16 17:11:56 2006 +0100
@@ -25,6 +25,13 @@ extern void *map_domain_page(unsigned lo
  * currently-executing VCPU via a call to map_domain_pages().
  */
 extern void unmap_domain_page(void *va);
+
+/* 
+ * Convert a VA (within a page previously mapped in the context of the
+ * currently-executing VCPU via a call to map_domain_pages()) to a machine 
+ * address 
+ */
+extern paddr_t mapped_domain_page_to_maddr(void *va);
 
 /*
  * Similar to the above calls, except the mapping is accessible in all
@@ -98,6 +105,7 @@ domain_mmap_cache_destroy(struct domain_
 
 #define map_domain_page(pfn)                maddr_to_virt((pfn)<<PAGE_SHIFT)
 #define unmap_domain_page(va)               ((void)(va))
+#define mapped_domain_page_to_maddr(va)     (virt_to_maddr(va))
 
 #define map_domain_page_global(pfn)         maddr_to_virt((pfn)<<PAGE_SHIFT)
 #define unmap_domain_page_global(va)        ((void)(va))
@@ -112,4 +120,9 @@ struct domain_mmap_cache {
 
 #endif /* !CONFIG_DOMAIN_PAGE */
 
+#define HERE_I_AM \
+do { \
+    printk("HERE I AM: %s %s %d\n", __func__, __FILE__, __LINE__); \
+} while (0)
+
 #endif /* __XEN_DOMAIN_PAGE_H__ */
diff -r f2151423f729 -r 01345b08d012 xen/include/xen/lib.h
--- a/xen/include/xen/lib.h     Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/include/xen/lib.h     Wed Aug 16 17:11:56 2006 +0100
@@ -18,7 +18,7 @@ extern void __bug(char *file, int line) 
 #ifndef NDEBUG
 #define ASSERT(_p)                                                      \
     do {                                                                \
-        if ( !(_p) )                                                    \
+        if ( unlikely(!(_p)) )                                          \
         {                                                               \
             printk("Assertion '%s' failed, line %d, file %s\n", #_p ,   \
                    __LINE__, __FILE__);                                 \
@@ -41,7 +41,7 @@ void cmdline_parse(char *cmdline);
 void cmdline_parse(char *cmdline);
 
 #ifndef NDEBUG
-extern int debugtrace_send_to_console;
+extern void debugtrace_toggle(void);
 extern void debugtrace_dump(void);
 extern void debugtrace_printk(const char *fmt, ...);
 #else
diff -r f2151423f729 -r 01345b08d012 xen/include/xen/list.h
--- a/xen/include/xen/list.h    Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/include/xen/list.h    Wed Aug 16 17:11:56 2006 +0100
@@ -160,6 +160,16 @@ static __inline__ void list_splice(struc
 #define list_for_each_safe(pos, n, head) \
        for (pos = (head)->next, n = pos->next; pos != (head); \
                pos = n, n = pos->next)
+
+/**
+ * list_for_each_backwards_safe        -       iterate backwards over a list 
safe against removal of list entry
+ * @pos:       the &struct list_head to use as a loop counter.
+ * @n:         another &struct list_head to use as temporary storage
+ * @head:      the head for your list.
+ */
+#define list_for_each_backwards_safe(pos, n, head) \
+       for (pos = (head)->prev, n = pos->prev; pos != (head); \
+               pos = n, n = pos->prev)
 
 /**
  * list_for_each_entry -       iterate over list of given type
diff -r f2151423f729 -r 01345b08d012 xen/include/xen/sched.h
--- a/xen/include/xen/sched.h   Wed Aug 16 16:48:45 2006 +0100
+++ b/xen/include/xen/sched.h   Wed Aug 16 17:11:56 2006 +0100
@@ -376,9 +376,12 @@ extern struct domain *domain_list;
  /* VCPU is paused by the hypervisor? */
 #define _VCPUF_paused          11
 #define VCPUF_paused           (1UL<<_VCPUF_paused)
- /* VCPU is blocked awaiting an event to be consumed by Xen. */
+/* VCPU is blocked awaiting an event to be consumed by Xen. */
 #define _VCPUF_blocked_in_xen  12
 #define VCPUF_blocked_in_xen   (1UL<<_VCPUF_blocked_in_xen)
+ /* HVM vcpu thinks CR0.PG == 0 */
+#define _VCPUF_shadow2_translate 13
+#define VCPUF_shadow2_translate  (1UL<<_VCPUF_shadow2_translate)
 
 /*
  * Per-domain flags (domain_flags).
diff -r f2151423f729 -r 01345b08d012 xen/arch/x86/shadow2-common.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/shadow2-common.c     Wed Aug 16 17:11:56 2006 +0100
@@ -0,0 +1,3394 @@
+/******************************************************************************
+ * arch/x86/shadow2-common.c
+ *
+ * Shadow2 code that does not need to be multiply compiled.
+ * Parts of this code are Copyright (c) 2006 by XenSource Inc.
+ * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
+ * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#define SHADOW2 1
+
+#include <xen/config.h>
+#include <xen/types.h>
+#include <xen/mm.h>
+#include <xen/trace.h>
+#include <xen/sched.h>
+#include <xen/perfc.h>
+#include <xen/irq.h>
+#include <xen/domain_page.h>
+#include <xen/guest_access.h>
+#include <asm/event.h>
+#include <asm/page.h>
+#include <asm/current.h>
+#include <asm/flushtlb.h>
+#include <asm/shadow2.h>
+#include <asm/shadow2-private.h>
+
+#if SHADOW2_AUDIT
+int shadow2_audit_enable = 0;
+#endif
+
+static void sh2_free_log_dirty_bitmap(struct domain *d);
+
+int _shadow2_mode_refcounts(struct domain *d)
+{
+    return shadow2_mode_refcounts(d);
+}
+
+
+/**************************************************************************/
+/* x86 emulator support for the shadow2 code
+ */
+
+static int
+sh2_x86_emulate_read_std(unsigned long addr,
+                         unsigned long *val,
+                         unsigned int bytes,
+                         struct x86_emulate_ctxt *ctxt)
+{
+    struct vcpu *v = current;
+    if ( hvm_guest(v) )
+    {
+        *val = 0;
+        // XXX -- this is WRONG.
+        //        It entirely ignores the permissions in the page tables.
+        //        In this case, that is only a user vs supervisor access check.
+        //
+        if ( hvm_copy(val, addr, bytes, HVM_COPY_IN) )
+        {
+#if 0
+            SHADOW2_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
+                           v->domain->domain_id, v->vcpu_id, 
+                           addr, *val, bytes);
+#endif
+            return X86EMUL_CONTINUE;
+        }
+
+        /* If we got here, there was nothing mapped here, or a bad GFN 
+         * was mapped here.  This should never happen: we're here because
+         * of a write fault at the end of the instruction we're emulating. */ 
+        SHADOW2_PRINTK("read failed to va %#lx\n", addr);
+        return X86EMUL_PROPAGATE_FAULT;
+    }
+    else 
+    {
+        SHADOW2_PRINTK("this operation is not emulated yet\n");
+        return X86EMUL_UNHANDLEABLE;
+    }
+}
+
+static int
+sh2_x86_emulate_write_std(unsigned long addr,
+                          unsigned long val,
+                          unsigned int bytes,
+                          struct x86_emulate_ctxt *ctxt)
+{
+    struct vcpu *v = current;
+#if 0
+    SHADOW2_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
+                  v->domain->domain_id, v->vcpu_id, addr, val, bytes);
+#endif
+    if ( hvm_guest(v) )
+    {
+        // XXX -- this is WRONG.
+        //        It entirely ignores the permissions in the page tables.
+        //        In this case, that includes user vs supervisor, and
+        //        write access.
+        //
+        if ( hvm_copy(&val, addr, bytes, HVM_COPY_OUT) )
+            return X86EMUL_CONTINUE;
+
+        /* If we got here, there was nothing mapped here, or a bad GFN 
+         * was mapped here.  This should never happen: we're here because
+         * of a write fault at the end of the instruction we're emulating,
+         * which should be handled by sh2_x86_emulate_write_emulated. */ 
+        SHADOW2_PRINTK("write failed to va %#lx\n", addr);
+        return X86EMUL_PROPAGATE_FAULT;
+    }
+    else 
+    {
+        SHADOW2_PRINTK("this operation is not emulated yet\n");
+        return X86EMUL_UNHANDLEABLE;
+    }
+}
+
+static int
+sh2_x86_emulate_write_emulated(unsigned long addr,
+                               unsigned long val,
+                               unsigned int bytes,
+                               struct x86_emulate_ctxt *ctxt)
+{
+    struct vcpu *v = current;
+#if 0
+    SHADOW2_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n",
+                  v->domain->domain_id, v->vcpu_id, addr, val, bytes);
+#endif
+    if ( hvm_guest(v) )
+    {
+        return v->arch.shadow2->x86_emulate_write(v, addr, &val, bytes, ctxt);
+    }
+    else 
+    {
+        SHADOW2_PRINTK("this operation is not emulated yet\n");
+        return X86EMUL_UNHANDLEABLE;
+    }
+}
+
+static int 
+sh2_x86_emulate_cmpxchg_emulated(unsigned long addr,
+                                 unsigned long old,
+                                 unsigned long new,
+                                 unsigned int bytes,
+                                 struct x86_emulate_ctxt *ctxt)
+{
+    struct vcpu *v = current;
+#if 0
+    SHADOW2_PRINTK("d=%u v=%u a=%#lx o?=%#lx n:=%#lx bytes=%u\n",
+                   v->domain->domain_id, v->vcpu_id, addr, old, new, bytes);
+#endif
+    if ( hvm_guest(v) )
+    {
+        return v->arch.shadow2->x86_emulate_cmpxchg(v, addr, old, new, 
+                                                    bytes, ctxt);
+    }
+    else 
+    {
+        SHADOW2_PRINTK("this operation is not emulated yet\n");
+        return X86EMUL_UNHANDLEABLE;
+    }
+}
+
+static int 
+sh2_x86_emulate_cmpxchg8b_emulated(unsigned long addr,
+                                   unsigned long old_lo,
+                                   unsigned long old_hi,
+                                   unsigned long new_lo,
+                                   unsigned long new_hi,
+                                   struct x86_emulate_ctxt *ctxt)
+{
+    struct vcpu *v = current;
+#if 0
+    SHADOW2_PRINTK("d=%u v=%u a=%#lx o?=%#lx:%lx n:=%#lx:%lx\n",
+                   v->domain->domain_id, v->vcpu_id, addr, old_hi, old_lo,
+                   new_hi, new_lo, ctxt);
+#endif
+    if ( hvm_guest(v) )
+    {
+        return v->arch.shadow2->x86_emulate_cmpxchg8b(v, addr, old_lo, old_hi,
+                                                      new_lo, new_hi, ctxt);
+    }
+    else 
+    {
+        SHADOW2_PRINTK("this operation is not emulated yet\n");
+        return X86EMUL_UNHANDLEABLE;
+    }
+}
+
+
+struct x86_emulate_ops shadow2_emulator_ops = {
+    .read_std           = sh2_x86_emulate_read_std,
+    .write_std          = sh2_x86_emulate_write_std,
+    .read_emulated      = sh2_x86_emulate_read_std,
+    .write_emulated     = sh2_x86_emulate_write_emulated,
+    .cmpxchg_emulated   = sh2_x86_emulate_cmpxchg_emulated,
+    .cmpxchg8b_emulated = sh2_x86_emulate_cmpxchg8b_emulated,
+};
+
+
+/**************************************************************************/
+/* Code for "promoting" a guest page to the point where the shadow code is
+ * willing to let it be treated as a guest page table.  This generally
+ * involves making sure there are no writable mappings available to the guest
+ * for this page.
+ */
+void shadow2_promote(struct vcpu *v, mfn_t gmfn, u32 type)
+{
+    struct page_info *page = mfn_to_page(gmfn);
+    unsigned long type_info;
+
+    ASSERT(valid_mfn(gmfn));
+
+    /* We should never try to promote a gmfn that has writeable mappings */
+    ASSERT(shadow2_remove_write_access(v, gmfn, 0, 0) == 0);
+
+    // Is the page already shadowed?
+    if ( !test_and_set_bit(_PGC_page_table, &page->count_info) )
+    {
+        // No prior shadow exists...
+
+        // Grab a type-ref.  We don't really care if we are racing with another
+        // vcpu or not, or even what kind of type we get; we just want the type
+        // count to be > 0.
+        //
+        do {
+            type_info =
+                page->u.inuse.type_info & (PGT_type_mask | PGT_va_mask);
+        } while ( !get_page_type(page, type_info) );
+
+        // Now that the type ref is non-zero, we can safely use the
+        // shadow2_flags.
+        //
+        page->shadow2_flags = 0;
+    }
+
+    ASSERT(!test_bit(type >> PGC_SH2_type_shift, &page->shadow2_flags));
+    set_bit(type >> PGC_SH2_type_shift, &page->shadow2_flags);
+}
+
+void shadow2_demote(struct vcpu *v, mfn_t gmfn, u32 type)
+{
+    struct page_info *page = mfn_to_page(gmfn);
+
+    ASSERT(test_bit(_PGC_page_table, &page->count_info));
+    ASSERT(test_bit(type >> PGC_SH2_type_shift, &page->shadow2_flags));
+
+    clear_bit(type >> PGC_SH2_type_shift, &page->shadow2_flags);
+
+    if ( (page->shadow2_flags & SH2F_page_type_mask) == 0 )
+    {
+        // release the extra type ref
+        put_page_type(page);
+
+        // clear the is-a-page-table bit.
+        clear_bit(_PGC_page_table, &page->count_info);
+    }
+}
+
+/**************************************************************************/
+/* Validate a pagetable change from the guest and update the shadows.
+ * Returns a bitmask of SHADOW2_SET_* flags. */
+
+static int
+__shadow2_validate_guest_entry(struct vcpu *v, mfn_t gmfn, 
+                               void *entry, u32 size)
+{
+    int result = 0;
+    struct page_info *page = mfn_to_page(gmfn);
+
+    sh2_mark_dirty(v->domain, gmfn);
+    
+    // Determine which types of shadows are affected, and update each.
+    //
+    // Always validate L1s before L2s to prevent another cpu with a linear
+    // mapping of this gmfn from seeing a walk that results from 
+    // using the new L2 value and the old L1 value.  (It is OK for such a
+    // guest to see a walk that uses the old L2 value with the new L1 value,
+    // as hardware could behave this way if one level of the pagewalk occurs
+    // before the store, and the next level of the pagewalk occurs after the
+    // store.
+    //
+    // Ditto for L2s before L3s, etc.
+    //
+
+    if ( !(page->count_info & PGC_page_table) )
+        return 0;  /* Not shadowed at all */
+
+#if CONFIG_PAGING_LEVELS == 2
+    if ( page->shadow2_flags & SH2F_L1_32 ) 
+        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, 2, 2)
+            (v, gmfn, entry, size);
+#else 
+    if ( page->shadow2_flags & SH2F_L1_32 ) 
+        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, 3, 2)
+            (v, gmfn, entry, size);
+#endif
+
+#if CONFIG_PAGING_LEVELS == 2
+    if ( page->shadow2_flags & SH2F_L2_32 ) 
+        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, 2, 2)
+            (v, gmfn, entry, size);
+#else 
+    if ( page->shadow2_flags & SH2F_L2_32 ) 
+        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, 3, 2)
+            (v, gmfn, entry, size);
+#endif
+
+#if CONFIG_PAGING_LEVELS >= 3 
+    if ( page->shadow2_flags & SH2F_L1_PAE ) 
+        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, 3, 3)
+            (v, gmfn, entry, size);
+    if ( page->shadow2_flags & SH2F_L2_PAE ) 
+        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, 3, 3)
+            (v, gmfn, entry, size);
+    if ( page->shadow2_flags & SH2F_L2H_PAE ) 
+        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2he, 3, 3)
+            (v, gmfn, entry, size);
+    if ( page->shadow2_flags & SH2F_L3_PAE ) 
+        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl3e, 3, 3)
+            (v, gmfn, entry, size);
+#else /* 32-bit non-PAE hypervisor does not support PAE guests */
+    ASSERT((page->shadow2_flags & (SH2F_L3_PAE|SH2F_L2_PAE|SH2F_L1_PAE)) == 0);
+#endif
+
+#if CONFIG_PAGING_LEVELS >= 4 
+    if ( page->shadow2_flags & SH2F_L1_64 ) 
+        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, 4, 4)
+            (v, gmfn, entry, size);
+    if ( page->shadow2_flags & SH2F_L2_64 ) 
+        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, 4, 4)
+            (v, gmfn, entry, size);
+    if ( page->shadow2_flags & SH2F_L3_64 ) 
+        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl3e, 4, 4)
+            (v, gmfn, entry, size);
+    if ( page->shadow2_flags & SH2F_L4_64 ) 
+        result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl4e, 4, 4)
+            (v, gmfn, entry, size);
+#else /* 32-bit/PAE hypervisor does not support 64-bit guests */
+    ASSERT((page->shadow2_flags 
+            & (SH2F_L4_64|SH2F_L3_64|SH2F_L2_64|SH2F_L1_64)) == 0);
+#endif
+
+    return result;
+}
+
+
+int
+shadow2_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *entry)
+/* This is the entry point from hypercalls. It returns a bitmask of all the 
+ * results of shadow_set_l*e() calls, so the caller knows to do TLB flushes. */
+{
+    int rc;
+
+    ASSERT(shadow2_lock_is_acquired(v->domain));
+    rc = __shadow2_validate_guest_entry(v, gmfn, entry, sizeof(l1_pgentry_t));
+    shadow2_audit_tables(v);
+    return rc;
+}
+
+void
+shadow2_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn,
+                                void *entry, u32 size)
+/* This is the entry point for emulated writes to pagetables in HVM guests */
+{
+    struct domain *d = v->domain;
+    int rc;
+
+    ASSERT(shadow2_lock_is_acquired(v->domain));
+    rc = __shadow2_validate_guest_entry(v, gmfn, entry, size);
+    if ( rc & SHADOW2_SET_FLUSH )
+    {
+        // Flush everyone except the local processor, which will flush when it
+        // re-enters the HVM guest.
+        //
+        cpumask_t mask = d->domain_dirty_cpumask;
+        cpu_clear(v->processor, mask);
+        flush_tlb_mask(mask);
+    }
+    if ( rc & SHADOW2_SET_ERROR ) 
+    {
+        /* This page is probably not a pagetable any more: tear it out of the 
+         * shadows, along with any tables that reference it */
+        shadow2_remove_all_shadows_and_parents(v, gmfn);
+    }
+    /* We ignore the other bits: since we are about to change CR3 on
+     * VMENTER we don't need to do any extra TLB flushes. */ 
+}
+
+
+/**************************************************************************/
+/* Memory management for shadow pages. */ 
+
+/* Meaning of the count_info field in shadow pages
+ * ----------------------------------------------
+ * 
+ * A count of all references to this page from other shadow pages and
+ * guest CR3s (a.k.a. v->arch.shadow_table).  
+ *
+ * The top bits hold the shadow type and the pinned bit.  Top-level
+ * shadows are pinned so that they don't disappear when not in a CR3
+ * somewhere.
+ *
+ * We don't need to use get|put_page for this as the updates are all
+ * protected by the shadow lock.  We can't use get|put_page for this
+ * as the size of the count on shadow pages is different from that on
+ * normal guest pages.
+ */
+
+/* Meaning of the type_info field in shadow pages
+ * ----------------------------------------------
+ * 
+ * type_info use depends on the shadow type (from count_info)
+ * 
+ * PGC_SH2_none : This page is in the shadow2 free pool.  type_info holds
+ *                the chunk order for our freelist allocator.
+ *
+ * PGC_SH2_l*_shadow : This page is in use as a shadow. type_info 
+ *                     holds the mfn of the guest page being shadowed,
+ *
+ * PGC_SH2_fl1_*_shadow : This page is being used to shatter a superpage.
+ *                        type_info holds the gfn being shattered.
+ *
+ * PGC_SH2_monitor_table : This page is part of a monitor table.
+ *                         type_info is not used.
+ */
+
+/* Meaning of the _domain field in shadow pages
+ * --------------------------------------------
+ *
+ * In shadow pages, this field will always have its least significant bit
+ * set.  This ensures that all attempts to get_page() will fail (as all
+ * valid pickled domain pointers have a zero for their least significant bit).
+ * Instead, the remaining upper bits are used to record the shadow generation
+ * counter when the shadow was created.
+ */
+
+/* Meaning of the shadow2_flags field
+ * ----------------------------------
+ * 
+ * In guest pages that are shadowed, one bit for each kind of shadow they have.
+ * 
+ * In shadow pages, will be used for holding a representation of the populated
+ * entries in this shadow (either a min/max, or a bitmap, or ...)
+ *
+ * In monitor-table pages, holds the level of the particular page (to save
+ * spilling the shadow types into an extra bit by having three types of monitor
+ * page).
+ */
+
+/* Meaning of the list_head struct in shadow pages
+ * -----------------------------------------------
+ *
+ * In free shadow pages, this is used to hold the free-lists of chunks.
+ *
+ * In top-level shadow tables, this holds a linked-list of all top-level
+ * shadows (used for recovering memory and destroying shadows). 
+ *
+ * In lower-level shadows, this holds the physical address of a higher-level
+ * shadow entry that holds a reference to this shadow (or zero).
+ */
+
+/* Allocating shadow pages
+ * -----------------------
+ *
+ * Most shadow pages are allocated singly, but there are two cases where we 
+ * need to allocate multiple pages together.
+ * 
+ * 1: Shadowing 32-bit guest tables on PAE or 64-bit shadows.
+ *    A 32-bit guest l1 table covers 4MB of virtuial address space,
+ *    and needs to be shadowed by two PAE/64-bit l1 tables (covering 2MB
+ *    of virtual address space each).  Similarly, a 32-bit guest l2 table 
+ *    (4GB va) needs to be shadowed by four PAE/64-bit l2 tables (1GB va 
+ *    each).  These multi-page shadows are contiguous and aligned; 
+ *    functions for handling offsets into them are defined in shadow2.c 
+ *    (shadow_l1_index() etc.)
+ *    
+ * 2: Shadowing PAE top-level pages.  Each guest page that contains
+ *    any PAE top-level pages requires two shadow pages to shadow it.
+ *    They contain alternating l3 tables and pae_l3_bookkeeping structs.
+ *
+ * This table shows the allocation behaviour of the different modes:
+ *
+ * Xen paging      32b  pae  pae  64b  64b  64b
+ * Guest paging    32b  32b  pae  32b  pae  64b
+ * PV or HVM        *   HVM   *   HVM  HVM   * 
+ * Shadow paging   32b  pae  pae  pae  pae  64b
+ *
+ * sl1 size         4k   8k   4k   8k   4k   4k
+ * sl2 size         4k  16k   4k  16k   4k   4k
+ * sl3 size         -    -    8k   -    8k   4k
+ * sl4 size         -    -    -    -    -    4k
+ *
+ * We allocate memory from xen in four-page units and break them down
+ * with a simple buddy allocator.  Can't use the xen allocator to handle
+ * this as it only works for contiguous zones, and a domain's shadow
+ * pool is made of fragments.
+ *
+ * In HVM guests, the p2m table is built out of shadow pages, and we provide 
+ * a function for the p2m management to steal pages, in max-order chunks, from 
+ * the free pool.  We don't provide for giving them back, yet.
+ */
+
+/* Figure out the least acceptable quantity of shadow memory.
+ * The minimum memory requirement for always being able to free up a
+ * chunk of memory is very small -- only three max-order chunks per
+ * vcpu to hold the top level shadows and pages with Xen mappings in them.  
+ *
+ * But for a guest to be guaranteed to successfully execute a single
+ * instruction, we must be able to map a large number (about thirty) VAs
+ * at the same time, which means that to guarantee progress, we must
+ * allow for more than ninety allocated pages per vcpu.  We round that
+ * up to 128 pages, or half a megabyte per vcpu. */
+unsigned int shadow2_min_acceptable_pages(struct domain *d) 
+{
+    u32 vcpu_count = 0;
+    struct vcpu *v;
+
+    for_each_vcpu(d, v)
+        vcpu_count++;
+
+    return (vcpu_count * 128);
+}
+
+/* Using the type_info field to store freelist order */
+#define SH2_PFN_ORDER(_p) ((_p)->u.inuse.type_info)
+#define SH2_SET_PFN_ORDER(_p, _o)                       \
+ do { (_p)->u.inuse.type_info = (_o); } while (0)
+ 
+
+/* Figure out the order of allocation needed for a given shadow type */
+static inline u32
+shadow_order(u32 shadow_type) 
+{
+#if CONFIG_PAGING_LEVELS > 2
+    static const u32 type_to_order[16] = {
+        0, /* PGC_SH2_none           */
+        1, /* PGC_SH2_l1_32_shadow   */
+        1, /* PGC_SH2_fl1_32_shadow  */
+        2, /* PGC_SH2_l2_32_shadow   */
+        0, /* PGC_SH2_l1_pae_shadow  */
+        0, /* PGC_SH2_fl1_pae_shadow */
+        0, /* PGC_SH2_l2_pae_shadow  */
+        0, /* PGC_SH2_l2h_pae_shadow */
+        1, /* PGC_SH2_l3_pae_shadow  */
+        0, /* PGC_SH2_l1_64_shadow   */
+        0, /* PGC_SH2_fl1_64_shadow  */
+        0, /* PGC_SH2_l2_64_shadow   */
+        0, /* PGC_SH2_l3_64_shadow   */
+        0, /* PGC_SH2_l4_64_shadow   */
+        2, /* PGC_SH2_p2m_table      */
+        0  /* PGC_SH2_monitor_table  */
+        };
+    u32 type = (shadow_type & PGC_SH2_type_mask) >> PGC_SH2_type_shift;
+    return type_to_order[type];
+#else  /* 32-bit Xen only ever shadows 32-bit guests on 32-bit shadows. */
+    return 0;
+#endif
+}
+
+
+/* Do we have a free chunk of at least this order? */
+static inline int chunk_is_available(struct domain *d, int order)
+{
+    int i;
+    
+    for ( i = order; i <= SHADOW2_MAX_ORDER; i++ )
+        if ( !list_empty(&d->arch.shadow2_freelists[i]) )
+            return 1;
+    return 0;
+}
+
+/* Dispatcher function: call the per-mode function that will unhook the
+ * non-Xen mappings in this top-level shadow mfn */
+void shadow2_unhook_mappings(struct vcpu *v, mfn_t smfn)
+{
+    struct page_info *pg = mfn_to_page(smfn);
+    switch ( (pg->count_info & PGC_SH2_type_mask) >> PGC_SH2_type_shift )
+    {
+    case PGC_SH2_l2_32_shadow >> PGC_SH2_type_shift:
+#if CONFIG_PAGING_LEVELS == 2
+        SHADOW2_INTERNAL_NAME(sh2_unhook_32b_mappings,2,2)(v,smfn);
+#else
+        SHADOW2_INTERNAL_NAME(sh2_unhook_32b_mappings,3,2)(v,smfn);
+#endif
+        break;
+#if CONFIG_PAGING_LEVELS >= 3
+    case PGC_SH2_l3_pae_shadow >> PGC_SH2_type_shift:
+        SHADOW2_INTERNAL_NAME(sh2_unhook_pae_mappings,3,3)(v,smfn);
+        break;
+#endif
+#if CONFIG_PAGING_LEVELS >= 4
+    case PGC_SH2_l4_64_shadow >> PGC_SH2_type_shift:
+        SHADOW2_INTERNAL_NAME(sh2_unhook_64b_mappings,4,4)(v,smfn);
+        break;
+#endif
+    default:
+        SHADOW2_PRINTK("top-level shadow has bad type %08lx\n", 
+                       (unsigned long)((pg->count_info & PGC_SH2_type_mask)
+                                       >> PGC_SH2_type_shift));
+        BUG();
+    }
+}
+
+
+/* Make sure there is at least one chunk of the required order available
+ * in the shadow page pool. This must be called before any calls to
+ * shadow2_alloc().  Since this will free existing shadows to make room,
+ * it must be called early enough to avoid freeing shadows that the
+ * caller is currently working on. */
+void shadow2_prealloc(struct domain *d, unsigned int order)
+{
+    /* Need a vpcu for calling unpins; for now, since we don't have
+     * per-vcpu shadows, any will do */
+    struct vcpu *v = d->vcpu[0];
+    struct list_head *l, *t;
+    struct page_info *pg;
+    mfn_t smfn;
+
+    if ( chunk_is_available(d, order) ) return; 
+    
+    /* Stage one: walk the list of top-level pages, unpinning them */
+    perfc_incrc(shadow2_prealloc_1);
+    list_for_each_backwards_safe(l, t, &d->arch.shadow2_toplevel_shadows)
+    {
+        pg = list_entry(l, struct page_info, list);
+        smfn = page_to_mfn(pg);
+
+#if CONFIG_PAGING_LEVELS >= 3
+        if ( (pg->count_info & PGC_SH2_type_mask) == PGC_SH2_l3_pae_shadow )
+        {
+            /* For PAE, we need to unpin each subshadow on this shadow */
+            SHADOW2_INTERNAL_NAME(sh2_unpin_all_l3_subshadows,3,3)(v, smfn);
+        } 
+        else 
+#endif /* 32-bit code always takes this branch */
+        {
+            /* Unpin this top-level shadow */
+            sh2_unpin(v, smfn);
+        }
+
+        /* See if that freed up a chunk of appropriate size */
+        if ( chunk_is_available(d, order) ) return;
+    }
+
+    /* Stage two: all shadow pages are in use in hierarchies that are
+     * loaded in cr3 on some vcpu.  Walk them, unhooking the non-Xen
+     * mappings. */
+    perfc_incrc(shadow2_prealloc_2);
+    v = current;
+    if ( v->domain != d )
+        v = d->vcpu[0];
+    /* Walk the list from the tail: recently used toplevels have been pulled
+     * to the head */
+    list_for_each_backwards_safe(l, t, &d->arch.shadow2_toplevel_shadows)
+    {
+        pg = list_entry(l, struct page_info, list);
+        smfn = page_to_mfn(pg);
+        shadow2_unhook_mappings(v, smfn);
+
+        /* Need to flush TLB if we've altered our own tables */
+        if ( !shadow2_mode_external(d) 
+             && pagetable_get_pfn(current->arch.shadow_table) == mfn_x(smfn) )
+            local_flush_tlb();
+        
+        /* See if that freed up a chunk of appropriate size */
+        if ( chunk_is_available(d, order) ) return;
+    }
+    
+    /* Nothing more we can do: all remaining shadows are of pages that
+     * hold Xen mappings for some vcpu.  This can never happen. */
+    SHADOW2_PRINTK("Can't pre-allocate %i shadow pages!\n"
+                   "  shadow pages total = %u, free = %u, p2m=%u\n",
+                   1 << order, 
+                   d->arch.shadow2_total_pages, 
+                   d->arch.shadow2_free_pages, 
+                   d->arch.shadow2_p2m_pages);
+    BUG();
+}
+
+
+/* Allocate another shadow's worth of (contiguous, aligned) pages,
+ * and fill in the type and backpointer fields of their page_infos. 
+ * Never fails to allocate. */
+mfn_t shadow2_alloc(struct domain *d,  
+                    u32 shadow_type,
+                    unsigned long backpointer)
+{
+    struct page_info *pg = NULL;
+    unsigned int order = shadow_order(shadow_type);
+    cpumask_t mask;
+    void *p;
+    int i;
+
+    ASSERT(shadow2_lock_is_acquired(d));
+    ASSERT(order <= SHADOW2_MAX_ORDER);
+    ASSERT(shadow_type != PGC_SH2_none);
+    perfc_incrc(shadow2_alloc);
+
+    /* Find smallest order which can satisfy the request. */
+    for ( i = order; i <= SHADOW2_MAX_ORDER; i++ )
+        if ( !list_empty(&d->arch.shadow2_freelists[i]) )
+        {
+            pg = list_entry(d->arch.shadow2_freelists[i].next, 
+                            struct page_info, list);
+            list_del(&pg->list);
+            
+            /* We may have to halve the chunk a number of times. */
+            while ( i != order )
+            {
+                i--;
+                SH2_SET_PFN_ORDER(pg, i);
+                list_add_tail(&pg->list, &d->arch.shadow2_freelists[i]);
+                pg += 1 << i;
+            }
+            d->arch.shadow2_free_pages -= 1 << order;
+
+            /* Init page info fields and clear the pages */
+            for ( i = 0; i < 1<<order ; i++ ) 
+            {
+                pg[i].u.inuse.type_info = backpointer;
+                pg[i].count_info = shadow_type;
+                pg[i].shadow2_flags = 0;
+                INIT_LIST_HEAD(&pg[i].list);
+                /* Before we overwrite the old contents of this page, 
+                 * we need to be sure that no TLB holds a pointer to it. */
+                mask = d->domain_dirty_cpumask;
+                tlbflush_filter(mask, pg[i].tlbflush_timestamp);
+                if ( unlikely(!cpus_empty(mask)) )
+                {
+                    perfc_incrc(shadow2_alloc_tlbflush);
+                    flush_tlb_mask(mask);
+                }
+                /* Now safe to clear the page for reuse */
+                p = sh2_map_domain_page(page_to_mfn(pg+i));
+                ASSERT(p != NULL);
+                clear_page(p);
+                sh2_unmap_domain_page(p);
+                perfc_incr(shadow2_alloc_count);
+            }
+            return page_to_mfn(pg);
+        }
+    
+    /* If we get here, we failed to allocate. This should never happen.
+     * It means that we didn't call shadow2_prealloc() correctly before
+     * we allocated.  We can't recover by calling prealloc here, because
+     * we might free up higher-level pages that the caller is working on. */
+    SHADOW2_PRINTK("Can't allocate %i shadow pages!\n", 1 << order);
+    BUG();
+}
+
+
+/* Return some shadow pages to the pool. */
+void shadow2_free(struct domain *d, mfn_t smfn)
+{
+    struct page_info *pg = mfn_to_page(smfn); 
+    u32 shadow_type;
+    unsigned long order;
+    unsigned long mask;
+    int i;
+
+    ASSERT(shadow2_lock_is_acquired(d));
+    perfc_incrc(shadow2_free);
+
+    shadow_type = pg->count_info & PGC_SH2_type_mask;
+    ASSERT(shadow_type != PGC_SH2_none);
+    ASSERT(shadow_type != PGC_SH2_p2m_table);
+    order = shadow_order(shadow_type);
+
+    d->arch.shadow2_free_pages += 1 << order;
+
+    for ( i = 0; i < 1<<order; i++ ) 
+    {
+        /* Strip out the type: this is now a free shadow page */
+        pg[i].count_info = 0;
+        /* Remember the TLB timestamp so we will know whether to flush 
+         * TLBs when we reuse the page.  Because the destructors leave the
+         * contents of the pages in place, we can delay TLB flushes until
+         * just before the allocator hands the page out again. */
+        pg[i].tlbflush_timestamp = tlbflush_current_time();
+        perfc_decr(shadow2_alloc_count);
+    }
+
+    /* Merge chunks as far as possible. */
+    while ( order < SHADOW2_MAX_ORDER )
+    {
+        mask = 1 << order;
+        if ( (mfn_x(page_to_mfn(pg)) & mask) ) {
+            /* Merge with predecessor block? */
+            if ( (((pg-mask)->count_info & PGC_SH2_type_mask) != PGT_none) 
+                 || (SH2_PFN_ORDER(pg-mask) != order) )
+                break;
+            list_del(&(pg-mask)->list);
+            pg -= mask;
+        } else {
+            /* Merge with successor block? */
+            if ( (((pg+mask)->count_info & PGC_SH2_type_mask) != PGT_none)
+                 || (SH2_PFN_ORDER(pg+mask) != order) )
+                break;
+            list_del(&(pg+mask)->list);
+        }
+        order++;
+    }
+
+    SH2_SET_PFN_ORDER(pg, order);
+    list_add_tail(&pg->list, &d->arch.shadow2_freelists[order]);
+}
+
+/* Divert some memory from the pool to be used by the p2m mapping.
+ * This action is irreversible: the p2m mapping only ever grows.
+ * That's OK because the p2m table only exists for external domains,
+ * and those domains can't ever turn off shadow mode.
+ * Also, we only ever allocate a max-order chunk, so as to preserve
+ * the invariant that shadow2_prealloc() always works.
+ * Returns 0 iff it can't get a chunk (the caller should then
+ * free up some pages in domheap and call set_sh2_allocation);
+ * returns non-zero on success.
+ */
+static int
+shadow2_alloc_p2m_pages(struct domain *d)
+{
+    struct page_info *pg;
+    u32 i;
+    ASSERT(shadow2_lock_is_acquired(d));
+    
+    if ( d->arch.shadow2_total_pages 
+         < (shadow2_min_acceptable_pages(d) + (1<<SHADOW2_MAX_ORDER)) )
+        return 0; /* Not enough shadow memory: need to increase it first */
+    
+    pg = mfn_to_page(shadow2_alloc(d, PGC_SH2_p2m_table, 0));
+    d->arch.shadow2_p2m_pages += (1<<SHADOW2_MAX_ORDER);
+    d->arch.shadow2_total_pages -= (1<<SHADOW2_MAX_ORDER);
+    for (i = 0; i < (1<<SHADOW2_MAX_ORDER); i++)
+    {
+        /* Unlike shadow pages, mark p2m pages as owned by the domain */
+        page_set_owner(&pg[i], d);
+        list_add_tail(&pg[i].list, &d->arch.shadow2_p2m_freelist);
+    }
+    return 1;
+}
+
+// Returns 0 if no memory is available...
+mfn_t
+shadow2_alloc_p2m_page(struct domain *d)
+{
+    struct list_head *entry;
+    mfn_t mfn;
+    void *p;
+
+    if ( list_empty(&d->arch.shadow2_p2m_freelist) &&
+         !shadow2_alloc_p2m_pages(d) )
+        return _mfn(0);
+    entry = d->arch.shadow2_p2m_freelist.next;
+    list_del(entry);
+    list_add_tail(entry, &d->arch.shadow2_p2m_inuse);
+    mfn = page_to_mfn(list_entry(entry, struct page_info, list));
+    sh2_get_ref(mfn, 0);
+    p = sh2_map_domain_page(mfn);
+    clear_page(p);
+    sh2_unmap_domain_page(p);
+
+    return mfn;
+}
+
+#if CONFIG_PAGING_LEVELS == 3
+static void p2m_install_entry_in_monitors(struct domain *d, 
+                                          l3_pgentry_t *l3e) 
+/* Special case, only used for external-mode domains on PAE hosts:
+ * update the mapping of the p2m table.  Once again, this is trivial in
+ * other paging modes (one top-level entry points to the top-level p2m,
+ * no maintenance needed), but PAE makes life difficult by needing a
+ * copy the eight l3es of the p2m table in eight l2h slots in the
+ * monitor table.  This function makes fresh copies when a p2m l3e
+ * changes. */
+{
+    l2_pgentry_t *ml2e;
+    struct vcpu *v;
+    unsigned int index;
+
+    index = ((unsigned long)l3e & ~PAGE_MASK) / sizeof(l3_pgentry_t);
+    ASSERT(index < MACHPHYS_MBYTES>>1);
+
+    for_each_vcpu(d, v) 
+    {
+        if ( pagetable_get_pfn(v->arch.monitor_table) == 0 ) 
+            continue;
+        ASSERT(shadow2_mode_external(v->domain));
+
+        SHADOW2_DEBUG(P2M, "d=%u v=%u index=%u mfn=%#lx\n",
+                      d->domain_id, v->vcpu_id, index, l3e_get_pfn(*l3e));
+
+        if ( v == current ) /* OK to use linear map of monitor_table */
+            ml2e = __linear_l2_table + l2_linear_offset(RO_MPT_VIRT_START);
+        else 
+        {
+            l3_pgentry_t *ml3e;
+            ml3e = 
sh2_map_domain_page(pagetable_get_mfn(v->arch.monitor_table));
+            ASSERT(l3e_get_flags(ml3e[3]) & _PAGE_PRESENT);
+            ml2e = sh2_map_domain_page(_mfn(l3e_get_pfn(ml3e[3])));
+            ml2e += l2_table_offset(RO_MPT_VIRT_START);
+            sh2_unmap_domain_page(ml3e);
+        }
+        ml2e[index] = l2e_from_pfn(l3e_get_pfn(*l3e), __PAGE_HYPERVISOR);
+        if ( v != current )
+            sh2_unmap_domain_page(ml2e);
+    }
+}
+#endif
+
+// Find the next level's P2M entry, checking for out-of-range gfn's...
+// Returns NULL on error.
+//
+static l1_pgentry_t *
+p2m_find_entry(void *table, unsigned long *gfn_remainder,
+                   unsigned long gfn, u32 shift, u32 max)
+{
+    u32 index;
+
+    index = *gfn_remainder >> shift;
+    if ( index >= max )
+    {
+        SHADOW2_DEBUG(P2M, "gfn=0x%lx out of range "
+                      "(gfn_remainder=0x%lx shift=%d index=0x%x max=0x%x)\n",
+                       gfn, *gfn_remainder, shift, index, max);
+        return NULL;
+    }
+    *gfn_remainder &= (1 << shift) - 1;
+    return (l1_pgentry_t *)table + index;
+}
+
+// Walk one level of the P2M table, allocating a new table if required.
+// Returns 0 on error.
+//
+static int
+p2m_next_level(struct domain *d, mfn_t *table_mfn, void **table, 
+               unsigned long *gfn_remainder, unsigned long gfn, u32 shift, 
+               u32 max, unsigned long type)
+{
+    l1_pgentry_t *p2m_entry;
+    void *next;
+
+    if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn,
+                                      shift, max)) )
+        return 0;
+
+    if ( !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) )
+    {
+        mfn_t mfn = shadow2_alloc_p2m_page(d);
+        if ( mfn_x(mfn) == 0 )
+            return 0;
+        *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER);
+        mfn_to_page(mfn)->u.inuse.type_info = type | 1 | PGT_validated;
+        mfn_to_page(mfn)->count_info = 1;
+#if CONFIG_PAGING_LEVELS == 3
+        if (type == PGT_l2_page_table)
+        {
+            /* We have written to the p2m l3: need to sync the per-vcpu
+             * copies of it in the monitor tables */
+            p2m_install_entry_in_monitors(d, (l3_pgentry_t *)p2m_entry);
+        }
+#endif
+        /* The P2M can be shadowed: keep the shadows synced */
+        if ( d->vcpu[0] )
+            (void)__shadow2_validate_guest_entry(d->vcpu[0], *table_mfn,
+                                                 p2m_entry, sizeof *p2m_entry);
+    }
+    *table_mfn = _mfn(l1e_get_pfn(*p2m_entry));
+    next = sh2_map_domain_page(*table_mfn);
+    sh2_unmap_domain_page(*table);
+    *table = next;
+
+    return 1;
+}
+
+// Returns 0 on error (out of memory)
+int
+shadow2_set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn)
+{
+    // XXX -- this might be able to be faster iff current->domain == d
+    mfn_t table_mfn = pagetable_get_mfn(d->arch.phys_table);
+    void *table = sh2_map_domain_page(table_mfn);
+    unsigned long gfn_remainder = gfn;
+    l1_pgentry_t *p2m_entry;
+
+#if CONFIG_PAGING_LEVELS >= 4
+    if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
+                         L4_PAGETABLE_SHIFT - PAGE_SHIFT,
+                         L4_PAGETABLE_ENTRIES, PGT_l3_page_table) )
+        return 0;
+#endif
+#if CONFIG_PAGING_LEVELS >= 3
+    // When using PAE Xen, we only allow 33 bits of pseudo-physical
+    // address in translated guests (i.e. 8 GBytes).  This restriction
+    // comes from wanting to map the P2M table into the 16MB RO_MPT hole
+    // in Xen's address space for translated PV guests.
+    //
+    if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
+                         L3_PAGETABLE_SHIFT - PAGE_SHIFT,
+                         (CONFIG_PAGING_LEVELS == 3
+                          ? 8
+                          : L3_PAGETABLE_ENTRIES),
+                         PGT_l2_page_table) )
+        return 0;
+#endif
+    if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
+                         L2_PAGETABLE_SHIFT - PAGE_SHIFT,
+                         L2_PAGETABLE_ENTRIES, PGT_l1_page_table) )
+        return 0;
+
+    p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
+                               0, L1_PAGETABLE_ENTRIES);
+    ASSERT(p2m_entry);
+    if ( valid_mfn(mfn) )
+        *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER);
+    else
+        *p2m_entry = l1e_empty();
+
+    /* The P2M can be shadowed: keep the shadows synced */
+    (void) __shadow2_validate_guest_entry(d->vcpu[0], table_mfn, 
+                                          p2m_entry, sizeof *p2m_entry);
+
+    sh2_unmap_domain_page(table);
+
+    return 1;
+}
+
+// Allocate a new p2m table for a domain.
+//
+// The structure of the p2m table is that of a pagetable for xen (i.e. it is
+// controlled by CONFIG_PAGING_LEVELS).
+//
+// Returns 0 if p2m table could not be initialized
+//
+static int
+shadow2_alloc_p2m_table(struct domain *d)
+{
+    mfn_t p2m_top;
+    struct list_head *entry;
+    unsigned int page_count = 0;
+    
+    SHADOW2_PRINTK("allocating p2m table\n");
+    ASSERT(pagetable_get_pfn(d->arch.phys_table) == 0);
+
+    p2m_top = shadow2_alloc_p2m_page(d);
+    mfn_to_page(p2m_top)->count_info = 1;
+    mfn_to_page(p2m_top)->u.inuse.type_info = 
+#if CONFIG_PAGING_LEVELS == 4
+        PGT_l4_page_table
+#elif CONFIG_PAGING_LEVELS == 3
+        PGT_l3_page_table
+#elif CONFIG_PAGING_LEVELS == 2
+        PGT_l2_page_table
+#endif
+        | 1 | PGT_validated;
+   
+    if ( mfn_x(p2m_top) == 0 )
+        return 0;
+
+    d->arch.phys_table = pagetable_from_mfn(p2m_top);
+
+    SHADOW2_PRINTK("populating p2m table\n");
+ 
+    for ( entry = d->page_list.next;
+          entry != &d->page_list;
+          entry = entry->next )
+    {
+        struct page_info *page = list_entry(entry, struct page_info, list);
+        mfn_t mfn = page_to_mfn(page);
+        unsigned long gfn = get_gpfn_from_mfn(mfn_x(mfn));
+        page_count++;
+        if (
+#ifdef __x86_64__
+            (gfn != 0x5555555555555555L)
+#else
+            (gfn != 0x55555555L)
+#endif
+             && gfn != INVALID_M2P_ENTRY
+             && !shadow2_set_p2m_entry(d, gfn, mfn) )
+        {
+            SHADOW2_PRINTK("failed to initialize p2m table, gfn=%05lx, mfn=%" 
SH2_PRI_mfn "\n",
+                           gfn, mfn_x(mfn));
+            return 0;
+        }
+    }
+
+    SHADOW2_PRINTK("p2m table initialised (%u pages)\n", page_count);
+    return 1;
+}
+
+mfn_t
+sh2_gfn_to_mfn_foreign(struct domain *d, unsigned long gpfn)
+/* Read another domain's p2m entries */
+{
+    mfn_t mfn;
+    unsigned long addr = gpfn << PAGE_SHIFT;
+    l2_pgentry_t *l2e;
+    l1_pgentry_t *l1e;
+    
+    ASSERT(shadow2_mode_translate(d));
+    mfn = pagetable_get_mfn(d->arch.phys_table);
+
+
+#if CONFIG_PAGING_LEVELS > 2
+    if ( gpfn > (RO_MPT_VIRT_END - RO_MPT_VIRT_START) / sizeof(l1_pgentry_t) ) 
+        /* This pfn is higher than the p2m map can hold */
+        return _mfn(INVALID_MFN);
+#endif
+
+
+#if CONFIG_PAGING_LEVELS >= 4
+    { 
+        l4_pgentry_t *l4e = sh2_map_domain_page(mfn);
+        l4e += l4_table_offset(addr);
+        if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 )
+        {
+            sh2_unmap_domain_page(l4e);
+            return _mfn(INVALID_MFN);
+        }
+        mfn = _mfn(l4e_get_pfn(*l4e));
+        sh2_unmap_domain_page(l4e);
+    }
+#endif
+#if CONFIG_PAGING_LEVELS >= 3
+    {
+        l3_pgentry_t *l3e = sh2_map_domain_page(mfn);
+        l3e += l3_table_offset(addr);
+        if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 )
+        {
+            sh2_unmap_domain_page(l3e);
+            return _mfn(INVALID_MFN);
+        }
+        mfn = _mfn(l3e_get_pfn(*l3e));
+        sh2_unmap_domain_page(l3e);
+    }
+#endif
+
+    l2e = sh2_map_domain_page(mfn);
+    l2e += l2_table_offset(addr);
+    if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 )
+    {
+        sh2_unmap_domain_page(l2e);
+        return _mfn(INVALID_MFN);
+    }
+    mfn = _mfn(l2e_get_pfn(*l2e));
+    sh2_unmap_domain_page(l2e);
+
+    l1e = sh2_map_domain_page(mfn);
+    l1e += l1_table_offset(addr);
+    if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 )
+    {
+        sh2_unmap_domain_page(l1e);
+        return _mfn(INVALID_MFN);
+    }
+    mfn = _mfn(l1e_get_pfn(*l1e));
+    sh2_unmap_domain_page(l1e);
+
+    return mfn;
+}
+
+unsigned long
+shadow2_gfn_to_mfn_foreign(unsigned long gpfn)
+{
+    return mfn_x(sh2_gfn_to_mfn_foreign(current->domain, gpfn));
+}
+
+
+static void shadow2_p2m_teardown(struct domain *d)
+/* Return all the p2m pages to Xen.
+ * We know we don't have any extra mappings to these pages */
+{
+    struct list_head *entry, *n;
+    struct page_info *pg;
+
+    d->arch.phys_table = pagetable_null();
+
+    list_for_each_safe(entry, n, &d->arch.shadow2_p2m_inuse)
+    {
+        pg = list_entry(entry, struct page_info, list);
+        list_del(entry);
+        /* Should have just the one ref we gave it in alloc_p2m_page() */
+        if ( (pg->count_info & PGC_SH2_count_mask) != 1 )
+        {
+            SHADOW2_PRINTK("Odd p2m page count c=%#x t=%"PRtype_info"\n",
+                           pg->count_info, pg->u.inuse.type_info);
+        }
+        ASSERT(page_get_owner(pg) == d);
+        /* Free should not decrement domain's total allocation, since 
+         * these pages were allocated without an owner. */
+        page_set_owner(pg, NULL); 
+        free_domheap_pages(pg, 0);
+        d->arch.shadow2_p2m_pages--;
+        perfc_decr(shadow2_alloc_count);
+    }
+    list_for_each_safe(entry, n, &d->arch.shadow2_p2m_freelist)
+    {
+        list_del(entry);
+        pg = list_entry(entry, struct page_info, list);
+        ASSERT(page_get_owner(pg) == d);
+        /* Free should not decrement domain's total allocation. */
+        page_set_owner(pg, NULL); 
+        free_domheap_pages(pg, 0);
+        d->arch.shadow2_p2m_pages--;
+        perfc_decr(shadow2_alloc_count);
+    }
+    ASSERT(d->arch.shadow2_p2m_pages == 0);
+}
+
+/* Set the pool of shadow pages to the required number of pages.
+ * Input will be rounded up to at least shadow2_min_acceptable_pages(),
+ * plus space for the p2m table.
+ * Returns 0 for success, non-zero for failure. */
+static unsigned int set_sh2_allocation(struct domain *d, 
+                                       unsigned int pages,
+                                       int *preempted)
+{
+    struct page_info *pg;
+    unsigned int lower_bound;
+    int j;
+
+    ASSERT(shadow2_lock_is_acquired(d));
+    
+    /* Don't allocate less than the minimum acceptable, plus one page per
+     * megabyte of RAM (for the p2m table) */
+    lower_bound = shadow2_min_acceptable_pages(d) + (d->tot_pages / 256);
+    if ( pages > 0 && pages < lower_bound )
+        pages = lower_bound;
+    /* Round up to largest block size */
+    pages = (pages + ((1<<SHADOW2_MAX_ORDER)-1)) & ~((1<<SHADOW2_MAX_ORDER)-1);
+
+    SHADOW2_PRINTK("current %i target %i\n", 
+                   d->arch.shadow2_total_pages, pages);
+
+    while ( d->arch.shadow2_total_pages != pages ) 
+    {
+        if ( d->arch.shadow2_total_pages < pages ) 
+        {
+            /* Need to allocate more memory from domheap */
+            pg = alloc_domheap_pages(NULL, SHADOW2_MAX_ORDER, 0); 
+            if ( pg == NULL ) 
+            { 
+                SHADOW2_PRINTK("failed to allocate shadow pages.\n");
+                return -ENOMEM;
+            }
+            d->arch.shadow2_free_pages += 1<<SHADOW2_MAX_ORDER;
+            d->arch.shadow2_total_pages += 1<<SHADOW2_MAX_ORDER;
+            for ( j = 0; j < 1<<SHADOW2_MAX_ORDER; j++ ) 
+            {
+                pg[j].u.inuse.type_info = 0;  /* Free page */
+                pg[j].tlbflush_timestamp = 0; /* Not in any TLB */
+            }
+            SH2_SET_PFN_ORDER(pg, SHADOW2_MAX_ORDER);
+            list_add_tail(&pg->list, 
+                          &d->arch.shadow2_freelists[SHADOW2_MAX_ORDER]);
+        } 
+        else if ( d->arch.shadow2_total_pages > pages ) 
+        {
+            /* Need to return memory to domheap */
+            shadow2_prealloc(d, SHADOW2_MAX_ORDER);
+            ASSERT(!list_empty(&d->arch.shadow2_freelists[SHADOW2_MAX_ORDER]));
+            pg = list_entry(d->arch.shadow2_freelists[SHADOW2_MAX_ORDER].next, 
+                            struct page_info, list);
+            list_del(&pg->list);
+            d->arch.shadow2_free_pages -= 1<<SHADOW2_MAX_ORDER;
+            d->arch.shadow2_total_pages -= 1<<SHADOW2_MAX_ORDER;
+            free_domheap_pages(pg, SHADOW2_MAX_ORDER);
+        }
+
+        /* Check to see if we need to yield and try again */
+        if ( preempted && hypercall_preempt_check() )
+        {
+            *preempted = 1;
+            return 0;
+        }
+    }
+
+    return 0;
+}
+
+unsigned int shadow2_set_allocation(struct domain *d, 
+                                    unsigned int megabytes,
+                                    int *preempted)
+/* Hypercall interface to set the shadow memory allocation */
+{
+    unsigned int rv;
+    shadow2_lock(d);
+    rv = set_sh2_allocation(d, megabytes << (20 - PAGE_SHIFT), preempted); 
+    SHADOW2_PRINTK("dom %u allocation now %u pages (%u MB)\n",
+                   d->domain_id,
+                   d->arch.shadow2_total_pages,
+                   shadow2_get_allocation(d));
+    shadow2_unlock(d);
+    return rv;
+}
+
+/**************************************************************************/
+/* Hash table for storing the guest->shadow mappings */
+
+/* Hash function that takes a gfn or mfn, plus another byte of type info */
+typedef u32 key_t;
+static inline key_t sh2_hash(unsigned long n, u8 t) 
+{
+    unsigned char *p = (unsigned char *)&n;
+    key_t k = t;
+    int i;
+    for ( i = 0; i < sizeof(n) ; i++ ) k = (u32)p[i] + (k<<6) + (k<<16) - k;
+    return k;
+}
+
+#if SHADOW2_AUDIT & (SHADOW2_AUDIT_HASH|SHADOW2_AUDIT_HASH_FULL)
+
+/* Before we get to the mechanism, define a pair of audit functions
+ * that sanity-check the contents of the hash table. */
+static void sh2_hash_audit_bucket(struct domain *d, int bucket)
+/* Audit one bucket of the hash table */
+{
+    struct shadow2_hash_entry *e, *x;
+    struct page_info *pg;
+
+    if ( !(SHADOW2_AUDIT_ENABLE) )
+        return;
+
+    e = &d->arch.shadow2_hash_table[bucket];
+    if ( e->t == 0 ) return; /* Bucket is empty */ 
+    while ( e )
+    {
+        /* Empty link? */
+        BUG_ON( e->t == 0 ); 
+        /* Bogus type? */
+        BUG_ON( e->t > (PGC_SH2_max_shadow >> PGC_SH2_type_shift) );
+        /* Wrong bucket? */
+        BUG_ON( sh2_hash(e->n, e->t) % SHADOW2_HASH_BUCKETS != bucket ); 
+        /* Duplicate entry? */
+        for ( x = e->next; x; x = x->next )
+            BUG_ON( x->n == e->n && x->t == e->t );
+        /* Bogus MFN? */
+        BUG_ON( !valid_mfn(e->smfn) );
+        pg = mfn_to_page(e->smfn);
+        /* Not a shadow? */
+        BUG_ON( page_get_owner(pg) != 0 );
+        /* Wrong kind of shadow? */
+        BUG_ON( (pg->count_info & PGC_SH2_type_mask) >> PGC_SH2_type_shift 
+                != e->t ); 
+        /* Bad backlink? */
+        BUG_ON( pg->u.inuse.type_info != e->n );
+        if ( e->t != (PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift)
+             && e->t != (PGC_SH2_fl1_pae_shadow >> PGC_SH2_type_shift)
+             && e->t != (PGC_SH2_fl1_64_shadow >> PGC_SH2_type_shift) )
+        {
+            /* Bad shadow flags on guest page? */
+            BUG_ON( !(mfn_to_page(_mfn(e->n))->shadow2_flags & (1<<e->t)) );
+        }
+        /* That entry was OK; on we go */
+        e = e->next;
+    }
+}
+
+#else
+#define sh2_hash_audit_bucket(_d, _b)
+#endif /* Hashtable bucket audit */
+
+
+#if SHADOW2_AUDIT & SHADOW2_AUDIT_HASH_FULL
+
+static void sh2_hash_audit(struct domain *d)
+/* Full audit: audit every bucket in the table */
+{
+    int i;
+
+    if ( !(SHADOW2_AUDIT_ENABLE) )
+        return;
+
+    for ( i = 0; i < SHADOW2_HASH_BUCKETS; i++ ) 
+    {
+        sh2_hash_audit_bucket(d, i);
+    }
+}
+
+#else
+#define sh2_hash_audit(_d)
+#endif /* Hashtable bucket audit */
+
+/* Memory management interface for bucket allocation.
+ * These ought to come out of shadow memory, but at least on 32-bit
+ * machines we are forced to allocate them from xenheap so that we can
+ * address them. */
+static struct shadow2_hash_entry *sh2_alloc_hash_entry(struct domain *d)
+{
+    struct shadow2_hash_entry *extra, *x;
+    int i;
+
+    /* We need to allocate a new node. Ensure the free list is not empty. 
+     * Allocate new entries in units the same size as the original table. */
+    if ( unlikely(d->arch.shadow2_hash_freelist == NULL) )
+    {
+        size_t sz = sizeof(void *) + (SHADOW2_HASH_BUCKETS * sizeof(*x));
+        extra = xmalloc_bytes(sz);
+
+        if ( extra == NULL )
+        {
+            /* No memory left! */
+            SHADOW2_ERROR("xmalloc() failed when allocating hash buckets.\n");
+            domain_crash_synchronous();
+        }
+        memset(extra, 0, sz);
+
+        /* Record the allocation block so it can be correctly freed later. */
+        *((struct shadow2_hash_entry **)&extra[SHADOW2_HASH_BUCKETS]) = 
+            d->arch.shadow2_hash_allocations;
+        d->arch.shadow2_hash_allocations = &extra[0];
+
+        /* Thread a free chain through the newly-allocated nodes. */
+        for ( i = 0; i < (SHADOW2_HASH_BUCKETS - 1); i++ )
+            extra[i].next = &extra[i+1];
+        extra[i].next = NULL;
+
+        /* Add the new nodes to the free list. */
+        d->arch.shadow2_hash_freelist = &extra[0];
+    }
+
+    /* Allocate a new node from the free list. */
+    x = d->arch.shadow2_hash_freelist;
+    d->arch.shadow2_hash_freelist = x->next;
+    return x;
+}
+
+static void sh2_free_hash_entry(struct domain *d, struct shadow2_hash_entry *e)
+{
+    /* Mark the bucket as empty and return it to the free list */
+    e->t = 0; 
+    e->next = d->arch.shadow2_hash_freelist;
+    d->arch.shadow2_hash_freelist = e;
+}
+
+
+/* Allocate and initialise the table itself.  
+ * Returns 0 for success, 1 for error. */
+static int shadow2_hash_alloc(struct domain *d)
+{
+    struct shadow2_hash_entry *table;
+
+    ASSERT(shadow2_lock_is_acquired(d));
+    ASSERT(!d->arch.shadow2_hash_table);
+
+    table = xmalloc_array(struct shadow2_hash_entry, SHADOW2_HASH_BUCKETS);
+    if ( !table ) return 1;
+    memset(table, 0, 
+           SHADOW2_HASH_BUCKETS * sizeof (struct shadow2_hash_entry));
+    d->arch.shadow2_hash_table = table;
+    return 0;
+}
+
+/* Tear down the hash table and return all memory to Xen.
+ * This function does not care whether the table is populated. */
+static void shadow2_hash_teardown(struct domain *d)
+{
+    struct shadow2_hash_entry *a, *n;
+
+    ASSERT(shadow2_lock_is_acquired(d));
+    ASSERT(d->arch.shadow2_hash_table);
+
+    /* Return the table itself */
+    xfree(d->arch.shadow2_hash_table);
+    d->arch.shadow2_hash_table = NULL;
+
+    /* Return any extra allocations */
+    a = d->arch.shadow2_hash_allocations;
+    while ( a ) 
+    {
+        /* We stored a linked-list pointer at the end of each allocation */
+        n = *((struct shadow2_hash_entry **)(&a[SHADOW2_HASH_BUCKETS]));
+        xfree(a);
+        a = n;
+    }
+    d->arch.shadow2_hash_allocations = NULL;
+    d->arch.shadow2_hash_freelist = NULL;
+}
+
+
+mfn_t shadow2_hash_lookup(struct vcpu *v, unsigned long n, u8 t)
+/* Find an entry in the hash table.  Returns the MFN of the shadow,
+ * or INVALID_MFN if it doesn't exist */
+{
+    struct domain *d = v->domain;
+    struct shadow2_hash_entry *p, *x, *head;
+    key_t key;
+
+    ASSERT(shadow2_lock_is_acquired(d));
+    ASSERT(d->arch.shadow2_hash_table);
+    ASSERT(t);
+
+    sh2_hash_audit(d);
+
+    perfc_incrc(shadow2_hash_lookups);
+    key = sh2_hash(n, t);
+
+    x = head = &d->arch.shadow2_hash_table[key % SHADOW2_HASH_BUCKETS];
+    p = NULL;
+
+    sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS);
+
+    do
+    {
+        ASSERT(x->t || ((x == head) && (x->next == NULL)));
+
+        if ( x->n == n && x->t == t )
+        {
+            /* Pull-to-front if 'x' isn't already the head item */
+            if ( unlikely(x != head) )
+            {
+                if ( unlikely(d->arch.shadow2_hash_walking != 0) )
+                    /* Can't reorder: someone is walking the hash chains */
+                    return x->smfn;
+                else 
+                {
+                    /* Delete 'x' from list and reinsert after head. */
+                    p->next = x->next;
+                    x->next = head->next;
+                    head->next = x;
+                    
+                    /* Swap 'x' contents with head contents. */
+                    SWAP(head->n, x->n);
+                    SWAP(head->t, x->t);
+                    SWAP(head->smfn, x->smfn);
+                }
+            }
+            else
+            {
+                perfc_incrc(shadow2_hash_lookup_head);
+            }
+            return head->smfn;
+        }
+
+        p = x;
+        x = x->next;
+    }
+    while ( x != NULL );
+
+    perfc_incrc(shadow2_hash_lookup_miss);
+    return _mfn(INVALID_MFN);
+}
+
+void shadow2_hash_insert(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn)
+/* Put a mapping (n,t)->smfn into the hash table */
+{
+    struct domain *d = v->domain;
+    struct shadow2_hash_entry *x, *head;
+    key_t key;
+    
+    ASSERT(shadow2_lock_is_acquired(d));
+    ASSERT(d->arch.shadow2_hash_table);
+    ASSERT(t);
+
+    sh2_hash_audit(d);
+
+    perfc_incrc(shadow2_hash_inserts);
+    key = sh2_hash(n, t);
+
+    head = &d->arch.shadow2_hash_table[key % SHADOW2_HASH_BUCKETS];
+
+    sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS);
+
+    /* If the bucket is empty then insert the new page as the head item. */
+    if ( head->t == 0 )
+    {
+        head->n = n;
+        head->t = t;
+        head->smfn = smfn;
+        ASSERT(head->next == NULL);
+    }
+    else 
+    {
+        /* Insert a new entry directly after the head item. */
+        x = sh2_alloc_hash_entry(d);
+        x->n = n; 
+        x->t = t;
+        x->smfn = smfn;
+        x->next = head->next;
+        head->next = x;
+    }
+    
+    sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS);
+}
+
+void shadow2_hash_delete(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn)
+/* Excise the mapping (n,t)->smfn from the hash table */
+{
+    struct domain *d = v->domain;
+    struct shadow2_hash_entry *p, *x, *head;
+    key_t key;
+
+    ASSERT(shadow2_lock_is_acquired(d));
+    ASSERT(d->arch.shadow2_hash_table);
+    ASSERT(t);
+
+    sh2_hash_audit(d);
+
+    perfc_incrc(shadow2_hash_deletes);
+    key = sh2_hash(n, t);
+
+    head = &d->arch.shadow2_hash_table[key % SHADOW2_HASH_BUCKETS];
+
+    sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS);
+
+    /* Match on head item? */
+    if ( head->n == n && head->t == t )
+    {
+        if ( (x = head->next) != NULL )
+        {
+            /* Overwrite head with contents of following node. */
+            head->n = x->n;
+            head->t = x->t;
+            head->smfn = x->smfn;
+
+            /* Delete following node. */
+            head->next = x->next;
+            sh2_free_hash_entry(d, x);
+        }
+        else
+        {
+            /* This bucket is now empty. Initialise the head node. */
+            head->t = 0;
+        }
+    }
+    else 
+    {
+        /* Not at the head; need to walk the chain */
+        p = head;
+        x = head->next; 
+        
+        while(1)
+        {
+            ASSERT(x); /* We can't have hit the end, since our target is
+                        * still in the chain somehwere... */
+            if ( x->n == n && x->t == t )
+            {
+                /* Delete matching node. */
+                p->next = x->next;
+                sh2_free_hash_entry(d, x);
+                break;
+            }
+            p = x;
+            x = x->next;
+        }
+    }
+
+    sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS);
+}
+
+typedef int (*hash_callback_t)(struct vcpu *v, mfn_t smfn, mfn_t other_mfn);
+
+static void hash_foreach(struct vcpu *v, 
+                         unsigned int callback_mask, 
+                         hash_callback_t callbacks[], 
+                         mfn_t callback_mfn)
+/* Walk the hash table looking at the types of the entries and 
+ * calling the appropriate callback function for each entry. 
+ * The mask determines which shadow types we call back for, and the array
+ * of callbacks tells us which function to call.
+ * Any callback may return non-zero to let us skip the rest of the scan. 
+ *
+ * WARNING: Callbacks MUST NOT add or remove hash entries unless they 
+ * then return non-zero to terminate the scan. */
+{
+    int i, done = 0;
+    struct domain *d = v->domain;
+    struct shadow2_hash_entry *x;
+
+    /* Say we're here, to stop hash-lookups reordering the chains */
+    ASSERT(shadow2_lock_is_acquired(d));
+    ASSERT(d->arch.shadow2_hash_walking == 0);
+    d->arch.shadow2_hash_walking = 1;
+
+    callback_mask &= ~1; /* Never attempt to call back on empty buckets */
+    for ( i = 0; i < SHADOW2_HASH_BUCKETS; i++ ) 
+    {
+        /* WARNING: This is not safe against changes to the hash table.
+         * The callback *must* return non-zero if it has inserted or
+         * deleted anything from the hash (lookups are OK, though). */
+        for ( x = &d->arch.shadow2_hash_table[i]; x; x = x->next )
+        {
+            if ( callback_mask & (1 << x->t) ) 
+            {
+                ASSERT(x->t <= 15);
+                ASSERT(callbacks[x->t] != NULL);
+                if ( (done = callbacks[x->t](v, x->smfn, callback_mfn)) != 0 )
+                    break;
+            }
+        }
+        if ( done ) break; 
+    }
+    d->arch.shadow2_hash_walking = 0; 
+}
+
+
+/**************************************************************************/
+/* Destroy a shadow page: simple dispatcher to call the per-type destructor
+ * which will decrement refcounts appropriately and return memory to the 
+ * free pool. */
+
+void sh2_destroy_shadow(struct vcpu *v, mfn_t smfn)
+{
+    struct page_info *pg = mfn_to_page(smfn);
+    u32 t = pg->count_info & PGC_SH2_type_mask;
+
+
+    SHADOW2_PRINTK("smfn=%#lx\n", mfn_x(smfn));
+
+    /* Double-check, if we can, that the shadowed page belongs to this
+     * domain, (by following the back-pointer). */
+    ASSERT(t == PGC_SH2_fl1_32_shadow  ||  
+           t == PGC_SH2_fl1_pae_shadow ||  
+           t == PGC_SH2_fl1_64_shadow  || 
+           t == PGC_SH2_monitor_table  || 
+           (page_get_owner(mfn_to_page(_mfn(pg->u.inuse.type_info))) 
+            == v->domain)); 
+
+    /* The down-shifts here are so that the switch statement is on nice
+     * small numbers that the compiler will enjoy */
+    switch ( t >> PGC_SH2_type_shift )
+    {
+#if CONFIG_PAGING_LEVELS == 2
+    case PGC_SH2_l1_32_shadow >> PGC_SH2_type_shift:
+    case PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift:
+        SHADOW2_INTERNAL_NAME(sh2_destroy_l1_shadow, 2, 2)(v, smfn); 
+        break;
+    case PGC_SH2_l2_32_shadow >> PGC_SH2_type_shift:
+        SHADOW2_INTERNAL_NAME(sh2_destroy_l2_shadow, 2, 2)(v, smfn);
+        break;
+#else /* PAE or 64bit */
+    case PGC_SH2_l1_32_shadow >> PGC_SH2_type_shift:
+    case PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift:
+        SHADOW2_INTERNAL_NAME(sh2_destroy_l1_shadow, 3, 2)(v, smfn);
+        break;
+    case PGC_SH2_l2_32_shadow >> PGC_SH2_type_shift:
+        SHADOW2_INTERNAL_NAME(sh2_destroy_l2_shadow, 3, 2)(v, smfn);
+        break;
+#endif
+
+#if CONFIG_PAGING_LEVELS >= 3
+    case PGC_SH2_l1_pae_shadow >> PGC_SH2_type_shift:
+    case PGC_SH2_fl1_pae_shadow >> PGC_SH2_type_shift:
+        SHADOW2_INTERNAL_NAME(sh2_destroy_l1_shadow, 3, 3)(v, smfn);
+        break;
+    case PGC_SH2_l2_pae_shadow >> PGC_SH2_type_shift:
+    case PGC_SH2_l2h_pae_shadow >> PGC_SH2_type_shift:
+        SHADOW2_INTERNAL_NAME(sh2_destroy_l2_shadow, 3, 3)(v, smfn);
+        break;
+    case PGC_SH2_l3_pae_shadow >> PGC_SH2_type_shift:
+        SHADOW2_INTERNAL_NAME(sh2_destroy_l3_shadow, 3, 3)(v, smfn);
+        break;
+#endif
+
+#if CONFIG_PAGING_LEVELS >= 4
+    case PGC_SH2_l1_64_shadow >> PGC_SH2_type_shift:
+    case PGC_SH2_fl1_64_shadow >> PGC_SH2_type_shift:
+        SHADOW2_INTERNAL_NAME(sh2_destroy_l1_shadow, 4, 4)(v, smfn);
+        break;
+    case PGC_SH2_l2_64_shadow >> PGC_SH2_type_shift:
+        SHADOW2_INTERNAL_NAME(sh2_destroy_l2_shadow, 4, 4)(v, smfn);
+        break;
+    case PGC_SH2_l3_64_shadow >> PGC_SH2_type_shift:
+        SHADOW2_INTERNAL_NAME(sh2_destroy_l3_shadow, 4, 4)(v, smfn);
+        break;
+    case PGC_SH2_l4_64_shadow >> PGC_SH2_type_shift:
+        SHADOW2_INTERNAL_NAME(sh2_destroy_l4_shadow, 4, 4)(v, smfn);
+        break;
+#endif
+    default:
+        SHADOW2_PRINTK("tried to destroy shadow of bad type %08lx\n", 
+                       (unsigned long)t);
+        BUG();
+    }    
+}
+
+/**************************************************************************/
+/* Remove all writeable mappings of a guest frame from the shadow tables 
+ * Returns non-zero if we need to flush TLBs. 
+ * level and fault_addr desribe how we found this to be a pagetable;
+ * level==0 means we have some other reason for revoking write access.*/
+
+int shadow2_remove_write_access(struct vcpu *v, mfn_t gmfn, 
+                                unsigned int level,
+                                unsigned long fault_addr)
+{
+    /* Dispatch table for getting per-type functions */
+    static hash_callback_t callbacks[16] = {
+        NULL, /* none    */
+#if CONFIG_PAGING_LEVELS == 2
+        SHADOW2_INTERNAL_NAME(sh2_remove_write_access,2,2), /* l1_32   */
+        SHADOW2_INTERNAL_NAME(sh2_remove_write_access,2,2), /* fl1_32  */
+#else 
+        SHADOW2_INTERNAL_NAME(sh2_remove_write_access,3,2), /* l1_32   */
+        SHADOW2_INTERNAL_NAME(sh2_remove_write_access,3,2), /* fl1_32  */
+#endif
+        NULL, /* l2_32   */
+#if CONFIG_PAGING_LEVELS >= 3
+        SHADOW2_INTERNAL_NAME(sh2_remove_write_access,3,3), /* l1_pae  */
+        SHADOW2_INTERNAL_NAME(sh2_remove_write_access,3,3), /* fl1_pae */
+#else 
+        NULL, /* l1_pae  */
+        NULL, /* fl1_pae */
+#endif
+        NULL, /* l2_pae  */
+        NULL, /* l2h_pae */
+        NULL, /* l3_pae  */
+#if CONFIG_PAGING_LEVELS >= 4
+        SHADOW2_INTERNAL_NAME(sh2_remove_write_access,4,4), /* l1_64   */
+        SHADOW2_INTERNAL_NAME(sh2_remove_write_access,4,4), /* fl1_64  */
+#else
+        NULL, /* l1_64   */
+        NULL, /* fl1_64  */
+#endif
+        NULL, /* l2_64   */
+        NULL, /* l3_64   */
+        NULL, /* l4_64   */
+        NULL, /* p2m     */
+        NULL  /* unused  */
+    };
+
+    static unsigned int callback_mask = 
+          1 << (PGC_SH2_l1_32_shadow >> PGC_SH2_type_shift)
+        | 1 << (PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift)
+        | 1 << (PGC_SH2_l1_pae_shadow >> PGC_SH2_type_shift)
+        | 1 << (PGC_SH2_fl1_pae_shadow >> PGC_SH2_type_shift)
+        | 1 << (PGC_SH2_l1_64_shadow >> PGC_SH2_type_shift)
+        | 1 << (PGC_SH2_fl1_64_shadow >> PGC_SH2_type_shift)
+        ;
+    struct page_info *pg = mfn_to_page(gmfn);
+
+    ASSERT(shadow2_lock_is_acquired(v->domain));
+
+    /* Only remove writable mappings if we are doing shadow refcounts.
+     * In guest refcounting, we trust Xen to already be restricting
+     * all the writes to the guest page tables, so we do not need to
+     * do more. */
+    if ( !shadow2_mode_refcounts(v->domain) )
+        return 0;
+
+    /* Early exit if it's already a pagetable, or otherwise not writeable */
+    if ( sh2_mfn_is_a_page_table(gmfn) 
+         || (pg->u.inuse.type_info & PGT_count_mask) == 0 )
+        return 0;
+
+    perfc_incrc(shadow2_writeable);
+
+    /* If this isn't a "normal" writeable page, the domain is trying to 
+     * put pagetables in special memory of some kind.  We can't allow that. */
+    if ( (pg->u.inuse.type_info & PGT_type_mask) != PGT_writable_page )
+    {
+        SHADOW2_ERROR("can't remove write access to mfn %lx, type_info is %" 
+                      PRtype_info "\n",
+                      mfn_x(gmfn), mfn_to_page(gmfn)->u.inuse.type_info);
+        domain_crash(v->domain);
+    }
+
+#if SHADOW2_OPTIMIZATIONS & SH2OPT_WRITABLE_HEURISTIC
+    if ( v == current && level != 0 )
+    {
+        unsigned long gfn;
+        /* Heuristic: there is likely to be only one writeable mapping,
+         * and that mapping is likely to be in the current pagetable,
+         * either in the guest's linear map (linux, windows) or in a
+         * magic slot used to map high memory regions (linux HIGHTPTE) */
+
+#define GUESS(_a, _h) do {                                              \
+            if ( v->arch.shadow2->guess_wrmap(v, (_a), gmfn) )          \
+                perfc_incrc(shadow2_writeable_h_ ## _h);                \
+            if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 )        \
+                return 1;                                               \
+        } while (0)
+
+        
+        /* Linux lowmem: first 1GB is mapped 1-to-1 above 0xC0000000 */
+        if ( v == current 
+             && (gfn = sh2_mfn_to_gfn(v->domain, gmfn)) < 0x40000000 )
+            GUESS(0xC0000000 + (gfn << PAGE_SHIFT), 4);
+
+        if ( v->arch.shadow2->guest_levels == 2 )
+        {
+            if ( level == 1 )
+                /* 32bit non-PAE w2k3: linear map at 0xC0000000 */
+                GUESS(0xC0000000UL + (fault_addr >> 10), 1);
+        }
+#if CONFIG_PAGING_LEVELS >= 3
+        else if ( v->arch.shadow2->guest_levels == 3 )
+        {
+            /* 32bit PAE w2k3: linear map at 0xC0000000 */
+            switch ( level ) 
+            {
+            case 1: GUESS(0xC0000000UL + (fault_addr >> 9), 2); break;
+            case 2: GUESS(0xC0600000UL + (fault_addr >> 18), 2); break;
+            }
+        }
+#if CONFIG_PAGING_LEVELS >= 4
+        else if ( v->arch.shadow2->guest_levels == 4 )
+        {
+            /* 64bit w2k3: linear map at 0x0000070000000000 */
+            switch ( level ) 
+            {
+            case 1: GUESS(0x70000000000UL + (fault_addr >> 9), 3); break;
+            case 2: GUESS(0x70380000000UL + (fault_addr >> 18), 3); break;
+            case 3: GUESS(0x70381C00000UL + (fault_addr >> 27), 3); break;
+            }
+        }
+#endif /* CONFIG_PAGING_LEVELS >= 4 */
+#endif /* CONFIG_PAGING_LEVELS >= 3 */
+
+#undef GUESS
+
+    }
+#endif
+    
+    /* Brute-force search of all the shadows, by walking the hash */
+    perfc_incrc(shadow2_writeable_bf);
+    hash_foreach(v, callback_mask, callbacks, gmfn);
+
+    /* If that didn't catch the mapping, something is very wrong */
+    if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) != 0 )
+    {
+        SHADOW2_ERROR("can't find all writeable mappings of mfn %lx: "
+                      "%lu left\n", mfn_x(gmfn),
+                      (mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask));
+        domain_crash(v->domain);
+    }
+    
+    /* We killed at least one writeable mapping, so must flush TLBs. */
+    return 1;
+}
+
+
+
+/**************************************************************************/
+/* Remove all mappings of a guest frame from the shadow tables.
+ * Returns non-zero if we need to flush TLBs. */
+
+int shadow2_remove_all_mappings(struct vcpu *v, mfn_t gmfn)
+{
+    struct page_info *page = mfn_to_page(gmfn);
+    int expected_count;
+
+    /* Dispatch table for getting per-type functions */
+    static hash_callback_t callbacks[16] = {
+        NULL, /* none    */
+#if CONFIG_PAGING_LEVELS == 2
+        SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,2,2), /* l1_32   */
+        SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,2,2), /* fl1_32  */
+#else 
+        SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,3,2), /* l1_32   */
+        SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,3,2), /* fl1_32  */
+#endif
+        NULL, /* l2_32   */
+#if CONFIG_PAGING_LEVELS >= 3
+        SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,3,3), /* l1_pae  */
+        SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,3,3), /* fl1_pae */
+#else 
+        NULL, /* l1_pae  */
+        NULL, /* fl1_pae */
+#endif
+        NULL, /* l2_pae  */
+        NULL, /* l2h_pae */
+        NULL, /* l3_pae  */
+#if CONFIG_PAGING_LEVELS >= 4
+        SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,4,4), /* l1_64   */
+        SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,4,4), /* fl1_64  */
+#else
+        NULL, /* l1_64   */
+        NULL, /* fl1_64  */
+#endif
+        NULL, /* l2_64   */
+        NULL, /* l3_64   */
+        NULL, /* l4_64   */
+        NULL, /* p2m     */
+        NULL  /* unused  */
+    };
+
+    static unsigned int callback_mask = 
+          1 << (PGC_SH2_l1_32_shadow >> PGC_SH2_type_shift)
+        | 1 << (PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift)
+        | 1 << (PGC_SH2_l1_pae_shadow >> PGC_SH2_type_shift)
+        | 1 << (PGC_SH2_fl1_pae_shadow >> PGC_SH2_type_shift)
+        | 1 << (PGC_SH2_l1_64_shadow >> PGC_SH2_type_shift)
+        | 1 << (PGC_SH2_fl1_64_shadow >> PGC_SH2_type_shift)
+        ;
+
+    perfc_incrc(shadow2_mappings);
+    if ( (page->count_info & PGC_count_mask) == 0 )
+        return 0;
+
+    ASSERT(shadow2_lock_is_acquired(v->domain));
+
+    /* XXX TODO: 
+     * Heuristics for finding the (probably) single mapping of this gmfn */
+    
+    /* Brute-force search of all the shadows, by walking the hash */
+    perfc_incrc(shadow2_mappings_bf);
+    hash_foreach(v, callback_mask, callbacks, gmfn);
+
+    /* If that didn't catch the mapping, something is very wrong */
+    expected_count = (page->count_info & PGC_allocated) ? 1 : 0;
+    if ( (page->count_info & PGC_count_mask) != expected_count )
+    {
+        /* Don't complain if we're in HVM and there's one extra mapping: 
+         * The qemu helper process has an untyped mapping of this dom's RAM */
+        if ( !(shadow2_mode_external(v->domain)
+               && (page->count_info & PGC_count_mask) <= 2
+               && (page->u.inuse.type_info & PGT_count_mask) == 0) )
+        {
+            SHADOW2_ERROR("can't find all mappings of mfn %lx: "
+                          "c=%08x t=%08lx\n", mfn_x(gmfn), 
+                          page->count_info, page->u.inuse.type_info);
+        }
+    }
+
+    /* We killed at least one mapping, so must flush TLBs. */
+    return 1;
+}
+
+
+/**************************************************************************/
+/* Remove all shadows of a guest frame from the shadow tables */
+
+static int sh2_remove_shadow_via_pointer(struct vcpu *v, mfn_t smfn)
+/* Follow this shadow's up-pointer, if it has one, and remove the reference
+ * found there.  Returns 1 if that was the only reference to this shadow */
+{
+    struct page_info *pg = mfn_to_page(smfn);
+    mfn_t pmfn;
+    void *vaddr;
+    int rc;
+
+    ASSERT((pg->count_info & PGC_SH2_type_mask) > 0);
+    ASSERT((pg->count_info & PGC_SH2_type_mask) < PGC_SH2_max_shadow);
+    ASSERT((pg->count_info & PGC_SH2_type_mask) != PGC_SH2_l2_32_shadow);
+    ASSERT((pg->count_info & PGC_SH2_type_mask) != PGC_SH2_l3_pae_shadow);
+    ASSERT((pg->count_info & PGC_SH2_type_mask) != PGC_SH2_l4_64_shadow);
+    
+    if (pg->up == 0) return 0;
+    pmfn = _mfn(pg->up >> PAGE_SHIFT);
+    ASSERT(valid_mfn(pmfn));
+    vaddr = sh2_map_domain_page(pmfn);
+    ASSERT(vaddr);
+    vaddr += pg->up & (PAGE_SIZE-1);
+    ASSERT(l1e_get_pfn(*(l1_pgentry_t *)vaddr) == mfn_x(smfn));
+    
+    /* Is this the only reference to this shadow? */
+    rc = ((pg->count_info & PGC_SH2_count_mask) == 1) ? 1 : 0;
+
+    /* Blank the offending entry */
+    switch ((pg->count_info & PGC_SH2_type_mask)) 
+    {
+    case PGC_SH2_l1_32_shadow:
+    case PGC_SH2_l2_32_shadow:
+#if CONFIG_PAGING_LEVELS == 2
+        SHADOW2_INTERNAL_NAME(sh2_clear_shadow_entry,2,2)(v, vaddr, pmfn);
+#else
+        SHADOW2_INTERNAL_NAME(sh2_clear_shadow_entry,3,2)(v, vaddr, pmfn);
+#endif
+        break;
+#if CONFIG_PAGING_LEVELS >=3
+    case PGC_SH2_l1_pae_shadow:
+    case PGC_SH2_l2_pae_shadow:
+    case PGC_SH2_l2h_pae_shadow:
+    case PGC_SH2_l3_pae_shadow:
+        SHADOW2_INTERNAL_NAME(sh2_clear_shadow_entry,3,3)(v, vaddr, pmfn);
+        break;
+#if CONFIG_PAGING_LEVELS >= 4
+    case PGC_SH2_l1_64_shadow:
+    case PGC_SH2_l2_64_shadow:
+    case PGC_SH2_l3_64_shadow:
+    case PGC_SH2_l4_64_shadow:
+        SHADOW2_INTERNAL_NAME(sh2_clear_shadow_entry,4,4)(v, vaddr, pmfn);
+        break;
+#endif
+#endif
+    default: BUG(); /* Some wierd unknown shadow type */
+    }
+    
+    sh2_unmap_domain_page(vaddr);
+    if ( rc )
+        perfc_incrc(shadow2_up_pointer);
+    else
+        perfc_incrc(shadow2_unshadow_bf);
+
+    return rc;
+}
+
+void sh2_remove_shadows(struct vcpu *v, mfn_t gmfn, int all)
+/* Remove the shadows of this guest page.  
+ * If all != 0, find all shadows, if necessary by walking the tables.
+ * Otherwise, just try the (much faster) heuristics, which will remove 
+ * at most one reference to each shadow of the page. */
+{
+    struct page_info *pg;
+    mfn_t smfn;
+    u32 sh_flags;
+    unsigned char t;
+
+    /* Dispatch table for getting per-type functions: each level must
+     * be called with the function to remove a lower-level shadow. */
+    static hash_callback_t callbacks[16] = {
+        NULL, /* none    */
+        NULL, /* l1_32   */
+        NULL, /* fl1_32  */
+#if CONFIG_PAGING_LEVELS == 2
+        SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,2,2), /* l2_32   */
+#else 
+        SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,3,2), /* l2_32   */
+#endif
+        NULL, /* l1_pae  */
+        NULL, /* fl1_pae */
+#if CONFIG_PAGING_LEVELS >= 3
+        SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,3,3), /* l2_pae  */
+        SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,3,3), /* l2h_pae */
+        SHADOW2_INTERNAL_NAME(sh2_remove_l2_shadow,3,3), /* l3_pae  */
+#else 
+        NULL, /* l2_pae  */
+        NULL, /* l2h_pae */
+        NULL, /* l3_pae  */
+#endif
+        NULL, /* l1_64   */
+        NULL, /* fl1_64  */
+#if CONFIG_PAGING_LEVELS >= 4
+        SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,4,4), /* l2_64   */
+        SHADOW2_INTERNAL_NAME(sh2_remove_l2_shadow,4,4), /* l3_64   */
+        SHADOW2_INTERNAL_NAME(sh2_remove_l3_shadow,4,4), /* l4_64   */
+#else
+        NULL, /* l2_64   */
+        NULL, /* l3_64   */
+        NULL, /* l4_64   */
+#endif
+        NULL, /* p2m     */
+        NULL  /* unused  */
+    };
+
+    /* Another lookup table, for choosing which mask to use */
+    static unsigned int masks[16] = {
+        0, /* none    */
+        1 << (PGC_SH2_l2_32_shadow >> PGC_SH2_type_shift), /* l1_32   */
+        0, /* fl1_32  */
+        0, /* l2_32   */
+        ((1 << (PGC_SH2_l2h_pae_shadow >> PGC_SH2_type_shift))
+         | (1 << (PGC_SH2_l2_pae_shadow >> PGC_SH2_type_shift))), /* l1_pae  */
+        0, /* fl1_pae */
+        1 << (PGC_SH2_l3_pae_shadow >> PGC_SH2_type_shift), /* l2_pae  */
+        1 << (PGC_SH2_l3_pae_shadow >> PGC_SH2_type_shift), /* l2h_pae  */
+        0, /* l3_pae  */
+        1 << (PGC_SH2_l2_64_shadow >> PGC_SH2_type_shift), /* l1_64   */
+        0, /* fl1_64  */
+        1 << (PGC_SH2_l3_64_shadow >> PGC_SH2_type_shift), /* l2_64   */
+        1 << (PGC_SH2_l4_64_shadow >> PGC_SH2_type_shift), /* l3_64   */
+        0, /* l4_64   */
+        0, /* p2m     */
+        0  /* unused  */
+    };
+
+    SHADOW2_PRINTK("d=%d, v=%d, gmfn=%05lx\n",
+                   v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
+
+    ASSERT(shadow2_lock_is_acquired(v->domain));
+
+    pg = mfn_to_page(gmfn);
+
+    /* Bale out now if the page is not shadowed */
+    if ( (pg->count_info & PGC_page_table) == 0 )
+        return;
+
+    /* Search for this shadow in all appropriate shadows */
+    perfc_incrc(shadow2_unshadow);
+    sh_flags = pg->shadow2_flags;
+
+    /* Lower-level shadows need to be excised from upper-level shadows.
+     * This call to hash_foreach() looks dangerous but is in fact OK: each
+     * call will remove at most one shadow, and terminate immediately when
+     * it does remove it, so we never walk the hash after doing a deletion.  */
+#define DO_UNSHADOW(_type) do {                                 \
+    t = (_type) >> PGC_SH2_type_shift;                          \
+    smfn = shadow2_hash_lookup(v, mfn_x(gmfn), t);              \
+    if ( !sh2_remove_shadow_via_pointer(v, smfn) && all )       \
+        hash_foreach(v, masks[t], callbacks, smfn);             \
+} while (0)
+
+    /* Top-level shadows need to be unpinned */
+#define DO_UNPIN(_type) do {                                             \
+    t = (_type) >> PGC_SH2_type_shift;                                   \
+    smfn = shadow2_hash_lookup(v, mfn_x(gmfn), t);                       \
+    if ( mfn_to_page(smfn)->count_info & PGC_SH2_pinned )                \
+        sh2_unpin(v, smfn);                                              \
+    if ( (_type) == PGC_SH2_l3_pae_shadow )                              \
+        SHADOW2_INTERNAL_NAME(sh2_unpin_all_l3_subshadows,3,3)(v, smfn); \
+} while (0)
+
+    if ( sh_flags & SH2F_L1_32 )   DO_UNSHADOW(PGC_SH2_l1_32_shadow);
+    if ( sh_flags & SH2F_L2_32 )   DO_UNPIN(PGC_SH2_l2_32_shadow);
+#if CONFIG_PAGING_LEVELS >= 3
+    if ( sh_flags & SH2F_L1_PAE )  DO_UNSHADOW(PGC_SH2_l1_pae_shadow);
+    if ( sh_flags & SH2F_L2_PAE )  DO_UNSHADOW(PGC_SH2_l2_pae_shadow);
+    if ( sh_flags & SH2F_L2H_PAE ) DO_UNSHADOW(PGC_SH2_l2h_pae_shadow);
+    if ( sh_flags & SH2F_L3_PAE )  DO_UNPIN(PGC_SH2_l3_pae_shadow);
+#if CONFIG_PAGING_LEVELS >= 4
+    if ( sh_flags & SH2F_L1_64 )   DO_UNSHADOW(PGC_SH2_l1_64_shadow);
+    if ( sh_flags & SH2F_L2_64 )   DO_UNSHADOW(PGC_SH2_l2_64_shadow);
+    if ( sh_flags & SH2F_L3_64 )   DO_UNSHADOW(PGC_SH2_l3_64_shadow);
+    if ( sh_flags & SH2F_L4_64 )   DO_UNPIN(PGC_SH2_l4_64_shadow);
+#endif
+#endif
+
+#undef DO_UNSHADOW
+#undef DO_UNPIN
+
+
+#if CONFIG_PAGING_LEVELS > 2
+    /* We may have caused some PAE l3 entries to change: need to 
+     * fix up the copies of them in various places */
+    if ( sh_flags & (SH2F_L2_PAE|SH2F_L2H_PAE) )
+        sh2_pae_recopy(v->domain);
+#endif
+
+    /* If that didn't catch the shadows, something is wrong */
+    if ( all && (pg->count_info & PGC_page_table) )
+    {
+        SHADOW2_ERROR("can't find all shadows of mfn %05lx 
(shadow2_flags=%08x)\n",
+                      mfn_x(gmfn), pg->shadow2_flags);
+        domain_crash(v->domain);
+    }
+}
+
+void
+shadow2_remove_all_shadows_and_parents(struct vcpu *v, mfn_t gmfn)
+/* Even harsher: this is a HVM page that we thing is no longer a pagetable.
+ * Unshadow it, and recursively unshadow pages that reference it. */
+{
+    shadow2_remove_all_shadows(v, gmfn);
+    /* XXX TODO:
+     * Rework this hashtable walker to return a linked-list of all 
+     * the shadows it modified, then do breadth-first recursion 
+     * to find the way up to higher-level tables and unshadow them too. 
+     *
+     * The current code (just tearing down each page's shadows as we
+     * detect that it is not a pagetable) is correct, but very slow. 
+     * It means extra emulated writes and slows down removal of mappings. */
+}
+
+/**************************************************************************/
+
+void sh2_update_paging_modes(struct vcpu *v)
+{
+    struct domain *d = v->domain;
+    struct shadow2_entry_points *old_entries = v->arch.shadow2;
+    mfn_t old_guest_table;
+
+    ASSERT(shadow2_lock_is_acquired(d));
+
+    // Valid transitions handled by this function:
+    // - For PV guests:
+    //     - after a shadow mode has been changed
+    // - For HVM guests:
+    //     - after a shadow mode has been changed
+    //     - changes in CR0.PG, CR4.PAE, CR4.PSE, or CR4.PGE
+    //
+
+    // Avoid determining the current shadow2 mode for uninitialized CPUs, as
+    // we can not yet determine whether it is an HVM or PV domain.
+    //
+    if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
+    {
+        printk("%s: postponing determination of shadow2 mode\n", __func__);
+        return;
+    }
+
+    // First, tear down any old shadow tables held by this vcpu.
+    //
+    if ( v->arch.shadow2 )
+        shadow2_detach_old_tables(v);
+
+    if ( !hvm_guest(v) )
+    {
+        ///
+        /// PV guest
+        ///
+#if CONFIG_PAGING_LEVELS == 4
+        if ( pv_32bit_guest(v) )
+            v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 4, 3);
+        else
+            v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 4, 4);
+#elif CONFIG_PAGING_LEVELS == 3
+        v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 3, 3);
+#elif CONFIG_PAGING_LEVELS == 2
+        v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 2, 2);
+#else
+#error unexpected paging mode
+#endif
+    }
+    else
+    {
+        ///
+        /// HVM guest
+        ///
+        ASSERT(shadow2_mode_translate(d));
+        ASSERT(shadow2_mode_external(d));
+
+        if ( !hvm_paging_enabled(v) )
+        {
+            // paging disabled...
+            clear_bit(_VCPUF_shadow2_translate, &v->vcpu_flags);
+            
+            /* Set v->arch.guest_table to use the p2m map, and choose
+             * the appropriate shadow mode */
+            old_guest_table = pagetable_get_mfn(v->arch.guest_table);
+#if CONFIG_PAGING_LEVELS == 2
+            v->arch.guest_table =
+                pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table));
+            v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry,2,2);
+#elif CONFIG_PAGING_LEVELS == 3 
+            v->arch.guest_table =
+                pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table));
+            v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry,3,3);
+#else /* CONFIG_PAGING_LEVELS == 4 */
+            { 
+                l4_pgentry_t *l4e; 
+                /* Use the start of the first l3 table as a PAE l3 */
+                ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0);
+                l4e = 
sh2_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
+                ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT);
+                v->arch.guest_table =
+                    pagetable_from_pfn(l4e_get_pfn(l4e[0]));
+                sh2_unmap_domain_page(l4e);
+            }
+            v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry,3,3);
+#endif
+            /* Fix up refcounts on guest_table */
+            get_page(mfn_to_page(pagetable_get_mfn(v->arch.guest_table)), d);
+            if ( mfn_x(old_guest_table) != 0 )
+                put_page(mfn_to_page(old_guest_table));
+        }
+        else
+        {
+            set_bit(_VCPUF_shadow2_translate, &v->vcpu_flags);
+
+#ifdef __x86_64__
+            if ( hvm_long_mode_enabled(v) )
+            {
+                // long mode guest...
+                v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 4, 4);
+            }
+            else
+#endif
+                if ( hvm_get_guest_ctrl_reg(v, 4) & X86_CR4_PAE )
+                {
+#if CONFIG_PAGING_LEVELS >= 3
+                    // 32-bit PAE mode guest...
+                    v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 3, 
3);
+#else
+                    SHADOW2_ERROR("PAE not supported in 32-bit Xen\n");
+                    domain_crash(d);
+                    return;
+#endif
+                }
+                else
+                {
+                    // 32-bit 2 level guest...
+#if CONFIG_PAGING_LEVELS >= 3
+                    v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 3, 
2);
+#else
+                    v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 2, 
2);
+#endif
+                }
+        }
+        
+        if ( pagetable_get_pfn(v->arch.monitor_table) == 0 )
+        {
+            mfn_t mmfn = shadow2_make_monitor_table(v);
+            v->arch.monitor_table = pagetable_from_mfn(mmfn);
+            v->arch.monitor_vtable = sh2_map_domain_page(mmfn);
+        } 
+
+        if ( v->arch.shadow2 != old_entries )
+        {
+            SHADOW2_PRINTK("new paging mode: d=%u v=%u g=%u s=%u "
+                           "(was g=%u s=%u)\n",
+                           d->domain_id, v->vcpu_id, 
+                           v->arch.shadow2->guest_levels,
+                           v->arch.shadow2->shadow_levels,
+                           old_entries ? old_entries->guest_levels : 0,
+                           old_entries ? old_entries->shadow_levels : 0);
+            if ( old_entries &&
+                 (v->arch.shadow2->shadow_levels !=
+                  old_entries->shadow_levels) )
+            {
+                /* Need to make a new monitor table for the new mode */
+                mfn_t new_mfn, old_mfn;
+
+                if ( v != current ) 
+                {
+                    SHADOW2_ERROR("Some third party (d=%u v=%u) is changing "
+                                  "this HVM vcpu's (d=%u v=%u) paging mode!\n",
+                                  current->domain->domain_id, current->vcpu_id,
+                                  v->domain->domain_id, v->vcpu_id);
+                    domain_crash(v->domain);
+                    return;
+                }
+
+                sh2_unmap_domain_page(v->arch.monitor_vtable);
+                old_mfn = pagetable_get_mfn(v->arch.monitor_table);
+                v->arch.monitor_table = pagetable_null();
+                new_mfn = v->arch.shadow2->make_monitor_table(v);            
+                v->arch.monitor_table = pagetable_from_mfn(new_mfn);
+                v->arch.monitor_vtable = sh2_map_domain_page(new_mfn);
+                SHADOW2_PRINTK("new monitor table %"SH2_PRI_mfn "\n",
+                               mfn_x(new_mfn));
+
+                /* Don't be running on the old monitor table when we 
+                 * pull it down!  Switch CR3, and warn the HVM code that
+                 * its host cr3 has changed. */
+                make_cr3(v, mfn_x(new_mfn));
+                write_ptbase(v);
+                hvm_update_host_cr3(v);
+                old_entries->destroy_monitor_table(v, old_mfn);
+            }
+        }
+
+        // XXX -- Need to deal with changes in CR4.PSE and CR4.PGE.
+        //        These are HARD: think about the case where two CPU's have
+        //        different values for CR4.PSE and CR4.PGE at the same time.
+        //        This *does* happen, at least for CR4.PGE...
+    }
+
+    v->arch.shadow2->update_cr3(v);
+}
+
+/**************************************************************************/
+/* Turning on and off shadow2 features */
+
+static void sh2_new_mode(struct domain *d, u32 new_mode)
+/* Inform all the vcpus that the shadow mode has been changed */
+{
+    struct vcpu *v;
+
+    ASSERT(shadow2_lock_is_acquired(d));
+    ASSERT(d != current->domain);
+    d->arch.shadow2_mode = new_mode;
+    if ( new_mode & SHM2_translate ) 
+        shadow2_audit_p2m(d);
+    for_each_vcpu(d, v)
+        sh2_update_paging_modes(v);
+}
+
+static int shadow2_enable(struct domain *d, u32 mode)
+/* Turn on "permanent" shadow features: external, translate, refcount.
+ * Can only be called once on a domain, and these features cannot be
+ * disabled. 
+ * Returns 0 for success, -errno for failure. */
+{    
+    unsigned int old_pages;
+    int rv = 0;
+
+    domain_pause(d);
+    shadow2_lock(d);
+
+    /* Sanity check the arguments */
+    if ( d == current->domain 
+         || shadow2_mode_enabled(d)
+         || !(mode & SHM2_enable)
+         || ((mode & SHM2_external) && !(mode & SHM2_translate)) )
+    {
+        rv = -EINVAL;
+        goto out;
+    }
+
+    // XXX -- eventually would like to require that all memory be allocated
+    // *after* shadow2_enabled() is called...  So here, we would test to make
+    // sure that d->page_list is empty.
+#if 0
+    spin_lock(&d->page_alloc_lock);
+    if ( !list_empty(&d->page_list) )
+    {
+        spin_unlock(&d->page_alloc_lock);
+        rv = -EINVAL;
+        goto out;
+    }
+    spin_unlock(&d->page_alloc_lock);
+#endif
+
+    /* Init the shadow memory allocation if the user hasn't done so */
+    old_pages = d->arch.shadow2_total_pages;
+    if ( old_pages == 0 )
+        if ( set_sh2_allocation(d, 256, NULL) != 0 ) /* Use at least 1MB */
+        {
+            set_sh2_allocation(d, 0, NULL);
+            rv = -ENOMEM;
+            goto out;
+        }
+
+    /* Init the hash table */
+    if ( shadow2_hash_alloc(d) != 0 )
+    {
+        set_sh2_allocation(d, old_pages, NULL);            
+        rv = -ENOMEM;
+        goto out;
+    }
+
+    /* Init the P2M table */
+    if ( mode & SHM2_translate )
+        if ( !shadow2_alloc_p2m_table(d) )
+        {
+            shadow2_hash_teardown(d);
+            set_sh2_allocation(d, old_pages, NULL);
+            shadow2_p2m_teardown(d);
+            rv = -ENOMEM;
+            goto out;
+        }
+
+    /* Update the bits */
+    sh2_new_mode(d, mode);
+    shadow2_audit_p2m(d);
+ out:
+    shadow2_unlock(d);
+    domain_unpause(d);
+    return 0;
+}
+
+void shadow2_teardown(struct domain *d)
+/* Destroy the shadow pagetables of this domain and free its shadow memory.
+ * Should only be called for dying domains. */
+{
+    struct vcpu *v;
+    mfn_t mfn;
+
+    ASSERT(test_bit(_DOMF_dying, &d->domain_flags));
+    ASSERT(d != current->domain);
+
+    if ( !shadow2_lock_is_acquired(d) )
+        shadow2_lock(d); /* Keep various asserts happy */
+
+    if ( shadow2_mode_enabled(d) )
+    {
+        /* Release the shadow and monitor tables held by each vcpu */
+        for_each_vcpu(d, v)
+        {
+            if ( v->arch.shadow2 )
+                shadow2_detach_old_tables(v);
+            if ( shadow2_mode_external(d) )
+            {
+                mfn = pagetable_get_mfn(v->arch.monitor_table);
+                if ( valid_mfn(mfn) && (mfn_x(mfn) != 0) )
+                    shadow2_destroy_monitor_table(v, mfn);
+                v->arch.monitor_table = pagetable_null();
+            }
+        }
+    }
+
+    if ( d->arch.shadow2_total_pages != 0 )
+    {
+        SHADOW2_PRINTK("teardown of domain %u starts."
+                       "  Shadow pages total = %u, free = %u, p2m=%u\n",
+                       d->domain_id,
+                       d->arch.shadow2_total_pages, 
+                       d->arch.shadow2_free_pages, 
+                       d->arch.shadow2_p2m_pages);
+        /* Destroy all the shadows and release memory to domheap */
+        set_sh2_allocation(d, 0, NULL);
+        /* Release the hash table back to xenheap */
+        if (d->arch.shadow2_hash_table) 
+            shadow2_hash_teardown(d);
+        /* Release the log-dirty bitmap of dirtied pages */
+        sh2_free_log_dirty_bitmap(d);
+        /* Should not have any more memory held */
+        SHADOW2_PRINTK("teardown done."
+                       "  Shadow pages total = %u, free = %u, p2m=%u\n",
+                       d->arch.shadow2_total_pages, 
+                       d->arch.shadow2_free_pages, 
+                       d->arch.shadow2_p2m_pages);
+        ASSERT(d->arch.shadow2_total_pages == 0);
+    }
+
+    /* We leave the "permanent" shadow modes enabled, but clear the
+     * log-dirty mode bit.  We don't want any more mark_dirty()
+     * calls now that we've torn down the bitmap */
+    d->arch.shadow2_mode &= ~SHM2_log_dirty;
+
+    shadow2_unlock(d);
+}
+
+void shadow2_final_teardown(struct domain *d)
+/* Called by arch_domain_destroy(), when it's safe to pull down the p2m map. */
+{
+
+    SHADOW2_PRINTK("dom %u final teardown starts."
+                   "  Shadow pages total = %u, free = %u, p2m=%u\n",
+                   d->domain_id,
+                   d->arch.shadow2_total_pages, 
+                   d->arch.shadow2_free_pages, 
+                   d->arch.shadow2_p2m_pages);
+
+    /* Double-check that the domain didn't have any shadow memory.  
+     * It is possible for a domain that never got domain_kill()ed
+     * to get here with its shadow allocation intact. */
+    if ( d->arch.shadow2_total_pages != 0 )
+        shadow2_teardown(d);
+
+    /* It is now safe to pull down the p2m map. */
+    if ( d->arch.shadow2_p2m_pages != 0 )
+        shadow2_p2m_teardown(d);
+
+    SHADOW2_PRINTK("dom %u final teardown done."
+                   "  Shadow pages total = %u, free = %u, p2m=%u\n",
+                   d->domain_id,
+                   d->arch.shadow2_total_pages, 
+                   d->arch.shadow2_free_pages, 
+                   d->arch.shadow2_p2m_pages);
+}
+
+static int shadow2_one_bit_enable(struct domain *d, u32 mode)
+/* Turn on a single shadow mode feature */
+{
+    ASSERT(shadow2_lock_is_acquired(d));
+
+    /* Sanity check the call */
+    if ( d == current->domain || (d->arch.shadow2_mode & mode) )
+    {
+        return -EINVAL;
+    }
+
+    if ( d->arch.shadow2_mode == 0 )
+    {
+        /* Init the shadow memory allocation and the hash table */
+        if ( set_sh2_allocation(d, 1, NULL) != 0 
+             || shadow2_hash_alloc(d) != 0 )
+        {
+            set_sh2_allocation(d, 0, NULL);
+            return -ENOMEM;
+        }
+    }
+
+    /* Update the bits */
+    sh2_new_mode(d, d->arch.shadow2_mode | mode);
+
+    return 0;
+}
+
+static int shadow2_one_bit_disable(struct domain *d, u32 mode) 
+/* Turn off a single shadow mode feature */
+{
+    struct vcpu *v;
+    ASSERT(shadow2_lock_is_acquired(d));
+
+    /* Sanity check the call */
+    if ( d == current->domain || !(d->arch.shadow2_mode & mode) )
+    {
+        return -EINVAL;
+    }
+
+    /* Update the bits */
+    sh2_new_mode(d, d->arch.shadow2_mode & ~mode);
+    if ( d->arch.shadow2_mode == 0 )
+    {
+        /* Get this domain off shadows */
+        SHADOW2_PRINTK("un-shadowing of domain %u starts."
+                       "  Shadow pages total = %u, free = %u, p2m=%u\n",
+                       d->domain_id,
+                       d->arch.shadow2_total_pages, 
+                       d->arch.shadow2_free_pages, 
+                       d->arch.shadow2_p2m_pages);
+        for_each_vcpu(d, v)
+        {
+            if ( v->arch.shadow2 )
+                shadow2_detach_old_tables(v);
+#if CONFIG_PAGING_LEVELS == 4
+            if ( !(v->arch.flags & TF_kernel_mode) )
+                make_cr3(v, pagetable_get_pfn(v->arch.guest_table_user));
+            else
+#endif
+                make_cr3(v, pagetable_get_pfn(v->arch.guest_table));
+
+        }
+
+        /* Pull down the memory allocation */
+        if ( set_sh2_allocation(d, 0, NULL) != 0 )
+        {
+            // XXX - How can this occur?
+            //       Seems like a bug to return an error now that we've
+            //       disabled the relevant shadow mode.
+            //
+            return -ENOMEM;
+        }
+        shadow2_hash_teardown(d);
+        SHADOW2_PRINTK("un-shadowing of domain %u done."
+                       "  Shadow pages total = %u, free = %u, p2m=%u\n",
+                       d->domain_id,
+                       d->arch.shadow2_total_pages, 
+                       d->arch.shadow2_free_pages, 
+                       d->arch.shadow2_p2m_pages);
+    }
+
+    return 0;
+}
+
+/* Enable/disable ops for the "test" and "log-dirty" modes */
+int shadow2_test_enable(struct domain *d)
+{
+    int ret;
+
+    domain_pause(d);
+    shadow2_lock(d);
+
+    if ( shadow2_mode_enabled(d) )
+    {
+        SHADOW2_ERROR("Don't support enabling test mode"
+                      "on already shadowed doms\n");
+        ret = -EINVAL;
+        goto out;
+    }
+
+    ret = shadow2_one_bit_enable(d, SHM2_enable);
+ out:
+    shadow2_unlock(d);
+    domain_unpause(d);
+
+    return ret;
+}
+
+int shadow2_test_disable(struct domain *d)
+{
+    int ret;
+
+    domain_pause(d);
+    shadow2_lock(d);
+    ret = shadow2_one_bit_disable(d, SHM2_enable);
+    shadow2_unlock(d);
+    domain_unpause(d);
+
+    return ret;
+}
+
+static int
+sh2_alloc_log_dirty_bitmap(struct domain *d)
+{
+    ASSERT(d->arch.shadow_dirty_bitmap == NULL);
+    d->arch.shadow_dirty_bitmap_size =
+        (d->shared_info->arch.max_pfn + (BITS_PER_LONG - 1)) &
+        ~(BITS_PER_LONG - 1);
+    d->arch.shadow_dirty_bitmap =
+        xmalloc_array(unsigned long,
+                      d->arch.shadow_dirty_bitmap_size / BITS_PER_LONG);
+    if ( d->arch.shadow_dirty_bitmap == NULL )
+    {
+        d->arch.shadow_dirty_bitmap_size = 0;
+        return -ENOMEM;
+    }
+    memset(d->arch.shadow_dirty_bitmap, 0, d->arch.shadow_dirty_bitmap_size/8);
+
+    return 0;
+}
+
+static void
+sh2_free_log_dirty_bitmap(struct domain *d)
+{
+    d->arch.shadow_dirty_bitmap_size = 0;
+    if ( d->arch.shadow_dirty_bitmap )
+    {
+        xfree(d->arch.shadow_dirty_bitmap);
+        d->arch.shadow_dirty_bitmap = NULL;
+    }
+}
+
+static int shadow2_log_dirty_enable(struct domain *d)
+{
+    int ret;
+
+    domain_pause(d);
+    shadow2_lock(d);
+
+    if ( shadow2_mode_log_dirty(d) )
+    {
+        ret = -EINVAL;
+        goto out;
+    }
+
+    if ( shadow2_mode_enabled(d) )
+    {
+        SHADOW2_ERROR("Don't (yet) support enabling log-dirty"
+                      "on already shadowed doms\n");
+        ret = -EINVAL;
+        goto out;
+    }
+
+    ret = sh2_alloc_log_dirty_bitmap(d);
+    if ( ret != 0 )
+    {
+        sh2_free_log_dirty_bitmap(d);
+        goto out;
+    }
+
+    ret = shadow2_one_bit_enable(d, SHM2_log_dirty);
+    if ( ret != 0 )
+        sh2_free_log_dirty_bitmap(d);
+
+ out:
+    shadow2_unlock(d);
+    domain_unpause(d);
+    return ret;
+}
+
+static int shadow2_log_dirty_disable(struct domain *d)
+{
+    int ret;
+
+    domain_pause(d);
+    shadow2_lock(d);
+    ret = shadow2_one_bit_disable(d, SHM2_log_dirty);
+    if ( !shadow2_mode_log_dirty(d) )
+        sh2_free_log_dirty_bitmap(d);
+    shadow2_unlock(d);
+    domain_unpause(d);
+
+    return ret;
+}
+
+/**************************************************************************/
+/* P2M map manipulations */
+
+static void
+sh2_p2m_remove_page(struct domain *d, unsigned long gfn, unsigned long mfn)
+{
+    struct vcpu *v;
+
+    if ( !shadow2_mode_translate(d) )
+        return;
+
+    v = current;
+    if ( v->domain != d )
+        v = d->vcpu[0];
+
+
+    SHADOW2_PRINTK("removing gfn=%#lx mfn=%#lx\n", gfn, mfn);
+
+    ASSERT(mfn_x(sh2_gfn_to_mfn(d, gfn)) == mfn);
+    //ASSERT(sh2_mfn_to_gfn(d, mfn) == gfn);
+
+    shadow2_remove_all_shadows_and_parents(v, _mfn(mfn));
+    if ( shadow2_remove_all_mappings(v, _mfn(mfn)) )
+        flush_tlb_mask(d->domain_dirty_cpumask);
+    shadow2_set_p2m_entry(d, gfn, _mfn(INVALID_MFN));
+    set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
+}
+
+void
+shadow2_guest_physmap_remove_page(struct domain *d, unsigned long gfn,
+                                  unsigned long mfn)
+{
+    shadow2_lock(d);
+    shadow2_audit_p2m(d);
+    sh2_p2m_remove_page(d, gfn, mfn);
+    shadow2_audit_p2m(d);
+    shadow2_unlock(d);    
+}
+
+void
+shadow2_guest_physmap_add_page(struct domain *d, unsigned long gfn,
+                               unsigned long mfn)
+{
+    struct vcpu *v;
+    unsigned long ogfn;
+    mfn_t omfn;
+
+    if ( !shadow2_mode_translate(d) )
+        return;
+
+    v = current;
+    if ( v->domain != d )
+        v = d->vcpu[0];
+
+    shadow2_lock(d);
+    shadow2_audit_p2m(d);
+
+    SHADOW2_DEBUG(P2M, "adding gfn=%#lx mfn=%#lx\n", gfn, mfn);
+
+    omfn = sh2_gfn_to_mfn(d, gfn);
+    if ( valid_mfn(omfn) )
+    {
+        /* Get rid of the old mapping, especially any shadows */
+        shadow2_remove_all_shadows_and_parents(v, omfn);
+        if ( shadow2_remove_all_mappings(v, omfn) )
+            flush_tlb_mask(d->domain_dirty_cpumask);
+        set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
+    }        
+
+    ogfn = sh2_mfn_to_gfn(d, _mfn(mfn));
+    if (
+#ifdef __x86_64__
+        (ogfn != 0x5555555555555555L)
+#else
+        (ogfn != 0x55555555L)
+#endif
+        && (ogfn != INVALID_M2P_ENTRY)
+        && (ogfn != gfn) )
+    {
+        /* This machine frame is already mapped at another physical address */
+        SHADOW2_DEBUG(P2M, "aliased! mfn=%#lx, old gfn=%#lx, new gfn=%#lx\n",
+                       mfn, ogfn, gfn);
+        if ( valid_mfn(omfn = sh2_gfn_to_mfn(d, ogfn)) ) 
+        {
+            SHADOW2_DEBUG(P2M, "old gfn=%#lx -> mfn %#lx\n", 
+                           ogfn , mfn_x(omfn));
+            if ( mfn_x(omfn) == mfn ) 
+                sh2_p2m_remove_page(d, ogfn, mfn);
+        }
+    }
+
+    shadow2_set_p2m_entry(d, gfn, _mfn(mfn));
+    set_gpfn_from_mfn(mfn, gfn);
+    shadow2_audit_p2m(d);
+    shadow2_unlock(d);
+}
+
+/**************************************************************************/
+/* Log-dirty mode support */
+
+/* Convert a shadow to log-dirty mode. */
+void shadow2_convert_to_log_dirty(struct vcpu *v, mfn_t smfn)
+{
+    BUG();
+}
+
+
+/* Read a domain's log-dirty bitmap and stats.  
+ * If the operation is a CLEAN, clear the bitmap and stats as well. */
+static int shadow2_log_dirty_op(struct domain *d, dom0_shadow_control_t *sc)
+{    
+    int i, rv = 0, clean = 0;
+
+    domain_pause(d);
+    shadow2_lock(d);
+
+    if ( sc->op == DOM0_SHADOW_CONTROL_OP_CLEAN
+         || sc->op == DOM0_SHADOW_CONTROL_OP_FLUSH ) 
+        clean = 1;
+    else 
+        ASSERT(sc->op == DOM0_SHADOW_CONTROL_OP_PEEK); 
+
+    SHADOW2_DEBUG(LOGDIRTY, "log-dirty %s: dom %u faults=%u dirty=%u\n", 
+                  (clean) ? "clean" : "peek",
+                  d->domain_id,
+                  d->arch.shadow_fault_count, 
+                  d->arch.shadow_dirty_count);
+
+    sc->stats.fault_count = d->arch.shadow_fault_count;
+    sc->stats.dirty_count = d->arch.shadow_dirty_count;    
+        
+    if ( clean ) 
+    {
+        struct list_head *l, *t;
+        struct page_info *pg;
+
+        /* Need to revoke write access to the domain's pages again. 
+         * In future, we'll have a less heavy-handed approach to this, 
+         * but for now, we just unshadow everything except Xen. */
+        list_for_each_safe(l, t, &d->arch.shadow2_toplevel_shadows)
+        {
+            pg = list_entry(l, struct page_info, list);
+            shadow2_unhook_mappings(d->vcpu[0], page_to_mfn(pg));
+        }
+
+        d->arch.shadow_fault_count = 0;
+        d->arch.shadow_dirty_count = 0;
+    }
+
+    if ( guest_handle_is_null(sc->dirty_bitmap) ||
+         (d->arch.shadow_dirty_bitmap == NULL) )
+    {
+        rv = -EINVAL;
+        goto out;
+    }
+ 
+    if ( sc->pages > d->arch.shadow_dirty_bitmap_size )
+        sc->pages = d->arch.shadow_dirty_bitmap_size; 
+
+#define CHUNK (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */
+    for ( i = 0; i < sc->pages; i += CHUNK )
+    {
+        int bytes = ((((sc->pages - i) > CHUNK) 
+                      ? CHUNK 
+                      : (sc->pages - i)) + 7) / 8;
+     
+        if ( copy_to_guest_offset(
+                 sc->dirty_bitmap, 
+                 i/(8*sizeof(unsigned long)),
+                 d->arch.shadow_dirty_bitmap + (i/(8*sizeof(unsigned long))),
+                 (bytes + sizeof(unsigned long) - 1) / sizeof(unsigned long)) )
+        {
+            rv = -EINVAL;
+            goto out;
+        }
+
+        if ( clean )
+            memset(d->arch.shadow_dirty_bitmap + (i/(8*sizeof(unsigned long))),
+                   0, bytes);
+    }
+#undef CHUNK
+
+ out:
+    shadow2_unlock(d);
+    domain_unpause(d);
+    return 0;
+}
+
+
+/* Mark a page as dirty */
+void sh2_do_mark_dirty(struct domain *d, mfn_t gmfn)
+{
+    unsigned long pfn;
+
+    ASSERT(shadow2_lock_is_acquired(d));
+    ASSERT(shadow2_mode_log_dirty(d));
+
+    if ( !valid_mfn(gmfn) )
+        return;
+
+    ASSERT(d->arch.shadow_dirty_bitmap != NULL);
+
+    /* We /really/ mean PFN here, even for non-translated guests. */
+    pfn = get_gpfn_from_mfn(mfn_x(gmfn));
+
+    /*
+     * Values with the MSB set denote MFNs that aren't really part of the 
+     * domain's pseudo-physical memory map (e.g., the shared info frame).
+     * Nothing to do here...
+     */
+    if ( unlikely(!VALID_M2P(pfn)) )
+        return;
+
+    /* N.B. Can use non-atomic TAS because protected by shadow2_lock. */
+    if ( likely(pfn < d->arch.shadow_dirty_bitmap_size) ) 
+    { 
+        if ( !__test_and_set_bit(pfn, d->arch.shadow_dirty_bitmap) )
+        {
+            SHADOW2_DEBUG(LOGDIRTY, 
+                          "marked mfn %" SH2_PRI_mfn " (pfn=%lx), dom %d\n",
+                          mfn_x(gmfn), pfn, d->domain_id);
+            d->arch.shadow_dirty_count++;
+        }
+    }
+    else
+    {
+        SHADOW2_PRINTK("mark_dirty OOR! "
+                       "mfn=%" SH2_PRI_mfn " pfn=%lx max=%x (dom %d)\n"
+                       "owner=%d c=%08x t=%" PRtype_info "\n",
+                       mfn_x(gmfn), 
+                       pfn, 
+                       d->arch.shadow_dirty_bitmap_size,
+                       d->domain_id,
+                       (page_get_owner(mfn_to_page(gmfn))
+                        ? page_get_owner(mfn_to_page(gmfn))->domain_id
+                        : -1),
+                       mfn_to_page(gmfn)->count_info, 
+                       mfn_to_page(gmfn)->u.inuse.type_info);
+    }
+}
+
+
+/**************************************************************************/
+/* Shadow-control DOM0_OP dispatcher */
+
+int shadow2_control_op(struct domain *d, 
+                       dom0_shadow_control_t *sc,
+                       XEN_GUEST_HANDLE(dom0_op_t) u_dom0_op)
+{
+    int rc, preempted = 0;
+
+    if ( unlikely(d == current->domain) )
+    {
+        DPRINTK("Don't try to do a shadow op on yourself!\n");
+        return -EINVAL;
+    }
+
+    switch ( sc->op )
+    {
+    case DOM0_SHADOW_CONTROL_OP_OFF:
+        if ( shadow2_mode_log_dirty(d) )
+            if ( (rc = shadow2_log_dirty_disable(d)) != 0 ) 
+                return rc;
+        if ( d->arch.shadow2_mode & SHM2_enable )
+            if ( (rc = shadow2_test_disable(d)) != 0 ) 
+                return rc;
+        return 0;
+
+    case DOM0_SHADOW_CONTROL_OP_ENABLE_TEST:
+        return shadow2_test_enable(d);
+        
+    case DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY:
+        return shadow2_log_dirty_enable(d);
+        
+    case DOM0_SHADOW_CONTROL_OP_FLUSH:
+    case DOM0_SHADOW_CONTROL_OP_CLEAN:
+    case DOM0_SHADOW_CONTROL_OP_PEEK:
+        return shadow2_log_dirty_op(d, sc);
+
+
+
+    case DOM0_SHADOW2_CONTROL_OP_ENABLE:
+        return shadow2_enable(d, sc->mode << SHM2_shift);        
+
+    case DOM0_SHADOW2_CONTROL_OP_GET_ALLOCATION:
+        sc->mb = shadow2_get_allocation(d);
+        return 0;
+        
+    case DOM0_SHADOW2_CONTROL_OP_SET_ALLOCATION:
+        rc = shadow2_set_allocation(d, sc->mb, &preempted);
+        if ( preempted )
+            /* Not finished.  Set up to re-run the call. */
+            rc = hypercall_create_continuation(
+                __HYPERVISOR_dom0_op, "h", u_dom0_op);
+        else 
+            /* Finished.  Return the new allocation */
+            sc->mb = shadow2_get_allocation(d);
+        return rc;
+        
+        
+    default:
+        SHADOW2_ERROR("Bad shadow op %u\n", sc->op);
+        return -EINVAL;
+    }
+}
+
+
+/**************************************************************************/
+/* Auditing shadow tables */
+
+#if SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_FULL
+
+void shadow2_audit_tables(struct vcpu *v) 
+{
+    /* Dispatch table for getting per-type functions */
+    static hash_callback_t callbacks[16] = {
+        NULL, /* none    */
+#if CONFIG_PAGING_LEVELS == 2
+        SHADOW2_INTERNAL_NAME(sh2_audit_l1_table,2,2),  /* l1_32   */
+        SHADOW2_INTERNAL_NAME(sh2_audit_fl1_table,2,2), /* fl1_32  */
+        SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,2,2),  /* l2_32   */
+#else 
+        SHADOW2_INTERNAL_NAME(sh2_audit_l1_table,3,2),  /* l1_32   */
+        SHADOW2_INTERNAL_NAME(sh2_audit_fl1_table,3,2), /* fl1_32  */
+        SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,3,2),  /* l2_32   */
+        SHADOW2_INTERNAL_NAME(sh2_audit_l1_table,3,3),  /* l1_pae  */
+        SHADOW2_INTERNAL_NAME(sh2_audit_fl1_table,3,3), /* fl1_pae */
+        SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,3,3),  /* l2_pae  */
+        SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,3,3),  /* l2h_pae */
+        SHADOW2_INTERNAL_NAME(sh2_audit_l3_table,3,3),  /* l3_pae  */
+#if CONFIG_PAGING_LEVELS >= 4
+        SHADOW2_INTERNAL_NAME(sh2_audit_l1_table,4,4),  /* l1_64   */
+        SHADOW2_INTERNAL_NAME(sh2_audit_fl1_table,4,4), /* fl1_64  */
+        SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,4,4),  /* l2_64   */
+        SHADOW2_INTERNAL_NAME(sh2_audit_l3_table,4,4),  /* l3_64   */
+        SHADOW2_INTERNAL_NAME(sh2_audit_l4_table,4,4),  /* l4_64   */
+#endif /* CONFIG_PAGING_LEVELS >= 4 */
+#endif /* CONFIG_PAGING_LEVELS > 2 */
+        NULL  /* All the rest */
+    };
+    unsigned int mask; 
+
+    if ( !(SHADOW2_AUDIT_ENABLE) )
+        return;
+    
+    if ( SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_FULL )
+        mask = ~1; /* Audit every table in the system */
+    else 
+    {
+        /* Audit only the current mode's tables */
+        switch (v->arch.shadow2->guest_levels)
+        {
+        case 2: mask = (SH2F_L1_32|SH2F_FL1_32|SH2F_L2_32); break;
+        case 3: mask = (SH2F_L1_PAE|SH2F_FL1_PAE|SH2F_L2_PAE
+                        |SH2F_L2H_PAE|SH2F_L3_PAE); break;
+        case 4: mask = (SH2F_L1_64|SH2F_FL1_64|SH2F_L2_64  
+                        |SH2F_L3_64|SH2F_L4_64); break;
+        default: BUG();
+        }
+    }
+
+    hash_foreach(v, ~1, callbacks, _mfn(INVALID_MFN));
+}
+
+#endif /* Shadow audit */
+
+
+/**************************************************************************/
+/* Auditing p2m tables */
+
+#if SHADOW2_AUDIT & SHADOW2_AUDIT_P2M
+
+void shadow2_audit_p2m(struct domain *d)
+{
+    struct list_head *entry;
+    struct page_info *page;
+    struct domain *od;
+    unsigned long mfn, gfn, m2pfn, lp2mfn = 0;
+    mfn_t p2mfn;
+    unsigned long orphans_d = 0, orphans_i = 0, mpbad = 0, pmbad = 0;
+    int test_linear;
+    
+    if ( !(SHADOW2_AUDIT_ENABLE) || !shadow2_mode_translate(d) )
+        return;
+
+    //SHADOW2_PRINTK("p2m audit starts\n");
+
+    test_linear = ( (d == current->domain) && current->arch.monitor_vtable );
+    if ( test_linear )
+        local_flush_tlb(); 
+
+    /* Audit part one: walk the domain's page allocation list, checking 
+     * the m2p entries. */
+    for ( entry = d->page_list.next;
+          entry != &d->page_list;
+          entry = entry->next )
+    {
+        page = list_entry(entry, struct page_info, list);
+        mfn = mfn_x(page_to_mfn(page));
+
+        // SHADOW2_PRINTK("auditing guest page, mfn=%#lx\n", mfn); 
+
+        od = page_get_owner(page);
+
+        if ( od != d ) 
+        {
+            SHADOW2_PRINTK("wrong owner %#lx -> %p(%u) != %p(%u)\n",
+                           mfn, od, (od?od->domain_id:-1), d, d->domain_id);
+            continue;
+        }
+
+        gfn = get_gpfn_from_mfn(mfn);
+        if ( gfn == INVALID_M2P_ENTRY ) 
+        {
+            orphans_i++;
+            //SHADOW2_PRINTK("orphaned guest page: mfn=%#lx has invalid gfn\n",
+            //               mfn); 
+            continue;
+        }
+
+        if ( gfn == 0x55555555 ) 
+        {
+            orphans_d++;
+            //SHADOW2_PRINTK("orphaned guest page: mfn=%#lx has debug gfn\n", 
+            //               mfn); 
+            continue;
+        }
+
+        p2mfn = sh2_gfn_to_mfn_foreign(d, gfn);
+        if ( mfn_x(p2mfn) != mfn )
+        {
+            mpbad++;
+            SHADOW2_PRINTK("map mismatch mfn %#lx -> gfn %#lx -> mfn %#lx"
+                           " (-> gfn %#lx)\n",
+                           mfn, gfn, mfn_x(p2mfn),
+                           (mfn_valid(p2mfn)
+                            ? get_gpfn_from_mfn(mfn_x(p2mfn))
+                            : -1u));
+            /* This m2p entry is stale: the domain has another frame in
+             * this physical slot.  No great disaster, but for neatness,
+             * blow away the m2p entry. */ 
+            set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
+        }
+
+        if ( test_linear )
+        {
+            lp2mfn = get_mfn_from_gpfn(gfn);
+            if ( lp2mfn != mfn_x(p2mfn) )
+            {
+                SHADOW2_PRINTK("linear mismatch gfn %#lx -> mfn %#lx "
+                               "(!= mfn %#lx)\n", gfn, lp2mfn, p2mfn);
+            }
+        }
+
+        // SHADOW2_PRINTK("OK: mfn=%#lx, gfn=%#lx, p2mfn=%#lx, lp2mfn=%#lx\n", 
+        //                mfn, gfn, p2mfn, lp2mfn); 
+    }   
+
+    /* Audit part two: walk the domain's p2m table, checking the entries. */
+    if ( pagetable_get_pfn(d->arch.phys_table) != 0 )
+    {
+        l2_pgentry_t *l2e;
+        l1_pgentry_t *l1e;
+        int i1, i2;
+        
+#if CONFIG_PAGING_LEVELS == 4
+        l4_pgentry_t *l4e;
+        l3_pgentry_t *l3e;
+        int i3, i4;
+        l4e = sh2_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
+#elif CONFIG_PAGING_LEVELS == 3
+        l3_pgentry_t *l3e;
+        int i3;
+        l3e = sh2_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
+#else /* CONFIG_PAGING_LEVELS == 2 */
+        l2e = sh2_map_domain_page(pagetable_get_mfn(d->arch.phys_table));
+#endif
+
+        gfn = 0;
+#if CONFIG_PAGING_LEVELS >= 3
+#if CONFIG_PAGING_LEVELS >= 4
+        for ( i4 = 0; i4 < L4_PAGETABLE_ENTRIES; i4++ )
+        {
+            if ( !(l4e_get_flags(l4e[i4]) & _PAGE_PRESENT) )
+            {
+                gfn += 1 << (L4_PAGETABLE_SHIFT - PAGE_SHIFT);
+                continue;
+            }
+            l3e = sh2_map_domain_page(_mfn(l4e_get_pfn(l4e[i4])));
+#endif /* now at levels 3 or 4... */
+            for ( i3 = 0; 
+                  i3 < ((CONFIG_PAGING_LEVELS==4) ? L3_PAGETABLE_ENTRIES : 8); 
+                  i3++ )
+            {
+                if ( !(l3e_get_flags(l3e[i3]) & _PAGE_PRESENT) )
+                {
+                    gfn += 1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
+                    continue;
+                }
+                l2e = sh2_map_domain_page(_mfn(l3e_get_pfn(l3e[i3])));
+#endif /* all levels... */
+                for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )
+                {
+                    if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) )
+                    {
+                        gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT);
+                        continue;
+                    }
+                    l1e = sh2_map_domain_page(_mfn(l2e_get_pfn(l2e[i2])));
+                    
+                    for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ )
+                    {
+                        if ( !(l1e_get_flags(l1e[i1]) & _PAGE_PRESENT) )
+                            continue;
+                        mfn = l1e_get_pfn(l1e[i1]);
+                        ASSERT(valid_mfn(_mfn(mfn)));
+                        m2pfn = get_gpfn_from_mfn(mfn);
+                        if ( m2pfn != gfn )
+                        {
+                            pmbad++;
+                            SHADOW2_PRINTK("mismatch: gfn %#lx -> mfn %#lx"
+                                           " -> gfn %#lx\n", gfn, mfn, m2pfn);
+                            BUG();
+                        }
+                    }
+                    sh2_unmap_domain_page(l1e);
+                }
+#if CONFIG_PAGING_LEVELS >= 3
+                sh2_unmap_domain_page(l2e);
+            }
+#if CONFIG_PAGING_LEVELS >= 4
+            sh2_unmap_domain_page(l3e);
+        }
+#endif
+#endif
+
+#if CONFIG_PAGING_LEVELS == 4
+        sh2_unmap_domain_page(l4e);
+#elif CONFIG_PAGING_LEVELS == 3
+        sh2_unmap_domain_page(l3e);
+#else /* CONFIG_PAGING_LEVELS == 2 */
+        sh2_unmap_domain_page(l2e);
+#endif
+
+    }
+
+    //SHADOW2_PRINTK("p2m audit complete\n");
+    //if ( orphans_i | orphans_d | mpbad | pmbad ) 
+    //    SHADOW2_PRINTK("p2m audit found %lu orphans (%lu inval %lu debug)\n",
+    //                   orphans_i + orphans_d, orphans_i, orphans_d,
+    if ( mpbad | pmbad ) 
+        SHADOW2_PRINTK("p2m audit found %lu odd p2m, %lu bad m2p entries\n",
+                       pmbad, mpbad);
+}
+
+#endif /* p2m audit */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End: 
+ */
diff -r f2151423f729 -r 01345b08d012 xen/arch/x86/shadow2.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/shadow2.c    Wed Aug 16 17:11:56 2006 +0100
@@ -0,0 +1,4469 @@
+/******************************************************************************
+ * arch/x86/shadow2.c
+ *
+ * Simple, mostly-synchronous shadow page tables. 
+ * Parts of this code are Copyright (c) 2006 by XenSource Inc.
+ * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
+ * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+// DESIGN QUESTIONS:
+// Why use subshadows for PAE guests?
+// - reduces pressure in the hash table
+// - reduces shadow size (64-vs-4096 bytes of shadow for 32 bytes of guest L3)
+// - would need to find space in the page_info to store 7 more bits of
+//   backpointer
+// - independent shadows of 32 byte chunks makes it non-obvious how to quickly
+//   figure out when to demote the guest page from l3 status
+//
+// PAE Xen HVM guests are restricted to 8GB of pseudo-physical address space.
+// - Want to map the P2M table into the 16MB RO_MPT hole in Xen's address
+//   space for both PV and HVM guests.
+//
+
+#define SHADOW2 1
+
+#include <xen/config.h>
+#include <xen/types.h>
+#include <xen/mm.h>
+#include <xen/trace.h>
+#include <xen/sched.h>
+#include <xen/perfc.h>
+#include <xen/domain_page.h>
+#include <asm/page.h>
+#include <asm/current.h>
+#include <asm/shadow2.h>
+#include <asm/shadow2-private.h>
+#include <asm/shadow2-types.h>
+#include <asm/flushtlb.h>
+#include <asm/hvm/hvm.h>
+
+/* The first cut: an absolutely synchronous, trap-and-emulate version,
+ * supporting only HVM guests (and so only "external" shadow mode). 
+ *
+ * THINGS TO DO LATER:
+ * 
+ * FIX GVA_TO_GPA
+ * The current interface returns an unsigned long, which is not big enough
+ * to hold a physical address in PAE.  Should return a gfn instead.
+ * 
+ * TEARDOWN HEURISTICS
+ * Also: have a heuristic for when to destroy a previous paging-mode's 
+ * shadows.  When a guest is done with its start-of-day 32-bit tables
+ * and reuses the memory we want to drop those shadows.  Start with 
+ * shadows in a page in two modes as a hint, but beware of clever tricks 
+ * like reusing a pagetable for both PAE and 64-bit during boot...
+ *
+ * PAE LINEAR MAPS
+ * Rework shadow_get_l*e() to have the option of using map_domain_page()
+ * instead of linear maps.  Add appropriate unmap_l*e calls in the users. 
+ * Then we can test the speed difference made by linear maps.  If the 
+ * map_domain_page() version is OK on PAE, we could maybe allow a lightweight 
+ * l3-and-l2h-only shadow mode for PAE PV guests that would allow them 
+ * to share l2h pages again. 
+ *
+ * PAE L3 COPYING
+ * In this code, we copy all 32 bytes of a PAE L3 every time we change an 
+ * entry in it, and every time we change CR3.  We copy it for the linear 
+ * mappings (ugh! PAE linear mappings) and we copy it to the low-memory
+ * buffer so it fits in CR3.  Maybe we can avoid some of this recopying 
+ * by using the shadow directly in some places. 
+ * Also, for SMP, need to actually respond to seeing shadow2_pae_flip_pending.
+ *
+ * GUEST_WALK_TABLES TLB FLUSH COALESCE
+ * guest_walk_tables can do up to three remote TLB flushes as it walks to
+ * the first l1 of a new pagetable.  Should coalesce the flushes to the end, 
+ * and if we do flush, re-do the walk.  If anything has changed, then 
+ * pause all the other vcpus and do the walk *again*.
+ *
+ * WP DISABLED
+ * Consider how to implement having the WP bit of CR0 set to 0.  
+ * Since we need to be able to cause write faults to pagetables, this might
+ * end up looking like not having the (guest) pagetables present at all in 
+ * HVM guests...
+ *
+ * PSE disabled / PSE36
+ * We don't support any modes other than PSE enabled, PSE36 disabled.
+ * Neither of those would be hard to change, but we'd need to be able to 
+ * deal with shadows made in one mode and used in another.
+ */
+
+#define FETCH_TYPE_PREFETCH 1
+#define FETCH_TYPE_DEMAND   2
+#define FETCH_TYPE_WRITE    4
+typedef enum {
+    ft_prefetch     = FETCH_TYPE_PREFETCH,
+    ft_demand_read  = FETCH_TYPE_DEMAND,
+    ft_demand_write = FETCH_TYPE_DEMAND | FETCH_TYPE_WRITE,
+} fetch_type_t;
+
+#ifndef NDEBUG
+static char *fetch_type_names[] = {
+    [ft_prefetch]     "prefetch",
+    [ft_demand_read]  "demand read",
+    [ft_demand_write] "demand write",
+};
+#endif
+
+/* XXX forward declarations */
+#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3)
+static unsigned long hvm_pae_copy_root(struct vcpu *v, l3_pgentry_t *l3tab, 
int clear_res);
+#endif
+static inline void sh2_update_linear_entries(struct vcpu *v);
+
+/**************************************************************************/
+/* Hash table mapping from guest pagetables to shadows
+ *
+ * Normal case: maps the mfn of a guest page to the mfn of its shadow page.
+ * FL1's:       maps the *gfn* of the start of a superpage to the mfn of a
+ *              shadow L1 which maps its "splinters".
+ * PAE CR3s:    maps the 32-byte aligned, 32-bit CR3 value to the mfn of the
+ *              PAE L3 info page for that CR3 value.
+ */
+
+static inline mfn_t 
+get_fl1_shadow_status(struct vcpu *v, gfn_t gfn)
+/* Look for FL1 shadows in the hash table */
+{
+    mfn_t smfn = shadow2_hash_lookup(v, gfn_x(gfn),
+                                     PGC_SH2_fl1_shadow >> PGC_SH2_type_shift);
+
+    if ( unlikely(shadow2_mode_log_dirty(v->domain) && valid_mfn(smfn)) )
+    {
+        struct page_info *page = mfn_to_page(smfn);
+        if ( !(page->count_info & PGC_SH2_log_dirty) )
+            shadow2_convert_to_log_dirty(v, smfn);
+    }
+
+    return smfn;
+}
+
+static inline mfn_t 
+get_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type)
+/* Look for shadows in the hash table */
+{
+    mfn_t smfn = shadow2_hash_lookup(v, mfn_x(gmfn),
+                                     shadow_type >> PGC_SH2_type_shift);
+    perfc_incrc(shadow2_get_shadow_status);
+
+    if ( unlikely(shadow2_mode_log_dirty(v->domain) && valid_mfn(smfn)) )

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.