[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-changelog] [xen-unstable] merge
# HG changeset patch # User kfraser@xxxxxxxxxxxxxxxxxxxxx # Node ID 01345b08d0122830f17624bf8d19b0ba48744de0 # Parent f2151423f729a49281c719308be4cd76f6c934c4 # Parent e66352312acb9a8beacb7a8faae9a68442e9fb31 merge --- xen/arch/x86/audit.c | 984 ------ xen/arch/x86/shadow.c | 4150 ---------------------------- xen/arch/x86/shadow32.c | 3782 -------------------------- xen/arch/x86/shadow_guest32.c | 16 xen/arch/x86/shadow_guest32pae.c | 16 xen/arch/x86/shadow_public.c | 2143 -------------- xen/include/asm-x86/shadow_64.h | 587 ---- xen/include/asm-x86/shadow_ops.h | 138 xen/include/asm-x86/shadow_public.h | 61 .hgtags | 10 tools/examples/xmexample.hvm | 4 tools/libxc/xc_domain.c | 13 tools/libxc/xc_hvm_build.c | 13 tools/libxc/xc_linux_build.c | 2 tools/libxc/xc_linux_save.c | 18 tools/libxc/xenctrl.h | 2 tools/misc/xc_shadow.c | 2 tools/python/xen/lowlevel/xc/xc.c | 69 tools/python/xen/xend/XendDomain.py | 24 tools/python/xen/xend/XendDomainInfo.py | 47 tools/python/xen/xend/image.py | 17 tools/python/xen/xm/create.py | 9 xen/arch/x86/Makefile | 16 xen/arch/x86/dom0_ops.c | 2 xen/arch/x86/domain.c | 106 xen/arch/x86/domain_build.c | 13 xen/arch/x86/hvm/hvm.c | 25 xen/arch/x86/hvm/platform.c | 9 xen/arch/x86/hvm/svm/svm.c | 265 - xen/arch/x86/hvm/svm/vmcb.c | 4 xen/arch/x86/hvm/vlapic.c | 3 xen/arch/x86/hvm/vmx/vmcs.c | 15 xen/arch/x86/hvm/vmx/vmx.c | 228 - xen/arch/x86/mm.c | 485 +-- xen/arch/x86/setup.c | 2 xen/arch/x86/shadow2-common.c | 3394 +++++++++++++++++++++++ xen/arch/x86/shadow2.c | 4469 +++++++++++++++++++++++++++++++ xen/arch/x86/smpboot.c | 2 xen/arch/x86/traps.c | 32 xen/arch/x86/x86_32/domain_page.c | 33 xen/arch/x86/x86_32/mm.c | 3 xen/arch/x86/x86_64/mm.c | 3 xen/arch/x86/x86_64/traps.c | 14 xen/common/acm_ops.c | 1 xen/common/grant_table.c | 4 xen/common/keyhandler.c | 19 xen/common/memory.c | 11 xen/drivers/char/console.c | 50 xen/include/asm-x86/bitops.h | 18 xen/include/asm-x86/config.h | 22 xen/include/asm-x86/domain.h | 99 xen/include/asm-x86/grant_table.h | 2 xen/include/asm-x86/hvm/hvm.h | 25 xen/include/asm-x86/hvm/support.h | 11 xen/include/asm-x86/hvm/vcpu.h | 6 xen/include/asm-x86/hvm/vmx/vmcs.h | 1 xen/include/asm-x86/hvm/vmx/vmx.h | 49 xen/include/asm-x86/mm.h | 140 xen/include/asm-x86/msr.h | 4 xen/include/asm-x86/page-guest32.h | 7 xen/include/asm-x86/page.h | 37 xen/include/asm-x86/perfc_defn.h | 53 xen/include/asm-x86/processor.h | 1 xen/include/asm-x86/shadow.h | 1791 ------------ xen/include/asm-x86/shadow2-multi.h | 116 xen/include/asm-x86/shadow2-private.h | 612 ++++ xen/include/asm-x86/shadow2-types.h | 705 ++++ xen/include/asm-x86/shadow2.h | 627 ++++ xen/include/asm-x86/x86_32/page-2level.h | 1 xen/include/asm-x86/x86_32/page-3level.h | 3 xen/include/asm-x86/x86_64/page.h | 5 xen/include/public/dom0_ops.h | 16 xen/include/xen/domain_page.h | 13 xen/include/xen/lib.h | 4 xen/include/xen/list.h | 10 xen/include/xen/sched.h | 5 76 files changed, 11149 insertions(+), 14549 deletions(-) diff -r f2151423f729 -r 01345b08d012 .hgtags --- a/.hgtags Wed Aug 16 16:48:45 2006 +0100 +++ b/.hgtags Wed Aug 16 17:11:56 2006 +0100 @@ -15,3 +15,13 @@ c8fdb0caa77b429cf47f9707926e83947778cb48 c8fdb0caa77b429cf47f9707926e83947778cb48 RELEASE-3.0.0 af0573e9e5258db0a9d28aa954dd302ddd2c2d23 3.0.2-rc d0d3fef37685be264a7f52201f8ef44c030daad3 3.0.2-branched +6e864d7de9db066f92bea505d256bfe286200fed last-code-review +a898a6510c5db4e3d1f69d40fcacb540643b0f22 mainline +bfa6f4a0c594bc0ebd896437d69857b58dab0988 last-code-review +fc6cbf31bd883bc76ceb97f4b817ac88078d696a latest patch to unstable +8e55c5c1147589b7a6a1875384d4317aec7ccf84 mainline +2d2ed4d9b1c14aeee29dfdd77acd6017d31290cd mainline +0e32095a7b4611d18a82052a9d5b23e474f91af9 mainline +88e6bd5e2b5439f97e1d50a8724103c619aeaadf mainline +5233c4b076b9aa073eff63508461b7bfa597737c mainline +fda70200da01b89d5339342df6c0db372369a16d mainline diff -r f2151423f729 -r 01345b08d012 tools/examples/xmexample.hvm --- a/tools/examples/xmexample.hvm Wed Aug 16 16:48:45 2006 +0100 +++ b/tools/examples/xmexample.hvm Wed Aug 16 17:11:56 2006 +0100 @@ -26,6 +26,10 @@ builder='hvm' # memory errors. The domain needs enough memory to boot kernel # and modules. Allocating less than 32MBs is not recommended. memory = 128 + +# Shadow pagetable memory for the domain, in MB. +# Should be at least 2KB per MB of domain memory, plus a few MB per vcpu. +shadow_memory = 8 # A name for your domain. All domains must have different names. name = "ExampleHVMDomain" diff -r f2151423f729 -r 01345b08d012 tools/libxc/xc_domain.c --- a/tools/libxc/xc_domain.c Wed Aug 16 16:48:45 2006 +0100 +++ b/tools/libxc/xc_domain.c Wed Aug 16 17:11:56 2006 +0100 @@ -213,21 +213,28 @@ int xc_shadow_control(int xc_handle, unsigned int sop, unsigned long *dirty_bitmap, unsigned long pages, - xc_shadow_control_stats_t *stats ) + unsigned long *mb, + uint32_t mode, + xc_shadow_control_stats_t *stats) { int rc; DECLARE_DOM0_OP; op.cmd = DOM0_SHADOW_CONTROL; op.u.shadow_control.domain = (domid_t)domid; op.u.shadow_control.op = sop; + op.u.shadow_control.pages = pages; + op.u.shadow_control.mb = mb ? *mb : 0; + op.u.shadow_control.mode = mode; set_xen_guest_handle(op.u.shadow_control.dirty_bitmap, dirty_bitmap); - op.u.shadow_control.pages = pages; rc = do_dom0_op(xc_handle, &op); if ( stats ) memcpy(stats, &op.u.shadow_control.stats, sizeof(xc_shadow_control_stats_t)); + + if ( mb ) + *mb = op.u.shadow_control.mb; return (rc == 0) ? op.u.shadow_control.pages : rc; } @@ -391,7 +398,7 @@ int xc_domain_memory_populate_physmap(in if ( err > 0 ) { - DPRINTF("Failed deallocation for dom %d: %ld pages order %d\n", + DPRINTF("Failed allocation for dom %d: %ld pages order %d\n", domid, nr_extents, extent_order); errno = EBUSY; err = -1; diff -r f2151423f729 -r 01345b08d012 tools/libxc/xc_hvm_build.c --- a/tools/libxc/xc_hvm_build.c Wed Aug 16 16:48:45 2006 +0100 +++ b/tools/libxc/xc_hvm_build.c Wed Aug 16 17:11:56 2006 +0100 @@ -395,6 +395,19 @@ static int xc_hvm_build_internal(int xc_ PERROR("Could not get info on domain"); goto error_out; } + + /* HVM domains must be put into shadow2 mode at the start of day */ + if ( xc_shadow_control(xc_handle, domid, DOM0_SHADOW2_CONTROL_OP_ENABLE, + NULL, 0, NULL, + DOM0_SHADOW2_CONTROL_FLAG_ENABLE + | DOM0_SHADOW2_CONTROL_FLAG_REFCOUNT + | DOM0_SHADOW2_CONTROL_FLAG_TRANSLATE + | DOM0_SHADOW2_CONTROL_FLAG_EXTERNAL, + NULL) ) + { + PERROR("Could not enable shadow paging for domain.\n"); + goto error_out; + } memset(ctxt, 0, sizeof(*ctxt)); diff -r f2151423f729 -r 01345b08d012 tools/libxc/xc_linux_build.c --- a/tools/libxc/xc_linux_build.c Wed Aug 16 16:48:45 2006 +0100 +++ b/tools/libxc/xc_linux_build.c Wed Aug 16 17:11:56 2006 +0100 @@ -972,7 +972,7 @@ static int setup_guest(int xc_handle, /* Enable shadow translate mode */ if ( xc_shadow_control(xc_handle, dom, DOM0_SHADOW_CONTROL_OP_ENABLE_TRANSLATE, - NULL, 0, NULL) < 0 ) + NULL, 0, NULL, 0, NULL) < 0 ) { PERROR("Could not enable translation mode"); goto error_out; diff -r f2151423f729 -r 01345b08d012 tools/libxc/xc_linux_save.c --- a/tools/libxc/xc_linux_save.c Wed Aug 16 16:48:45 2006 +0100 +++ b/tools/libxc/xc_linux_save.c Wed Aug 16 17:11:56 2006 +0100 @@ -338,13 +338,13 @@ static int analysis_phase(int xc_handle, int i; xc_shadow_control(xc_handle, domid, DOM0_SHADOW_CONTROL_OP_CLEAN, - arr, max_pfn, NULL); + arr, max_pfn, NULL, 0, NULL); DPRINTF("#Flush\n"); for ( i = 0; i < 40; i++ ) { usleep(50000); now = llgettimeofday(); xc_shadow_control(xc_handle, domid, DOM0_SHADOW_CONTROL_OP_PEEK, - NULL, 0, &stats); + NULL, 0, NULL, 0, &stats); DPRINTF("now= %lld faults= %" PRId32 " dirty= %" PRId32 " dirty_net= %" PRId32 " dirty_block= %" PRId32"\n", @@ -727,7 +727,7 @@ int xc_linux_save(int xc_handle, int io_ if (xc_shadow_control(xc_handle, dom, DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY, - NULL, 0, NULL ) < 0) { + NULL, 0, NULL, 0, NULL) < 0) { ERR("Couldn't enable shadow mode"); goto out; } @@ -879,7 +879,7 @@ int xc_linux_save(int xc_handle, int io_ but this is fast enough for the moment. */ if (!last_iter && xc_shadow_control( xc_handle, dom, DOM0_SHADOW_CONTROL_OP_PEEK, - to_skip, max_pfn, NULL) != max_pfn) { + to_skip, max_pfn, NULL, 0, NULL) != max_pfn) { ERR("Error peeking shadow bitmap"); goto out; } @@ -1084,8 +1084,9 @@ int xc_linux_save(int xc_handle, int io_ (unsigned long)ctxt.user_regs.edx); } - if (xc_shadow_control(xc_handle, dom, DOM0_SHADOW_CONTROL_OP_CLEAN, - to_send, max_pfn, &stats ) != max_pfn) { + if (xc_shadow_control(xc_handle, dom, + DOM0_SHADOW_CONTROL_OP_CLEAN, to_send, + max_pfn, NULL, 0, &stats) != max_pfn) { ERR("Error flushing shadow PT"); goto out; } @@ -1174,8 +1175,9 @@ int xc_linux_save(int xc_handle, int io_ out: if (live) { - if(xc_shadow_control(xc_handle, dom, DOM0_SHADOW_CONTROL_OP_OFF, - NULL, 0, NULL ) < 0) { + if(xc_shadow_control(xc_handle, dom, + DOM0_SHADOW_CONTROL_OP_OFF, + NULL, 0, NULL, 0, NULL) < 0) { DPRINTF("Warning - couldn't disable shadow mode"); } } diff -r f2151423f729 -r 01345b08d012 tools/libxc/xenctrl.h --- a/tools/libxc/xenctrl.h Wed Aug 16 16:48:45 2006 +0100 +++ b/tools/libxc/xenctrl.h Wed Aug 16 17:11:56 2006 +0100 @@ -323,6 +323,8 @@ int xc_shadow_control(int xc_handle, unsigned int sop, unsigned long *dirty_bitmap, unsigned long pages, + unsigned long *mb, + uint32_t mode, xc_shadow_control_stats_t *stats); int xc_bvtsched_global_set(int xc_handle, diff -r f2151423f729 -r 01345b08d012 tools/misc/xc_shadow.c --- a/tools/misc/xc_shadow.c Wed Aug 16 16:48:45 2006 +0100 +++ b/tools/misc/xc_shadow.c Wed Aug 16 17:11:56 2006 +0100 @@ -60,6 +60,8 @@ int main(int argc, char *argv[]) mode, NULL, 0, + NULL, + 0, NULL) < 0 ) { fprintf(stderr, "Error reseting performance counters: %d (%s)\n", diff -r f2151423f729 -r 01345b08d012 tools/python/xen/lowlevel/xc/xc.c --- a/tools/python/xen/lowlevel/xc/xc.c Wed Aug 16 16:48:45 2006 +0100 +++ b/tools/python/xen/lowlevel/xc/xc.c Wed Aug 16 17:11:56 2006 +0100 @@ -669,6 +669,59 @@ static PyObject *pyxc_sedf_domain_get(Xc "weight", weight); } +static PyObject *pyxc_shadow_control(PyObject *self, + PyObject *args, + PyObject *kwds) +{ + XcObject *xc = (XcObject *)self; + + uint32_t dom; + int op=0; + + static char *kwd_list[] = { "dom", "op", NULL }; + + if ( !PyArg_ParseTupleAndKeywords(args, kwds, "i|i", kwd_list, + &dom, &op) ) + return NULL; + + if ( xc_shadow_control(xc->xc_handle, dom, op, NULL, 0, NULL, 0, NULL) + < 0 ) + return PyErr_SetFromErrno(xc_error); + + Py_INCREF(zero); + return zero; +} + +static PyObject *pyxc_shadow_mem_control(PyObject *self, + PyObject *args, + PyObject *kwds) +{ + XcObject *xc = (XcObject *)self; + int op; + uint32_t dom; + int mbarg = -1; + unsigned long mb; + + static char *kwd_list[] = { "dom", "mb", NULL }; + + if ( !PyArg_ParseTupleAndKeywords(args, kwds, "i|i", kwd_list, + &dom, &mbarg) ) + return NULL; + + if ( mbarg < 0 ) + op = DOM0_SHADOW2_CONTROL_OP_GET_ALLOCATION; + else + { + mb = mbarg; + op = DOM0_SHADOW2_CONTROL_OP_SET_ALLOCATION; + } + if ( xc_shadow_control(xc->xc_handle, dom, op, NULL, 0, &mb, 0, NULL) < 0 ) + return PyErr_SetFromErrno(xc_error); + + mbarg = mb; + return Py_BuildValue("i", mbarg); +} + static PyObject *pyxc_sched_credit_domain_set(XcObject *self, PyObject *args, PyObject *kwds) @@ -1118,6 +1171,22 @@ static PyMethodDef pyxc_methods[] = { "Get information about the Xen host\n" "Returns [dict]: information about Xen" " [None]: on failure.\n" }, + + { "shadow_control", + (PyCFunction)pyxc_shadow_control, + METH_VARARGS | METH_KEYWORDS, "\n" + "Set parameter for shadow pagetable interface\n" + " dom [int]: Identifier of domain.\n" + " op [int, 0]: operation\n\n" + "Returns: [int] 0 on success; -1 on error.\n" }, + + { "shadow_mem_control", + (PyCFunction)pyxc_shadow_mem_control, + METH_VARARGS | METH_KEYWORDS, "\n" + "Set or read shadow pagetable memory use\n" + " dom [int]: Identifier of domain.\n" + " mb [int, -1]: MB of shadow memory this domain should have.\n\n" + "Returns: [int] MB of shadow memory in use by this domain.\n" }, { "domain_setmaxmem", (PyCFunction)pyxc_domain_setmaxmem, diff -r f2151423f729 -r 01345b08d012 tools/python/xen/xend/XendDomain.py --- a/tools/python/xen/xend/XendDomain.py Wed Aug 16 16:48:45 2006 +0100 +++ b/tools/python/xen/xend/XendDomain.py Wed Aug 16 17:11:56 2006 +0100 @@ -532,6 +532,30 @@ class XendDomain: except Exception, ex: raise XendError(str(ex)) + def domain_shadow_control(self, domid, op): + """Shadow page control.""" + dominfo = self.domain_lookup(domid) + try: + return xc.shadow_control(dominfo.getDomid(), op) + except Exception, ex: + raise XendError(str(ex)) + + def domain_shadow_mem_get(self, domid): + """Get shadow pagetable memory allocation.""" + dominfo = self.domain_lookup(domid) + try: + return xc.shadow_mem_control(dominfo.getDomid()) + except Exception, ex: + raise XendError(str(ex)) + + def domain_shadow_mem_set(self, domid, mb): + """Set shadow pagetable memory allocation.""" + dominfo = self.domain_lookup(domid) + try: + return xc.shadow_mem_control(dominfo.getDomid(), mb=mb) + except Exception, ex: + raise XendError(str(ex)) + def domain_sched_credit_get(self, domid): """Get credit scheduler parameters for a domain. """ diff -r f2151423f729 -r 01345b08d012 tools/python/xen/xend/XendDomainInfo.py --- a/tools/python/xen/xend/XendDomainInfo.py Wed Aug 16 16:48:45 2006 +0100 +++ b/tools/python/xen/xend/XendDomainInfo.py Wed Aug 16 17:11:56 2006 +0100 @@ -30,6 +30,7 @@ import time import time import threading import os +import math import xen.lowlevel.xc from xen.util import asserts @@ -126,16 +127,17 @@ VM_CONFIG_PARAMS = [ # don't come out of xc in the same form as they are specified in the config # file, so those are handled separately. ROUNDTRIPPING_CONFIG_ENTRIES = [ - ('uuid', str), - ('vcpus', int), - ('vcpu_avail', int), - ('cpu_weight', float), - ('memory', int), - ('maxmem', int), - ('bootloader', str), + ('uuid', str), + ('vcpus', int), + ('vcpu_avail', int), + ('cpu_weight', float), + ('memory', int), + ('shadow_memory', int), + ('maxmem', int), + ('bootloader', str), ('bootloader_args', str), - ('features', str), - ('localtime', int), + ('features', str), + ('localtime', int), ] ROUNDTRIPPING_CONFIG_ENTRIES += VM_CONFIG_PARAMS @@ -146,12 +148,13 @@ ROUNDTRIPPING_CONFIG_ENTRIES += VM_CONFI # entries written to the store that cannot be reconfigured on-the-fly. # VM_STORE_ENTRIES = [ - ('uuid', str), - ('vcpus', int), - ('vcpu_avail', int), - ('memory', int), - ('maxmem', int), - ('start_time', float), + ('uuid', str), + ('vcpus', int), + ('vcpu_avail', int), + ('memory', int), + ('shadow_memory', int), + ('maxmem', int), + ('start_time', float), ] VM_STORE_ENTRIES += VM_CONFIG_PARAMS @@ -572,6 +575,7 @@ class XendDomainInfo: defaultInfo('vcpu_avail', lambda: (1 << self.info['vcpus']) - 1) defaultInfo('memory', lambda: 0) + defaultInfo('shadow_memory', lambda: 0) defaultInfo('maxmem', lambda: 0) defaultInfo('bootloader', lambda: None) defaultInfo('bootloader_args', lambda: None) @@ -1280,7 +1284,18 @@ class XendDomainInfo: xc.domain_setmaxmem(self.domid, self.info['maxmem'] * 1024) m = self.image.getDomainMemory(self.info['memory'] * 1024) - balloon.free(m) + + # get the domain's shadow memory requirement + sm = int(math.ceil(self.image.getDomainShadowMemory(m) / 1024.0)) + if self.info['shadow_memory'] > sm: + sm = self.info['shadow_memory'] + + # Make sure there's enough RAM available for the domain + balloon.free(m + sm * 1024) + + # Set up the shadow memory + sm = xc.shadow_mem_control(self.domid, mb=sm) + self.info['shadow_memory'] = sm init_reservation = self.info['memory'] * 1024 if os.uname()[4] in ('ia64', 'ppc64'): diff -r f2151423f729 -r 01345b08d012 tools/python/xen/xend/image.py --- a/tools/python/xen/xend/image.py Wed Aug 16 16:48:45 2006 +0100 +++ b/tools/python/xen/xend/image.py Wed Aug 16 17:11:56 2006 +0100 @@ -152,6 +152,12 @@ class ImageHandler: if 'hvm' in xc.xeninfo()['xen_caps']: mem_kb += 4*1024; return mem_kb + + def getDomainShadowMemory(self, mem_kb): + """@return The minimum shadow memory required, in KiB, for a domain + with mem_kb KiB of RAM.""" + # PV domains don't need any shadow memory + return 0 def buildDomain(self): """Build the domain. Define in subclass.""" @@ -364,6 +370,17 @@ class HVMImageHandler(ImageHandler): extra_pages = int( math.ceil( extra_mb*1024 / page_kb )) return mem_kb + extra_pages * page_kb + def getDomainShadowMemory(self, mem_kb): + """@return The minimum shadow memory required, in KiB, for a domain + with mem_kb KiB of RAM.""" + if os.uname()[4] in ('ia64', 'ppc64'): + # Explicit shadow memory is not a concept + return 0 + else: + # 1MB per vcpu plus 4Kib/Mib of RAM. This is higher than + # the minimum that Xen would allocate if no value were given. + return 1024 * self.vm.getVCpuCount() + mem_kb / 256 + def register_shutdown_watch(self): """ add xen store watch on control/shutdown """ self.shutdownWatch = xswatch(self.vm.dompath + "/control/shutdown", \ diff -r f2151423f729 -r 01345b08d012 tools/python/xen/xm/create.py --- a/tools/python/xen/xm/create.py Wed Aug 16 16:48:45 2006 +0100 +++ b/tools/python/xen/xm/create.py Wed Aug 16 17:11:56 2006 +0100 @@ -157,6 +157,10 @@ gopts.var('maxmem', val='MEMORY', gopts.var('maxmem', val='MEMORY', fn=set_int, default=None, use="Maximum domain memory in MB.") + +gopts.var('shadow_memory', val='MEMORY', + fn=set_int, default=0, + use="Domain shadow memory in MB.") gopts.var('cpu', val='CPU', fn=set_int, default=None, @@ -666,8 +670,9 @@ def make_config(vals): if v: config.append([n, v]) - map(add_conf, ['name', 'memory', 'maxmem', 'restart', 'on_poweroff', - 'on_reboot', 'on_crash', 'vcpus', 'features']) + map(add_conf, ['name', 'memory', 'maxmem', 'shadow_memory', + 'restart', 'on_poweroff', 'on_reboot', 'on_crash', + 'vcpus', 'features']) if vals.uuid is not None: config.append(['uuid', vals.uuid]) diff -r f2151423f729 -r 01345b08d012 xen/arch/x86/Makefile --- a/xen/arch/x86/Makefile Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/arch/x86/Makefile Wed Aug 16 17:11:56 2006 +0100 @@ -8,7 +8,6 @@ subdir-$(x86_64) += x86_64 subdir-$(x86_64) += x86_64 obj-y += apic.o -obj-y += audit.o obj-y += bitops.o obj-y += compat.o obj-y += delay.o @@ -41,12 +40,21 @@ obj-y += x86_emulate.o obj-y += x86_emulate.o ifneq ($(pae),n) -obj-$(x86_32) += shadow.o shadow_public.o shadow_guest32.o shadow_guest32pae.o +obj-$(x86_32) += shadow2-common.o shadow2_g2_on_s3.o shadow2_g3_on_s3.o else -obj-$(x86_32) += shadow32.o +obj-$(x86_32) += shadow2-common.o shadow2_g2_on_s2.o endif -obj-$(x86_64) += shadow.o shadow_public.o shadow_guest32.o shadow_guest32pae.o +obj-$(x86_64) += shadow2-common.o shadow2_g4_on_s4.o shadow2_g3_on_s3.o \ + shadow2_g2_on_s3.o + +guest_levels = $(subst g,,$(filter g%,$(subst ., ,$(subst _, ,$(subst shadow2_,,$(1)))))) +shadow_levels = $(subst s,,$(filter s%,$(subst ., ,$(subst _, ,$(subst shadow2_,,$(1)))))) +shadow2_defns = -DGUEST_PAGING_LEVELS=$(call guest_levels,$(1)) \ + -DSHADOW_PAGING_LEVELS=$(call shadow_levels,$(1)) + +shadow2_%.o: shadow2.c $(HDRS) Makefile + $(CC) $(CFLAGS) $(call shadow2_defns,$(@F)) -c $< -o $@ obj-$(crash_debug) += gdbstub.o diff -r f2151423f729 -r 01345b08d012 xen/arch/x86/dom0_ops.c --- a/xen/arch/x86/dom0_ops.c Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/arch/x86/dom0_ops.c Wed Aug 16 17:11:56 2006 +0100 @@ -89,7 +89,7 @@ long arch_do_dom0_op(struct dom0_op *op, d = find_domain_by_id(op->u.shadow_control.domain); if ( d != NULL ) { - ret = shadow_mode_control(d, &op->u.shadow_control); + ret = shadow2_control_op(d, &op->u.shadow_control, u_dom0_op); put_domain(d); copy_to_guest(u_dom0_op, op, 1); } diff -r f2151423f729 -r 01345b08d012 xen/arch/x86/domain.c --- a/xen/arch/x86/domain.c Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/arch/x86/domain.c Wed Aug 16 17:11:56 2006 +0100 @@ -134,13 +134,6 @@ struct vcpu *alloc_vcpu_struct(struct do v->arch.perdomain_ptes = d->arch.mm_perdomain_pt + (vcpu_id << GDT_LDT_VCPU_SHIFT); - v->arch.guest_vtable = __linear_l2_table; - v->arch.shadow_vtable = __shadow_linear_l2_table; -#if defined(__x86_64__) - v->arch.guest_vl3table = __linear_l3_table; - v->arch.guest_vl4table = __linear_l4_table; -#endif - pae_l3_cache_init(&v->arch.pae_l3_cache); return v; @@ -155,9 +148,7 @@ int arch_domain_create(struct domain *d) { l1_pgentry_t gdt_l1e; int vcpuid, pdpt_order; -#ifdef __x86_64__ int i; -#endif pdpt_order = get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t)); d->arch.mm_perdomain_pt = alloc_xenheap_pages(pdpt_order); @@ -202,8 +193,12 @@ int arch_domain_create(struct domain *d) #endif /* __x86_64__ */ - shadow_lock_init(d); - INIT_LIST_HEAD(&d->arch.free_shadow_frames); + shadow2_lock_init(d); + for ( i = 0; i <= SHADOW2_MAX_ORDER; i++ ) + INIT_LIST_HEAD(&d->arch.shadow2_freelists[i]); + INIT_LIST_HEAD(&d->arch.shadow2_p2m_freelist); + INIT_LIST_HEAD(&d->arch.shadow2_p2m_inuse); + INIT_LIST_HEAD(&d->arch.shadow2_toplevel_shadows); if ( !is_idle_domain(d) ) { @@ -234,6 +229,8 @@ int arch_domain_create(struct domain *d) void arch_domain_destroy(struct domain *d) { + shadow2_final_teardown(d); + free_xenheap_pages( d->arch.mm_perdomain_pt, get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t))); @@ -328,31 +325,35 @@ int arch_set_info_guest( if ( !hvm_initialize_guest_resources(v) ) return -EINVAL; } - else if ( shadow_mode_refcounts(d) ) - { - if ( !get_page(mfn_to_page(cr3_pfn), d) ) + else + { + if ( !get_page_and_type(mfn_to_page(cr3_pfn), d, + PGT_base_page_table) ) { destroy_gdt(v); return -EINVAL; } - } - else - { - if ( !get_page_and_type(mfn_to_page(cr3_pfn), d, - PGT_base_page_table) ) - { - destroy_gdt(v); - return -EINVAL; - } - } - - update_pagetables(v); + } + + /* Shadow2: make sure the domain has enough shadow memory to + * boot another vcpu */ + if ( shadow2_mode_enabled(d) + && d->arch.shadow2_total_pages < shadow2_min_acceptable_pages(d) ) + { + destroy_gdt(v); + return -ENOMEM; + } if ( v->vcpu_id == 0 ) update_domain_wallclock_time(d); /* Don't redo final setup */ set_bit(_VCPUF_initialised, &v->vcpu_flags); + + if ( shadow2_mode_enabled(d) ) + shadow2_update_paging_modes(v); + + update_cr3(v); return 0; } @@ -669,7 +670,6 @@ static void __context_switch(void) loaddebug(&n->arch.guest_context, 6); loaddebug(&n->arch.guest_context, 7); } - n->arch.ctxt_switch_to(n); } @@ -927,29 +927,34 @@ void domain_relinquish_resources(struct /* Drop the in-use references to page-table bases. */ for_each_vcpu ( d, v ) { - if ( (pfn = pagetable_get_pfn(v->arch.guest_table)) != 0 ) - { - if ( !shadow_mode_refcounts(d) ) - put_page_type(mfn_to_page(pfn)); - put_page(mfn_to_page(pfn)); - + /* Drop ref to guest_table (from new_guest_cr3(), svm/vmx cr3 handling, + * or sh2_update_paging_modes()) */ + pfn = pagetable_get_pfn(v->arch.guest_table); + if ( pfn != 0 ) + { + if ( shadow2_mode_refcounts(d) ) + put_page(mfn_to_page(pfn)); + else + put_page_and_type(mfn_to_page(pfn)); v->arch.guest_table = pagetable_null(); } - if ( (pfn = pagetable_get_pfn(v->arch.guest_table_user)) != 0 ) - { - if ( !shadow_mode_refcounts(d) ) - put_page_type(mfn_to_page(pfn)); - put_page(mfn_to_page(pfn)); - +#ifdef __x86_64__ + /* Drop ref to guest_table_user (from MMUEXT_NEW_USER_BASEPTR) */ + pfn = pagetable_get_pfn(v->arch.guest_table_user); + if ( pfn != 0 ) + { + put_page_and_type(mfn_to_page(pfn)); v->arch.guest_table_user = pagetable_null(); } +#endif } if ( d->vcpu[0] && hvm_guest(d->vcpu[0]) ) hvm_relinquish_guest_resources(d); - shadow_mode_disable(d); + /* Tear down shadow mode stuff. */ + shadow2_teardown(d); /* * Relinquish GDT mappings. No need for explicit unmapping of the LDT as @@ -964,26 +969,23 @@ void domain_relinquish_resources(struct /* Free page used by xen oprofile buffer */ free_xenoprof_pages(d); - } void arch_dump_domain_info(struct domain *d) { - if ( shadow_mode_enabled(d) ) - { - printk(" shadow mode: "); - if ( shadow_mode_refcounts(d) ) + if ( shadow2_mode_enabled(d) ) + { + printk(" shadow2 mode: "); + if ( d->arch.shadow2_mode & SHM2_enable ) + printk("enabled "); + if ( shadow2_mode_refcounts(d) ) printk("refcounts "); - if ( shadow_mode_write_all(d) ) - printk("write_all "); - if ( shadow_mode_log_dirty(d) ) + if ( shadow2_mode_log_dirty(d) ) printk("log_dirty "); - if ( shadow_mode_translate(d) ) + if ( shadow2_mode_translate(d) ) printk("translate "); - if ( shadow_mode_external(d) ) + if ( shadow2_mode_external(d) ) printk("external "); - if ( shadow_mode_wr_pt_pte(d) ) - printk("wr_pt_pte "); printk("\n"); } } diff -r f2151423f729 -r 01345b08d012 xen/arch/x86/domain_build.c --- a/xen/arch/x86/domain_build.c Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/arch/x86/domain_build.c Wed Aug 16 17:11:56 2006 +0100 @@ -683,8 +683,11 @@ int construct_dom0(struct domain *d, for ( i = 1; i < opt_dom0_max_vcpus; i++ ) (void)alloc_vcpu(d, i, i); - /* Set up monitor table */ - update_pagetables(v); + /* Set up CR3 value for write_ptbase */ + if ( shadow2_mode_enabled(v->domain) ) + shadow2_update_paging_modes(v); + else + update_cr3(v); /* Install the new page tables. */ local_irq_disable(); @@ -796,10 +799,8 @@ int construct_dom0(struct domain *d, new_thread(v, dsi.v_kernentry, vstack_end, vstartinfo_start); if ( opt_dom0_shadow ) - { - shadow_mode_enable(d, SHM_enable); - update_pagetables(v); - } + if ( shadow2_test_enable(d) == 0 ) + shadow2_update_paging_modes(v); if ( supervisor_mode_kernel ) { diff -r f2151423f729 -r 01345b08d012 xen/arch/x86/hvm/hvm.c --- a/xen/arch/x86/hvm/hvm.c Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/arch/x86/hvm/hvm.c Wed Aug 16 17:11:56 2006 +0100 @@ -30,6 +30,7 @@ #include <xen/hypercall.h> #include <xen/guest_access.h> #include <xen/event.h> +#include <xen/shadow.h> #include <asm/current.h> #include <asm/e820.h> #include <asm/io.h> @@ -42,10 +43,6 @@ #include <asm/spinlock.h> #include <asm/hvm/hvm.h> #include <asm/hvm/support.h> -#include <asm/shadow.h> -#if CONFIG_PAGING_LEVELS >= 3 -#include <asm/shadow_64.h> -#endif #include <public/sched.h> #include <public/hvm/ioreq.h> #include <public/version.h> @@ -61,7 +58,7 @@ static void hvm_zap_mmio_range( static void hvm_zap_mmio_range( struct domain *d, unsigned long pfn, unsigned long nr_pfn) { - unsigned long i, val = INVALID_MFN; + unsigned long i; ASSERT(d == current->domain); @@ -70,7 +67,8 @@ static void hvm_zap_mmio_range( if ( pfn + i >= 0xfffff ) break; - __copy_to_user(&phys_to_machine_mapping[pfn + i], &val, sizeof (val)); + if ( VALID_MFN(gmfn_to_mfn(d, pfn + i)) ) + guest_remove_page(d, pfn + i); } } @@ -262,11 +260,13 @@ void hvm_setup_platform(struct domain* d if ( !hvm_guest(v) || (v->vcpu_id != 0) ) return; +#if 0 /* SHADOW2 does not have this */ if ( shadow_direct_map_init(d) == 0 ) { printk("Can not allocate shadow direct map for HVM domain.\n"); domain_crash_synchronous(); } +#endif hvm_zap_iommu_pages(d); @@ -351,6 +351,7 @@ void hvm_hlt(unsigned long rflags) struct periodic_time *pt = &v->domain->arch.hvm_domain.pl_time.periodic_tm; s_time_t next_pit = -1, next_wakeup; +#if 0 /* This seems to fire at unwelcome times in Linux */ /* * Detect machine shutdown. Only do this for vcpu 0, to avoid potentially * shutting down the domain early. If we halt with interrupts disabled, @@ -364,6 +365,7 @@ void hvm_hlt(unsigned long rflags) domain_shutdown(current->domain, SHUTDOWN_poweroff); return; } +#endif /* 0 */ if ( !v->vcpu_id ) next_pit = get_scheduled(v, pt->irq, pt); @@ -380,6 +382,8 @@ void hvm_hlt(unsigned long rflags) */ int hvm_copy(void *buf, unsigned long vaddr, int size, int dir) { + struct vcpu *v = current; + unsigned long gfn; unsigned long mfn; char *addr; int count; @@ -389,10 +393,9 @@ int hvm_copy(void *buf, unsigned long va if (count > size) count = size; - if (hvm_paging_enabled(current)) - mfn = gva_to_mfn(vaddr); - else - mfn = get_mfn_from_gpfn(vaddr >> PAGE_SHIFT); + gfn = shadow2_gva_to_gfn(v, vaddr); + mfn = mfn_x(sh2_vcpu_gfn_to_mfn(v, gfn)); + if (mfn == INVALID_MFN) return 0; @@ -545,7 +548,7 @@ void hvm_do_hypercall(struct cpu_user_re return; } - if ( current->domain->arch.ops->guest_paging_levels == PAGING_L4 ) + if ( current->arch.shadow2->guest_levels == 4 ) { pregs->rax = hvm_hypercall64_table[pregs->rax](pregs->rdi, pregs->rsi, diff -r f2151423f729 -r 01345b08d012 xen/arch/x86/hvm/platform.c --- a/xen/arch/x86/hvm/platform.c Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/arch/x86/hvm/platform.c Wed Aug 16 17:11:56 2006 +0100 @@ -21,7 +21,7 @@ #include <xen/config.h> #include <xen/types.h> #include <xen/mm.h> -#include <asm/shadow.h> +#include <xen/shadow.h> #include <xen/domain_page.h> #include <asm/page.h> #include <xen/event.h> @@ -35,9 +35,6 @@ #include <xen/lib.h> #include <xen/sched.h> #include <asm/current.h> -#if CONFIG_PAGING_LEVELS >= 3 -#include <asm/shadow_64.h> -#endif #define DECODE_success 1 #define DECODE_failure 0 @@ -724,7 +721,7 @@ void send_pio_req(struct cpu_user_regs * if (pvalid) { if (hvm_paging_enabled(current)) - p->u.pdata = (void *) gva_to_gpa(value); + p->u.data = shadow2_gva_to_gpa(current, value); else p->u.pdata = (void *) value; /* guest VA == guest PA */ } else @@ -774,7 +771,7 @@ void send_mmio_req( if (pvalid) { if (hvm_paging_enabled(v)) - p->u.pdata = (void *) gva_to_gpa(value); + p->u.data = shadow2_gva_to_gpa(v, value); else p->u.pdata = (void *) value; /* guest VA == guest PA */ } else diff -r f2151423f729 -r 01345b08d012 xen/arch/x86/hvm/svm/svm.c --- a/xen/arch/x86/hvm/svm/svm.c Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/arch/x86/hvm/svm/svm.c Wed Aug 16 17:11:56 2006 +0100 @@ -26,9 +26,10 @@ #include <xen/irq.h> #include <xen/softirq.h> #include <xen/hypercall.h> +#include <xen/domain_page.h> #include <asm/current.h> #include <asm/io.h> -#include <asm/shadow.h> +#include <asm/shadow2.h> #include <asm/regs.h> #include <asm/cpufeature.h> #include <asm/processor.h> @@ -43,10 +44,6 @@ #include <asm/hvm/svm/emulate.h> #include <asm/hvm/svm/vmmcall.h> #include <asm/hvm/svm/intr.h> -#include <asm/shadow.h> -#if CONFIG_PAGING_LEVELS >= 3 -#include <asm/shadow_64.h> -#endif #include <public/sched.h> #define SVM_EXTRA_DEBUG @@ -414,7 +411,7 @@ static int svm_realmode(struct vcpu *v) return (eflags & X86_EFLAGS_VM) || !(cr0 & X86_CR0_PE); } -static int svm_instruction_length(struct vcpu *v) +int svm_guest_x86_mode(struct vcpu *v) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; unsigned long cr0 = vmcb->cr0, eflags = vmcb->rflags, mode; @@ -423,10 +420,20 @@ static int svm_instruction_length(struct mode = vmcb->cs.attributes.fields.l ? 8 : 4; else mode = (eflags & X86_EFLAGS_VM) || !(cr0 & X86_CR0_PE) ? 2 : 4; - return svm_instrlen(guest_cpu_user_regs(), mode); -} - -static unsigned long svm_get_ctrl_reg(struct vcpu *v, unsigned int num) + return mode; +} + +int svm_instruction_length(struct vcpu *v) +{ + return svm_instrlen(guest_cpu_user_regs(), svm_guest_x86_mode(v)); +} + +void svm_update_host_cr3(struct vcpu *v) +{ + /* SVM doesn't have a HOST_CR3 equivalent to update. */ +} + +unsigned long svm_get_ctrl_reg(struct vcpu *v, unsigned int num) { switch ( num ) { @@ -436,6 +443,8 @@ static unsigned long svm_get_ctrl_reg(st return v->arch.hvm_svm.cpu_cr2; case 3: return v->arch.hvm_svm.cpu_cr3; + case 4: + return v->arch.hvm_svm.cpu_shadow_cr4; default: BUG(); } @@ -524,8 +533,6 @@ static void svm_init_hypercall_page(stru /* Don't support HYPERVISOR_iret at the moment */ *(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */ } - - int svm_dbg_on = 0; @@ -647,6 +654,11 @@ static void svm_load_cpu_guest_regs( svm_load_cpu_user_regs(v, regs); } +int svm_long_mode_enabled(struct vcpu *v) +{ + return SVM_LONG_GUEST(v); +} + static void arch_svm_do_launch(struct vcpu *v) @@ -726,7 +738,6 @@ static void svm_final_setup_guest(struct static void svm_final_setup_guest(struct vcpu *v) { struct domain *d = v->domain; - struct vcpu *vc; v->arch.schedule_tail = arch_svm_do_launch; v->arch.ctxt_switch_from = svm_ctxt_switch_from; @@ -735,9 +746,12 @@ static void svm_final_setup_guest(struct if ( v != d->vcpu[0] ) return; - /* Initialize monitor page table */ - for_each_vcpu( d, vc ) - vc->arch.monitor_table = pagetable_null(); + if ( !shadow2_mode_external(d) ) + { + DPRINTK("Can't init HVM for dom %u vcpu %u: " + "not in shadow2 external mode\n", d->domain_id, v->vcpu_id); + domain_crash(d); + } /* * Required to do this once per domain @@ -745,13 +759,6 @@ static void svm_final_setup_guest(struct */ memset(&d->shared_info->evtchn_mask[0], 0xff, sizeof(d->shared_info->evtchn_mask)); - - /* - * Put the domain in shadow mode even though we're going to be using - * the shared 1:1 page table initially. It shouldn't hurt - */ - shadow_mode_enable(d, SHM_enable|SHM_refcounts| - SHM_translate|SHM_external|SHM_wr_pt_pte); } @@ -809,9 +816,13 @@ int start_svm(void) hvm_funcs.realmode = svm_realmode; hvm_funcs.paging_enabled = svm_paging_enabled; + hvm_funcs.long_mode_enabled = svm_long_mode_enabled; + hvm_funcs.guest_x86_mode = svm_guest_x86_mode; hvm_funcs.instruction_length = svm_instruction_length; hvm_funcs.get_guest_ctrl_reg = svm_get_ctrl_reg; + hvm_funcs.update_host_cr3 = svm_update_host_cr3; + hvm_funcs.stts = svm_stts; hvm_funcs.set_tsc_offset = svm_set_tsc_offset; @@ -834,7 +845,6 @@ static void svm_relinquish_guest_resourc continue; destroy_vmcb(&v->arch.hvm_svm); - free_monitor_pagetable(v); kill_timer(&v->arch.hvm_vcpu.hlt_timer); if ( hvm_apic_support(v->domain) && (VLAPIC(v) != NULL) ) { @@ -851,8 +861,6 @@ static void svm_relinquish_guest_resourc if ( d->arch.hvm_domain.buffered_io_va ) unmap_domain_page_global((void *)d->arch.hvm_domain.buffered_io_va); - - shadow_direct_map_clean(d); } @@ -894,7 +902,6 @@ static int svm_do_page_fault(unsigned lo { struct vcpu *v = current; unsigned long eip; - unsigned long gpa; /* FIXME: PAE */ int result; struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; @@ -907,43 +914,7 @@ static int svm_do_page_fault(unsigned lo va, eip, (unsigned long)regs->error_code); //#endif - if ( !svm_paging_enabled(v) ) - { - if ( shadow_direct_map_fault(va, regs) ) - return 1; - - handle_mmio(va, va); - return 1; - } - - - gpa = gva_to_gpa(va); - - /* Use 1:1 page table to identify MMIO address space */ - if (mmio_space(gpa)) - { - /* No support for APIC */ - if (!hvm_apic_support(v->domain) && gpa >= 0xFEC00000) - { - int inst_len; - inst_len = svm_instruction_length(v); - if (inst_len == -1) - { - printf("%s: INST_LEN - Unable to decode properly\n", __func__); - domain_crash_synchronous(); - } - - __update_guest_eip(vmcb, inst_len); - - return 1; - } - - handle_mmio(va, gpa); - - return 1; - } - - result = shadow_fault(va, regs); + result = shadow2_fault(va, regs); if( result ) { /* Let's make sure that the Guest TLB is flushed */ @@ -1035,19 +1006,12 @@ static void svm_vmexit_do_cpuid(struct v clear_bit(X86_FEATURE_APIC, &edx); } -#if CONFIG_PAGING_LEVELS < 3 - clear_bit(X86_FEATURE_PAE, &edx); - clear_bit(X86_FEATURE_PSE, &edx); +#if CONFIG_PAGING_LEVELS >= 3 + if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] ) +#endif + clear_bit(X86_FEATURE_PAE, &edx); clear_bit(X86_FEATURE_PSE36, &edx); -#else - if ( v->domain->arch.ops->guest_paging_levels == PAGING_L2 ) - { - if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] ) - clear_bit(X86_FEATURE_PAE, &edx); - clear_bit(X86_FEATURE_PSE, &edx); - clear_bit(X86_FEATURE_PSE36, &edx); - } -#endif + /* Clear out reserved bits. */ ecx &= ~SVM_VCPU_CPUID_L1_ECX_RESERVED; edx &= ~SVM_VCPU_CPUID_L1_EDX_RESERVED; @@ -1097,23 +1061,12 @@ static void svm_vmexit_do_cpuid(struct v clear_bit(X86_FEATURE_SYSCALL & 31, &edx); #endif -#if CONFIG_PAGING_LEVELS < 3 - clear_bit(X86_FEATURE_NX & 31, &edx); - clear_bit(X86_FEATURE_PAE, &edx); - clear_bit(X86_FEATURE_PSE, &edx); + +#if CONFIG_PAGING_LEVELS >= 3 + if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] ) +#endif + clear_bit(X86_FEATURE_PAE, &edx); clear_bit(X86_FEATURE_PSE36, &edx); -#else - if ( v->domain->arch.ops->guest_paging_levels == PAGING_L2 ) - { - if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] ) - { - clear_bit(X86_FEATURE_NX & 31, &edx); - clear_bit(X86_FEATURE_PAE, &edx); - } - clear_bit(X86_FEATURE_PSE, &edx); - clear_bit(X86_FEATURE_PSE36, &edx); - } -#endif /* Make SVM feature invisible to the guest. */ clear_bit(X86_FEATURE_SVME & 31, &ecx); @@ -1555,6 +1508,7 @@ static int svm_set_cr0(unsigned long val unsigned long mfn; int paging_enabled; struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; + unsigned long old_base_mfn; ASSERT(vmcb); @@ -1600,54 +1554,21 @@ static int svm_set_cr0(unsigned long val set_bit(SVM_CPU_STATE_LMA_ENABLED, &v->arch.hvm_svm.cpu_state); vmcb->efer |= (EFER_LMA | EFER_LME); - if (!shadow_set_guest_paging_levels(v->domain, PAGING_L4) ) - { - printk("Unsupported guest paging levels\n"); - domain_crash_synchronous(); /* need to take a clean path */ - } - } - else + } #endif /* __x86_64__ */ - { -#if CONFIG_PAGING_LEVELS >= 3 - /* seems it's a 32-bit or 32-bit PAE guest */ - if ( test_bit(SVM_CPU_STATE_PAE_ENABLED, - &v->arch.hvm_svm.cpu_state) ) - { - /* The guest enables PAE first and then it enables PG, it is - * really a PAE guest */ - if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L3) ) - { - printk("Unsupported guest paging levels\n"); - domain_crash_synchronous(); - } - } - else - { - if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L2) ) - { - printk("Unsupported guest paging levels\n"); - domain_crash_synchronous(); /* need to take a clean path */ - } - } -#endif - } /* Now arch.guest_table points to machine physical. */ + old_base_mfn = pagetable_get_pfn(v->arch.guest_table); v->arch.guest_table = pagetable_from_pfn(mfn); - update_pagetables(v); + if ( old_base_mfn ) + put_page(mfn_to_page(old_base_mfn)); + shadow2_update_paging_modes(v); HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx", (unsigned long) (mfn << PAGE_SHIFT)); + vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3; set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags); - vmcb->cr3 = pagetable_get_paddr(v->arch.shadow_table); - - /* arch->shadow_table should hold the next CR3 for shadow */ - HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx\n", - v->arch.hvm_svm.cpu_cr3, mfn); - - return 1; } if ( !((value & X86_CR0_PE) && (value & X86_CR0_PG)) && paging_enabled ) @@ -1667,17 +1588,16 @@ static int svm_set_cr0(unsigned long val svm_inject_exception(v, TRAP_gp_fault, 1, 0); return 0; } - - clear_all_shadow_status( v->domain ); + shadow2_update_paging_modes(v); + vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3; set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags); - vmcb->cr3 = pagetable_get_paddr(v->domain->arch.phys_table); } else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE ) { /* we should take care of this kind of situation */ - clear_all_shadow_status(v->domain); + shadow2_update_paging_modes(v); + vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3; set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags); - vmcb->cr3 = pagetable_get_paddr(v->domain->arch.phys_table); } return 1; @@ -1786,7 +1706,7 @@ static int mov_to_cr(int gpreg, int cr, mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT); if (mfn != pagetable_get_pfn(v->arch.guest_table)) __hvm_bug(regs); - shadow_sync_all(v->domain); + shadow2_update_cr3(v); } else { @@ -1812,14 +1732,10 @@ static int mov_to_cr(int gpreg, int cr, /* * arch.shadow_table should now hold the next CR3 for shadow */ -#if CONFIG_PAGING_LEVELS >= 3 - if ( v->domain->arch.ops->guest_paging_levels == PAGING_L3 ) - shadow_sync_all(v->domain); -#endif v->arch.hvm_svm.cpu_cr3 = value; - update_pagetables(v); + update_cr3(v); + vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3; HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value); - vmcb->cr3 = pagetable_get_paddr(v->arch.shadow_table); } break; } @@ -1838,12 +1754,6 @@ static int mov_to_cr(int gpreg, int cr, /* The guest is a 32-bit PAE guest. */ #if CONFIG_PAGING_LEVELS >= 3 unsigned long mfn, old_base_mfn; - - if( !shadow_set_guest_paging_levels(v->domain, PAGING_L3) ) - { - printk("Unsupported guest paging levels\n"); - domain_crash_synchronous(); /* need to take a clean path */ - } if ( !VALID_MFN(mfn = get_mfn_from_gpfn( v->arch.hvm_svm.cpu_cr3 >> PAGE_SHIFT)) || @@ -1853,21 +1763,20 @@ static int mov_to_cr(int gpreg, int cr, domain_crash_synchronous(); /* need to take a clean path */ } - old_base_mfn = pagetable_get_pfn(v->arch.guest_table); - if ( old_base_mfn ) - put_page(mfn_to_page(old_base_mfn)); - /* * Now arch.guest_table points to machine physical. */ + old_base_mfn = pagetable_get_pfn(v->arch.guest_table); v->arch.guest_table = pagetable_from_pfn(mfn); - update_pagetables(v); + if ( old_base_mfn ) + put_page(mfn_to_page(old_base_mfn)); + shadow2_update_paging_modes(v); HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx", (unsigned long) (mfn << PAGE_SHIFT)); - vmcb->cr3 = pagetable_get_paddr(v->arch.shadow_table); + vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3; /* * arch->shadow_table should hold the next CR3 for shadow @@ -1876,33 +1785,6 @@ static int mov_to_cr(int gpreg, int cr, HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx", v->arch.hvm_svm.cpu_cr3, mfn); -#endif - } - else - { - /* The guest is a 64 bit or 32-bit PAE guest. */ -#if CONFIG_PAGING_LEVELS >= 3 - if ( (v->domain->arch.ops != NULL) && - v->domain->arch.ops->guest_paging_levels == PAGING_L2) - { - /* Seems the guest first enables PAE without enabling PG, - * it must enable PG after that, and it is a 32-bit PAE - * guest */ - - if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L3)) - { - printk("Unsupported guest paging levels\n"); - domain_crash_synchronous(); - } - } - else - { - if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L4)) - { - printk("Unsupported guest paging levels\n"); - domain_crash_synchronous(); - } - } #endif } } @@ -1926,7 +1808,7 @@ static int mov_to_cr(int gpreg, int cr, if ((old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE)) { set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags); - shadow_sync_all(v->domain); + shadow2_update_paging_modes(v); } break; } @@ -2267,7 +2149,7 @@ void svm_handle_invlpg(const short invlp /* Overkill, we may not this */ set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags); - shadow_invlpg(v, g_vaddr); + shadow2_invlpg(v, g_vaddr); } @@ -2638,7 +2520,7 @@ void walk_shadow_and_guest_pt(unsigned l struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; unsigned long gpa; - gpa = gva_to_gpa( gva ); + gpa = shadow2_gva_to_gpa(current, gva); printk( "gva = %lx, gpa=%lx, gCR3=%x\n", gva, gpa, (u32)vmcb->cr3 ); if( !svm_paging_enabled(v) || mmio_space(gpa) ) return; @@ -2662,8 +2544,12 @@ void walk_shadow_and_guest_pt(unsigned l __copy_from_user(&gpte, &linear_pg_table[ l1_linear_offset(gva) ], sizeof(gpte) ); printk( "G-PTE = %x, flags=%x\n", gpte.l1, l1e_get_flags(gpte) ); - __copy_from_user( &spte, &phys_to_machine_mapping[ l1e_get_pfn( gpte ) ], + + BUG(); // need to think about this, and convert usage of + // phys_to_machine_mapping to use pagetable format... + __copy_from_user( &spte, &phys_to_machine_mapping[ l1e_get_pfn( gpte ) ], sizeof(spte) ); + printk( "S-PTE = %x, flags=%x\n", spte.l1, l1e_get_flags(spte)); } #endif /* SVM_WALK_GUEST_PAGES */ @@ -2704,7 +2590,8 @@ asmlinkage void svm_vmexit_handler(struc if (svm_dbg_on && exit_reason == VMEXIT_EXCEPTION_PF) { - if (svm_paging_enabled(v) && !mmio_space(gva_to_gpa(vmcb->exitinfo2))) + if (svm_paging_enabled(v) && + !mmio_space(shadow2_gva_to_gpa(current, vmcb->exitinfo2))) { printk("I%08ld,ExC=%s(%d),IP=%x:%llx,I1=%llx,I2=%llx,INT=%llx, " "gpa=%llx\n", intercepts_counter, @@ -2713,7 +2600,7 @@ asmlinkage void svm_vmexit_handler(struc (unsigned long long) vmcb->exitinfo1, (unsigned long long) vmcb->exitinfo2, (unsigned long long) vmcb->exitintinfo.bytes, - (unsigned long long) gva_to_gpa( vmcb->exitinfo2 ) ); + (unsigned long long) shadow2_gva_to_gpa(current, vmcb->exitinfo2)); } else { @@ -2757,7 +2644,7 @@ asmlinkage void svm_vmexit_handler(struc && ( ( vmcb->exitinfo2 == vmcb->rip ) || vmcb->exitintinfo.bytes) ) { - if (svm_paging_enabled(v) && !mmio_space(gva_to_gpa(vmcb->exitinfo2))) + if (svm_paging_enabled(v) && !mmio_space(gva_to_gpa(vmcb->exitinfo2))) walk_shadow_and_guest_pt( vmcb->exitinfo2 ); } #endif diff -r f2151423f729 -r 01345b08d012 xen/arch/x86/hvm/svm/vmcb.c --- a/xen/arch/x86/hvm/svm/vmcb.c Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/arch/x86/hvm/svm/vmcb.c Wed Aug 16 17:11:56 2006 +0100 @@ -380,8 +380,8 @@ void svm_do_launch(struct vcpu *v) printk("%s: phys_table = %lx\n", __func__, pt); } - /* At launch we always use the phys_table */ - vmcb->cr3 = pagetable_get_paddr(v->domain->arch.phys_table); + /* Set cr3 from hw_cr3 even when guest-visible paging is not enabled */ + vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3; if (svm_dbg_on) { diff -r f2151423f729 -r 01345b08d012 xen/arch/x86/hvm/vlapic.c --- a/xen/arch/x86/hvm/vlapic.c Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/arch/x86/hvm/vlapic.c Wed Aug 16 17:11:56 2006 +0100 @@ -21,7 +21,8 @@ #include <xen/types.h> #include <xen/mm.h> #include <xen/xmalloc.h> -#include <asm/shadow.h> +#include <xen/shadow.h> +#include <xen/domain_page.h> #include <asm/page.h> #include <xen/event.h> #include <xen/trace.h> diff -r f2151423f729 -r 01345b08d012 xen/arch/x86/hvm/vmx/vmcs.c --- a/xen/arch/x86/hvm/vmx/vmcs.c Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/arch/x86/hvm/vmx/vmcs.c Wed Aug 16 17:11:56 2006 +0100 @@ -34,12 +34,8 @@ #include <asm/flushtlb.h> #include <xen/event.h> #include <xen/kernel.h> -#include <asm/shadow.h> #include <xen/keyhandler.h> - -#if CONFIG_PAGING_LEVELS >= 3 -#include <asm/shadow_64.h> -#endif +#include <asm/shadow2.h> static int vmcs_size; static int vmcs_order; @@ -238,7 +234,7 @@ static void vmx_set_host_env(struct vcpu static void vmx_do_launch(struct vcpu *v) { -/* Update CR3, GDT, LDT, TR */ +/* Update CR3, CR0, CR4, GDT, LDT, TR */ unsigned int error = 0; unsigned long cr0, cr4; @@ -276,8 +272,11 @@ static void vmx_do_launch(struct vcpu *v error |= __vmwrite(GUEST_TR_BASE, 0); error |= __vmwrite(GUEST_TR_LIMIT, 0xff); - __vmwrite(GUEST_CR3, pagetable_get_paddr(v->domain->arch.phys_table)); - __vmwrite(HOST_CR3, pagetable_get_paddr(v->arch.monitor_table)); + shadow2_update_paging_modes(v); + printk("%s(): GUEST_CR3<=%08lx, HOST_CR3<=%08lx\n", + __func__, v->arch.hvm_vcpu.hw_cr3, v->arch.cr3); + __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3); + __vmwrite(HOST_CR3, v->arch.cr3); v->arch.schedule_tail = arch_vmx_do_resume; diff -r f2151423f729 -r 01345b08d012 xen/arch/x86/hvm/vmx/vmx.c --- a/xen/arch/x86/hvm/vmx/vmx.c Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/arch/x86/hvm/vmx/vmx.c Wed Aug 16 17:11:56 2006 +0100 @@ -26,9 +26,9 @@ #include <xen/softirq.h> #include <xen/domain_page.h> #include <xen/hypercall.h> +#include <xen/perfc.h> #include <asm/current.h> #include <asm/io.h> -#include <asm/shadow.h> #include <asm/regs.h> #include <asm/cpufeature.h> #include <asm/processor.h> @@ -40,10 +40,7 @@ #include <asm/hvm/vmx/vmx.h> #include <asm/hvm/vmx/vmcs.h> #include <asm/hvm/vmx/cpu.h> -#include <asm/shadow.h> -#if CONFIG_PAGING_LEVELS >= 3 -#include <asm/shadow_64.h> -#endif +#include <asm/shadow2.h> #include <public/sched.h> #include <public/hvm/ioreq.h> #include <asm/hvm/vpic.h> @@ -69,11 +66,16 @@ static int vmx_initialize_guest_resource if ( v->vcpu_id != 0 ) return 1; + if ( !shadow2_mode_external(d) ) + { + DPRINTK("Can't init HVM for dom %u vcpu %u: " + "not in shadow2 external mode\n", + d->domain_id, v->vcpu_id); + domain_crash(d); + } + for_each_vcpu ( d, vc ) { - /* Initialize monitor page table */ - vc->arch.monitor_table = pagetable_null(); - memset(&vc->arch.hvm_vmx, 0, sizeof(struct arch_vmx_struct)); if ( (rc = vmx_create_vmcs(vc)) != 0 ) @@ -107,6 +109,7 @@ static int vmx_initialize_guest_resource vc->arch.hvm_vmx.io_bitmap_a = io_bitmap_a; vc->arch.hvm_vmx.io_bitmap_b = io_bitmap_b; + } /* @@ -116,11 +119,6 @@ static int vmx_initialize_guest_resource memset(&d->shared_info->evtchn_mask[0], 0xff, sizeof(d->shared_info->evtchn_mask)); - /* Put the domain in shadow mode even though we're going to be using - * the shared 1:1 page table initially. It shouldn't hurt */ - shadow_mode_enable( - d, SHM_enable|SHM_refcounts|SHM_translate|SHM_external|SHM_wr_pt_pte); - return 1; } @@ -133,7 +131,6 @@ static void vmx_relinquish_guest_resourc vmx_destroy_vmcs(v); if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) ) continue; - free_monitor_pagetable(v); kill_timer(&v->arch.hvm_vcpu.hlt_timer); if ( hvm_apic_support(v->domain) && (VLAPIC(v) != NULL) ) { @@ -153,8 +150,6 @@ static void vmx_relinquish_guest_resourc if ( d->arch.hvm_domain.buffered_io_va ) unmap_domain_page_global((void *)d->arch.hvm_domain.buffered_io_va); - - shadow_direct_map_clean(d); } #ifdef __x86_64__ @@ -595,14 +590,6 @@ static void vmx_load_cpu_guest_regs(stru vmx_vmcs_exit(v); } -static int vmx_realmode(struct vcpu *v) -{ - unsigned long rflags; - - __vmread(GUEST_RFLAGS, &rflags); - return rflags & X86_EFLAGS_VM; -} - static int vmx_instruction_length(struct vcpu *v) { unsigned long inst_len; @@ -622,6 +609,8 @@ static unsigned long vmx_get_ctrl_reg(st return v->arch.hvm_vmx.cpu_cr2; case 3: return v->arch.hvm_vmx.cpu_cr3; + case 4: + return v->arch.hvm_vmx.cpu_shadow_cr4; default: BUG(); } @@ -753,8 +742,12 @@ static void vmx_setup_hvm_funcs(void) hvm_funcs.realmode = vmx_realmode; hvm_funcs.paging_enabled = vmx_paging_enabled; + hvm_funcs.long_mode_enabled = vmx_long_mode_enabled; + hvm_funcs.guest_x86_mode = vmx_guest_x86_mode; hvm_funcs.instruction_length = vmx_instruction_length; hvm_funcs.get_guest_ctrl_reg = vmx_get_ctrl_reg; + + hvm_funcs.update_host_cr3 = vmx_update_host_cr3; hvm_funcs.stts = vmx_stts; hvm_funcs.set_tsc_offset = vmx_set_tsc_offset; @@ -855,53 +848,25 @@ static void inline __update_guest_eip(un __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0); } - static int vmx_do_page_fault(unsigned long va, struct cpu_user_regs *regs) { - unsigned long gpa; /* FIXME: PAE */ int result; #if 0 /* keep for debugging */ { - unsigned long eip; - + unsigned long eip, cs; + + __vmread(GUEST_CS_BASE, &cs); __vmread(GUEST_RIP, &eip); HVM_DBG_LOG(DBG_LEVEL_VMMU, - "vmx_do_page_fault = 0x%lx, eip = %lx, error_code = %lx", - va, eip, (unsigned long)regs->error_code); + "vmx_do_page_fault = 0x%lx, cs_base=%lx, " + "eip = %lx, error_code = %lx\n", + va, cs, eip, (unsigned long)regs->error_code); } #endif - if ( !vmx_paging_enabled(current) ) - { - /* construct 1-to-1 direct mapping */ - if ( shadow_direct_map_fault(va, regs) ) - return 1; - - handle_mmio(va, va); - TRACE_VMEXIT (2,2); - return 1; - } - gpa = gva_to_gpa(va); - - /* Use 1:1 page table to identify MMIO address space */ - if ( mmio_space(gpa) ){ - struct vcpu *v = current; - /* No support for APIC */ - if (!hvm_apic_support(v->domain) && gpa >= 0xFEC00000) { - u32 inst_len; - __vmread(VM_EXIT_INSTRUCTION_LEN, &(inst_len)); - __update_guest_eip(inst_len); - return 1; - } - TRACE_VMEXIT (2,2); - /* in the case of MMIO, we are more interested in gpa than in va */ - TRACE_VMEXIT (4,gpa); - handle_mmio(va, gpa); - return 1; - } - - result = shadow_fault(va, regs); + result = shadow2_fault(va, regs); + TRACE_VMEXIT (2,result); #if 0 if ( !result ) @@ -972,23 +937,11 @@ static void vmx_vmexit_do_cpuid(struct c clear_bit(X86_FEATURE_APIC, &edx); } -#if CONFIG_PAGING_LEVELS < 3 - edx &= ~(bitmaskof(X86_FEATURE_PAE) | - bitmaskof(X86_FEATURE_PSE) | - bitmaskof(X86_FEATURE_PSE36)); -#else - if ( v->domain->arch.ops->guest_paging_levels == PAGING_L2 ) - { - if ( v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] ) - clear_bit(X86_FEATURE_PSE36, &edx); - else - { - clear_bit(X86_FEATURE_PAE, &edx); - clear_bit(X86_FEATURE_PSE, &edx); - clear_bit(X86_FEATURE_PSE36, &edx); - } - } +#if CONFIG_PAGING_LEVELS >= 3 + if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] ) #endif + clear_bit(X86_FEATURE_PAE, &edx); + clear_bit(X86_FEATURE_PSE36, &edx); ebx &= NUM_THREADS_RESET_MASK; @@ -1086,7 +1039,7 @@ static void vmx_vmexit_do_invlpg(unsigne * We do the safest things first, then try to update the shadow * copying from guest */ - shadow_invlpg(v, va); + shadow2_invlpg(v, va); } @@ -1307,11 +1260,8 @@ vmx_world_restore(struct vcpu *v, struct error |= __vmwrite(CR0_READ_SHADOW, c->cr0); - if (!vmx_paging_enabled(v)) { - HVM_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table"); - __vmwrite(GUEST_CR3, pagetable_get_paddr(v->domain->arch.phys_table)); + if (!vmx_paging_enabled(v)) goto skip_cr3; - } if (c->cr3 == v->arch.hvm_vmx.cpu_cr3) { /* @@ -1325,7 +1275,6 @@ vmx_world_restore(struct vcpu *v, struct domain_crash_synchronous(); return 0; } - shadow_sync_all(v->domain); } else { /* * If different, make a shadow. Check if the PDBR is valid @@ -1348,12 +1297,16 @@ vmx_world_restore(struct vcpu *v, struct * arch.shadow_table should now hold the next CR3 for shadow */ v->arch.hvm_vmx.cpu_cr3 = c->cr3; - update_pagetables(v); + } + + skip_cr3: + + shadow2_update_paging_modes(v); + if (!vmx_paging_enabled(v)) + HVM_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table"); + else HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %x", c->cr3); - __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table)); - } - - skip_cr3: + __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3); error |= __vmread(CR4_READ_SHADOW, &old_cr4); error |= __vmwrite(GUEST_CR4, (c->cr4 | VMX_CR4_HOST_MASK)); @@ -1485,6 +1438,7 @@ static int vmx_set_cr0(unsigned long val int paging_enabled; unsigned long vm_entry_value; unsigned long old_cr0; + unsigned long old_base_mfn; /* * CR0: We don't want to lose PE and PG. @@ -1514,7 +1468,8 @@ static int vmx_set_cr0(unsigned long val v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)) || !get_page(mfn_to_page(mfn), v->domain) ) { - printk("Invalid CR3 value = %lx", v->arch.hvm_vmx.cpu_cr3); + printk("Invalid CR3 value = %lx (mfn=%lx)\n", + v->arch.hvm_vmx.cpu_cr3, mfn); domain_crash_synchronous(); /* need to take a clean path */ } @@ -1539,51 +1494,22 @@ static int vmx_set_cr0(unsigned long val __vmread(VM_ENTRY_CONTROLS, &vm_entry_value); vm_entry_value |= VM_ENTRY_CONTROLS_IA32E_MODE; __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value); - - if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L4) ) - { - printk("Unsupported guest paging levels\n"); - domain_crash_synchronous(); /* need to take a clean path */ - } - } - else -#endif /* __x86_64__ */ - { -#if CONFIG_PAGING_LEVELS >= 3 - /* seems it's a 32-bit or 32-bit PAE guest */ - - if ( test_bit(VMX_CPU_STATE_PAE_ENABLED, - &v->arch.hvm_vmx.cpu_state) ) - { - /* The guest enables PAE first and then it enables PG, it is - * really a PAE guest */ - if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L3) ) - { - printk("Unsupported guest paging levels\n"); - domain_crash_synchronous(); - } - } - else - { - if ( !shadow_set_guest_paging_levels(v->domain, PAGING_L2) ) - { - printk("Unsupported guest paging levels\n"); - domain_crash_synchronous(); /* need to take a clean path */ - } - } + } #endif - } /* * Now arch.guest_table points to machine physical. */ + old_base_mfn = pagetable_get_pfn(v->arch.guest_table); v->arch.guest_table = pagetable_from_pfn(mfn); - update_pagetables(v); + if (old_base_mfn) + put_page(mfn_to_page(old_base_mfn)); + shadow2_update_paging_modes(v); HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx", (unsigned long) (mfn << PAGE_SHIFT)); - __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table)); + __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3); /* * arch->shadow_table should hold the next CR3 for shadow */ @@ -1625,7 +1551,6 @@ static int vmx_set_cr0(unsigned long val } } - clear_all_shadow_status(v->domain); if ( vmx_assist(v, VMX_ASSIST_INVOKE) ) { set_bit(VMX_CPU_STATE_ASSIST_ENABLED, &v->arch.hvm_vmx.cpu_state); __vmread(GUEST_RIP, &eip); @@ -1651,9 +1576,8 @@ static int vmx_set_cr0(unsigned long val } else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE ) { - /* we should take care of this kind of situation */ - clear_all_shadow_status(v->domain); - __vmwrite(GUEST_CR3, pagetable_get_paddr(v->domain->arch.phys_table)); + __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3); + shadow2_update_paging_modes(v); } return 1; @@ -1738,7 +1662,7 @@ static int mov_to_cr(int gp, int cr, str mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT); if (mfn != pagetable_get_pfn(v->arch.guest_table)) __hvm_bug(regs); - shadow_sync_all(v->domain); + shadow2_update_cr3(v); } else { /* * If different, make a shadow. Check if the PDBR is valid @@ -1759,16 +1683,11 @@ static int mov_to_cr(int gp, int cr, str /* * arch.shadow_table should now hold the next CR3 for shadow */ -#if CONFIG_PAGING_LEVELS >= 3 - if ( v->domain->arch.ops->guest_paging_levels == PAGING_L3 ) - shadow_sync_all(v->domain); -#endif - v->arch.hvm_vmx.cpu_cr3 = value; - update_pagetables(v); + update_cr3(v); HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value); - __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table)); + __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3); } break; } @@ -1785,12 +1704,6 @@ static int mov_to_cr(int gp, int cr, str /* The guest is a 32-bit PAE guest. */ #if CONFIG_PAGING_LEVELS >= 3 unsigned long mfn, old_base_mfn; - - if( !shadow_set_guest_paging_levels(v->domain, PAGING_L3) ) - { - printk("Unsupported guest paging levels\n"); - domain_crash_synchronous(); /* need to take a clean path */ - } if ( !VALID_MFN(mfn = get_mfn_from_gpfn( v->arch.hvm_vmx.cpu_cr3 >> PAGE_SHIFT)) || @@ -1800,21 +1713,20 @@ static int mov_to_cr(int gp, int cr, str domain_crash_synchronous(); /* need to take a clean path */ } - old_base_mfn = pagetable_get_pfn(v->arch.guest_table); - if ( old_base_mfn ) - put_page(mfn_to_page(old_base_mfn)); /* * Now arch.guest_table points to machine physical. */ + old_base_mfn = pagetable_get_pfn(v->arch.guest_table); v->arch.guest_table = pagetable_from_pfn(mfn); - update_pagetables(v); + if ( old_base_mfn ) + put_page(mfn_to_page(old_base_mfn)); HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx", (unsigned long) (mfn << PAGE_SHIFT)); - __vmwrite(GUEST_CR3, pagetable_get_paddr(v->arch.shadow_table)); + __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3); /* * arch->shadow_table should hold the next CR3 for shadow @@ -1822,27 +1734,6 @@ static int mov_to_cr(int gp, int cr, str HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx", v->arch.hvm_vmx.cpu_cr3, mfn); -#endif - } - else - { - /* The guest is a 64 bit or 32-bit PAE guest. */ -#if CONFIG_PAGING_LEVELS >= 3 - if ( (v->domain->arch.ops != NULL) && - v->domain->arch.ops->guest_paging_levels == PAGING_L2) - { - /* Seems the guest first enables PAE without enabling PG, - * it must enable PG after that, and it is a 32-bit PAE - * guest */ - - if ( !shadow_set_guest_paging_levels(v->domain, - PAGING_L3) ) - { - printk("Unsupported guest paging levels\n"); - /* need to take a clean path */ - domain_crash_synchronous(); - } - } #endif } } @@ -1864,8 +1755,7 @@ static int mov_to_cr(int gp, int cr, str * all TLB entries except global entries. */ if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) ) - shadow_sync_all(v->domain); - + shadow2_update_paging_modes(v); break; } default: diff -r f2151423f729 -r 01345b08d012 xen/arch/x86/mm.c --- a/xen/arch/x86/mm.c Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/arch/x86/mm.c Wed Aug 16 17:11:56 2006 +0100 @@ -137,7 +137,7 @@ static void free_l1_table(struct page_in static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long, unsigned long type); -static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t); +static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t, unsigned long gl1mfn); /* Used to defer flushing of memory structures. */ struct percpu_mm_info { @@ -274,9 +274,9 @@ void share_xen_page_with_privileged_gues #else /* * In debug builds we shadow a selection of <4GB PDPTs to exercise code paths. - * We cannot safely shadow the idle page table, nor shadow-mode page tables + * We cannot safely shadow the idle page table, nor shadow (v1) page tables * (detected by lack of an owning domain). As required for correctness, we - * always shadow PDPTs aboive 4GB. + * always shadow PDPTs above 4GB. */ #define l3tab_needs_shadow(mfn) \ (((((mfn) << PAGE_SHIFT) != __pa(idle_pg_table)) && \ @@ -297,17 +297,21 @@ static int __init cache_pae_fixmap_addre } __initcall(cache_pae_fixmap_address); -static void __write_ptbase(unsigned long mfn) +static DEFINE_PER_CPU(u32, make_cr3_timestamp); + +void make_cr3(struct vcpu *v, unsigned long mfn) +/* Takes the MFN of a PAE l3 table, copies the contents to below 4GB if + * necessary, and sets v->arch.cr3 to the value to load in CR3. */ { l3_pgentry_t *highmem_l3tab, *lowmem_l3tab; - struct pae_l3_cache *cache = ¤t->arch.pae_l3_cache; + struct pae_l3_cache *cache = &v->arch.pae_l3_cache; unsigned int cpu = smp_processor_id(); - /* Fast path 1: does this mfn need a shadow at all? */ + /* Fast path: does this mfn need a shadow at all? */ if ( !l3tab_needs_shadow(mfn) ) { - write_cr3(mfn << PAGE_SHIFT); - /* Cache is no longer in use or valid (/after/ write to %cr3). */ + v->arch.cr3 = mfn << PAGE_SHIFT; + /* Cache is no longer in use or valid */ cache->high_mfn = 0; return; } @@ -315,13 +319,6 @@ static void __write_ptbase(unsigned long /* Caching logic is not interrupt safe. */ ASSERT(!in_irq()); - /* Fast path 2: is this mfn already cached? */ - if ( cache->high_mfn == mfn ) - { - write_cr3(__pa(cache->table[cache->inuse_idx])); - return; - } - /* Protects against pae_flush_pgd(). */ spin_lock(&cache->lock); @@ -330,29 +327,33 @@ static void __write_ptbase(unsigned long /* Map the guest L3 table and copy to the chosen low-memory cache. */ *(fix_pae_highmem_pl1e - cpu) = l1e_from_pfn(mfn, __PAGE_HYPERVISOR); + /* First check the previous high mapping can't be in the TLB. + * (i.e. have we loaded CR3 since we last did this?) */ + if ( unlikely(this_cpu(make_cr3_timestamp) == this_cpu(tlbflush_time)) ) + local_flush_tlb_one(fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu)); highmem_l3tab = (l3_pgentry_t *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu); lowmem_l3tab = cache->table[cache->inuse_idx]; memcpy(lowmem_l3tab, highmem_l3tab, sizeof(cache->table[0])); *(fix_pae_highmem_pl1e - cpu) = l1e_empty(); - - /* Install the low-memory L3 table in CR3. */ - write_cr3(__pa(lowmem_l3tab)); + this_cpu(make_cr3_timestamp) = this_cpu(tlbflush_time); + + v->arch.cr3 = __pa(lowmem_l3tab); spin_unlock(&cache->lock); } #else /* !CONFIG_X86_PAE */ -static void __write_ptbase(unsigned long mfn) -{ - write_cr3(mfn << PAGE_SHIFT); +void make_cr3(struct vcpu *v, unsigned long mfn) +{ + v->arch.cr3 = mfn << PAGE_SHIFT; } #endif /* !CONFIG_X86_PAE */ void write_ptbase(struct vcpu *v) { - __write_ptbase(pagetable_get_pfn(v->arch.monitor_table)); + write_cr3(v->arch.cr3); } void invalidate_shadow_ldt(struct vcpu *v) @@ -423,8 +424,6 @@ int map_ldt_shadow_page(unsigned int off BUG_ON(unlikely(in_irq())); - shadow_sync_va(v, gva); - TOGGLE_MODE(); __copy_from_user(&l1e, &linear_pg_table[l1_linear_offset(gva)], sizeof(l1e)); @@ -440,12 +439,12 @@ int map_ldt_shadow_page(unsigned int off res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page); - if ( !res && unlikely(shadow_mode_refcounts(d)) ) - { - shadow_lock(d); - shadow_remove_all_write_access(d, gmfn, mfn); + if ( !res && unlikely(shadow2_mode_refcounts(d)) ) + { + shadow2_lock(d); + shadow2_remove_write_access(d->vcpu[0], _mfn(mfn), 0, 0); res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page); - shadow_unlock(d); + shadow2_unlock(d); } if ( unlikely(!res) ) @@ -513,7 +512,7 @@ get_linear_pagetable( struct page_info *page; unsigned long pfn; - ASSERT( !shadow_mode_refcounts(d) ); + ASSERT( !shadow2_mode_refcounts(d) ); if ( (root_get_flags(re) & _PAGE_RW) ) { @@ -576,7 +575,8 @@ get_page_from_l1e( if ( !iomem_access_permitted(d, mfn, mfn) ) { - MEM_LOG("Non-privileged attempt to map I/O space %08lx", mfn); + MEM_LOG("Non-privileged (%u) attempt to map I/O space %08lx", + d->domain_id, mfn); return 0; } @@ -587,9 +587,14 @@ get_page_from_l1e( d = dom_io; } - okay = ((l1e_get_flags(l1e) & _PAGE_RW) ? - get_page_and_type(page, d, PGT_writable_page) : - get_page(page, d)); + /* Foreign mappings into guests in shadow2 external mode don't + * contribute to writeable mapping refcounts. (This allows the + * qemu-dm helper process in dom0 to map the domain's memory without + * messing up the count of "real" writable mappings.) */ + okay = (((l1e_get_flags(l1e) & _PAGE_RW) && + !(unlikely(shadow2_mode_external(d) && (d != current->domain)))) + ? get_page_and_type(page, d, PGT_writable_page) + : get_page(page, d)); if ( !okay ) { MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte @@ -609,8 +614,6 @@ get_page_from_l2e( struct domain *d, unsigned long vaddr) { int rc; - - ASSERT(!shadow_mode_refcounts(d)); if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) return 1; @@ -641,8 +644,6 @@ get_page_from_l3e( { int rc; - ASSERT(!shadow_mode_refcounts(d)); - if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ) return 1; @@ -668,8 +669,6 @@ get_page_from_l4e( struct domain *d, unsigned long vaddr) { int rc; - - ASSERT( !shadow_mode_refcounts(d) ); if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) ) return 1; @@ -727,7 +726,10 @@ void put_page_from_l1e(l1_pgentry_t l1e, domain_crash(d); } - if ( l1e_get_flags(l1e) & _PAGE_RW ) + /* Remember we didn't take a type-count of foreign writable mappings + * to shadow2 external domains */ + if ( (l1e_get_flags(l1e) & _PAGE_RW) && + !(unlikely((e != d) && shadow2_mode_external(e))) ) { put_page_and_type(page); } @@ -784,7 +786,7 @@ static int alloc_l1_table(struct page_in l1_pgentry_t *pl1e; int i; - ASSERT(!shadow_mode_refcounts(d)); + ASSERT(!shadow2_mode_refcounts(d)); pl1e = map_domain_page(pfn); @@ -832,6 +834,8 @@ static int create_pae_xen_mappings(l3_pg * 2. Cannot appear in another page table's L3: * a. alloc_l3_table() calls this function and this check will fail * b. mod_l3_entry() disallows updates to slot 3 in an existing table + * + * XXX -- this needs revisiting for shadow2_mode_refcount()==true... */ page = l3e_get_page(l3e3); BUG_ON(page->u.inuse.type_info & PGT_pinned); @@ -955,11 +959,7 @@ static int alloc_l2_table(struct page_in l2_pgentry_t *pl2e; int i; - /* See the code in shadow_promote() to understand why this is here. */ - if ( (PGT_base_page_table == PGT_l2_page_table) && - unlikely(shadow_mode_refcounts(d)) ) - return 1; - ASSERT(!shadow_mode_refcounts(d)); + ASSERT(!shadow2_mode_refcounts(d)); pl2e = map_domain_page(pfn); @@ -1009,11 +1009,7 @@ static int alloc_l3_table(struct page_in l3_pgentry_t *pl3e; int i; - /* See the code in shadow_promote() to understand why this is here. */ - if ( (PGT_base_page_table == PGT_l3_page_table) && - shadow_mode_refcounts(d) ) - return 1; - ASSERT(!shadow_mode_refcounts(d)); + ASSERT(!shadow2_mode_refcounts(d)); #ifdef CONFIG_X86_PAE /* @@ -1072,11 +1068,7 @@ static int alloc_l4_table(struct page_in unsigned long vaddr; int i; - /* See the code in shadow_promote() to understand why this is here. */ - if ( (PGT_base_page_table == PGT_l4_page_table) && - shadow_mode_refcounts(d) ) - return 1; - ASSERT(!shadow_mode_refcounts(d)); + ASSERT(!shadow2_mode_refcounts(d)); for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ ) { @@ -1183,51 +1175,61 @@ static void free_l4_table(struct page_in static inline int update_l1e(l1_pgentry_t *pl1e, l1_pgentry_t ol1e, - l1_pgentry_t nl1e) -{ + l1_pgentry_t nl1e, + unsigned long gl1mfn, + struct vcpu *v) +{ + int rv = 1; + if ( unlikely(shadow2_mode_enabled(v->domain)) ) + shadow2_lock(v->domain); #ifndef PTE_UPDATE_WITH_CMPXCHG - return !__copy_to_user(pl1e, &nl1e, sizeof(nl1e)); + rv = (!__copy_to_user(pl1e, &nl1e, sizeof(nl1e))); #else - intpte_t o = l1e_get_intpte(ol1e); - intpte_t n = l1e_get_intpte(nl1e); - - for ( ; ; ) - { - if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ) - { - MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte - ": saw %" PRIpte, - l1e_get_intpte(ol1e), - l1e_get_intpte(nl1e), - o); - return 0; - } - - if ( o == l1e_get_intpte(ol1e) ) - break; - - /* Allowed to change in Accessed/Dirty flags only. */ - BUG_ON((o ^ l1e_get_intpte(ol1e)) & - ~(int)(_PAGE_ACCESSED|_PAGE_DIRTY)); - ol1e = l1e_from_intpte(o); - } - - return 1; + { + intpte_t o = l1e_get_intpte(ol1e); + intpte_t n = l1e_get_intpte(nl1e); + + for ( ; ; ) + { + if ( unlikely(cmpxchg_user(pl1e, o, n) != 0) ) + { + MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte + ": saw %" PRIpte, + l1e_get_intpte(ol1e), + l1e_get_intpte(nl1e), + o); + rv = 0; + break; + } + + if ( o == l1e_get_intpte(ol1e) ) + break; + + /* Allowed to change in Accessed/Dirty flags only. */ + BUG_ON((o ^ l1e_get_intpte(ol1e)) & + ~(int)(_PAGE_ACCESSED|_PAGE_DIRTY)); + ol1e = l1e_from_intpte(o); + } + } #endif + if ( unlikely(shadow2_mode_enabled(v->domain)) ) + { + shadow2_validate_guest_entry(v, _mfn(gl1mfn), pl1e); + shadow2_unlock(v->domain); + } + return rv; } /* Update the L1 entry at pl1e to new value nl1e. */ -static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e) +static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e, + unsigned long gl1mfn) { l1_pgentry_t ol1e; struct domain *d = current->domain; if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) ) return 0; - - if ( unlikely(shadow_mode_refcounts(d)) ) - return update_l1e(pl1e, ol1e, nl1e); if ( l1e_get_flags(nl1e) & _PAGE_PRESENT ) { @@ -1239,13 +1241,13 @@ static int mod_l1_entry(l1_pgentry_t *pl } /* Fast path for identical mapping, r/w and presence. */ - if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT)) - return update_l1e(pl1e, ol1e, nl1e); + if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT) ) + return update_l1e(pl1e, ol1e, nl1e, gl1mfn, current); if ( unlikely(!get_page_from_l1e(nl1e, FOREIGNDOM)) ) return 0; - if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) ) + if ( unlikely(!update_l1e(pl1e, ol1e, nl1e, gl1mfn, current)) ) { put_page_from_l1e(nl1e, d); return 0; @@ -1253,7 +1255,7 @@ static int mod_l1_entry(l1_pgentry_t *pl } else { - if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) ) + if ( unlikely(!update_l1e(pl1e, ol1e, nl1e, gl1mfn, current)) ) return 0; } @@ -1262,9 +1264,9 @@ static int mod_l1_entry(l1_pgentry_t *pl } #ifndef PTE_UPDATE_WITH_CMPXCHG -#define UPDATE_ENTRY(_t,_p,_o,_n) ({ (*(_p) = (_n)); 1; }) +#define _UPDATE_ENTRY(_t,_p,_o,_n) ({ (*(_p) = (_n)); 1; }) #else -#define UPDATE_ENTRY(_t,_p,_o,_n) ({ \ +#define _UPDATE_ENTRY(_t,_p,_o,_n) ({ \ for ( ; ; ) \ { \ intpte_t __o = cmpxchg((intpte_t *)(_p), \ @@ -1279,6 +1281,18 @@ static int mod_l1_entry(l1_pgentry_t *pl } \ 1; }) #endif +#define UPDATE_ENTRY(_t,_p,_o,_n,_m) ({ \ + int rv; \ + if ( unlikely(shadow2_mode_enabled(current->domain)) ) \ + shadow2_lock(current->domain); \ + rv = _UPDATE_ENTRY(_t, _p, _o, _n); \ + if ( unlikely(shadow2_mode_enabled(current->domain)) ) \ + { \ + shadow2_validate_guest_entry(current, _mfn(_m), (_p)); \ + shadow2_unlock(current->domain); \ + } \ + rv; \ +}) /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */ static int mod_l2_entry(l2_pgentry_t *pl2e, @@ -1309,19 +1323,19 @@ static int mod_l2_entry(l2_pgentry_t *pl /* Fast path for identical mapping and presence. */ if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT)) - return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e); + return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn); if ( unlikely(!l1_backptr(&vaddr, pgentry_ptr_to_slot(pl2e), type)) || unlikely(!get_page_from_l2e(nl2e, pfn, current->domain, vaddr)) ) return 0; - if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) ) + if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn)) ) { put_page_from_l2e(nl2e, pfn); return 0; } } - else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) ) + else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn)) ) { return 0; } @@ -1329,7 +1343,6 @@ static int mod_l2_entry(l2_pgentry_t *pl put_page_from_l2e(ol2e, pfn); return 1; } - #if CONFIG_PAGING_LEVELS >= 3 @@ -1356,7 +1369,7 @@ static int mod_l3_entry(l3_pgentry_t *pl */ if ( pgentry_ptr_to_slot(pl3e) >= 3 ) return 0; -#endif +#endif if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) ) return 0; @@ -1372,7 +1385,7 @@ static int mod_l3_entry(l3_pgentry_t *pl /* Fast path for identical mapping and presence. */ if (!l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT)) - return UPDATE_ENTRY(l3, pl3e, ol3e, nl3e); + return UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn); #if CONFIG_PAGING_LEVELS >= 4 if ( unlikely(!l2_backptr(&vaddr, pgentry_ptr_to_slot(pl3e), type)) || @@ -1383,15 +1396,15 @@ static int mod_l3_entry(l3_pgentry_t *pl << L3_PAGETABLE_SHIFT; if ( unlikely(!get_page_from_l3e(nl3e, pfn, current->domain, vaddr)) ) return 0; -#endif - - if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) ) +#endif + + if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn)) ) { put_page_from_l3e(nl3e, pfn); return 0; } } - else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) ) + else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn)) ) { return 0; } @@ -1438,19 +1451,19 @@ static int mod_l4_entry(l4_pgentry_t *pl /* Fast path for identical mapping and presence. */ if (!l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT)) - return UPDATE_ENTRY(l4, pl4e, ol4e, nl4e); + return UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn); if ( unlikely(!l3_backptr(&vaddr, pgentry_ptr_to_slot(pl4e), type)) || unlikely(!get_page_from_l4e(nl4e, pfn, current->domain, vaddr)) ) return 0; - if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) ) + if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn)) ) { put_page_from_l4e(nl4e, pfn); return 0; } } - else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) ) + else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn)) ) { return 0; } @@ -1506,18 +1519,21 @@ void free_page_type(struct page_info *pa */ this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_ALL_TLBS; - if ( unlikely(shadow_mode_enabled(owner)) ) + if ( unlikely(shadow2_mode_enabled(owner) + && !shadow2_lock_is_acquired(owner)) ) { /* Raw page tables are rewritten during save/restore. */ - if ( !shadow_mode_translate(owner) ) + if ( !shadow2_mode_translate(owner) ) mark_dirty(owner, page_to_mfn(page)); - if ( shadow_mode_refcounts(owner) ) + if ( shadow2_mode_refcounts(owner) ) return; gmfn = mfn_to_gmfn(owner, page_to_mfn(page)); ASSERT(VALID_M2P(gmfn)); - remove_shadow(owner, gmfn, type & PGT_type_mask); + shadow2_lock(owner); + shadow2_remove_all_shadows(owner->vcpu[0], _mfn(gmfn)); + shadow2_unlock(owner); } } @@ -1573,9 +1589,6 @@ void put_page_type(struct page_info *pag if ( unlikely((nx & PGT_count_mask) == 0) ) { - /* Record TLB information for flush later. Races are harmless. */ - page->tlbflush_timestamp = tlbflush_current_time(); - if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) && likely(nx & PGT_validated) ) { @@ -1593,6 +1606,9 @@ void put_page_type(struct page_info *pag x &= ~PGT_validated; nx &= ~PGT_validated; } + + /* Record TLB information for flush later. */ + page->tlbflush_timestamp = tlbflush_current_time(); } else if ( unlikely((nx & (PGT_pinned|PGT_type_mask|PGT_count_mask)) == (PGT_pinned|PGT_l1_page_table|1)) ) @@ -1682,7 +1698,7 @@ int get_page_type(struct page_info *page #endif /* Fixme: add code to propagate va_unknown to subtables. */ if ( ((type & PGT_type_mask) >= PGT_l2_page_table) && - !shadow_mode_refcounts(page_get_owner(page)) ) + !shadow2_mode_refcounts(page_get_owner(page)) ) return 0; /* This table is possibly mapped at multiple locations. */ nx &= ~PGT_va_mask; @@ -1729,7 +1745,10 @@ int new_guest_cr3(unsigned long mfn) int okay; unsigned long old_base_mfn; - if ( shadow_mode_refcounts(d) ) + if ( hvm_guest(v) && !hvm_paging_enabled(v) ) + domain_crash_synchronous(); + + if ( shadow2_mode_refcounts(d) ) { okay = get_page_from_pagenr(mfn, d); if ( unlikely(!okay) ) @@ -1747,7 +1766,7 @@ int new_guest_cr3(unsigned long mfn) MEM_LOG("New baseptr %lx: slow path via idle pagetables", mfn); old_base_mfn = pagetable_get_pfn(v->arch.guest_table); v->arch.guest_table = pagetable_null(); - update_pagetables(v); + update_cr3(v); write_cr3(__pa(idle_pg_table)); if ( old_base_mfn != 0 ) put_page_and_type(mfn_to_page(old_base_mfn)); @@ -1769,30 +1788,20 @@ int new_guest_cr3(unsigned long mfn) invalidate_shadow_ldt(v); old_base_mfn = pagetable_get_pfn(v->arch.guest_table); + v->arch.guest_table = pagetable_from_pfn(mfn); - update_pagetables(v); /* update shadow_table and monitor_table */ + update_cr3(v); /* update shadow_table and cr3 fields of vcpu struct */ write_ptbase(v); if ( likely(old_base_mfn != 0) ) { - if ( shadow_mode_refcounts(d) ) + if ( shadow2_mode_refcounts(d) ) put_page(mfn_to_page(old_base_mfn)); else put_page_and_type(mfn_to_page(old_base_mfn)); } - /* CR3 also holds a ref to its shadow... */ - if ( shadow_mode_enabled(d) ) - { - if ( v->arch.monitor_shadow_ref ) - put_shadow_ref(v->arch.monitor_shadow_ref); - v->arch.monitor_shadow_ref = - pagetable_get_pfn(v->arch.monitor_table); - ASSERT(!page_get_owner(mfn_to_page(v->arch.monitor_shadow_ref))); - get_shadow_ref(v->arch.monitor_shadow_ref); - } - return 1; } @@ -1807,8 +1816,6 @@ static void process_deferred_ops(void) if ( deferred_ops & (DOP_FLUSH_ALL_TLBS|DOP_FLUSH_TLB) ) { - if ( shadow_mode_enabled(d) ) - shadow_sync_all(d); if ( deferred_ops & DOP_FLUSH_ALL_TLBS ) flush_tlb_mask(d->domain_dirty_cpumask); else @@ -1974,7 +1981,7 @@ int do_mmuext_op( type = PGT_root_page_table; pin_page: - if ( shadow_mode_refcounts(FOREIGNDOM) ) + if ( shadow2_mode_refcounts(FOREIGNDOM) ) break; okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM); @@ -1996,7 +2003,7 @@ int do_mmuext_op( break; case MMUEXT_UNPIN_TABLE: - if ( shadow_mode_refcounts(d) ) + if ( shadow2_mode_refcounts(d) ) break; if ( unlikely(!(okay = get_page_from_pagenr(mfn, d))) ) @@ -2009,6 +2016,12 @@ int do_mmuext_op( { put_page_and_type(page); put_page(page); + if ( shadow2_mode_enabled(d) ) + { + shadow2_lock(d); + shadow2_remove_all_shadows(v, _mfn(mfn)); + shadow2_unlock(d); + } } else { @@ -2050,9 +2063,9 @@ int do_mmuext_op( break; case MMUEXT_INVLPG_LOCAL: - if ( shadow_mode_enabled(d) ) - shadow_invlpg(v, op.arg1.linear_addr); - local_flush_tlb_one(op.arg1.linear_addr); + if ( !shadow2_mode_enabled(d) + || shadow2_invlpg(v, op.arg1.linear_addr) != 0 ) + local_flush_tlb_one(op.arg1.linear_addr); break; case MMUEXT_TLB_FLUSH_MULTI: @@ -2098,7 +2111,7 @@ int do_mmuext_op( unsigned long ptr = op.arg1.linear_addr; unsigned long ents = op.arg2.nr_ents; - if ( shadow_mode_external(d) ) + if ( shadow2_mode_external(d) ) { MEM_LOG("ignoring SET_LDT hypercall from external " "domain %u", d->domain_id); @@ -2171,9 +2184,6 @@ int do_mmu_update( LOCK_BIGLOCK(d); - if ( unlikely(shadow_mode_enabled(d)) ) - check_pagetable(v, "pre-mmu"); /* debug */ - if ( unlikely(count & MMU_UPDATE_PREEMPTED) ) { count &= ~MMU_UPDATE_PREEMPTED; @@ -2248,7 +2258,12 @@ int do_mmu_update( case PGT_l3_page_table: case PGT_l4_page_table: { - ASSERT(!shadow_mode_refcounts(d)); + if ( shadow2_mode_refcounts(d) ) + { + DPRINTK("mmu update on shadow-refcounted domain!"); + break; + } + if ( unlikely(!get_page_type( page, type_info & (PGT_type_mask|PGT_va_mask))) ) goto not_a_pt; @@ -2258,10 +2273,7 @@ int do_mmu_update( case PGT_l1_page_table: { l1_pgentry_t l1e = l1e_from_intpte(req.val); - okay = mod_l1_entry(va, l1e); - if ( okay && unlikely(shadow_mode_enabled(d)) ) - shadow_l1_normal_pt_update( - d, req.ptr, l1e, &sh_mapcache); + okay = mod_l1_entry(va, l1e, mfn); } break; case PGT_l2_page_table: @@ -2269,9 +2281,6 @@ int do_mmu_update( l2_pgentry_t l2e = l2e_from_intpte(req.val); okay = mod_l2_entry( (l2_pgentry_t *)va, l2e, mfn, type_info); - if ( okay && unlikely(shadow_mode_enabled(d)) ) - shadow_l2_normal_pt_update( - d, req.ptr, l2e, &sh_mapcache); } break; #if CONFIG_PAGING_LEVELS >= 3 @@ -2279,9 +2288,6 @@ int do_mmu_update( { l3_pgentry_t l3e = l3e_from_intpte(req.val); okay = mod_l3_entry(va, l3e, mfn, type_info); - if ( okay && unlikely(shadow_mode_enabled(d)) ) - shadow_l3_normal_pt_update( - d, req.ptr, l3e, &sh_mapcache); } break; #endif @@ -2290,9 +2296,6 @@ int do_mmu_update( { l4_pgentry_t l4e = l4e_from_intpte(req.val); okay = mod_l4_entry(va, l4e, mfn, type_info); - if ( okay && unlikely(shadow_mode_enabled(d)) ) - shadow_l4_normal_pt_update( - d, req.ptr, l4e, &sh_mapcache); } break; #endif @@ -2308,19 +2311,17 @@ int do_mmu_update( if ( unlikely(!get_page_type(page, PGT_writable_page)) ) break; - if ( shadow_mode_enabled(d) ) - { - shadow_lock(d); - __mark_dirty(d, mfn); - if ( page_is_page_table(page) && !page_out_of_sync(page) ) - shadow_mark_mfn_out_of_sync(v, gmfn, mfn); - } + if ( unlikely(shadow2_mode_enabled(d)) ) + shadow2_lock(d); *(intpte_t *)va = req.val; okay = 1; - if ( shadow_mode_enabled(d) ) - shadow_unlock(d); + if ( unlikely(shadow2_mode_enabled(d)) ) + { + shadow2_validate_guest_entry(v, _mfn(mfn), va); + shadow2_unlock(d); + } put_page_type(page); } @@ -2333,12 +2334,6 @@ int do_mmu_update( break; case MMU_MACHPHYS_UPDATE: - - if ( shadow_mode_translate(FOREIGNDOM) ) - { - MEM_LOG("can't mutate m2p table of translate mode guest"); - break; - } mfn = req.ptr >> PAGE_SHIFT; gpfn = req.val; @@ -2349,9 +2344,13 @@ int do_mmu_update( break; } - set_gpfn_from_mfn(mfn, gpfn); + if ( shadow2_mode_translate(FOREIGNDOM) ) + shadow2_guest_physmap_add_page(FOREIGNDOM, gpfn, mfn); + else + set_gpfn_from_mfn(mfn, gpfn); okay = 1; + // Mark the new gfn dirty... mark_dirty(FOREIGNDOM, mfn); put_page(mfn_to_page(mfn)); @@ -2381,9 +2380,6 @@ int do_mmu_update( done += i; if ( unlikely(!guest_handle_is_null(pdone)) ) copy_to_guest(pdone, &done, 1); - - if ( unlikely(shadow_mode_enabled(d)) ) - check_pagetable(v, "post-mmu"); /* debug */ UNLOCK_BIGLOCK(d); return rc; @@ -2402,7 +2398,6 @@ static int create_grant_pte_mapping( struct domain *d = v->domain; ASSERT(spin_is_locked(&d->big_lock)); - ASSERT(!shadow_mode_refcounts(d)); gmfn = pte_addr >> PAGE_SHIFT; mfn = gmfn_to_mfn(d, gmfn); @@ -2418,7 +2413,7 @@ static int create_grant_pte_mapping( page = mfn_to_page(mfn); type_info = page->u.inuse.type_info; - if ( ((type_info & PGT_type_mask) != PGT_l1_page_table) || + if ( ((type_info & PGT_type_mask) != PGT_l1_page_table) || !get_page_type(page, type_info & (PGT_type_mask|PGT_va_mask)) ) { MEM_LOG("Grant map attempted to update a non-L1 page"); @@ -2427,28 +2422,22 @@ static int create_grant_pte_mapping( } ol1e = *(l1_pgentry_t *)va; - if ( !update_l1e(va, ol1e, _nl1e) ) + if ( !update_l1e(va, ol1e, _nl1e, mfn, v) ) { put_page_type(page); rc = GNTST_general_error; goto failed; } - put_page_from_l1e(ol1e, d); - - if ( unlikely(shadow_mode_enabled(d)) ) - { - struct domain_mmap_cache sh_mapcache; - domain_mmap_cache_init(&sh_mapcache); - shadow_l1_normal_pt_update(d, pte_addr, _nl1e, &sh_mapcache); - domain_mmap_cache_destroy(&sh_mapcache); - } + if ( !shadow2_mode_refcounts(d) ) + put_page_from_l1e(ol1e, d); put_page_type(page); failed: unmap_domain_page(va); put_page(page); + return rc; } @@ -2462,8 +2451,6 @@ static int destroy_grant_pte_mapping( u32 type_info; l1_pgentry_t ol1e; - ASSERT(!shadow_mode_refcounts(d)); - gmfn = addr >> PAGE_SHIFT; mfn = gmfn_to_mfn(d, gmfn); @@ -2504,7 +2491,9 @@ static int destroy_grant_pte_mapping( } /* Delete pagetable entry. */ - if ( unlikely(!update_l1e((l1_pgentry_t *)va, ol1e, l1e_empty())) ) + if ( unlikely(!update_l1e( + (l1_pgentry_t *)va, ol1e, l1e_empty(), mfn, + d->vcpu[0] /* Change if we go to per-vcpu shadows. */)) ) { MEM_LOG("Cannot delete PTE entry at %p", va); put_page_type(page); @@ -2512,14 +2501,6 @@ static int destroy_grant_pte_mapping( goto failed; } - if ( unlikely(shadow_mode_enabled(d)) ) - { - struct domain_mmap_cache sh_mapcache; - domain_mmap_cache_init(&sh_mapcache); - shadow_l1_normal_pt_update(d, addr, l1e_empty(), &sh_mapcache); - domain_mmap_cache_destroy(&sh_mapcache); - } - put_page_type(page); failed: @@ -2536,31 +2517,22 @@ static int create_grant_va_mapping( struct domain *d = v->domain; ASSERT(spin_is_locked(&d->big_lock)); - ASSERT(!shadow_mode_refcounts(d)); - - /* - * This is actually overkill - we don't need to sync the L1 itself, - * just everything involved in getting to this L1 (i.e. we need - * linear_pg_table[l1_linear_offset(va)] to be in sync)... - */ - __shadow_sync_va(v, va); pl1e = &linear_pg_table[l1_linear_offset(va)]; if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) || - !update_l1e(pl1e, ol1e, _nl1e) ) + !update_l1e(pl1e, ol1e, _nl1e, + l2e_get_pfn(__linear_l2_table[l2_linear_offset(va)]), v) ) return GNTST_general_error; - put_page_from_l1e(ol1e, d); - - if ( unlikely(shadow_mode_enabled(d)) ) - shadow_do_update_va_mapping(va, _nl1e, v); + if ( !shadow2_mode_refcounts(d) ) + put_page_from_l1e(ol1e, d); return GNTST_okay; } static int destroy_grant_va_mapping( - unsigned long addr, unsigned long frame) + unsigned long addr, unsigned long frame, struct domain *d) { l1_pgentry_t *pl1e, ol1e; @@ -2584,12 +2556,14 @@ static int destroy_grant_va_mapping( } /* Delete pagetable entry. */ - if ( unlikely(!update_l1e(pl1e, ol1e, l1e_empty())) ) + if ( unlikely(!update_l1e(pl1e, ol1e, l1e_empty(), + l2e_get_pfn(__linear_l2_table[l2_linear_offset(addr)]), + d->vcpu[0] /* Change for per-vcpu shadows */)) ) { MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e); return GNTST_general_error; } - + return 0; } @@ -2597,7 +2571,7 @@ int create_grant_host_mapping( unsigned long addr, unsigned long frame, unsigned int flags) { l1_pgentry_t pte = l1e_from_pfn(frame, GRANT_PTE_FLAGS); - + if ( (flags & GNTMAP_application_map) ) l1e_add_flags(pte,_PAGE_USER); if ( !(flags & GNTMAP_readonly) ) @@ -2613,7 +2587,7 @@ int destroy_grant_host_mapping( { if ( flags & GNTMAP_contains_pte ) return destroy_grant_pte_mapping(addr, frame, current->domain); - return destroy_grant_va_mapping(addr, frame); + return destroy_grant_va_mapping(addr, frame, current->domain); } int steal_page( @@ -2675,46 +2649,44 @@ int do_update_va_mapping(unsigned long v perfc_incrc(calls_to_update_va); - if ( unlikely(!__addr_ok(va) && !shadow_mode_external(d)) ) + if ( unlikely(!__addr_ok(va) && !shadow2_mode_external(d)) ) return -EINVAL; + if ( unlikely(shadow2_mode_refcounts(d)) ) + { + DPRINTK("Grant op on a shadow-refcounted domain\n"); + return -EINVAL; + } + LOCK_BIGLOCK(d); - if ( unlikely(shadow_mode_enabled(d)) ) - check_pagetable(v, "pre-va"); /* debug */ - - if ( unlikely(!mod_l1_entry(&linear_pg_table[l1_linear_offset(va)], - val)) ) - rc = -EINVAL; - - if ( likely(rc == 0) && unlikely(shadow_mode_enabled(d)) ) + if ( likely(rc == 0) && unlikely(shadow2_mode_enabled(d)) ) { if ( unlikely(this_cpu(percpu_mm_info).foreign && - (shadow_mode_translate(d) || - shadow_mode_translate( + (shadow2_mode_translate(d) || + shadow2_mode_translate( this_cpu(percpu_mm_info).foreign))) ) { /* * The foreign domain's pfn's are in a different namespace. There's - * not enough information in just a gpte to figure out how to + * not enough information in just a gpte to figure out how to * (re-)shadow this entry. */ domain_crash(d); } + } + + if ( unlikely(!mod_l1_entry( + &linear_pg_table[l1_linear_offset(va)], val, + l2e_get_pfn(__linear_l2_table[l2_linear_offset(va)]))) ) + rc = -EINVAL; - rc = shadow_do_update_va_mapping(va, val, v); - - check_pagetable(v, "post-va"); /* debug */ - } - switch ( flags & UVMF_FLUSHTYPE_MASK ) { case UVMF_TLB_FLUSH: switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) ) { case UVMF_LOCAL: - if ( unlikely(shadow_mode_enabled(d)) ) - shadow_sync_all(d); local_flush_tlb(); break; case UVMF_ALL: @@ -2733,9 +2705,9 @@ int do_update_va_mapping(unsigned long v switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) ) { case UVMF_LOCAL: - if ( unlikely(shadow_mode_enabled(d)) ) - shadow_invlpg(current, va); - local_flush_tlb_one(va); + if ( !shadow2_mode_enabled(d) + || (shadow2_invlpg(current, va) != 0) ) + local_flush_tlb_one(va); break; case UVMF_ALL: flush_tlb_one_mask(d->domain_dirty_cpumask, va); @@ -2807,8 +2779,6 @@ long set_gdt(struct vcpu *v, if ( entries > FIRST_RESERVED_GDT_ENTRY ) return -EINVAL; - - shadow_sync_all(d); /* Check the pages in the new GDT. */ for ( i = 0; i < nr_pages; i++ ) { @@ -2912,24 +2882,13 @@ long do_update_descriptor(u64 pa, u64 de break; } - if ( shadow_mode_enabled(dom) ) - { - shadow_lock(dom); - - __mark_dirty(dom, mfn); - - if ( page_is_page_table(page) && !page_out_of_sync(page) ) - shadow_mark_mfn_out_of_sync(current, gmfn, mfn); - } + mark_dirty(dom, mfn); /* All is good so make the update. */ gdt_pent = map_domain_page(mfn); memcpy(&gdt_pent[offset], &d, 8); unmap_domain_page(gdt_pent); - if ( shadow_mode_enabled(dom) ) - shadow_unlock(dom); - put_page_type(page); ret = 0; /* success */ @@ -2981,8 +2940,8 @@ long arch_memory_op(int op, XEN_GUEST_HA default: break; } - - if ( !shadow_mode_translate(d) || (mfn == 0) ) + + if ( !shadow2_mode_translate(d) || (mfn == 0) ) { put_domain(d); return -EINVAL; @@ -3011,7 +2970,7 @@ long arch_memory_op(int op, XEN_GUEST_HA guest_physmap_add_page(d, xatp.gpfn, mfn); UNLOCK_BIGLOCK(d); - + put_domain(d); break; @@ -3136,7 +3095,8 @@ static int ptwr_emulated_update( unsigned long pfn; struct page_info *page; l1_pgentry_t pte, ol1e, nl1e, *pl1e; - struct domain *d = current->domain; + struct vcpu *v = current; + struct domain *d = v->domain; /* Aligned access only, thank you. */ if ( !access_ok(addr, bytes) || ((addr & (bytes-1)) != 0) ) @@ -3196,25 +3156,36 @@ static int ptwr_emulated_update( return X86EMUL_UNHANDLEABLE; } + /* Checked successfully: do the update (write or cmpxchg). */ pl1e = map_domain_page(page_to_mfn(page)); pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK)); if ( do_cmpxchg ) { + if ( shadow2_mode_enabled(d) ) + shadow2_lock(d); ol1e = l1e_from_intpte(old); if ( cmpxchg((intpte_t *)pl1e, old, val) != old ) { + if ( shadow2_mode_enabled(d) ) + shadow2_unlock(d); unmap_domain_page(pl1e); put_page_from_l1e(nl1e, d); return X86EMUL_CMPXCHG_FAILED; } + if ( unlikely(shadow2_mode_enabled(v->domain)) ) + { + shadow2_validate_guest_entry(v, _mfn(page_to_mfn(page)), pl1e); + shadow2_unlock(v->domain); + } } else { ol1e = *pl1e; - if ( !update_l1e(pl1e, ol1e, nl1e) ) + if ( !update_l1e(pl1e, ol1e, nl1e, page_to_mfn(page), v) ) BUG(); } + unmap_domain_page(pl1e); /* Finally, drop the old PTE. */ diff -r f2151423f729 -r 01345b08d012 xen/arch/x86/setup.c --- a/xen/arch/x86/setup.c Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/arch/x86/setup.c Wed Aug 16 17:11:56 2006 +0100 @@ -532,8 +532,6 @@ void __init __start_xen(multiboot_info_t if ( opt_watchdog ) watchdog_enable(); - shadow_mode_init(); - /* initialize access control security module */ acm_init(&initrdidx, mbi, initial_images_start); diff -r f2151423f729 -r 01345b08d012 xen/arch/x86/smpboot.c --- a/xen/arch/x86/smpboot.c Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/arch/x86/smpboot.c Wed Aug 16 17:11:56 2006 +0100 @@ -896,7 +896,7 @@ static int __devinit do_boot_cpu(int api v = alloc_idle_vcpu(cpu); BUG_ON(v == NULL); - v->arch.monitor_table = pagetable_from_paddr(__pa(idle_pg_table)); + v->arch.cr3 = __pa(idle_pg_table); /* start_eip had better be page-aligned! */ start_eip = setup_trampoline(); diff -r f2151423f729 -r 01345b08d012 xen/arch/x86/traps.c --- a/xen/arch/x86/traps.c Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/arch/x86/traps.c Wed Aug 16 17:11:56 2006 +0100 @@ -277,6 +277,21 @@ void show_stack(struct cpu_user_regs *re show_trace(regs); } +void show_xen_trace() +{ + struct cpu_user_regs regs; +#ifdef __x86_64 + __asm__("movq %%rsp,%0" : "=m" (regs.rsp)); + __asm__("movq %%rbp,%0" : "=m" (regs.rbp)); + __asm__("leaq 0(%%rip),%0" : "=a" (regs.rip)); +#else + __asm__("movl %%esp,%0" : "=m" (regs.esp)); + __asm__("movl %%ebp,%0" : "=m" (regs.ebp)); + __asm__("call 1f; 1: popl %0" : "=a" (regs.eip)); +#endif + show_trace(®s); +} + void show_stack_overflow(unsigned long esp) { #ifdef MEMORY_GUARD @@ -861,8 +876,8 @@ static int fixup_page_fault(unsigned lon if ( unlikely(IN_HYPERVISOR_RANGE(addr)) ) { - if ( shadow_mode_external(d) && guest_mode(regs) ) - return shadow_fault(addr, regs); + if ( shadow2_mode_external(d) && guest_mode(regs) ) + return shadow2_fault(addr, regs); if ( (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) ) return handle_gdt_ldt_mapping_fault( addr - GDT_LDT_VIRT_START, regs); @@ -873,14 +888,14 @@ static int fixup_page_fault(unsigned lon return (spurious_page_fault(addr, regs) ? EXCRET_not_a_fault : 0); } - if ( unlikely(shadow_mode_enabled(d)) ) - return shadow_fault(addr, regs); - if ( likely(VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) && guest_kernel_mode(v, regs) && ((regs->error_code & (PGERR_write_access|PGERR_page_present)) == (PGERR_write_access|PGERR_page_present)) ) return ptwr_do_page_fault(d, addr, regs) ? EXCRET_fault_fixed : 0; + + if ( shadow2_mode_enabled(d) ) + return shadow2_fault(addr, regs); return 0; } @@ -905,6 +920,13 @@ asmlinkage int do_page_fault(struct cpu_ DEBUGGER_trap_entry(TRAP_page_fault, regs); perfc_incrc(page_faults); + + if ( shadow2_mode_enabled(current->domain) ) + debugtrace_printk("%s %s %d dom=%d eip=%p cr2=%p code=%d cs=%x\n", + __func__, __FILE__, __LINE__, + current->domain->domain_id, + (void *)regs->eip, (void *)addr, regs->error_code, + regs->cs); if ( unlikely((rc = fixup_page_fault(addr, regs)) != 0) ) return rc; diff -r f2151423f729 -r 01345b08d012 xen/arch/x86/x86_32/domain_page.c --- a/xen/arch/x86/x86_32/domain_page.c Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/arch/x86/x86_32/domain_page.c Wed Aug 16 17:11:56 2006 +0100 @@ -15,6 +15,7 @@ #include <asm/current.h> #include <asm/flushtlb.h> #include <asm/hardirq.h> +#include <asm/hvm/support.h> static inline struct vcpu *mapcache_current_vcpu(void) { @@ -58,10 +59,10 @@ void *map_domain_page(unsigned long pfn) cache = &v->domain->arch.mapcache; hashent = &cache->vcpu_maphash[vcpu].hash[MAPHASH_HASHFN(pfn)]; - if ( hashent->pfn == pfn ) - { - idx = hashent->idx; + if ( hashent->pfn == pfn && (idx = hashent->idx) != MAPHASHENT_NOTINUSE ) + { hashent->refcnt++; + ASSERT(idx < MAPCACHE_ENTRIES); ASSERT(hashent->refcnt != 0); ASSERT(l1e_get_pfn(cache->l1tab[idx]) == pfn); goto out; @@ -178,6 +179,30 @@ void mapcache_init(struct domain *d) MAPHASHENT_NOTINUSE; } +paddr_t mapped_domain_page_to_maddr(void *va) +/* Convert a pointer in a mapped domain page to a machine address. + * Takes any pointer that's valid for use in unmap_domain_page() */ +{ + unsigned int idx; + struct vcpu *v; + struct mapcache *cache; + unsigned long pfn; + + ASSERT(!in_irq()); + + ASSERT((void *)MAPCACHE_VIRT_START <= va); + ASSERT(va < (void *)MAPCACHE_VIRT_END); + + v = mapcache_current_vcpu(); + + cache = &v->domain->arch.mapcache; + + idx = ((unsigned long)va - MAPCACHE_VIRT_START) >> PAGE_SHIFT; + pfn = l1e_get_pfn(cache->l1tab[idx]); + return ((paddr_t) pfn << PAGE_SHIFT + | ((unsigned long) va & ~PAGE_MASK)); +} + #define GLOBALMAP_BITS (IOREMAP_MBYTES << (20 - PAGE_SHIFT)) static unsigned long inuse[BITS_TO_LONGS(GLOBALMAP_BITS)]; static unsigned long garbage[BITS_TO_LONGS(GLOBALMAP_BITS)]; @@ -233,6 +258,8 @@ void unmap_domain_page_global(void *va) l1_pgentry_t *pl1e; unsigned int idx; + ASSERT((__va >= IOREMAP_VIRT_START) && (__va <= (IOREMAP_VIRT_END - 1))); + /* /First/, we zap the PTE. */ pl2e = virt_to_xen_l2e(__va); pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(__va); diff -r f2151423f729 -r 01345b08d012 xen/arch/x86/x86_32/mm.c --- a/xen/arch/x86/x86_32/mm.c Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/arch/x86/x86_32/mm.c Wed Aug 16 17:11:56 2006 +0100 @@ -75,8 +75,7 @@ void __init paging_init(void) printk("PAE disabled.\n"); #endif - idle_vcpu[0]->arch.monitor_table = - pagetable_from_paddr(__pa(idle_pg_table)); + idle_vcpu[0]->arch.cr3 = __pa(idle_pg_table); if ( cpu_has_pge ) { diff -r f2151423f729 -r 01345b08d012 xen/arch/x86/x86_64/mm.c --- a/xen/arch/x86/x86_64/mm.c Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/arch/x86/x86_64/mm.c Wed Aug 16 17:11:56 2006 +0100 @@ -81,8 +81,7 @@ void __init paging_init(void) l2_pgentry_t *l2_ro_mpt; struct page_info *pg; - idle_vcpu[0]->arch.monitor_table = - pagetable_from_paddr(__pa(idle_pg_table)); + idle_vcpu[0]->arch.cr3 = __pa(idle_pg_table); /* Create user-accessible L2 directory to map the MPT for guests. */ l3_ro_mpt = alloc_xenheap_page(); diff -r f2151423f729 -r 01345b08d012 xen/arch/x86/x86_64/traps.c --- a/xen/arch/x86/x86_64/traps.c Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/arch/x86/x86_64/traps.c Wed Aug 16 17:11:56 2006 +0100 @@ -84,7 +84,8 @@ void show_page_walk(unsigned long addr) l4e = l4t[l4_table_offset(addr)]; mfn = l4e_get_pfn(l4e); pfn = get_gpfn_from_mfn(mfn); - printk(" L4 = %"PRIpte" %016lx\n", l4e_get_intpte(l4e), pfn); + printk(" L4[0x%lx] = %"PRIpte" %016lx\n", + l4_table_offset(addr), l4e_get_intpte(l4e), pfn); if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) ) return; @@ -92,7 +93,8 @@ void show_page_walk(unsigned long addr) l3e = l3t[l3_table_offset(addr)]; mfn = l3e_get_pfn(l3e); pfn = get_gpfn_from_mfn(mfn); - printk(" L3 = %"PRIpte" %016lx\n", l3e_get_intpte(l3e), pfn); + printk(" L3[0x%lx] = %"PRIpte" %016lx\n", + l3_table_offset(addr), l3e_get_intpte(l3e), pfn); if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ) return; @@ -100,7 +102,8 @@ void show_page_walk(unsigned long addr) l2e = l2t[l2_table_offset(addr)]; mfn = l2e_get_pfn(l2e); pfn = get_gpfn_from_mfn(mfn); - printk(" L2 = %"PRIpte" %016lx %s\n", l2e_get_intpte(l2e), pfn, + printk(" L2[0x%lx] = %"PRIpte" %016lx %s\n", + l2_table_offset(addr), l2e_get_intpte(l2e), pfn, (l2e_get_flags(l2e) & _PAGE_PSE) ? "(PSE)" : ""); if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || (l2e_get_flags(l2e) & _PAGE_PSE) ) @@ -110,7 +113,8 @@ void show_page_walk(unsigned long addr) l1e = l1t[l1_table_offset(addr)]; mfn = l1e_get_pfn(l1e); pfn = get_gpfn_from_mfn(mfn); - printk(" L1 = %"PRIpte" %016lx\n", l1e_get_intpte(l1e), pfn); + printk(" L1[0x%lx] = %"PRIpte" %016lx\n", + l1_table_offset(addr), l1e_get_intpte(l1e), pfn); } asmlinkage void double_fault(void); @@ -162,7 +166,7 @@ void toggle_guest_mode(struct vcpu *v) { v->arch.flags ^= TF_kernel_mode; __asm__ __volatile__ ( "swapgs" ); - update_pagetables(v); + update_cr3(v); write_ptbase(v); } diff -r f2151423f729 -r 01345b08d012 xen/common/acm_ops.c --- a/xen/common/acm_ops.c Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/common/acm_ops.c Wed Aug 16 17:11:56 2006 +0100 @@ -26,7 +26,6 @@ #include <xen/trace.h> #include <xen/console.h> #include <xen/guest_access.h> -#include <asm/shadow.h> #include <public/sched_ctl.h> #include <acm/acm_hooks.h> diff -r f2151423f729 -r 01345b08d012 xen/common/grant_table.c --- a/xen/common/grant_table.c Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/common/grant_table.c Wed Aug 16 17:11:56 2006 +0100 @@ -434,7 +434,7 @@ __gnttab_unmap_grant_ref( /* If just unmapped a writable mapping, mark as dirtied */ if ( !(flags & GNTMAP_readonly) ) - gnttab_log_dirty(rd, frame); + gnttab_mark_dirty(rd, frame); if ( ((act->pin & (GNTPIN_devw_mask|GNTPIN_hstw_mask)) == 0) && !(flags & GNTMAP_readonly) ) @@ -731,7 +731,7 @@ __release_grant_for_copy( const unsigned long r_frame = act->frame; if ( !readonly ) - gnttab_log_dirty(rd, r_frame); + gnttab_mark_dirty(rd, r_frame); spin_lock(&rd->grant_table->lock); if ( readonly ) diff -r f2151423f729 -r 01345b08d012 xen/common/keyhandler.c --- a/xen/common/keyhandler.c Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/common/keyhandler.c Wed Aug 16 17:11:56 2006 +0100 @@ -241,9 +241,6 @@ static void read_clocks(unsigned char ke } extern void dump_runq(unsigned char key); -#ifndef NDEBUG -extern void audit_domains_key(unsigned char key); -#endif #ifdef PERF_COUNTERS extern void perfc_printall(unsigned char key); @@ -261,10 +258,16 @@ static void do_debug_key(unsigned char k #ifndef NDEBUG static void debugtrace_key(unsigned char key) { - debugtrace_send_to_console = !debugtrace_send_to_console; - debugtrace_dump(); - printk("debugtrace_printk now writing to %s.\n", - debugtrace_send_to_console ? "console" : "buffer"); + debugtrace_toggle(); +} + +static void shadow2_audit_key(unsigned char key) +{ + extern int shadow2_audit_enable; + + shadow2_audit_enable = !shadow2_audit_enable; + printk("%s shadow2_audit_enable=%d\n", + __func__, shadow2_audit_enable); } #endif @@ -288,7 +291,7 @@ void initialize_keytable(void) #ifndef NDEBUG register_keyhandler( - 'o', audit_domains_key, "audit domains >0 EXPERIMENTAL"); + 'O', shadow2_audit_key, "toggle shadow2 audits"); register_keyhandler( 'T', debugtrace_key, "toggle debugtrace to console/buffer"); #endif diff -r f2151423f729 -r 01345b08d012 xen/common/memory.c --- a/xen/common/memory.c Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/common/memory.c Wed Aug 16 17:11:56 2006 +0100 @@ -126,6 +126,11 @@ populate_physmap( for ( j = 0; j < (1 << extent_order); j++ ) guest_physmap_add_page(d, gpfn + j, mfn + j); } + else if ( unlikely(shadow2_mode_translate(d)) ) + { + for ( j = 0; j < (1 << extent_order); j++ ) + shadow2_guest_physmap_add_page(d, gpfn + j, mfn + j); + } else { for ( j = 0; j < (1 << extent_order); j++ ) @@ -153,7 +158,7 @@ guest_remove_page( if ( unlikely(!mfn_valid(mfn)) ) { DPRINTK("Domain %u page number %lx invalid\n", - d->domain_id, mfn); + d->domain_id, gmfn); return 0; } @@ -179,7 +184,7 @@ guest_remove_page( (unsigned long)page->count_info, page->u.inuse.type_info); } - guest_physmap_remove_page(d, gmfn, mfn); + shadow2_guest_physmap_remove_page(d, gmfn, mfn); put_page(page); @@ -250,7 +255,7 @@ translate_gpfn_list( if ( (d = find_domain_by_id(op.domid)) == NULL ) return -ESRCH; - if ( !shadow_mode_translate(d) ) + if ( !(shadow_mode_translate(d) || shadow2_mode_translate(d)) ) { put_domain(d); return -EINVAL; diff -r f2151423f729 -r 01345b08d012 xen/drivers/char/console.c --- a/xen/drivers/char/console.c Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/drivers/char/console.c Wed Aug 16 17:11:56 2006 +0100 @@ -569,7 +569,7 @@ int console_getc(void) #ifndef NDEBUG /* Send output direct to console, or buffer it? */ -int debugtrace_send_to_console; +static volatile int debugtrace_send_to_console; static char *debugtrace_buf; /* Debug-trace buffer */ static unsigned int debugtrace_prd; /* Producer index */ @@ -578,16 +578,10 @@ static DEFINE_SPINLOCK(debugtrace_lock); static DEFINE_SPINLOCK(debugtrace_lock); integer_param("debugtrace", debugtrace_kilobytes); -void debugtrace_dump(void) -{ - unsigned long flags; - +static void debugtrace_dump_worker(void) +{ if ( (debugtrace_bytes == 0) || !debugtrace_used ) return; - - watchdog_disable(); - - spin_lock_irqsave(&debugtrace_lock, flags); printk("debugtrace_dump() starting\n"); @@ -602,15 +596,47 @@ void debugtrace_dump(void) memset(debugtrace_buf, '\0', debugtrace_bytes); printk("debugtrace_dump() finished\n"); +} + +void debugtrace_toggle(void) +{ + unsigned long flags; + + watchdog_disable(); + spin_lock_irqsave(&debugtrace_lock, flags); + + // dump the buffer *before* toggling, in case the act of dumping the + // buffer itself causes more printk's... + // + printk("debugtrace_printk now writing to %s.\n", + !debugtrace_send_to_console ? "console": "buffer"); + if ( !debugtrace_send_to_console ) + debugtrace_dump_worker(); + + debugtrace_send_to_console = !debugtrace_send_to_console; spin_unlock_irqrestore(&debugtrace_lock, flags); - watchdog_enable(); + +} + +void debugtrace_dump(void) +{ + unsigned long flags; + + watchdog_disable(); + spin_lock_irqsave(&debugtrace_lock, flags); + + debugtrace_dump_worker(); + + spin_unlock_irqrestore(&debugtrace_lock, flags); + watchdog_enable(); } void debugtrace_printk(const char *fmt, ...) { static char buf[1024]; + static u32 count; va_list args; char *p; @@ -625,8 +651,10 @@ void debugtrace_printk(const char *fmt, ASSERT(debugtrace_buf[debugtrace_bytes - 1] == 0); + sprintf(buf, "%u ", ++count); + va_start(args, fmt); - (void)vsnprintf(buf, sizeof(buf), fmt, args); + (void)vsnprintf(buf + strlen(buf), sizeof(buf), fmt, args); va_end(args); if ( debugtrace_send_to_console ) diff -r f2151423f729 -r 01345b08d012 xen/include/asm-x86/bitops.h --- a/xen/include/asm-x86/bitops.h Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/include/asm-x86/bitops.h Wed Aug 16 17:11:56 2006 +0100 @@ -75,6 +75,24 @@ static __inline__ void clear_bit(int nr, :"=m" (ADDR) :"dIr" (nr)); } + +/** + * __clear_bit - Clears a bit in memory + * @nr: Bit to clear + * @addr: Address to start counting from + * + * Unlike clear_bit(), this function is non-atomic and may be reordered. + * If it's called on the same region of memory simultaneously, the effect + * may be that only one operation succeeds. + */ +static __inline__ void __clear_bit(int nr, volatile void * addr) +{ + __asm__( + "btrl %1,%0" + :"=m" (ADDR) + :"dIr" (nr)); +} + #define smp_mb__before_clear_bit() barrier() #define smp_mb__after_clear_bit() barrier() diff -r f2151423f729 -r 01345b08d012 xen/include/asm-x86/config.h --- a/xen/include/asm-x86/config.h Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/include/asm-x86/config.h Wed Aug 16 17:11:56 2006 +0100 @@ -79,9 +79,14 @@ #ifndef __ASSEMBLY__ extern unsigned long _end; /* standard ELF symbol */ + +static inline void FORCE_CRASH(void) __attribute__((noreturn,always_inline)); +static inline void FORCE_CRASH(void) +{ + __asm__ __volatile__ ( "ud2" ); + while(1); +} #endif /* __ASSEMBLY__ */ - -#define FORCE_CRASH() __asm__ __volatile__ ( "ud2" ) #if defined(__x86_64__) @@ -149,9 +154,14 @@ extern unsigned long _end; /* standard E /* Slot 256: read-only guest-accessible machine-to-phys translation table. */ #define RO_MPT_VIRT_START (PML4_ADDR(256)) #define RO_MPT_VIRT_END (RO_MPT_VIRT_START + PML4_ENTRY_BYTES/2) + +// current unused? +#if 0 /* Slot 257: read-only guest-accessible linear page table. */ #define RO_LINEAR_PT_VIRT_START (PML4_ADDR(257)) #define RO_LINEAR_PT_VIRT_END (RO_LINEAR_PT_VIRT_START + PML4_ENTRY_BYTES) +#endif + /* Slot 258: linear page table (guest table). */ #define LINEAR_PT_VIRT_START (PML4_ADDR(258)) #define LINEAR_PT_VIRT_END (LINEAR_PT_VIRT_START + PML4_ENTRY_BYTES) @@ -175,7 +185,7 @@ extern unsigned long _end; /* standard E #define DIRECTMAP_VIRT_START (PML4_ADDR(262)) #define DIRECTMAP_VIRT_END (DIRECTMAP_VIRT_START + PML4_ENTRY_BYTES*2) -#define PGT_base_page_table PGT_l4_page_table +#define PGT_base_page_table PGT_l4_page_table #define __HYPERVISOR_CS64 0xe010 #define __HYPERVISOR_CS32 0xe008 @@ -274,9 +284,9 @@ extern unsigned long _end; /* standard E (L2_PAGETABLE_LAST_XEN_SLOT - L2_PAGETABLE_FIRST_XEN_SLOT + 1) #ifdef CONFIG_X86_PAE -# define PGT_base_page_table PGT_l3_page_table -#else -# define PGT_base_page_table PGT_l2_page_table +# define PGT_base_page_table PGT_l3_page_table +#else +# define PGT_base_page_table PGT_l2_page_table #endif #define __HYPERVISOR_CS 0xe008 diff -r f2151423f729 -r 01345b08d012 xen/include/asm-x86/domain.h --- a/xen/include/asm-x86/domain.h Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/include/asm-x86/domain.h Wed Aug 16 17:11:56 2006 +0100 @@ -73,42 +73,42 @@ struct arch_domain /* I/O-port admin-specified access capabilities. */ struct rangeset *ioport_caps; - /* Shadow mode status and controls. */ - struct shadow_ops *ops; - unsigned int shadow_mode; /* flags to control shadow table operation */ - unsigned int shadow_nest; /* Recursive depth of shadow_lock() nesting */ - - /* shadow hashtable */ - struct shadow_status *shadow_ht; - struct shadow_status *shadow_ht_free; - struct shadow_status *shadow_ht_extras; /* extra allocation units */ - unsigned int shadow_extras_count; - - /* shadow dirty bitmap */ + /* HVM stuff */ + struct hvm_domain hvm_domain; + + /* Shadow-translated guest: Pseudophys base address of reserved area. */ + unsigned long first_reserved_pfn; + + /* Shadow2 stuff */ + u32 shadow2_mode; /* flags to control shadow operation */ + spinlock_t shadow2_lock; /* shadow2 domain lock */ + int shadow2_locker; /* processor which holds the lock */ + const char *shadow2_locker_function; /* Func that took it */ + struct list_head shadow2_freelists[SHADOW2_MAX_ORDER + 1]; + struct list_head shadow2_p2m_freelist; + struct list_head shadow2_p2m_inuse; + struct list_head shadow2_toplevel_shadows; + unsigned int shadow2_total_pages; /* number of pages allocated */ + unsigned int shadow2_free_pages; /* number of pages on freelists */ + unsigned int shadow2_p2m_pages; /* number of pages in p2m map */ + + /* Shadow2 hashtable */ + struct shadow2_hash_entry *shadow2_hash_table; + struct shadow2_hash_entry *shadow2_hash_freelist; + struct shadow2_hash_entry *shadow2_hash_allocations; + int shadow2_hash_walking; /* Some function is walking the hash table */ + + /* Shadow log-dirty bitmap */ unsigned long *shadow_dirty_bitmap; unsigned int shadow_dirty_bitmap_size; /* in pages, bit per page */ - /* shadow mode stats */ - unsigned int shadow_page_count; - unsigned int hl2_page_count; - unsigned int snapshot_page_count; - + /* Shadow log-dirty mode stats */ unsigned int shadow_fault_count; unsigned int shadow_dirty_count; - /* full shadow mode */ - struct out_of_sync_entry *out_of_sync; /* list of out-of-sync pages */ - struct out_of_sync_entry *out_of_sync_free; - struct out_of_sync_entry *out_of_sync_extras; - unsigned int out_of_sync_extras_count; - - struct list_head free_shadow_frames; - - pagetable_t phys_table; /* guest 1:1 pagetable */ - struct hvm_domain hvm_domain; - - /* Shadow-translated guest: Pseudophys base address of reserved area. */ - unsigned long first_reserved_pfn; + /* Shadow translated domain: P2M mapping */ + pagetable_t phys_table; + } __cacheline_aligned; #ifdef CONFIG_X86_PAE @@ -166,25 +166,34 @@ struct arch_vcpu */ l1_pgentry_t *perdomain_ptes; - pagetable_t guest_table_user; /* x86/64: user-space pagetable. */ - pagetable_t guest_table; /* (MA) guest notion of cr3 */ - pagetable_t shadow_table; /* (MA) shadow of guest */ - pagetable_t monitor_table; /* (MA) used in hypervisor */ - - l2_pgentry_t *guest_vtable; /* virtual address of pagetable */ - l2_pgentry_t *shadow_vtable; /* virtual address of shadow_table */ - l2_pgentry_t *monitor_vtable; /* virtual address of monitor_table */ - l1_pgentry_t *hl2_vtable; /* virtual address of hl2_table */ - #ifdef CONFIG_X86_64 - l3_pgentry_t *guest_vl3table; - l4_pgentry_t *guest_vl4table; -#endif - - unsigned long monitor_shadow_ref; + pagetable_t guest_table_user; /* (MFN) x86/64 user-space pagetable */ +#endif + pagetable_t guest_table; /* (MFN) guest notion of cr3 */ + /* guest_table holds a ref to the page, and also a type-count unless + * shadow refcounts are in use */ + pagetable_t shadow_table; /* (MFN) shadow of guest */ + pagetable_t monitor_table; /* (MFN) hypervisor PT (for HVM) */ + unsigned long cr3; /* (MA) value to install in HW CR3 */ + + void *guest_vtable; /* virtual address of pagetable */ + void *shadow_vtable; /* virtual address of shadow_table */ + root_pgentry_t *monitor_vtable; /* virtual address of monitor_table */ /* Current LDT details. */ unsigned long shadow_ldt_mapcnt; + + /* Shadow2 stuff */ + /* -- pointers to mode-specific entry points */ + struct shadow2_entry_points *shadow2; + unsigned long last_emulated_mfn; /* last mfn we emulated a write to */ + u8 shadow2_propagate_fault; /* emulated fault needs to be */ + /* propagated to guest */ +#if CONFIG_PAGING_LEVELS >= 3 + u8 shadow2_pae_flip_pending; /* shadow update requires this PAE cpu + * to recopy/install its L3 table. + */ +#endif } __cacheline_aligned; /* shorthands to improve code legibility */ diff -r f2151423f729 -r 01345b08d012 xen/include/asm-x86/grant_table.h --- a/xen/include/asm-x86/grant_table.h Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/include/asm-x86/grant_table.h Wed Aug 16 17:11:56 2006 +0100 @@ -31,7 +31,7 @@ int destroy_grant_host_mapping( #define gnttab_shared_gmfn(d, t, i) \ (mfn_to_gmfn(d, gnttab_shared_mfn(d, t, i))) -#define gnttab_log_dirty(d, f) mark_dirty((d), (f)) +#define gnttab_mark_dirty(d, f) mark_dirty((d), (f)) static inline void gnttab_clear_flag(unsigned long nr, uint16_t *addr) { diff -r f2151423f729 -r 01345b08d012 xen/include/asm-x86/hvm/hvm.h --- a/xen/include/asm-x86/hvm/hvm.h Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/include/asm-x86/hvm/hvm.h Wed Aug 16 17:11:56 2006 +0100 @@ -56,8 +56,15 @@ struct hvm_function_table { */ int (*realmode)(struct vcpu *v); int (*paging_enabled)(struct vcpu *v); + int (*long_mode_enabled)(struct vcpu *v); + int (*guest_x86_mode)(struct vcpu *v); int (*instruction_length)(struct vcpu *v); unsigned long (*get_guest_ctrl_reg)(struct vcpu *v, unsigned int num); + + /* + * Re-set the value of CR3 that Xen runs on when handling VM exits + */ + void (*update_host_cr3)(struct vcpu *v); /* * Update specifics of the guest state: @@ -134,9 +141,27 @@ hvm_paging_enabled(struct vcpu *v) } static inline int +hvm_long_mode_enabled(struct vcpu *v) +{ + return hvm_funcs.long_mode_enabled(v); +} + +static inline int +hvm_guest_x86_mode(struct vcpu *v) +{ + return hvm_funcs.guest_x86_mode(v); +} + +static inline int hvm_instruction_length(struct vcpu *v) { return hvm_funcs.instruction_length(v); +} + +static inline void +hvm_update_host_cr3(struct vcpu *v) +{ + hvm_funcs.update_host_cr3(v); } void hvm_hypercall_page_initialise(struct domain *d, diff -r f2151423f729 -r 01345b08d012 xen/include/asm-x86/hvm/support.h --- a/xen/include/asm-x86/hvm/support.h Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/include/asm-x86/hvm/support.h Wed Aug 16 17:11:56 2006 +0100 @@ -116,10 +116,13 @@ enum hval_bitmaps { #define DBG_LEVEL_IOAPIC (1 << 9) extern unsigned int opt_hvm_debug_level; -#define HVM_DBG_LOG(level, _f, _a...) \ - if ( (level) & opt_hvm_debug_level ) \ - printk("[HVM:%d.%d] <%s> " _f "\n", \ - current->domain->domain_id, current->vcpu_id, __func__, ## _a) +#define HVM_DBG_LOG(level, _f, _a...) \ + do { \ + if ( (level) & opt_hvm_debug_level ) \ + printk("[HVM:%d.%d] <%s> " _f "\n", \ + current->domain->domain_id, current->vcpu_id, __func__, \ + ## _a); \ + } while (0) #else #define HVM_DBG_LOG(level, _f, _a...) #endif diff -r f2151423f729 -r 01345b08d012 xen/include/asm-x86/hvm/vcpu.h --- a/xen/include/asm-x86/hvm/vcpu.h Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/include/asm-x86/hvm/vcpu.h Wed Aug 16 17:11:56 2006 +0100 @@ -29,6 +29,7 @@ #define HVM_VCPU_INIT_SIPI_SIPI_STATE_WAIT_SIPI 1 struct hvm_vcpu { + unsigned long hw_cr3; /* value we give to HW to use */ unsigned long ioflags; struct hvm_io_op io_op; struct vlapic *vlapic; @@ -39,6 +40,11 @@ struct hvm_vcpu { unsigned long init_sipi_sipi_state; int xen_port; + +#if CONFIG_PAGING_LEVELS >= 3 + l3_pgentry_t hvm_lowmem_l3tab[4] + __attribute__((__aligned__(32))); +#endif /* Flags */ int flag_dr_dirty; diff -r f2151423f729 -r 01345b08d012 xen/include/asm-x86/hvm/vmx/vmcs.h --- a/xen/include/asm-x86/hvm/vmx/vmcs.h Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/include/asm-x86/hvm/vmx/vmcs.h Wed Aug 16 17:11:56 2006 +0100 @@ -87,6 +87,7 @@ struct arch_vmx_struct { unsigned long cpu_cr0; /* copy of guest CR0 */ unsigned long cpu_shadow_cr0; /* copy of guest read shadow CR0 */ + unsigned long cpu_shadow_cr4; /* copy of guest read shadow CR4 */ unsigned long cpu_cr2; /* save CR2 */ unsigned long cpu_cr3; unsigned long cpu_state; diff -r f2151423f729 -r 01345b08d012 xen/include/asm-x86/hvm/vmx/vmx.h --- a/xen/include/asm-x86/hvm/vmx/vmx.h Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/include/asm-x86/hvm/vmx/vmx.h Wed Aug 16 17:11:56 2006 +0100 @@ -298,6 +298,9 @@ static always_inline void __vmwrite_vcpu case GUEST_CR0: v->arch.hvm_vmx.cpu_cr0 = value; break; + case CR4_READ_SHADOW: + v->arch.hvm_vmx.cpu_shadow_cr4 = value; + break; case CPU_BASED_VM_EXEC_CONTROL: v->arch.hvm_vmx.cpu_based_exec_control = value; break; @@ -317,11 +320,14 @@ static always_inline void __vmread_vcpu( case GUEST_CR0: *value = v->arch.hvm_vmx.cpu_cr0; break; + case CR4_READ_SHADOW: + *value = v->arch.hvm_vmx.cpu_shadow_cr4; + break; case CPU_BASED_VM_EXEC_CONTROL: *value = v->arch.hvm_vmx.cpu_based_exec_control; break; default: - printk("__vmread_cpu: invalid field %lx\n", field); + printk("__vmread_vcpu: invalid field %lx\n", field); break; } } @@ -342,6 +348,7 @@ static inline int __vmwrite(unsigned lon switch ( field ) { case CR0_READ_SHADOW: case GUEST_CR0: + case CR4_READ_SHADOW: case CPU_BASED_VM_EXEC_CONTROL: __vmwrite_vcpu(v, field, value); break; @@ -402,6 +409,46 @@ static inline int vmx_paging_enabled(str __vmread_vcpu(v, CR0_READ_SHADOW, &cr0); return (cr0 & X86_CR0_PE) && (cr0 & X86_CR0_PG); +} + +/* Works only for vcpu == current */ +static inline int vmx_long_mode_enabled(struct vcpu *v) +{ + ASSERT(v == current); + return VMX_LONG_GUEST(current); +} + +/* Works only for vcpu == current */ +static inline int vmx_realmode(struct vcpu *v) +{ + unsigned long rflags; + ASSERT(v == current); + + __vmread(GUEST_RFLAGS, &rflags); + return rflags & X86_EFLAGS_VM; +} + +/* Works only for vcpu == current */ +static inline void vmx_update_host_cr3(struct vcpu *v) +{ + ASSERT(v == current); + __vmwrite(HOST_CR3, v->arch.cr3); +} + +static inline int vmx_guest_x86_mode(struct vcpu *v) +{ + unsigned long cs_ar_bytes; + ASSERT(v == current); + + if ( vmx_long_mode_enabled(v) ) + { + __vmread(GUEST_CS_AR_BYTES, &cs_ar_bytes); + return (cs_ar_bytes & (1u<<13)) ? 8 : 4; + } + if ( vmx_realmode(v) ) + return 2; + __vmread(GUEST_CS_AR_BYTES, &cs_ar_bytes); + return (cs_ar_bytes & (1u<<14)) ? 4 : 2; } static inline int vmx_pgbit_test(struct vcpu *v) diff -r f2151423f729 -r 01345b08d012 xen/include/asm-x86/mm.h --- a/xen/include/asm-x86/mm.h Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/include/asm-x86/mm.h Wed Aug 16 17:11:56 2006 +0100 @@ -20,7 +20,11 @@ struct page_info struct page_info { /* Each frame can be threaded onto a doubly-linked list. */ - struct list_head list; + union { + struct list_head list; + /* Shadow2 uses this field as an up-pointer in lower-level shadows */ + paddr_t up; + }; /* Reference count and various PGC_xxx flags and fields. */ u32 count_info; @@ -46,8 +50,20 @@ struct page_info } u; - /* Timestamp from 'TLB clock', used to reduce need for safety flushes. */ - u32 tlbflush_timestamp; + union { + /* Timestamp from 'TLB clock', used to reduce need for safety + * flushes. Only valid on a) free pages, and b) guest pages with a + * zero type count. */ + u32 tlbflush_timestamp; + + /* Only used on guest pages with a shadow. + * Guest pages with a shadow must have a non-zero type count, so this + * does not conflict with the tlbflush timestamp. */ + u32 shadow2_flags; + + // XXX -- we expect to add another field here, to be used for min/max + // purposes, which is only used for shadow pages. + }; }; /* The following page types are MUTUALLY EXCLUSIVE. */ @@ -60,6 +76,7 @@ struct page_info #define PGT_ldt_page (6U<<29) /* using this page in an LDT? */ #define PGT_writable_page (7U<<29) /* has writable mappings of this page? */ +#ifndef SHADOW2 #define PGT_l1_shadow PGT_l1_page_table #define PGT_l2_shadow PGT_l2_page_table #define PGT_l3_shadow PGT_l3_page_table @@ -69,14 +86,16 @@ struct page_info #define PGT_writable_pred (7U<<29) /* predicted gpfn with writable ref */ #define PGT_fl1_shadow (5U<<29) +#endif + #define PGT_type_mask (7U<<29) /* Bits 29-31. */ + /* Owning guest has pinned this page to its current type? */ +#define _PGT_pinned 28 +#define PGT_pinned (1U<<_PGT_pinned) /* Has this page been validated for use as its current type? */ -#define _PGT_validated 28 +#define _PGT_validated 27 #define PGT_validated (1U<<_PGT_validated) - /* Owning guest has pinned this page to its current type? */ -#define _PGT_pinned 27 -#define PGT_pinned (1U<<_PGT_pinned) #if defined(__i386__) /* The 11 most significant bits of virt address if this is a page table. */ #define PGT_va_shift 16 @@ -98,6 +117,7 @@ struct page_info /* 16-bit count of uses of this frame as its current type. */ #define PGT_count_mask ((1U<<16)-1) +#ifndef SHADOW2 #ifdef __x86_64__ #define PGT_high_mfn_shift 52 #define PGT_high_mfn_mask (0xfffUL << PGT_high_mfn_shift) @@ -112,19 +132,53 @@ struct page_info #define PGT_score_shift 23 #define PGT_score_mask (((1U<<4)-1)<<PGT_score_shift) #endif +#endif /* SHADOW2 */ /* Cleared when the owning guest 'frees' this page. */ #define _PGC_allocated 31 #define PGC_allocated (1U<<_PGC_allocated) - /* Set when fullshadow mode marks a page out-of-sync */ + /* Set on a *guest* page to mark it out-of-sync with its shadow */ #define _PGC_out_of_sync 30 #define PGC_out_of_sync (1U<<_PGC_out_of_sync) - /* Set when fullshadow mode is using a page as a page table */ + /* Set when is using a page as a page table */ #define _PGC_page_table 29 #define PGC_page_table (1U<<_PGC_page_table) /* 29-bit count of references to this frame. */ #define PGC_count_mask ((1U<<29)-1) +/* shadow2 uses the count_info on shadow pages somewhat differently */ +/* NB: please coordinate any changes here with the SH2F's in shadow2.h */ +#define PGC_SH2_none (0U<<28) /* on the shadow2 free list */ +#define PGC_SH2_min_shadow (1U<<28) +#define PGC_SH2_l1_32_shadow (1U<<28) /* shadowing a 32-bit L1 guest page */ +#define PGC_SH2_fl1_32_shadow (2U<<28) /* L1 shadow for a 32b 4M superpage */ +#define PGC_SH2_l2_32_shadow (3U<<28) /* shadowing a 32-bit L2 guest page */ +#define PGC_SH2_l1_pae_shadow (4U<<28) /* shadowing a pae L1 page */ +#define PGC_SH2_fl1_pae_shadow (5U<<28) /* L1 shadow for pae 2M superpg */ +#define PGC_SH2_l2_pae_shadow (6U<<28) /* shadowing a pae L2-low page */ +#define PGC_SH2_l2h_pae_shadow (7U<<28) /* shadowing a pae L2-high page */ +#define PGC_SH2_l3_pae_shadow (8U<<28) /* shadowing a pae L3 page */ +#define PGC_SH2_l1_64_shadow (9U<<28) /* shadowing a 64-bit L1 page */ +#define PGC_SH2_fl1_64_shadow (10U<<28) /* L1 shadow for 64-bit 2M superpg */ +#define PGC_SH2_l2_64_shadow (11U<<28) /* shadowing a 64-bit L2 page */ +#define PGC_SH2_l3_64_shadow (12U<<28) /* shadowing a 64-bit L3 page */ +#define PGC_SH2_l4_64_shadow (13U<<28) /* shadowing a 64-bit L4 page */ +#define PGC_SH2_max_shadow (13U<<28) +#define PGC_SH2_p2m_table (14U<<28) /* in use as the p2m table */ +#define PGC_SH2_monitor_table (15U<<28) /* in use as a monitor table */ +#define PGC_SH2_unused (15U<<28) + +#define PGC_SH2_type_mask (15U<<28) +#define PGC_SH2_type_shift 28 + +#define PGC_SH2_pinned (1U<<27) + +#define _PGC_SH2_log_dirty 26 +#define PGC_SH2_log_dirty (1U<<26) + +/* 26 bit ref count for shadow pages */ +#define PGC_SH2_count_mask ((1U<<26) - 1) + /* We trust the slab allocator in slab.c, and our use of it. */ #define PageSlab(page) (1) #define PageSetSlab(page) ((void)0) @@ -134,14 +188,22 @@ struct page_info #if defined(__i386__) #define pickle_domptr(_d) ((u32)(unsigned long)(_d)) -#define unpickle_domptr(_d) ((struct domain *)(unsigned long)(_d)) +static inline struct domain *unpickle_domptr(u32 _domain) +{ return (_domain & 1) ? NULL : (void *)_domain; } #define PRtype_info "08lx" /* should only be used for printk's */ #elif defined(__x86_64__) static inline struct domain *unpickle_domptr(u32 _domain) -{ return (_domain == 0) ? NULL : __va(_domain); } +{ return ((_domain == 0) || (_domain & 1)) ? NULL : __va(_domain); } static inline u32 pickle_domptr(struct domain *domain) { return (domain == NULL) ? 0 : (u32)__pa(domain); } #define PRtype_info "016lx"/* should only be used for printk's */ +#endif + +/* The order of the largest allocation unit we use for shadow pages */ +#if CONFIG_PAGING_LEVELS == 2 +#define SHADOW2_MAX_ORDER 0 /* Only ever need 4k allocations */ +#else +#define SHADOW2_MAX_ORDER 2 /* Need up to 16k allocs for 32-bit on PAE/64 */ #endif #define page_get_owner(_p) (unpickle_domptr((_p)->u.inuse._domain)) @@ -165,7 +227,7 @@ extern int shadow_remove_all_write_acces extern int shadow_remove_all_write_access( struct domain *d, unsigned long gmfn, unsigned long mfn); extern u32 shadow_remove_all_access( struct domain *d, unsigned long gmfn); -extern int _shadow_mode_refcounts(struct domain *d); +extern int _shadow2_mode_refcounts(struct domain *d); static inline void put_page(struct page_info *page) { @@ -197,8 +259,8 @@ static inline int get_page(struct page_i unlikely((nx & PGC_count_mask) == 0) || /* Count overflow? */ unlikely(d != _domain) ) /* Wrong owner? */ { - if ( !_shadow_mode_refcounts(domain) ) - DPRINTK("Error pfn %lx: rd=%p, od=%p, caf=%08x, taf=%" + if ( !_shadow2_mode_refcounts(domain) ) + DPRINTK("Error pfn %lx: rd=%p, od=%p, caf=%08x, taf=%" PRtype_info "\n", page_to_mfn(page), domain, unpickle_domptr(d), x, page->u.inuse.type_info); @@ -254,6 +316,16 @@ static inline int page_is_removable(stru ASSERT(((_p)->count_info & PGC_count_mask) != 0); \ ASSERT(page_get_owner(_p) == (_d)) +// Quick test for whether a given page can be represented directly in CR3. +// +#if CONFIG_PAGING_LEVELS == 3 +#define MFN_FITS_IN_CR3(_MFN) !(mfn_x(_MFN) >> 20) + +/* returns a lowmem machine address of the copied L3 root table */ +unsigned long +pae_copy_root(struct vcpu *v, l3_pgentry_t *l3tab); +#endif /* CONFIG_PAGING_LEVELS == 3 */ + int check_descriptor(struct desc_struct *d); /* @@ -271,29 +343,44 @@ int check_descriptor(struct desc_struct #define set_gpfn_from_mfn(mfn, pfn) (machine_to_phys_mapping[(mfn)] = (pfn)) #define get_gpfn_from_mfn(mfn) (machine_to_phys_mapping[(mfn)]) + +#define mfn_to_gmfn(_d, mfn) \ + ( (shadow2_mode_translate(_d)) \ + ? get_gpfn_from_mfn(mfn) \ + : (mfn) ) + +#define gmfn_to_mfn(_d, gpfn) mfn_x(sh2_gfn_to_mfn(_d, gpfn)) + + /* * The phys_to_machine_mapping is the reversed mapping of MPT for full * virtualization. It is only used by shadow_mode_translate()==true * guests, so we steal the address space that would have normally * been used by the read-only MPT map. */ -#define phys_to_machine_mapping ((unsigned long *)RO_MPT_VIRT_START) -#define NR_P2M_TABLE_ENTRIES ((unsigned long *)RO_MPT_VIRT_END \ - - phys_to_machine_mapping) +#define phys_to_machine_mapping ((l1_pgentry_t *)RO_MPT_VIRT_START) #define INVALID_MFN (~0UL) #define VALID_MFN(_mfn) (!((_mfn) & (1U<<31))) -#define set_mfn_from_gpfn(pfn, mfn) (phys_to_machine_mapping[(pfn)] = (mfn)) static inline unsigned long get_mfn_from_gpfn(unsigned long pfn) { - unsigned long mfn; - - if ( unlikely(pfn >= NR_P2M_TABLE_ENTRIES) || - unlikely(__copy_from_user(&mfn, &phys_to_machine_mapping[pfn], - sizeof(mfn))) ) - mfn = INVALID_MFN; - - return mfn; + l1_pgentry_t l1e = l1e_empty(); + int ret; + +#if CONFIG_PAGING_LEVELS > 2 + if ( pfn > (RO_MPT_VIRT_END - RO_MPT_VIRT_START) / sizeof (l1_pgentry_t) ) + /* This pfn is higher than the p2m map can hold */ + return INVALID_MFN; +#endif + + ret = __copy_from_user(&l1e, + &phys_to_machine_mapping[pfn], + sizeof(l1e)); + + if ( (ret == 0) && (l1e_get_flags(l1e) & _PAGE_PRESENT) ) + return l1e_get_pfn(l1e); + + return INVALID_MFN; } #ifdef MEMORY_GUARD @@ -333,6 +420,7 @@ void audit_domains(void); #endif int new_guest_cr3(unsigned long pfn); +void make_cr3(struct vcpu *v, unsigned long mfn); void propagate_page_fault(unsigned long addr, u16 error_code); diff -r f2151423f729 -r 01345b08d012 xen/include/asm-x86/msr.h --- a/xen/include/asm-x86/msr.h Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/include/asm-x86/msr.h Wed Aug 16 17:11:56 2006 +0100 @@ -112,6 +112,10 @@ static inline void wrmsrl(unsigned int m #define MSR_IA32_VMX_EXIT_CTLS_MSR 0x483 #define MSR_IA32_VMX_ENTRY_CTLS_MSR 0x484 #define MSR_IA32_VMX_MISC_MSR 0x485 +#define MSR_IA32_VMX_CR0_FIXED0 0x486 +#define MSR_IA32_VMX_CR0_FIXED1 0x487 +#define MSR_IA32_VMX_CR4_FIXED0 0x488 +#define MSR_IA32_VMX_CR4_FIXED1 0x489 #define IA32_FEATURE_CONTROL_MSR 0x3a #define IA32_FEATURE_CONTROL_MSR_LOCK 0x1 #define IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON 0x4 diff -r f2151423f729 -r 01345b08d012 xen/include/asm-x86/page-guest32.h --- a/xen/include/asm-x86/page-guest32.h Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/include/asm-x86/page-guest32.h Wed Aug 16 17:11:56 2006 +0100 @@ -89,15 +89,8 @@ static inline l2_pgentry_32_t l2e_from_p #define linear_l1_table_32 \ ((l1_pgentry_32_t *)(LINEAR_PT_VIRT_START)) -#define __linear_l2_table_32 \ - ((l2_pgentry_32_t *)(LINEAR_PT_VIRT_START + \ - (LINEAR_PT_VIRT_START >> (PAGETABLE_ORDER<<0)))) #define linear_pg_table_32 linear_l1_table_32 -#define linear_l2_table_32(_ed) ((_ed)->arch.guest_vtable) - -#define va_to_l1mfn_32(_ed, _va) \ - (l2e_get_pfn(linear_l2_table(_ed)[_va>>L2_PAGETABLE_SHIFT])) #endif /* __X86_PAGE_GUEST_H__ */ diff -r f2151423f729 -r 01345b08d012 xen/include/asm-x86/page.h --- a/xen/include/asm-x86/page.h Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/include/asm-x86/page.h Wed Aug 16 17:11:56 2006 +0100 @@ -233,26 +233,18 @@ typedef struct { u64 pfn; } pagetable_t; + DOMAIN_ENTRIES_PER_L4_PAGETABLE) #endif -#define LINEAR_PT_OFFSET (LINEAR_PT_VIRT_START & VADDR_MASK) -#define linear_l1_table \ - ((l1_pgentry_t *)(LINEAR_PT_VIRT_START)) -#define __linear_l2_table \ - ((l2_pgentry_t *)(LINEAR_PT_VIRT_START + \ - (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<0)))) -#define __linear_l3_table \ - ((l3_pgentry_t *)(LINEAR_PT_VIRT_START + \ - (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<0)) + \ - (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<1)))) -#define __linear_l4_table \ - ((l4_pgentry_t *)(LINEAR_PT_VIRT_START + \ - (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<0)) + \ - (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<1)) + \ - (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<2)))) - +/* Where to find each level of the linear mapping */ +#define __linear_l1_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START)) +#define __linear_l2_table \ + ((l2_pgentry_t *)(__linear_l1_table + l1_linear_offset(LINEAR_PT_VIRT_START))) +#define __linear_l3_table \ + ((l3_pgentry_t *)(__linear_l2_table + l2_linear_offset(LINEAR_PT_VIRT_START))) +#define __linear_l4_table \ + ((l4_pgentry_t *)(__linear_l3_table + l3_linear_offset(LINEAR_PT_VIRT_START))) + +#define linear_l1_table __linear_l1_table #define linear_pg_table linear_l1_table -#define linear_l2_table(v) ((v)->arch.guest_vtable) -#define linear_l3_table(v) ((v)->arch.guest_vl3table) -#define linear_l4_table(v) ((v)->arch.guest_vl4table) +#define linear_l2_table(v) ((l2_pgentry_t *)(v)->arch.guest_vtable) #ifndef __ASSEMBLY__ #if CONFIG_PAGING_LEVELS == 3 @@ -294,6 +286,7 @@ extern void paging_init(void); #define _PAGE_AVAIL1 0x400U #define _PAGE_AVAIL2 0x800U #define _PAGE_AVAIL 0xE00U +#define _PAGE_PSE_PAT 0x1000U /* * Debug option: Ensure that granted mappings are not implicitly unmapped. @@ -307,9 +300,9 @@ extern void paging_init(void); #endif /* - * Disallow unused flag bits plus PAT, PSE and GLOBAL. Also disallow GNTTAB - * if we are using it for grant-table debugging. Permit the NX bit if the - * hardware supports it. + * Disallow unused flag bits plus PAT, PSE and GLOBAL. + * Also disallow GNTTAB if we are using it for grant-table debugging. + * Permit the NX bit if the hardware supports it. */ #define BASE_DISALLOW_MASK ((0xFFFFF180U | _PAGE_GNTTAB) & ~_PAGE_NX) diff -r f2151423f729 -r 01345b08d012 xen/include/asm-x86/perfc_defn.h --- a/xen/include/asm-x86/perfc_defn.h Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/include/asm-x86/perfc_defn.h Wed Aug 16 17:11:56 2006 +0100 @@ -144,4 +144,57 @@ PERFCOUNTER_CPU(remove_write_bad_predict PERFCOUNTER_CPU(remove_write_bad_prediction, "remove_write bad prediction") PERFCOUNTER_CPU(update_hl2e_invlpg, "update_hl2e calls invlpg") +/* Shadow2 counters */ +PERFCOUNTER_CPU(shadow2_alloc, "calls to shadow2_alloc") +PERFCOUNTER_CPU(shadow2_alloc_tlbflush, "shadow2_alloc flushed TLBs") +PERFSTATUS(shadow2_alloc_count, "number of shadow pages in use") +PERFCOUNTER_CPU(shadow2_free, "calls to shadow2_free") +PERFCOUNTER_CPU(shadow2_prealloc_1, "shadow2 recycles old shadows") +PERFCOUNTER_CPU(shadow2_prealloc_2, "shadow2 recycles in-use shadows") +PERFCOUNTER_CPU(shadow2_linear_map_failed, "shadow2 hit read-only linear map") +PERFCOUNTER_CPU(shadow2_a_update, "shadow2 A bit update") +PERFCOUNTER_CPU(shadow2_ad_update, "shadow2 A&D bit update") +PERFCOUNTER_CPU(shadow2_fault, "calls to shadow2_fault") +PERFCOUNTER_CPU(shadow2_fault_bail_bad_gfn, "shadow2_fault guest bad gfn") +PERFCOUNTER_CPU(shadow2_fault_bail_not_present, + "shadow2_fault guest not-present") +PERFCOUNTER_CPU(shadow2_fault_bail_nx, "shadow2_fault guest NX fault") +PERFCOUNTER_CPU(shadow2_fault_bail_ro_mapping, "shadow2_fault guest R/W fault") +PERFCOUNTER_CPU(shadow2_fault_bail_user_supervisor, + "shadow2_fault guest U/S fault") +PERFCOUNTER_CPU(shadow2_fault_emulate_read, "shadow2_fault emulates a read") +PERFCOUNTER_CPU(shadow2_fault_emulate_write, "shadow2_fault emulates a write") +PERFCOUNTER_CPU(shadow2_fault_emulate_failed, "shadow2_fault emulator fails") +PERFCOUNTER_CPU(shadow2_fault_mmio, "shadow2_fault handled as mmio") +PERFCOUNTER_CPU(shadow2_fault_fixed, "shadow2_fault fixed fault") +PERFCOUNTER_CPU(shadow2_ptwr_emulate, "shadow2 causes ptwr to emulate") +PERFCOUNTER_CPU(shadow2_validate_gl1e_calls, "calls to shadow2_validate_gl1e") +PERFCOUNTER_CPU(shadow2_validate_gl2e_calls, "calls to shadow2_validate_gl2e") +PERFCOUNTER_CPU(shadow2_validate_gl3e_calls, "calls to shadow2_validate_gl3e") +PERFCOUNTER_CPU(shadow2_validate_gl4e_calls, "calls to shadow2_validate_gl4e") +PERFCOUNTER_CPU(shadow2_hash_lookups, "calls to shadow2_hash_lookup") +PERFCOUNTER_CPU(shadow2_hash_lookup_head, "shadow2 hash hit in bucket head") +PERFCOUNTER_CPU(shadow2_hash_lookup_miss, "shadow2 hash misses") +PERFCOUNTER_CPU(shadow2_get_shadow_status, "calls to get_shadow_status") +PERFCOUNTER_CPU(shadow2_hash_inserts, "calls to shadow2_hash_insert") +PERFCOUNTER_CPU(shadow2_hash_deletes, "calls to shadow2_hash_delete") +PERFCOUNTER_CPU(shadow2_writeable, "shadow2 removes write access") +PERFCOUNTER_CPU(shadow2_writeable_h_1, "shadow2 writeable: 32b w2k3") +PERFCOUNTER_CPU(shadow2_writeable_h_2, "shadow2 writeable: 32pae w2k3") +PERFCOUNTER_CPU(shadow2_writeable_h_3, "shadow2 writeable: 64b w2k3") +PERFCOUNTER_CPU(shadow2_writeable_h_4, "shadow2 writeable: 32b linux low") +PERFCOUNTER_CPU(shadow2_writeable_bf, "shadow2 writeable brute-force") +PERFCOUNTER_CPU(shadow2_mappings, "shadow2 removes all mappings") +PERFCOUNTER_CPU(shadow2_mappings_bf, "shadow2 rm-mappings brute-force") +PERFCOUNTER_CPU(shadow2_early_unshadow, "shadow2 unshadows for fork/exit") +PERFCOUNTER_CPU(shadow2_early_unshadow_top, "shadow2 unhooks for fork/exit") +PERFCOUNTER_CPU(shadow2_unshadow, "shadow2 unshadows a page") +PERFCOUNTER_CPU(shadow2_up_pointer, "shadow2 unshadow by up-pointer") +PERFCOUNTER_CPU(shadow2_unshadow_bf, "shadow2 unshadow brute-force") +PERFCOUNTER_CPU(shadow2_get_page_fail, "shadow2_get_page_from_l1e failed") +PERFCOUNTER_CPU(shadow2_guest_walk, "shadow2 walks guest tables") +PERFCOUNTER_CPU(shadow2_walk_cache_hit, "shadow2 walk-cache hits") +PERFCOUNTER_CPU(shadow2_walk_cache_miss, "shadow2 walk-cache misses") + + /*#endif*/ /* __XEN_PERFC_DEFN_H__ */ diff -r f2151423f729 -r 01345b08d012 xen/include/asm-x86/processor.h --- a/xen/include/asm-x86/processor.h Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/include/asm-x86/processor.h Wed Aug 16 17:11:56 2006 +0100 @@ -545,6 +545,7 @@ extern always_inline void prefetchw(cons #endif void show_stack(struct cpu_user_regs *regs); +void show_xen_trace(void); void show_stack_overflow(unsigned long esp); void show_registers(struct cpu_user_regs *regs); void show_execution_state(struct cpu_user_regs *regs); diff -r f2151423f729 -r 01345b08d012 xen/include/asm-x86/shadow.h --- a/xen/include/asm-x86/shadow.h Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/include/asm-x86/shadow.h Wed Aug 16 17:11:56 2006 +0100 @@ -1,8 +1,7 @@ /****************************************************************************** * include/asm-x86/shadow.h * - * Copyright (c) 2005 Michael A Fetterman - * Based on an earlier implementation by Ian Pratt et al + * Copyright (c) 2006 by XenSource Inc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -22,1782 +21,28 @@ #ifndef _XEN_SHADOW_H #define _XEN_SHADOW_H -#include <xen/config.h> -#include <xen/types.h> -#include <xen/perfc.h> -#include <xen/sched.h> -#include <xen/mm.h> -#include <xen/domain_page.h> -#include <asm/current.h> -#include <asm/flushtlb.h> -#include <asm/processor.h> -#include <asm/hvm/hvm.h> -#include <asm/hvm/support.h> -#include <asm/regs.h> -#include <public/dom0_ops.h> -#include <asm/shadow_public.h> -#include <asm/page-guest32.h> -#include <asm/shadow_ops.h> +/* This file is just a wrapper around the new Shadow2 header, + * providing names that must be defined in any shadow implementation. */ -/* Shadow PT operation mode : shadow-mode variable in arch_domain. */ +#include <asm/shadow2.h> -#define SHM_enable (1<<0) /* we're in one of the shadow modes */ -#define SHM_refcounts (1<<1) /* refcounts based on shadow tables instead of - guest tables */ -#define SHM_write_all (1<<2) /* allow write access to all guest pt pages, - regardless of pte write permissions */ -#define SHM_log_dirty (1<<3) /* enable log dirty mode */ -#define SHM_translate (1<<4) /* Xen does p2m translation, not guest */ -#define SHM_external (1<<5) /* Xen does not steal address space from the - domain for its own booking; requires VT or - similar mechanisms */ -#define SHM_wr_pt_pte (1<<6) /* guest allowed to set PAGE_RW bit in PTEs which - point to page table pages. */ +/* How to make sure a page is not referred to in a shadow PT */ +/* This will need to be a for_each_vcpu if we go to per-vcpu shadows */ +#define shadow_drop_references(_d, _p) \ + shadow2_remove_all_mappings((_d)->vcpu[0], _mfn(page_to_mfn(_p))) +#define shadow_sync_and_drop_references(_d, _p) \ + shadow2_remove_all_mappings((_d)->vcpu[0], _mfn(page_to_mfn(_p))) -#define shadow_mode_enabled(_d) ((_d)->arch.shadow_mode) -#define shadow_mode_refcounts(_d) ((_d)->arch.shadow_mode & SHM_refcounts) -#define shadow_mode_write_l1(_d) (VM_ASSIST(_d, VMASST_TYPE_writable_pagetables)) -#define shadow_mode_write_all(_d) ((_d)->arch.shadow_mode & SHM_write_all) -#define shadow_mode_log_dirty(_d) ((_d)->arch.shadow_mode & SHM_log_dirty) -#define shadow_mode_translate(_d) ((_d)->arch.shadow_mode & SHM_translate) -#define shadow_mode_external(_d) ((_d)->arch.shadow_mode & SHM_external) -#define shadow_mode_wr_pt_pte(_d) ((_d)->arch.shadow_mode & SHM_wr_pt_pte) +/* Whether we are translating the domain's frame numbers for it */ +#define shadow_mode_translate(d) shadow2_mode_translate(d) -#define shadow_linear_pg_table ((l1_pgentry_t *)SH_LINEAR_PT_VIRT_START) -#define __shadow_linear_l2_table ((l2_pgentry_t *)(SH_LINEAR_PT_VIRT_START + \ - (SH_LINEAR_PT_VIRT_START >> (L2_PAGETABLE_SHIFT - L1_PAGETABLE_SHIFT)))) -#define shadow_linear_l2_table(_v) ((_v)->arch.shadow_vtable) +/* ...and if so, how to add and remove entries in the mapping */ +#define guest_physmap_add_page(_d, _p, _m) \ + shadow2_guest_physmap_add_page((_d), (_p), (_m)) +#define guest_physmap_remove_page(_d, _p, _m ) \ + shadow2_guest_physmap_remove_page((_d), (_p), (_m)) -// easy access to the hl2 table (for translated but not external modes only) -#define __linear_hl2_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START + \ - (PERDOMAIN_VIRT_START >> (L2_PAGETABLE_SHIFT - L1_PAGETABLE_SHIFT)))) - -/* - * For now we use the per-domain BIGLOCK rather than a shadow-specific lock. - * We usually have the BIGLOCK already acquired anyway, so this is unlikely - * to cause much unnecessary extra serialisation. Also it's a recursive - * lock, and there are some code paths containing nested shadow_lock(). - * The #if0'ed code below is therefore broken until such nesting is removed. - */ -#if 0 -#define shadow_lock_init(_d) \ - spin_lock_init(&(_d)->arch.shadow_lock) -#define shadow_lock_is_acquired(_d) \ - spin_is_locked(&(_d)->arch.shadow_lock) -#define shadow_lock(_d) \ -do { \ - ASSERT(!shadow_lock_is_acquired(_d)); \ - spin_lock(&(_d)->arch.shadow_lock); \ -} while (0) -#define shadow_unlock(_d) \ -do { \ - ASSERT(!shadow_lock_is_acquired(_d)); \ - spin_unlock(&(_d)->arch.shadow_lock); \ -} while (0) -#else -#define shadow_lock_init(_d) \ - ((_d)->arch.shadow_nest = 0) -#define shadow_lock_is_acquired(_d) \ - (spin_is_locked(&(_d)->big_lock) && ((_d)->arch.shadow_nest != 0)) -#define shadow_lock(_d) \ -do { \ - LOCK_BIGLOCK(_d); \ - (_d)->arch.shadow_nest++; \ -} while (0) -#define shadow_unlock(_d) \ -do { \ - ASSERT(shadow_lock_is_acquired(_d)); \ - (_d)->arch.shadow_nest--; \ - UNLOCK_BIGLOCK(_d); \ -} while (0) -#endif - -#if CONFIG_PAGING_LEVELS >= 3 -static inline u64 get_cr3_idxval(struct vcpu *v) -{ - u64 pae_cr3; - - if ( v->domain->arch.ops->guest_paging_levels == PAGING_L3 && - !shadow_mode_log_dirty(v->domain) ) - { - pae_cr3 = hvm_get_guest_ctrl_reg(v, 3); /* get CR3 */ - return (pae_cr3 >> PAE_CR3_ALIGN) & PAE_CR3_IDX_MASK; - } - else - return 0; -} - -#define shadow_key_t u64 -#define index_to_key(x) ((x) << 32) -#else -#define get_cr3_idxval(v) (0) -#define shadow_key_t unsigned long -#define index_to_key(x) (0) -#endif - - -#define SHADOW_ENCODE_MIN_MAX(_min, _max) ((((GUEST_L1_PAGETABLE_ENTRIES - 1) - (_max)) << 16) | (_min)) -#define SHADOW_MIN(_encoded) ((_encoded) & ((1u<<16) - 1)) -#define SHADOW_MAX(_encoded) ((GUEST_L1_PAGETABLE_ENTRIES - 1) - ((_encoded) >> 16)) -extern void shadow_direct_map_clean(struct domain *d); -extern int shadow_direct_map_init(struct domain *d); -extern int shadow_direct_map_fault( - unsigned long vpa, struct cpu_user_regs *regs); -extern void shadow_mode_init(void); -extern int shadow_mode_control(struct domain *p, dom0_shadow_control_t *sc); -extern int shadow_fault(unsigned long va, struct cpu_user_regs *regs); -extern int shadow_mode_enable(struct domain *p, unsigned int mode); -extern void shadow_invlpg(struct vcpu *, unsigned long); -extern struct out_of_sync_entry *shadow_mark_mfn_out_of_sync( - struct vcpu *v, unsigned long gpfn, unsigned long mfn); -extern void free_monitor_pagetable(struct vcpu *v); -extern void __shadow_sync_all(struct domain *d); -extern int __shadow_out_of_sync(struct vcpu *v, unsigned long va); -extern int set_p2m_entry( - struct domain *d, unsigned long pfn, unsigned long mfn, - struct domain_mmap_cache *l2cache, - struct domain_mmap_cache *l1cache); -extern void remove_shadow(struct domain *d, unsigned long gpfn, u32 stype); - -extern void free_shadow_page(unsigned long smfn); - -extern void shadow_l1_normal_pt_update(struct domain *d, - paddr_t pa, l1_pgentry_t l1e, - struct domain_mmap_cache *cache); -extern void shadow_l2_normal_pt_update(struct domain *d, - paddr_t pa, l2_pgentry_t l2e, - struct domain_mmap_cache *cache); -#if CONFIG_PAGING_LEVELS >= 3 -#include <asm/page-guest32.h> -/* - * va_mask cannot be used because it's used by the shadow hash. - * Use the score area for for now. - */ -#define is_xen_l2_slot(t,s) \ - ( ((((t) & PGT_score_mask) >> PGT_score_shift) == 3) && \ - ((s) >= (L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES - 1))) ) - -extern unsigned long gva_to_gpa(unsigned long gva); -extern void shadow_l3_normal_pt_update(struct domain *d, - paddr_t pa, l3_pgentry_t l3e, - struct domain_mmap_cache *cache); -#endif -#if CONFIG_PAGING_LEVELS >= 4 -extern void shadow_l4_normal_pt_update(struct domain *d, - paddr_t pa, l4_pgentry_t l4e, - struct domain_mmap_cache *cache); -#endif -extern int shadow_do_update_va_mapping(unsigned long va, - l1_pgentry_t val, - struct vcpu *v); - - -static inline unsigned long __shadow_status( - struct domain *d, unsigned long gpfn, unsigned long stype); - -#if CONFIG_PAGING_LEVELS <= 2 -static inline void update_hl2e(struct vcpu *v, unsigned long va); -#endif - -static inline int page_is_page_table(struct page_info *page) -{ - struct domain *owner = page_get_owner(page); - u32 type_info; - - if ( owner && shadow_mode_refcounts(owner) ) - return page->count_info & PGC_page_table; - - type_info = page->u.inuse.type_info & PGT_type_mask; - return type_info && (type_info <= PGT_l4_page_table); -} - -static inline int mfn_is_page_table(unsigned long mfn) -{ - if ( !mfn_valid(mfn) ) - return 0; - - return page_is_page_table(mfn_to_page(mfn)); -} - -static inline int page_out_of_sync(struct page_info *page) -{ - return page->count_info & PGC_out_of_sync; -} - -static inline int mfn_out_of_sync(unsigned long mfn) -{ - if ( !mfn_valid(mfn) ) - return 0; - - return page_out_of_sync(mfn_to_page(mfn)); -} - - -/************************************************************************/ - -static void inline -__shadow_sync_mfn(struct domain *d, unsigned long mfn) -{ - if ( d->arch.out_of_sync ) - { - // XXX - could be smarter - // - __shadow_sync_all(d); - } -} - -static void inline -__shadow_sync_va(struct vcpu *v, unsigned long va) -{ - struct domain *d = v->domain; - - if ( d->arch.out_of_sync && __shadow_out_of_sync(v, va) ) - { - perfc_incrc(shadow_sync_va); - - // XXX - could be smarter - // - __shadow_sync_all(v->domain); - } -#if CONFIG_PAGING_LEVELS <= 2 - // Also make sure the HL2 is up-to-date for this address. - // - if ( unlikely(shadow_mode_translate(v->domain)) ) - update_hl2e(v, va); -#endif -} - -static void inline -shadow_sync_all(struct domain *d) -{ - if ( unlikely(shadow_mode_enabled(d)) ) - { - shadow_lock(d); - - if ( d->arch.out_of_sync ) - __shadow_sync_all(d); - - ASSERT(d->arch.out_of_sync == NULL); - - shadow_unlock(d); - } -} - -// SMP BUG: This routine can't ever be used properly in an SMP context. -// It should be something like get_shadow_and_sync_va(). -// This probably shouldn't exist. -// -static void inline -shadow_sync_va(struct vcpu *v, unsigned long gva) -{ - struct domain *d = v->domain; - if ( unlikely(shadow_mode_enabled(d)) ) - { - shadow_lock(d); - __shadow_sync_va(v, gva); - shadow_unlock(d); - } -} - -extern void __shadow_mode_disable(struct domain *d); -static inline void shadow_mode_disable(struct domain *d) -{ - if ( unlikely(shadow_mode_enabled(d)) ) - { - shadow_lock(d); - __shadow_mode_disable(d); - shadow_unlock(d); - } -} - -/************************************************************************/ - -#define mfn_to_gmfn(_d, mfn) \ - ( (shadow_mode_translate(_d)) \ - ? get_gpfn_from_mfn(mfn) \ - : (mfn) ) - -#define gmfn_to_mfn(_d, gpfn) \ - ({ \ - unlikely(shadow_mode_translate(_d)) \ - ? (likely(current->domain == (_d)) \ - ? get_mfn_from_gpfn(gpfn) \ - : get_mfn_from_gpfn_foreign(_d, gpfn)) \ - : (gpfn); \ - }) - -extern unsigned long get_mfn_from_gpfn_foreign( - struct domain *d, unsigned long gpfn); - -/************************************************************************/ - -struct shadow_status { - struct shadow_status *next; /* Pull-to-front list per hash bucket. */ - shadow_key_t gpfn_and_flags; /* Guest pfn plus flags. */ - unsigned long smfn; /* Shadow mfn. */ -}; - -#define shadow_ht_extra_size 128 -#define shadow_ht_buckets 256 - -struct out_of_sync_entry { - struct out_of_sync_entry *next; - struct vcpu *v; - unsigned long gpfn; /* why is this here? */ - unsigned long gmfn; - unsigned long snapshot_mfn; - paddr_t writable_pl1e; /* NB: this is a machine address */ - unsigned long va; -}; - -#define out_of_sync_extra_size 127 - -#define SHADOW_SNAPSHOT_ELSEWHERE (-1L) - -/************************************************************************/ -#define SHADOW_DEBUG 0 -#define SHADOW_VERBOSE_DEBUG 0 -#define SHADOW_VVERBOSE_DEBUG 0 -#define SHADOW_VVVERBOSE_DEBUG 0 -#define SHADOW_HASH_DEBUG 0 -#define FULLSHADOW_DEBUG 0 - -#if SHADOW_DEBUG -extern int shadow_status_noswap; -#define SHADOW_REFLECTS_SNAPSHOT _PAGE_AVAIL0 -#endif - -#if SHADOW_VERBOSE_DEBUG -#define SH_LOG(_f, _a...) \ - printk("DOM%uP%u: SH_LOG(%d): " _f "\n", \ - current->domain->domain_id , smp_processor_id(), __LINE__ , ## _a ) -#define SH_VLOG(_f, _a...) \ - printk("DOM%uP%u: SH_VLOG(%d): " _f "\n", \ - current->domain->domain_id, smp_processor_id(), __LINE__ , ## _a ) -#else -#define SH_LOG(_f, _a...) ((void)0) -#define SH_VLOG(_f, _a...) ((void)0) -#endif - -#if SHADOW_VVERBOSE_DEBUG -#define SH_VVLOG(_f, _a...) \ - printk("DOM%uP%u: SH_VVLOG(%d): " _f "\n", \ - current->domain->domain_id, smp_processor_id(), __LINE__ , ## _a ) -#else -#define SH_VVLOG(_f, _a...) ((void)0) -#endif - -#if SHADOW_VVVERBOSE_DEBUG -#define SH_VVVLOG(_f, _a...) \ - printk("DOM%uP%u: SH_VVVLOG(%d): " _f "\n", \ - current->domain->domain_id, smp_processor_id(), __LINE__ , ## _a ) -#else -#define SH_VVVLOG(_f, _a...) ((void)0) -#endif - -#if FULLSHADOW_DEBUG -#define FSH_LOG(_f, _a...) \ - printk("DOM%uP%u: FSH_LOG(%d): " _f "\n", \ - current->domain->domain_id, smp_processor_id(), __LINE__ , ## _a ) -#else -#define FSH_LOG(_f, _a...) ((void)0) -#endif - - -/************************************************************************/ - -static inline int -shadow_get_page_from_l1e(l1_pgentry_t l1e, struct domain *d) -{ - l1_pgentry_t nl1e; - int res; - unsigned long mfn; - struct domain *owner; - - ASSERT(l1e_get_flags(l1e) & _PAGE_PRESENT); - - if ( !shadow_mode_refcounts(d) ) - return 1; - - nl1e = l1e; - l1e_remove_flags(nl1e, _PAGE_GLOBAL); - - if ( unlikely(l1e_get_flags(nl1e) & L1_DISALLOW_MASK) ) - return 0; - - res = get_page_from_l1e(nl1e, d); - - if ( unlikely(!res) && IS_PRIV(d) && !shadow_mode_translate(d) && - !(l1e_get_flags(nl1e) & L1_DISALLOW_MASK) && - (mfn = l1e_get_pfn(nl1e)) && - mfn_valid(mfn) && - (owner = page_get_owner(mfn_to_page(mfn))) && - (d != owner) ) - { - res = get_page_from_l1e(nl1e, owner); - printk("tried to map mfn %lx from domain %d into shadow page tables " - "of domain %d; %s\n", - mfn, owner->domain_id, d->domain_id, - res ? "success" : "failed"); - } - - if ( unlikely(!res) ) - { - perfc_incrc(shadow_get_page_fail); - FSH_LOG("%s failed to get ref l1e=%" PRIpte "\n", - __func__, l1e_get_intpte(l1e)); - } - - return res; -} - -static inline void -shadow_put_page_from_l1e(l1_pgentry_t l1e, struct domain *d) -{ - if ( !shadow_mode_refcounts(d) ) - return; - - put_page_from_l1e(l1e, d); -} - -static inline void -shadow_put_page_type(struct domain *d, struct page_info *page) -{ - if ( !shadow_mode_refcounts(d) ) - return; - - put_page_type(page); -} - -static inline int shadow_get_page(struct domain *d, - struct page_info *page, - struct domain *owner) -{ - if ( !shadow_mode_refcounts(d) ) - return 1; - return get_page(page, owner); -} - -static inline void shadow_put_page(struct domain *d, - struct page_info *page) -{ - if ( !shadow_mode_refcounts(d) ) - return; - put_page(page); -} - -/************************************************************************/ - -static inline void __mark_dirty(struct domain *d, unsigned long mfn) -{ - unsigned long pfn; - - ASSERT(shadow_lock_is_acquired(d)); - - if ( likely(!shadow_mode_log_dirty(d)) || !VALID_MFN(mfn) ) - return; - - ASSERT(d->arch.shadow_dirty_bitmap != NULL); - - /* We /really/ mean PFN here, even for non-translated guests. */ - pfn = get_gpfn_from_mfn(mfn); - - /* - * Values with the MSB set denote MFNs that aren't really part of the - * domain's pseudo-physical memory map (e.g., the shared info frame). - * Nothing to do here... - */ - if ( unlikely(IS_INVALID_M2P_ENTRY(pfn)) ) - return; - - /* N.B. Can use non-atomic TAS because protected by shadow_lock. */ - if ( likely(pfn < d->arch.shadow_dirty_bitmap_size) && - !__test_and_set_bit(pfn, d->arch.shadow_dirty_bitmap) ) - { - d->arch.shadow_dirty_count++; - } -#ifndef NDEBUG - else if ( mfn_valid(mfn) ) - { - SH_VLOG("mark_dirty OOR! mfn=%lx pfn=%lx max=%x (dom %p)", - mfn, pfn, d->arch.shadow_dirty_bitmap_size, d); - SH_VLOG("dom=%p caf=%08x taf=%" PRtype_info, - page_get_owner(mfn_to_page(mfn)), - mfn_to_page(mfn)->count_info, - mfn_to_page(mfn)->u.inuse.type_info ); - } -#endif -} - - -static inline void mark_dirty(struct domain *d, unsigned int mfn) -{ - if ( unlikely(shadow_mode_log_dirty(d)) ) - { - shadow_lock(d); - __mark_dirty(d, mfn); - shadow_unlock(d); - } -} - - -/************************************************************************/ -#if CONFIG_PAGING_LEVELS <= 2 -static inline void -__shadow_get_l2e( - struct vcpu *v, unsigned long va, l2_pgentry_t *psl2e) -{ - ASSERT(shadow_mode_enabled(v->domain)); - - *psl2e = v->arch.shadow_vtable[l2_table_offset(va)]; -} - -static inline void -__shadow_set_l2e( - struct vcpu *v, unsigned long va, l2_pgentry_t value) -{ - ASSERT(shadow_mode_enabled(v->domain)); - - v->arch.shadow_vtable[l2_table_offset(va)] = value; -} - -static inline void -__guest_get_l2e( - struct vcpu *v, unsigned long va, l2_pgentry_t *pl2e) -{ - *pl2e = v->arch.guest_vtable[l2_table_offset(va)]; -} - -static inline void -__guest_set_l2e( - struct vcpu *v, unsigned long va, l2_pgentry_t value) -{ - struct domain *d = v->domain; - - v->arch.guest_vtable[l2_table_offset(va)] = value; - - if ( unlikely(shadow_mode_translate(d)) ) - update_hl2e(v, va); - - __mark_dirty(d, pagetable_get_pfn(v->arch.guest_table)); -} - -static inline void -__direct_get_l2e( - struct vcpu *v, unsigned long va, l2_pgentry_t *psl2e) -{ - l2_pgentry_t *phys_vtable; - - ASSERT(shadow_mode_enabled(v->domain)); - - phys_vtable = map_domain_page( - pagetable_get_pfn(v->domain->arch.phys_table)); - - *psl2e = phys_vtable[l2_table_offset(va)]; - - unmap_domain_page(phys_vtable); -} - -static inline void -__direct_set_l2e( - struct vcpu *v, unsigned long va, l2_pgentry_t value) -{ - l2_pgentry_t *phys_vtable; - - ASSERT(shadow_mode_enabled(v->domain)); - - phys_vtable = map_domain_page( - pagetable_get_pfn(v->domain->arch.phys_table)); - - phys_vtable[l2_table_offset(va)] = value; - - unmap_domain_page(phys_vtable); -} - -static inline void -update_hl2e(struct vcpu *v, unsigned long va) -{ - int index = l2_table_offset(va); - unsigned long mfn; - l2_pgentry_t gl2e = v->arch.guest_vtable[index]; - l1_pgentry_t old_hl2e, new_hl2e; - int need_flush = 0; - - ASSERT(shadow_mode_translate(v->domain)); - - old_hl2e = v->arch.hl2_vtable[index]; - - if ( (l2e_get_flags(gl2e) & _PAGE_PRESENT) && - VALID_MFN(mfn = get_mfn_from_gpfn(l2e_get_pfn(gl2e))) ) - new_hl2e = l1e_from_pfn(mfn, __PAGE_HYPERVISOR); - else - new_hl2e = l1e_empty(); - - // only do the ref counting if something has changed. - // - if ( (l1e_has_changed(old_hl2e, new_hl2e, PAGE_FLAG_MASK)) ) - { - if ( (l1e_get_flags(new_hl2e) & _PAGE_PRESENT) && - !shadow_get_page(v->domain, mfn_to_page(l1e_get_pfn(new_hl2e)), - v->domain) ) - new_hl2e = l1e_empty(); - if ( l1e_get_flags(old_hl2e) & _PAGE_PRESENT ) - { - shadow_put_page(v->domain, mfn_to_page(l1e_get_pfn(old_hl2e))); - need_flush = 1; - } - - v->arch.hl2_vtable[l2_table_offset(va)] = new_hl2e; - - if ( need_flush ) - { - perfc_incrc(update_hl2e_invlpg); - flush_tlb_one_mask(v->domain->domain_dirty_cpumask, - &linear_pg_table[l1_linear_offset(va)]); - } - } -} - -static inline void shadow_drop_references( - struct domain *d, struct page_info *page) -{ - if ( likely(!shadow_mode_refcounts(d)) || - ((page->u.inuse.type_info & PGT_count_mask) == 0) ) - return; - - /* XXX This needs more thought... */ - printk("%s: needing to call shadow_remove_all_access for mfn=%lx\n", - __func__, page_to_mfn(page)); - printk("Before: mfn=%lx c=%08x t=%" PRtype_info "\n", page_to_mfn(page), - page->count_info, page->u.inuse.type_info); - - shadow_lock(d); - shadow_remove_all_access(d, page_to_mfn(page)); - shadow_unlock(d); - - printk("After: mfn=%lx c=%08x t=%" PRtype_info "\n", page_to_mfn(page), - page->count_info, page->u.inuse.type_info); -} - -/* XXX Needs more thought. Neither pretty nor fast: a place holder. */ -static inline void shadow_sync_and_drop_references( - struct domain *d, struct page_info *page) -{ - if ( likely(!shadow_mode_refcounts(d)) ) - return; - - if ( page_out_of_sync(page) ) - __shadow_sync_mfn(d, page_to_mfn(page)); - - shadow_remove_all_access(d, page_to_mfn(page)); -} -#endif - -/************************************************************************/ - -/* - * Add another shadow reference to smfn. - */ -static inline int -get_shadow_ref(unsigned long smfn) -{ - u32 x, nx; - - ASSERT(mfn_valid(smfn)); - - x = mfn_to_page(smfn)->count_info; - nx = x + 1; - - if ( unlikely(nx == 0) ) - { - printk("get_shadow_ref overflow, gmfn=%" PRtype_info " smfn=%lx\n", - mfn_to_page(smfn)->u.inuse.type_info & PGT_mfn_mask, - smfn); - BUG(); - } - - // Guarded by the shadow lock... - // - mfn_to_page(smfn)->count_info = nx; - - return 1; -} - -/* - * Drop a shadow reference to smfn. - */ -static inline void -put_shadow_ref(unsigned long smfn) -{ - u32 x, nx; - - ASSERT(mfn_valid(smfn)); - - x = mfn_to_page(smfn)->count_info; - nx = x - 1; - - if ( unlikely(x == 0) ) - { - printk("put_shadow_ref underflow, smfn=%lx oc=%08x t=%" - PRtype_info "\n", - smfn, - mfn_to_page(smfn)->count_info, - mfn_to_page(smfn)->u.inuse.type_info); - BUG(); - } - - // Guarded by the shadow lock... - // - mfn_to_page(smfn)->count_info = nx; - - if ( unlikely(nx == 0) ) - { - free_shadow_page(smfn); - } -} - -static inline void -shadow_pin(unsigned long smfn) -{ - ASSERT( !(mfn_to_page(smfn)->u.inuse.type_info & PGT_pinned) ); - - mfn_to_page(smfn)->u.inuse.type_info |= PGT_pinned; - if ( unlikely(!get_shadow_ref(smfn)) ) - BUG(); -} - -static inline void -shadow_unpin(unsigned long smfn) -{ - ASSERT( (mfn_to_page(smfn)->u.inuse.type_info & PGT_pinned) ); - - mfn_to_page(smfn)->u.inuse.type_info &= ~PGT_pinned; - put_shadow_ref(smfn); -} - -/* - * SMP issue. The following code assumes the shadow lock is held. Re-visit - * when working on finer-gained locks for shadow. - */ -static inline void set_guest_back_ptr( - struct domain *d, l1_pgentry_t spte, - unsigned long smfn, unsigned int index) -{ - struct page_info *gpage; - - ASSERT(shadow_lock_is_acquired(d)); - - if ( !shadow_mode_external(d) || - ((l1e_get_flags(spte) & (_PAGE_PRESENT|_PAGE_RW)) != - (_PAGE_PRESENT|_PAGE_RW)) ) - return; - - gpage = l1e_get_page(spte); - - ASSERT(smfn != 0); - ASSERT(page_to_mfn(gpage) != 0); - - gpage->tlbflush_timestamp = smfn; - gpage->u.inuse.type_info &= ~PGT_va_mask; - gpage->u.inuse.type_info |= (unsigned long)index << PGT_va_shift; -} - -/************************************************************************/ -#if CONFIG_PAGING_LEVELS <= 2 -extern void shadow_mark_va_out_of_sync( - struct vcpu *v, unsigned long gpfn, unsigned long mfn, - unsigned long va); - -static inline int l1pte_write_fault( - struct vcpu *v, l1_pgentry_t *gpte_p, l1_pgentry_t *spte_p, - unsigned long va) -{ - struct domain *d = v->domain; - l1_pgentry_t gpte = *gpte_p; - l1_pgentry_t spte; - unsigned long gpfn = l1e_get_pfn(gpte); - unsigned long gmfn = gmfn_to_mfn(d, gpfn); - - //printk("l1pte_write_fault gmfn=%lx\n", gmfn); - - if ( unlikely(!VALID_MFN(gmfn)) ) - { - SH_VLOG("l1pte_write_fault: invalid gpfn=%lx", gpfn); - *spte_p = l1e_empty(); - return 0; - } - - ASSERT(l1e_get_flags(gpte) & _PAGE_RW); - l1e_add_flags(gpte, _PAGE_DIRTY | _PAGE_ACCESSED); - spte = l1e_from_pfn(gmfn, l1e_get_flags(gpte) & ~_PAGE_GLOBAL); - - SH_VVLOG("l1pte_write_fault: updating spte=0x%" PRIpte " gpte=0x%" PRIpte, - l1e_get_intpte(spte), l1e_get_intpte(gpte)); - - __mark_dirty(d, gmfn); - - if ( mfn_is_page_table(gmfn) ) - shadow_mark_va_out_of_sync(v, gpfn, gmfn, va); - - *gpte_p = gpte; - *spte_p = spte; - - return 1; -} - -static inline int l1pte_read_fault( - struct domain *d, l1_pgentry_t *gpte_p, l1_pgentry_t *spte_p) -{ - l1_pgentry_t gpte = *gpte_p; - l1_pgentry_t spte = *spte_p; - unsigned long pfn = l1e_get_pfn(gpte); - unsigned long mfn = gmfn_to_mfn(d, pfn); - - if ( unlikely(!VALID_MFN(mfn)) ) - { - SH_VLOG("l1pte_read_fault: invalid gpfn=%lx", pfn); - *spte_p = l1e_empty(); - return 0; - } - - l1e_add_flags(gpte, _PAGE_ACCESSED); - spte = l1e_from_pfn(mfn, l1e_get_flags(gpte) & ~_PAGE_GLOBAL); - - if ( shadow_mode_log_dirty(d) || !(l1e_get_flags(gpte) & _PAGE_DIRTY) || - mfn_is_page_table(mfn) ) - { - l1e_remove_flags(spte, _PAGE_RW); - } - - SH_VVLOG("l1pte_read_fault: updating spte=0x%" PRIpte " gpte=0x%" PRIpte, - l1e_get_intpte(spte), l1e_get_intpte(gpte)); - *gpte_p = gpte; - *spte_p = spte; - - return 1; -} -#endif - -static inline void l1pte_propagate_from_guest( - struct domain *d, guest_l1_pgentry_t gpte, l1_pgentry_t *spte_p) -{ - unsigned long mfn; - l1_pgentry_t spte; - - spte = l1e_empty(); - - if ( ((guest_l1e_get_flags(gpte) & (_PAGE_PRESENT|_PAGE_ACCESSED) ) == - (_PAGE_PRESENT|_PAGE_ACCESSED)) && - VALID_MFN(mfn = gmfn_to_mfn(d, l1e_get_pfn(gpte))) ) - { - spte = l1e_from_pfn( - mfn, guest_l1e_get_flags(gpte) & ~(_PAGE_GLOBAL | _PAGE_AVAIL)); - - if ( shadow_mode_log_dirty(d) || - !(guest_l1e_get_flags(gpte) & _PAGE_DIRTY) || - mfn_is_page_table(mfn) ) - { - l1e_remove_flags(spte, _PAGE_RW); - } - } - - if ( l1e_get_intpte(spte) || l1e_get_intpte(gpte) ) - SH_VVVLOG("%s: gpte=%" PRIpte ", new spte=%" PRIpte, - __func__, l1e_get_intpte(gpte), l1e_get_intpte(spte)); - - *spte_p = spte; -} - -static inline void hl2e_propagate_from_guest( - struct domain *d, l2_pgentry_t gpde, l1_pgentry_t *hl2e_p) -{ - unsigned long pfn = l2e_get_pfn(gpde); - unsigned long mfn; - l1_pgentry_t hl2e; - - hl2e = l1e_empty(); - - if ( l2e_get_flags(gpde) & _PAGE_PRESENT ) - { - mfn = gmfn_to_mfn(d, pfn); - if ( VALID_MFN(mfn) && mfn_valid(mfn) ) - hl2e = l1e_from_pfn(mfn, __PAGE_HYPERVISOR); - } - - if ( l1e_get_intpte(hl2e) || l2e_get_intpte(gpde) ) - SH_VVLOG("%s: gpde=%" PRIpte " hl2e=%" PRIpte, __func__, - l2e_get_intpte(gpde), l1e_get_intpte(hl2e)); - - *hl2e_p = hl2e; -} - -static inline void l2pde_general( - struct domain *d, - guest_l2_pgentry_t *gpde_p, - l2_pgentry_t *spde_p, - unsigned long sl1mfn) -{ - guest_l2_pgentry_t gpde = *gpde_p; - l2_pgentry_t spde; - - spde = l2e_empty(); - if ( (guest_l2e_get_flags(gpde) & _PAGE_PRESENT) && (sl1mfn != 0) ) - { - spde = l2e_from_pfn( - sl1mfn, - (guest_l2e_get_flags(gpde) | _PAGE_RW | _PAGE_ACCESSED) & ~_PAGE_AVAIL); - - /* N.B. PDEs do not have a dirty bit. */ - guest_l2e_add_flags(gpde, _PAGE_ACCESSED); - - *gpde_p = gpde; - } - - if ( l2e_get_intpte(spde) || l2e_get_intpte(gpde) ) - SH_VVLOG("%s: gpde=%" PRIpte ", new spde=%" PRIpte, __func__, - l2e_get_intpte(gpde), l2e_get_intpte(spde)); - - *spde_p = spde; -} - -static inline void l2pde_propagate_from_guest( - struct domain *d, guest_l2_pgentry_t *gpde_p, l2_pgentry_t *spde_p) -{ - guest_l2_pgentry_t gpde = *gpde_p; - unsigned long sl1mfn = 0; - - if ( guest_l2e_get_flags(gpde) & _PAGE_PRESENT ) - sl1mfn = __shadow_status(d, l2e_get_pfn(gpde), PGT_l1_shadow); - l2pde_general(d, gpde_p, spde_p, sl1mfn); -} - -/************************************************************************/ - -// returns true if a tlb flush is needed -// -static int inline -validate_pte_change( - struct domain *d, - guest_l1_pgentry_t new_pte, - l1_pgentry_t *shadow_pte_p) -{ - l1_pgentry_t old_spte, new_spte; - int need_flush = 0; - - perfc_incrc(validate_pte_calls); - - l1pte_propagate_from_guest(d, new_pte, &new_spte); - - if ( shadow_mode_refcounts(d) ) - { - old_spte = *shadow_pte_p; - - if ( l1e_get_intpte(old_spte) == l1e_get_intpte(new_spte) ) - { - // No accounting required... - // - perfc_incrc(validate_pte_changes1); - } - else if ( l1e_get_intpte(old_spte) == (l1e_get_intpte(new_spte)|_PAGE_RW) ) - { - // Fast path for PTEs that have merely been write-protected - // (e.g., during a Unix fork()). A strict reduction in privilege. - // - perfc_incrc(validate_pte_changes2); - if ( likely(l1e_get_flags(new_spte) & _PAGE_PRESENT) ) - shadow_put_page_type(d, mfn_to_page(l1e_get_pfn(new_spte))); - } - else if ( ((l1e_get_flags(old_spte) | l1e_get_flags(new_spte)) & - _PAGE_PRESENT ) && - l1e_has_changed(old_spte, new_spte, _PAGE_RW | _PAGE_PRESENT) ) - { - // only do the ref counting if something important changed. - // - perfc_incrc(validate_pte_changes3); - - if ( l1e_get_flags(old_spte) & _PAGE_PRESENT ) - { - shadow_put_page_from_l1e(old_spte, d); - need_flush = 1; - } - if ( (l1e_get_flags(new_spte) & _PAGE_PRESENT) && - !shadow_get_page_from_l1e(new_spte, d) ) { - new_spte = l1e_empty(); - need_flush = -1; /* need to unshadow the page */ - } - } - else - { - perfc_incrc(validate_pte_changes4); - } - } - - *shadow_pte_p = new_spte; - - return need_flush; -} - -// returns true if a tlb flush is needed -// -static int inline -validate_hl2e_change( - struct domain *d, - l2_pgentry_t new_gpde, - l1_pgentry_t *shadow_hl2e_p) -{ - l1_pgentry_t old_hl2e, new_hl2e; - int need_flush = 0; - - perfc_incrc(validate_hl2e_calls); - - old_hl2e = *shadow_hl2e_p; - hl2e_propagate_from_guest(d, new_gpde, &new_hl2e); - - // Only do the ref counting if something important changed. - // - if ( ((l1e_get_flags(old_hl2e) | l1e_get_flags(new_hl2e)) & _PAGE_PRESENT) && - l1e_has_changed(old_hl2e, new_hl2e, _PAGE_PRESENT) ) - { - perfc_incrc(validate_hl2e_changes); - - if ( (l1e_get_flags(new_hl2e) & _PAGE_PRESENT) && - !get_page(mfn_to_page(l1e_get_pfn(new_hl2e)), d) ) - new_hl2e = l1e_empty(); - if ( l1e_get_flags(old_hl2e) & _PAGE_PRESENT ) - { - put_page(mfn_to_page(l1e_get_pfn(old_hl2e))); - need_flush = 1; - } - } - - *shadow_hl2e_p = new_hl2e; - - return need_flush; -} - -// returns true if a tlb flush is needed -// -static int inline -validate_pde_change( - struct domain *d, - guest_l2_pgentry_t new_gpde, - l2_pgentry_t *shadow_pde_p) -{ - l2_pgentry_t old_spde, new_spde; - int need_flush = 0; - - perfc_incrc(validate_pde_calls); - - old_spde = *shadow_pde_p; - l2pde_propagate_from_guest(d, &new_gpde, &new_spde); - - // Only do the ref counting if something important changed. - // - if ( ((l2e_get_intpte(old_spde) | l2e_get_intpte(new_spde)) & _PAGE_PRESENT) && - l2e_has_changed(old_spde, new_spde, _PAGE_PRESENT) ) - { - perfc_incrc(validate_pde_changes); - - if ( (l2e_get_flags(new_spde) & _PAGE_PRESENT) && - !get_shadow_ref(l2e_get_pfn(new_spde)) ) - BUG(); - if ( l2e_get_flags(old_spde) & _PAGE_PRESENT ) - { - put_shadow_ref(l2e_get_pfn(old_spde)); - need_flush = 1; - } - } - - *shadow_pde_p = new_spde; - - return need_flush; -} - -/*********************************************************************/ - -#if SHADOW_HASH_DEBUG - -static void shadow_audit(struct domain *d, int print) -{ - int live = 0, free = 0, j = 0, abs; - struct shadow_status *a; - - for ( j = 0; j < shadow_ht_buckets; j++ ) - { - a = &d->arch.shadow_ht[j]; - if ( a->gpfn_and_flags ) - { - live++; - ASSERT(a->smfn); - } - else - ASSERT(!a->next); - - a = a->next; - while ( a && (live < 9999) ) - { - live++; - if ( (a->gpfn_and_flags == 0) || (a->smfn == 0) ) - { - printk("XXX live=%d gpfn+flags=%lx sp=%lx next=%p\n", - live, a->gpfn_and_flags, a->smfn, a->next); - BUG(); - } - ASSERT(a->smfn); - a = a->next; - } - ASSERT(live < 9999); - } - - for ( a = d->arch.shadow_ht_free; a != NULL; a = a->next ) - free++; - - if ( print ) - printk("Xlive=%d free=%d\n", live, free); - - // BUG: this only works if there's only a single domain which is - // using shadow tables. - // - abs = ( - perfc_value(shadow_l1_pages) + - perfc_value(shadow_l2_pages) + - perfc_value(hl2_table_pages) + - perfc_value(snapshot_pages) + - perfc_value(writable_pte_predictions) - ) - live; -#ifdef PERF_COUNTERS - if ( (abs < -1) || (abs > 1) ) - { - printk("live=%d free=%d l1=%d l2=%d hl2=%d snapshot=%d writable_ptes=%d\n", - live, free, - perfc_value(shadow_l1_pages), - perfc_value(shadow_l2_pages), - perfc_value(hl2_table_pages), - perfc_value(snapshot_pages), - perfc_value(writable_pte_predictions)); - BUG(); - } -#endif - - // XXX ought to add some code to audit the out-of-sync entries, too. - // -} -#else -#define shadow_audit(p, print) ((void)0) -#endif - - -static inline struct shadow_status *hash_bucket( - struct domain *d, unsigned int gpfn) -{ - return &d->arch.shadow_ht[gpfn % shadow_ht_buckets]; -} - - -/* - * N.B. This takes a guest pfn (i.e. a pfn in the guest's namespace, - * which, depending on full shadow mode, may or may not equal - * its mfn). - * It returns the shadow's mfn, or zero if it doesn't exist. - */ -static inline unsigned long __shadow_status( - struct domain *d, unsigned long gpfn, unsigned long stype) -{ - struct shadow_status *p, *x, *head; - shadow_key_t key; -#if CONFIG_PAGING_LEVELS >= 3 - if ( d->arch.ops->guest_paging_levels == PAGING_L3 && stype == PGT_l4_shadow ) - key = gpfn | stype | index_to_key(get_cr3_idxval(current)); - else -#endif - key = gpfn | stype; - - ASSERT(shadow_lock_is_acquired(d)); - ASSERT(gpfn == (gpfn & PGT_mfn_mask)); - ASSERT(stype && !(stype & ~PGT_type_mask)); - - perfc_incrc(shadow_status_calls); - - x = head = hash_bucket(d, gpfn); - p = NULL; - - shadow_audit(d, 0); - - do - { - ASSERT(x->gpfn_and_flags || ((x == head) && (x->next == NULL))); - - if ( x->gpfn_and_flags == key ) - { -#if SHADOW_DEBUG - if ( unlikely(shadow_status_noswap) ) - return x->smfn; -#endif - /* Pull-to-front if 'x' isn't already the head item. */ - if ( unlikely(x != head) ) - { - /* Delete 'x' from list and reinsert immediately after head. */ - p->next = x->next; - x->next = head->next; - head->next = x; - - /* Swap 'x' contents with head contents. */ - SWAP(head->gpfn_and_flags, x->gpfn_and_flags); - SWAP(head->smfn, x->smfn); - } - else - { - perfc_incrc(shadow_status_hit_head); - } - - return head->smfn; - } - - p = x; - x = x->next; - } - while ( x != NULL ); - - perfc_incrc(shadow_status_miss); - return 0; -} - -/* - * Not clear if pull-to-front is worth while for this or not, - * as it generally needs to scan the entire bucket anyway. - * Much simpler without. - * - * Either returns PGT_none, or PGT_l{1,2,3,4}_page_table. - */ -static inline u32 -shadow_max_pgtable_type(struct domain *d, unsigned long gpfn, - unsigned long *smfn) -{ - struct shadow_status *x; - u32 pttype = PGT_none, type; - - ASSERT(shadow_lock_is_acquired(d)); - ASSERT(gpfn == (gpfn & PGT_mfn_mask)); - - perfc_incrc(shadow_max_type); - - x = hash_bucket(d, gpfn); - - while ( x && x->gpfn_and_flags ) - { - if ( (x->gpfn_and_flags & PGT_mfn_mask) == gpfn ) - { - type = x->gpfn_and_flags & PGT_type_mask; - - switch ( type ) - { - case PGT_hl2_shadow: - // Treat an HL2 as if it's an L1 - // - type = PGT_l1_shadow; - break; - case PGT_snapshot: - case PGT_writable_pred: - // Ignore snapshots -- they don't in and of themselves constitute - // treating a page as a page table - // - goto next; - case PGT_base_page_table: - // Early exit if we found the max possible value - // - return type; - default: - break; - } - - if ( type > pttype ) - { - pttype = type; - if ( smfn ) - *smfn = x->smfn; - } - } - next: - x = x->next; - } - - return pttype; -} - -static inline void delete_shadow_status( - struct domain *d, unsigned long gpfn, unsigned long gmfn, unsigned int stype, u64 index) -{ - struct shadow_status *p, *x, *n, *head; - - shadow_key_t key = gpfn | stype | index_to_key(index); - - ASSERT(shadow_lock_is_acquired(d)); - ASSERT(!(gpfn & ~PGT_mfn_mask)); - ASSERT(stype && !(stype & ~PGT_type_mask)); - - head = hash_bucket(d, gpfn); - - SH_VLOG("delete gpfn=%lx t=%08x bucket=%p", gpfn, stype, head); - shadow_audit(d, 0); - - /* Match on head item? */ - if ( head->gpfn_and_flags == key ) - { - if ( (n = head->next) != NULL ) - { - /* Overwrite head with contents of following node. */ - head->gpfn_and_flags = n->gpfn_and_flags; - head->smfn = n->smfn; - - /* Delete following node. */ - head->next = n->next; - - /* Add deleted node to the free list. */ - n->gpfn_and_flags = 0; - n->smfn = 0; - n->next = d->arch.shadow_ht_free; - d->arch.shadow_ht_free = n; - } - else - { - /* This bucket is now empty. Initialise the head node. */ - head->gpfn_and_flags = 0; - head->smfn = 0; - } - - goto found; - } - - p = head; - x = head->next; - - do - { - if ( x->gpfn_and_flags == key ) - { - /* Delete matching node. */ - p->next = x->next; - - /* Add deleted node to the free list. */ - x->gpfn_and_flags = 0; - x->smfn = 0; - x->next = d->arch.shadow_ht_free; - d->arch.shadow_ht_free = x; - - goto found; - } - - p = x; - x = x->next; - } - while ( x != NULL ); - - /* If we got here, it wasn't in the list! */ - BUG(); - - found: - // release ref to page - if ( stype != PGT_writable_pred ) - put_page(mfn_to_page(gmfn)); - - shadow_audit(d, 0); -} - -static inline void set_shadow_status( - struct domain *d, unsigned long gpfn, unsigned long gmfn, - unsigned long smfn, unsigned long stype, u64 index) -{ - struct shadow_status *x, *head, *extra; - int i; - - shadow_key_t key = gpfn | stype | index_to_key(index); - - SH_VVLOG("set gpfn=%lx gmfn=%lx smfn=%lx t=%lx", gpfn, gmfn, smfn, stype); - - ASSERT(shadow_lock_is_acquired(d)); - - ASSERT(shadow_mode_translate(d) || gpfn); - ASSERT(!(gpfn & ~PGT_mfn_mask)); - - // XXX - need to be more graceful. - ASSERT(VALID_MFN(gmfn)); - - ASSERT(stype && !(stype & ~PGT_type_mask)); - - x = head = hash_bucket(d, gpfn); - - SH_VLOG("set gpfn=%lx smfn=%lx t=%lx bucket=%p(%p)", - gpfn, smfn, stype, x, x->next); - shadow_audit(d, 0); - - // grab a reference to the guest page to represent the entry in the shadow - // hash table - // - // XXX - Should PGT_writable_pred grab a page ref? - // - Who/how are these hash table entry refs flushed if/when a page - // is given away by the domain? - // - if ( stype != PGT_writable_pred ) - get_page(mfn_to_page(gmfn), d); - - /* - * STEP 1. If page is already in the table, update it in place. - */ - do - { - if ( unlikely(x->gpfn_and_flags == key) ) - { - if ( stype != PGT_writable_pred ) - BUG(); // we should never replace entries into the hash table - x->smfn = smfn; - if ( stype != PGT_writable_pred ) - put_page(mfn_to_page(gmfn)); // already had a ref... - goto done; - } - - x = x->next; - } - while ( x != NULL ); - - /* - * STEP 2. The page must be inserted into the table. - */ - - /* If the bucket is empty then insert the new page as the head item. */ - if ( head->gpfn_and_flags == 0 ) - { - head->gpfn_and_flags = key; - head->smfn = smfn; - ASSERT(head->next == NULL); - goto done; - } - - /* We need to allocate a new node. Ensure the quicklist is non-empty. */ - if ( unlikely(d->arch.shadow_ht_free == NULL) ) - { - SH_VLOG("Allocate more shadow hashtable blocks."); - - extra = xmalloc_bytes( - sizeof(void *) + (shadow_ht_extra_size * sizeof(*x))); - - /* XXX Should be more graceful here. */ - if ( extra == NULL ) - BUG(); - - memset(extra, 0, sizeof(void *) + (shadow_ht_extra_size * sizeof(*x))); - - /* Record the allocation block so it can be correctly freed later. */ - d->arch.shadow_extras_count++; - *((struct shadow_status **)&extra[shadow_ht_extra_size]) = - d->arch.shadow_ht_extras; - d->arch.shadow_ht_extras = &extra[0]; - - /* Thread a free chain through the newly-allocated nodes. */ - for ( i = 0; i < (shadow_ht_extra_size - 1); i++ ) - extra[i].next = &extra[i+1]; - extra[i].next = NULL; - - /* Add the new nodes to the free list. */ - d->arch.shadow_ht_free = &extra[0]; - } - - /* Allocate a new node from the quicklist. */ - x = d->arch.shadow_ht_free; - d->arch.shadow_ht_free = x->next; - - /* Initialise the new node and insert directly after the head item. */ - x->gpfn_and_flags = key; - x->smfn = smfn; - x->next = head->next; - head->next = x; - - done: - shadow_audit(d, 0); - - if ( stype <= PGT_l4_shadow ) - { - // add to front of list of pages to check when removing write - // permissions for a page... - // - } -} - -/************************************************************************/ - -static inline void guest_physmap_add_page( - struct domain *d, unsigned long gpfn, unsigned long mfn) -{ - struct domain_mmap_cache c1, c2; - - if ( likely(!shadow_mode_translate(d)) ) - return; - - domain_mmap_cache_init(&c1); - domain_mmap_cache_init(&c2); - shadow_lock(d); - shadow_sync_and_drop_references(d, mfn_to_page(mfn)); - set_p2m_entry(d, gpfn, mfn, &c1, &c2); - set_gpfn_from_mfn(mfn, gpfn); - shadow_unlock(d); - domain_mmap_cache_destroy(&c1); - domain_mmap_cache_destroy(&c2); -} - -static inline void guest_physmap_remove_page( - struct domain *d, unsigned long gpfn, unsigned long mfn) -{ - struct domain_mmap_cache c1, c2; - unsigned long type; - - if ( likely(!shadow_mode_translate(d)) ) - return; - - domain_mmap_cache_init(&c1); - domain_mmap_cache_init(&c2); - shadow_lock(d); - shadow_sync_and_drop_references(d, mfn_to_page(mfn)); - while ( (type = shadow_max_pgtable_type(d, gpfn, NULL)) != PGT_none ) - free_shadow_page(__shadow_status(d, gpfn, type)); - set_p2m_entry(d, gpfn, -1, &c1, &c2); - set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY); - shadow_unlock(d); - domain_mmap_cache_destroy(&c1); - domain_mmap_cache_destroy(&c2); -} - -/************************************************************************/ - -void static inline -shadow_update_min_max(unsigned long smfn, int index) -{ - struct page_info *sl1page = mfn_to_page(smfn); - u32 min_max = sl1page->tlbflush_timestamp; - int min = SHADOW_MIN(min_max); - int max = SHADOW_MAX(min_max); - int update = 0; - - if ( index < min ) - { - min = index; - update = 1; - } - if ( index > max ) - { - max = index; - update = 1; - } - if ( update ) - sl1page->tlbflush_timestamp = SHADOW_ENCODE_MIN_MAX(min, max); -} - -#if CONFIG_PAGING_LEVELS <= 2 -extern void shadow_map_l1_into_current_l2(unsigned long va); - -void static inline -shadow_set_l1e(unsigned long va, l1_pgentry_t new_spte, int create_l1_shadow) -{ - struct vcpu *v = current; - struct domain *d = v->domain; - l2_pgentry_t sl2e = {0}; - - __shadow_get_l2e(v, va, &sl2e); - if ( !(l2e_get_flags(sl2e) & _PAGE_PRESENT) ) - { - /* - * Either the L1 is not shadowed, or the shadow isn't linked into - * the current shadow L2. - */ - if ( create_l1_shadow ) - { - perfc_incrc(shadow_set_l1e_force_map); - shadow_map_l1_into_current_l2(va); - } - else /* check to see if it exists; if so, link it in */ - { - l2_pgentry_t gpde = linear_l2_table(v)[l2_table_offset(va)]; - unsigned long gl1pfn = l2e_get_pfn(gpde); - unsigned long sl1mfn = __shadow_status(d, gl1pfn, PGT_l1_shadow); - - ASSERT( l2e_get_flags(gpde) & _PAGE_PRESENT ); - - if ( sl1mfn ) - { - perfc_incrc(shadow_set_l1e_unlinked); - if ( !get_shadow_ref(sl1mfn) ) - BUG(); - l2pde_general(d, &gpde, &sl2e, sl1mfn); - __guest_set_l2e(v, va, gpde); - __shadow_set_l2e(v, va, sl2e); - } - else - { - // no shadow exists, so there's nothing to do. - perfc_incrc(shadow_set_l1e_fail); - return; - } - } - } - - __shadow_get_l2e(v, va, &sl2e); - - if ( shadow_mode_refcounts(d) ) - { - l1_pgentry_t old_spte = shadow_linear_pg_table[l1_linear_offset(va)]; - - // only do the ref counting if something important changed. - // - if ( l1e_has_changed(old_spte, new_spte, _PAGE_RW | _PAGE_PRESENT) ) - { - if ( (l1e_get_flags(new_spte) & _PAGE_PRESENT) && - !shadow_get_page_from_l1e(new_spte, d) ) - new_spte = l1e_empty(); - if ( l1e_get_flags(old_spte) & _PAGE_PRESENT ) - shadow_put_page_from_l1e(old_spte, d); - } - - } - - set_guest_back_ptr(d, new_spte, l2e_get_pfn(sl2e), l1_table_offset(va)); - shadow_linear_pg_table[l1_linear_offset(va)] = new_spte; - shadow_update_min_max(l2e_get_pfn(sl2e), l1_table_offset(va)); -} -#endif -/************************************************************************/ - -static inline int -shadow_mode_page_writable(unsigned long va, struct cpu_user_regs *regs, unsigned long gpfn) -{ - struct vcpu *v = current; - struct domain *d = v->domain; - unsigned long mfn = gmfn_to_mfn(d, gpfn); - u32 type = mfn_to_page(mfn)->u.inuse.type_info & PGT_type_mask; - - if ( shadow_mode_refcounts(d) && - (type == PGT_writable_page) ) - type = shadow_max_pgtable_type(d, gpfn, NULL); - - // Strange but true: writable page tables allow kernel-mode access - // to L1 page table pages via write-protected PTEs... Similarly, write - // access to all page table pages is granted for shadow_mode_write_all - // clients. - // - if ( ((shadow_mode_write_l1(d) && (type == PGT_l1_page_table)) || - (shadow_mode_write_all(d) && type && (type <= PGT_l4_page_table))) && - ((va < HYPERVISOR_VIRT_START) -#if defined(__x86_64__) - || (va >= HYPERVISOR_VIRT_END) -#endif - ) && - guest_kernel_mode(v, regs) ) - return 1; - - return 0; -} - -#if CONFIG_PAGING_LEVELS <= 2 -static inline l1_pgentry_t gva_to_gpte(unsigned long gva) -{ - l2_pgentry_t gpde; - l1_pgentry_t gpte; - struct vcpu *v = current; - - ASSERT( shadow_mode_translate(current->domain) ); - - __guest_get_l2e(v, gva, &gpde); - if ( unlikely(!(l2e_get_flags(gpde) & _PAGE_PRESENT)) ) - return l1e_empty();; - - // This is actually overkill - we only need to make sure the hl2 - // is in-sync. - // - shadow_sync_va(v, gva); - - if ( unlikely(__copy_from_user(&gpte, - &linear_pg_table[gva >> PAGE_SHIFT], - sizeof(gpte))) ) - { - FSH_LOG("gva_to_gpte got a fault on gva=%lx", gva); - return l1e_empty(); - } - - return gpte; -} - -static inline unsigned long gva_to_gpa(unsigned long gva) -{ - l1_pgentry_t gpte; - - gpte = gva_to_gpte(gva); - if ( !(l1e_get_flags(gpte) & _PAGE_PRESENT) ) - return 0; - - return l1e_get_paddr(gpte) + (gva & ~PAGE_MASK); -} -#endif - -static inline unsigned long gva_to_mfn(unsigned long gva) -{ - unsigned long gpa = gva_to_gpa(gva); - return get_mfn_from_gpfn(gpa >> PAGE_SHIFT); -} - -/************************************************************************/ - -extern void __update_pagetables(struct vcpu *v); -static inline void update_pagetables(struct vcpu *v) -{ - struct domain *d = v->domain; - int paging_enabled; - - if ( hvm_guest(v) ) - paging_enabled = hvm_paging_enabled(v); - else - // HACK ALERT: there's currently no easy way to figure out if a domU - // has set its arch.guest_table to zero, vs not yet initialized it. - // - paging_enabled = !!pagetable_get_paddr(v->arch.guest_table); - - /* - * We don't call __update_pagetables() when hvm guest paging is - * disabled as we want the linear_pg_table to be inaccessible so that - * we bail out early of shadow_fault() if the hvm guest tries illegal - * accesses while it thinks paging is turned off. - */ - if ( unlikely(shadow_mode_enabled(d)) && paging_enabled ) - { - shadow_lock(d); - __update_pagetables(v); - shadow_unlock(d); - } - - if ( likely(!shadow_mode_external(d)) ) - { - if ( shadow_mode_enabled(d) ) - v->arch.monitor_table = v->arch.shadow_table; - else -#if CONFIG_PAGING_LEVELS == 4 - if ( !(v->arch.flags & TF_kernel_mode) ) - v->arch.monitor_table = v->arch.guest_table_user; - else -#endif - v->arch.monitor_table = v->arch.guest_table; - } -} - -void clear_all_shadow_status(struct domain *d); - -#if SHADOW_DEBUG -extern int _check_pagetable(struct vcpu *v, char *s); -extern int _check_all_pagetables(struct vcpu *v, char *s); - -#define check_pagetable(_v, _s) _check_pagetable(_v, _s) -//#define check_pagetable(_v, _s) _check_all_pagetables(_v, _s) - -#else -#define check_pagetable(_v, _s) ((void)0) -#endif - -#endif /* XEN_SHADOW_H */ +#endif /* _XEN_SHADOW_H */ /* * Local variables: diff -r f2151423f729 -r 01345b08d012 xen/include/asm-x86/x86_32/page-2level.h --- a/xen/include/asm-x86/x86_32/page-2level.h Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/include/asm-x86/x86_32/page-2level.h Wed Aug 16 17:11:56 2006 +0100 @@ -46,6 +46,7 @@ typedef l2_pgentry_t root_pgentry_t; * 12-bit flags = (pte[11:0]) */ +#define _PAGE_NX_BIT 0U #define _PAGE_NX 0U /* Extract flags into 12-bit integer, or turn 12-bit flags into a pte mask. */ diff -r f2151423f729 -r 01345b08d012 xen/include/asm-x86/x86_32/page-3level.h --- a/xen/include/asm-x86/x86_32/page-3level.h Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/include/asm-x86/x86_32/page-3level.h Wed Aug 16 17:11:56 2006 +0100 @@ -59,7 +59,8 @@ typedef l3_pgentry_t root_pgentry_t; * 32-bit flags = (pte[63:44],pte[11:0]) */ -#define _PAGE_NX (cpu_has_nx ? (1<<31) : 0) +#define _PAGE_NX_BIT (1U<<31) +#define _PAGE_NX (cpu_has_nx ? _PAGE_NX_BIT : 0) /* Extract flags into 32-bit integer, or turn 32-bit flags into a pte mask. */ #define get_pte_flags(x) (((int)((x) >> 32) & ~0xFFF) | ((int)(x) & 0xFFF)) diff -r f2151423f729 -r 01345b08d012 xen/include/asm-x86/x86_64/page.h --- a/xen/include/asm-x86/x86_64/page.h Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/include/asm-x86/x86_64/page.h Wed Aug 16 17:11:56 2006 +0100 @@ -44,6 +44,8 @@ typedef l4_pgentry_t root_pgentry_t; /* Given a virtual address, get an entry offset into a linear page table. */ #define l1_linear_offset(_a) (((_a) & VADDR_MASK) >> L1_PAGETABLE_SHIFT) #define l2_linear_offset(_a) (((_a) & VADDR_MASK) >> L2_PAGETABLE_SHIFT) +#define l3_linear_offset(_a) (((_a) & VADDR_MASK) >> L3_PAGETABLE_SHIFT) +#define l4_linear_offset(_a) (((_a) & VADDR_MASK) >> L4_PAGETABLE_SHIFT) #define is_guest_l1_slot(_s) (1) #define is_guest_l2_slot(_t, _s) (1) @@ -70,7 +72,8 @@ typedef l4_pgentry_t root_pgentry_t; #define put_pte_flags(x) (((intpte_t)((x) & ~0xFFF) << 40) | ((x) & 0xFFF)) /* Bit 23 of a 24-bit flag mask. This corresponds to bit 63 of a pte.*/ -#define _PAGE_NX (cpu_has_nx ? (1U<<23) : 0U) +#define _PAGE_NX_BIT (1U<<23) +#define _PAGE_NX (cpu_has_nx ? _PAGE_NX_BIT : 0U) #define L1_DISALLOW_MASK BASE_DISALLOW_MASK #define L2_DISALLOW_MASK BASE_DISALLOW_MASK diff -r f2151423f729 -r 01345b08d012 xen/include/public/dom0_ops.h --- a/xen/include/public/dom0_ops.h Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/include/public/dom0_ops.h Wed Aug 16 17:11:56 2006 +0100 @@ -262,6 +262,18 @@ DEFINE_XEN_GUEST_HANDLE(dom0_sched_id_t) #define DOM0_SHADOW_CONTROL_OP_CLEAN 11 #define DOM0_SHADOW_CONTROL_OP_PEEK 12 +/* Shadow2 operations */ +#define DOM0_SHADOW2_CONTROL_OP_GET_ALLOCATION 30 +#define DOM0_SHADOW2_CONTROL_OP_SET_ALLOCATION 31 +#define DOM0_SHADOW2_CONTROL_OP_ENABLE 32 + +/* Mode flags for Shadow2 enable op */ +#define DOM0_SHADOW2_CONTROL_FLAG_ENABLE (1 << 0) +#define DOM0_SHADOW2_CONTROL_FLAG_REFCOUNT (1 << 1) +#define DOM0_SHADOW2_CONTROL_FLAG_LOG_DIRTY (1 << 2) +#define DOM0_SHADOW2_CONTROL_FLAG_TRANSLATE (1 << 3) +#define DOM0_SHADOW2_CONTROL_FLAG_EXTERNAL (1 << 4) + struct dom0_shadow_control_stats { uint32_t fault_count; uint32_t dirty_count; @@ -277,7 +289,9 @@ struct dom0_shadow_control { uint32_t op; XEN_GUEST_HANDLE(ulong) dirty_bitmap; /* IN/OUT variables. */ - uint64_t pages; /* size of buffer, updated with actual size */ + uint64_t pages; /* size of buffer, updated with actual size */ + uint32_t mb; /* Shadow2 memory allocation in MB */ + uint32_t mode; /* Shadow2 mode to enable */ /* OUT variables. */ struct dom0_shadow_control_stats stats; }; diff -r f2151423f729 -r 01345b08d012 xen/include/xen/domain_page.h --- a/xen/include/xen/domain_page.h Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/include/xen/domain_page.h Wed Aug 16 17:11:56 2006 +0100 @@ -25,6 +25,13 @@ extern void *map_domain_page(unsigned lo * currently-executing VCPU via a call to map_domain_pages(). */ extern void unmap_domain_page(void *va); + +/* + * Convert a VA (within a page previously mapped in the context of the + * currently-executing VCPU via a call to map_domain_pages()) to a machine + * address + */ +extern paddr_t mapped_domain_page_to_maddr(void *va); /* * Similar to the above calls, except the mapping is accessible in all @@ -98,6 +105,7 @@ domain_mmap_cache_destroy(struct domain_ #define map_domain_page(pfn) maddr_to_virt((pfn)<<PAGE_SHIFT) #define unmap_domain_page(va) ((void)(va)) +#define mapped_domain_page_to_maddr(va) (virt_to_maddr(va)) #define map_domain_page_global(pfn) maddr_to_virt((pfn)<<PAGE_SHIFT) #define unmap_domain_page_global(va) ((void)(va)) @@ -112,4 +120,9 @@ struct domain_mmap_cache { #endif /* !CONFIG_DOMAIN_PAGE */ +#define HERE_I_AM \ +do { \ + printk("HERE I AM: %s %s %d\n", __func__, __FILE__, __LINE__); \ +} while (0) + #endif /* __XEN_DOMAIN_PAGE_H__ */ diff -r f2151423f729 -r 01345b08d012 xen/include/xen/lib.h --- a/xen/include/xen/lib.h Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/include/xen/lib.h Wed Aug 16 17:11:56 2006 +0100 @@ -18,7 +18,7 @@ extern void __bug(char *file, int line) #ifndef NDEBUG #define ASSERT(_p) \ do { \ - if ( !(_p) ) \ + if ( unlikely(!(_p)) ) \ { \ printk("Assertion '%s' failed, line %d, file %s\n", #_p , \ __LINE__, __FILE__); \ @@ -41,7 +41,7 @@ void cmdline_parse(char *cmdline); void cmdline_parse(char *cmdline); #ifndef NDEBUG -extern int debugtrace_send_to_console; +extern void debugtrace_toggle(void); extern void debugtrace_dump(void); extern void debugtrace_printk(const char *fmt, ...); #else diff -r f2151423f729 -r 01345b08d012 xen/include/xen/list.h --- a/xen/include/xen/list.h Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/include/xen/list.h Wed Aug 16 17:11:56 2006 +0100 @@ -160,6 +160,16 @@ static __inline__ void list_splice(struc #define list_for_each_safe(pos, n, head) \ for (pos = (head)->next, n = pos->next; pos != (head); \ pos = n, n = pos->next) + +/** + * list_for_each_backwards_safe - iterate backwards over a list safe against removal of list entry + * @pos: the &struct list_head to use as a loop counter. + * @n: another &struct list_head to use as temporary storage + * @head: the head for your list. + */ +#define list_for_each_backwards_safe(pos, n, head) \ + for (pos = (head)->prev, n = pos->prev; pos != (head); \ + pos = n, n = pos->prev) /** * list_for_each_entry - iterate over list of given type diff -r f2151423f729 -r 01345b08d012 xen/include/xen/sched.h --- a/xen/include/xen/sched.h Wed Aug 16 16:48:45 2006 +0100 +++ b/xen/include/xen/sched.h Wed Aug 16 17:11:56 2006 +0100 @@ -376,9 +376,12 @@ extern struct domain *domain_list; /* VCPU is paused by the hypervisor? */ #define _VCPUF_paused 11 #define VCPUF_paused (1UL<<_VCPUF_paused) - /* VCPU is blocked awaiting an event to be consumed by Xen. */ +/* VCPU is blocked awaiting an event to be consumed by Xen. */ #define _VCPUF_blocked_in_xen 12 #define VCPUF_blocked_in_xen (1UL<<_VCPUF_blocked_in_xen) + /* HVM vcpu thinks CR0.PG == 0 */ +#define _VCPUF_shadow2_translate 13 +#define VCPUF_shadow2_translate (1UL<<_VCPUF_shadow2_translate) /* * Per-domain flags (domain_flags). diff -r f2151423f729 -r 01345b08d012 xen/arch/x86/shadow2-common.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/arch/x86/shadow2-common.c Wed Aug 16 17:11:56 2006 +0100 @@ -0,0 +1,3394 @@ +/****************************************************************************** + * arch/x86/shadow2-common.c + * + * Shadow2 code that does not need to be multiply compiled. + * Parts of this code are Copyright (c) 2006 by XenSource Inc. + * Parts of this code are Copyright (c) 2006 by Michael A Fetterman + * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#define SHADOW2 1 + +#include <xen/config.h> +#include <xen/types.h> +#include <xen/mm.h> +#include <xen/trace.h> +#include <xen/sched.h> +#include <xen/perfc.h> +#include <xen/irq.h> +#include <xen/domain_page.h> +#include <xen/guest_access.h> +#include <asm/event.h> +#include <asm/page.h> +#include <asm/current.h> +#include <asm/flushtlb.h> +#include <asm/shadow2.h> +#include <asm/shadow2-private.h> + +#if SHADOW2_AUDIT +int shadow2_audit_enable = 0; +#endif + +static void sh2_free_log_dirty_bitmap(struct domain *d); + +int _shadow2_mode_refcounts(struct domain *d) +{ + return shadow2_mode_refcounts(d); +} + + +/**************************************************************************/ +/* x86 emulator support for the shadow2 code + */ + +static int +sh2_x86_emulate_read_std(unsigned long addr, + unsigned long *val, + unsigned int bytes, + struct x86_emulate_ctxt *ctxt) +{ + struct vcpu *v = current; + if ( hvm_guest(v) ) + { + *val = 0; + // XXX -- this is WRONG. + // It entirely ignores the permissions in the page tables. + // In this case, that is only a user vs supervisor access check. + // + if ( hvm_copy(val, addr, bytes, HVM_COPY_IN) ) + { +#if 0 + SHADOW2_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n", + v->domain->domain_id, v->vcpu_id, + addr, *val, bytes); +#endif + return X86EMUL_CONTINUE; + } + + /* If we got here, there was nothing mapped here, or a bad GFN + * was mapped here. This should never happen: we're here because + * of a write fault at the end of the instruction we're emulating. */ + SHADOW2_PRINTK("read failed to va %#lx\n", addr); + return X86EMUL_PROPAGATE_FAULT; + } + else + { + SHADOW2_PRINTK("this operation is not emulated yet\n"); + return X86EMUL_UNHANDLEABLE; + } +} + +static int +sh2_x86_emulate_write_std(unsigned long addr, + unsigned long val, + unsigned int bytes, + struct x86_emulate_ctxt *ctxt) +{ + struct vcpu *v = current; +#if 0 + SHADOW2_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n", + v->domain->domain_id, v->vcpu_id, addr, val, bytes); +#endif + if ( hvm_guest(v) ) + { + // XXX -- this is WRONG. + // It entirely ignores the permissions in the page tables. + // In this case, that includes user vs supervisor, and + // write access. + // + if ( hvm_copy(&val, addr, bytes, HVM_COPY_OUT) ) + return X86EMUL_CONTINUE; + + /* If we got here, there was nothing mapped here, or a bad GFN + * was mapped here. This should never happen: we're here because + * of a write fault at the end of the instruction we're emulating, + * which should be handled by sh2_x86_emulate_write_emulated. */ + SHADOW2_PRINTK("write failed to va %#lx\n", addr); + return X86EMUL_PROPAGATE_FAULT; + } + else + { + SHADOW2_PRINTK("this operation is not emulated yet\n"); + return X86EMUL_UNHANDLEABLE; + } +} + +static int +sh2_x86_emulate_write_emulated(unsigned long addr, + unsigned long val, + unsigned int bytes, + struct x86_emulate_ctxt *ctxt) +{ + struct vcpu *v = current; +#if 0 + SHADOW2_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n", + v->domain->domain_id, v->vcpu_id, addr, val, bytes); +#endif + if ( hvm_guest(v) ) + { + return v->arch.shadow2->x86_emulate_write(v, addr, &val, bytes, ctxt); + } + else + { + SHADOW2_PRINTK("this operation is not emulated yet\n"); + return X86EMUL_UNHANDLEABLE; + } +} + +static int +sh2_x86_emulate_cmpxchg_emulated(unsigned long addr, + unsigned long old, + unsigned long new, + unsigned int bytes, + struct x86_emulate_ctxt *ctxt) +{ + struct vcpu *v = current; +#if 0 + SHADOW2_PRINTK("d=%u v=%u a=%#lx o?=%#lx n:=%#lx bytes=%u\n", + v->domain->domain_id, v->vcpu_id, addr, old, new, bytes); +#endif + if ( hvm_guest(v) ) + { + return v->arch.shadow2->x86_emulate_cmpxchg(v, addr, old, new, + bytes, ctxt); + } + else + { + SHADOW2_PRINTK("this operation is not emulated yet\n"); + return X86EMUL_UNHANDLEABLE; + } +} + +static int +sh2_x86_emulate_cmpxchg8b_emulated(unsigned long addr, + unsigned long old_lo, + unsigned long old_hi, + unsigned long new_lo, + unsigned long new_hi, + struct x86_emulate_ctxt *ctxt) +{ + struct vcpu *v = current; +#if 0 + SHADOW2_PRINTK("d=%u v=%u a=%#lx o?=%#lx:%lx n:=%#lx:%lx\n", + v->domain->domain_id, v->vcpu_id, addr, old_hi, old_lo, + new_hi, new_lo, ctxt); +#endif + if ( hvm_guest(v) ) + { + return v->arch.shadow2->x86_emulate_cmpxchg8b(v, addr, old_lo, old_hi, + new_lo, new_hi, ctxt); + } + else + { + SHADOW2_PRINTK("this operation is not emulated yet\n"); + return X86EMUL_UNHANDLEABLE; + } +} + + +struct x86_emulate_ops shadow2_emulator_ops = { + .read_std = sh2_x86_emulate_read_std, + .write_std = sh2_x86_emulate_write_std, + .read_emulated = sh2_x86_emulate_read_std, + .write_emulated = sh2_x86_emulate_write_emulated, + .cmpxchg_emulated = sh2_x86_emulate_cmpxchg_emulated, + .cmpxchg8b_emulated = sh2_x86_emulate_cmpxchg8b_emulated, +}; + + +/**************************************************************************/ +/* Code for "promoting" a guest page to the point where the shadow code is + * willing to let it be treated as a guest page table. This generally + * involves making sure there are no writable mappings available to the guest + * for this page. + */ +void shadow2_promote(struct vcpu *v, mfn_t gmfn, u32 type) +{ + struct page_info *page = mfn_to_page(gmfn); + unsigned long type_info; + + ASSERT(valid_mfn(gmfn)); + + /* We should never try to promote a gmfn that has writeable mappings */ + ASSERT(shadow2_remove_write_access(v, gmfn, 0, 0) == 0); + + // Is the page already shadowed? + if ( !test_and_set_bit(_PGC_page_table, &page->count_info) ) + { + // No prior shadow exists... + + // Grab a type-ref. We don't really care if we are racing with another + // vcpu or not, or even what kind of type we get; we just want the type + // count to be > 0. + // + do { + type_info = + page->u.inuse.type_info & (PGT_type_mask | PGT_va_mask); + } while ( !get_page_type(page, type_info) ); + + // Now that the type ref is non-zero, we can safely use the + // shadow2_flags. + // + page->shadow2_flags = 0; + } + + ASSERT(!test_bit(type >> PGC_SH2_type_shift, &page->shadow2_flags)); + set_bit(type >> PGC_SH2_type_shift, &page->shadow2_flags); +} + +void shadow2_demote(struct vcpu *v, mfn_t gmfn, u32 type) +{ + struct page_info *page = mfn_to_page(gmfn); + + ASSERT(test_bit(_PGC_page_table, &page->count_info)); + ASSERT(test_bit(type >> PGC_SH2_type_shift, &page->shadow2_flags)); + + clear_bit(type >> PGC_SH2_type_shift, &page->shadow2_flags); + + if ( (page->shadow2_flags & SH2F_page_type_mask) == 0 ) + { + // release the extra type ref + put_page_type(page); + + // clear the is-a-page-table bit. + clear_bit(_PGC_page_table, &page->count_info); + } +} + +/**************************************************************************/ +/* Validate a pagetable change from the guest and update the shadows. + * Returns a bitmask of SHADOW2_SET_* flags. */ + +static int +__shadow2_validate_guest_entry(struct vcpu *v, mfn_t gmfn, + void *entry, u32 size) +{ + int result = 0; + struct page_info *page = mfn_to_page(gmfn); + + sh2_mark_dirty(v->domain, gmfn); + + // Determine which types of shadows are affected, and update each. + // + // Always validate L1s before L2s to prevent another cpu with a linear + // mapping of this gmfn from seeing a walk that results from + // using the new L2 value and the old L1 value. (It is OK for such a + // guest to see a walk that uses the old L2 value with the new L1 value, + // as hardware could behave this way if one level of the pagewalk occurs + // before the store, and the next level of the pagewalk occurs after the + // store. + // + // Ditto for L2s before L3s, etc. + // + + if ( !(page->count_info & PGC_page_table) ) + return 0; /* Not shadowed at all */ + +#if CONFIG_PAGING_LEVELS == 2 + if ( page->shadow2_flags & SH2F_L1_32 ) + result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, 2, 2) + (v, gmfn, entry, size); +#else + if ( page->shadow2_flags & SH2F_L1_32 ) + result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, 3, 2) + (v, gmfn, entry, size); +#endif + +#if CONFIG_PAGING_LEVELS == 2 + if ( page->shadow2_flags & SH2F_L2_32 ) + result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, 2, 2) + (v, gmfn, entry, size); +#else + if ( page->shadow2_flags & SH2F_L2_32 ) + result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, 3, 2) + (v, gmfn, entry, size); +#endif + +#if CONFIG_PAGING_LEVELS >= 3 + if ( page->shadow2_flags & SH2F_L1_PAE ) + result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, 3, 3) + (v, gmfn, entry, size); + if ( page->shadow2_flags & SH2F_L2_PAE ) + result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, 3, 3) + (v, gmfn, entry, size); + if ( page->shadow2_flags & SH2F_L2H_PAE ) + result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2he, 3, 3) + (v, gmfn, entry, size); + if ( page->shadow2_flags & SH2F_L3_PAE ) + result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl3e, 3, 3) + (v, gmfn, entry, size); +#else /* 32-bit non-PAE hypervisor does not support PAE guests */ + ASSERT((page->shadow2_flags & (SH2F_L3_PAE|SH2F_L2_PAE|SH2F_L1_PAE)) == 0); +#endif + +#if CONFIG_PAGING_LEVELS >= 4 + if ( page->shadow2_flags & SH2F_L1_64 ) + result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl1e, 4, 4) + (v, gmfn, entry, size); + if ( page->shadow2_flags & SH2F_L2_64 ) + result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl2e, 4, 4) + (v, gmfn, entry, size); + if ( page->shadow2_flags & SH2F_L3_64 ) + result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl3e, 4, 4) + (v, gmfn, entry, size); + if ( page->shadow2_flags & SH2F_L4_64 ) + result |= SHADOW2_INTERNAL_NAME(sh2_map_and_validate_gl4e, 4, 4) + (v, gmfn, entry, size); +#else /* 32-bit/PAE hypervisor does not support 64-bit guests */ + ASSERT((page->shadow2_flags + & (SH2F_L4_64|SH2F_L3_64|SH2F_L2_64|SH2F_L1_64)) == 0); +#endif + + return result; +} + + +int +shadow2_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *entry) +/* This is the entry point from hypercalls. It returns a bitmask of all the + * results of shadow_set_l*e() calls, so the caller knows to do TLB flushes. */ +{ + int rc; + + ASSERT(shadow2_lock_is_acquired(v->domain)); + rc = __shadow2_validate_guest_entry(v, gmfn, entry, sizeof(l1_pgentry_t)); + shadow2_audit_tables(v); + return rc; +} + +void +shadow2_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn, + void *entry, u32 size) +/* This is the entry point for emulated writes to pagetables in HVM guests */ +{ + struct domain *d = v->domain; + int rc; + + ASSERT(shadow2_lock_is_acquired(v->domain)); + rc = __shadow2_validate_guest_entry(v, gmfn, entry, size); + if ( rc & SHADOW2_SET_FLUSH ) + { + // Flush everyone except the local processor, which will flush when it + // re-enters the HVM guest. + // + cpumask_t mask = d->domain_dirty_cpumask; + cpu_clear(v->processor, mask); + flush_tlb_mask(mask); + } + if ( rc & SHADOW2_SET_ERROR ) + { + /* This page is probably not a pagetable any more: tear it out of the + * shadows, along with any tables that reference it */ + shadow2_remove_all_shadows_and_parents(v, gmfn); + } + /* We ignore the other bits: since we are about to change CR3 on + * VMENTER we don't need to do any extra TLB flushes. */ +} + + +/**************************************************************************/ +/* Memory management for shadow pages. */ + +/* Meaning of the count_info field in shadow pages + * ---------------------------------------------- + * + * A count of all references to this page from other shadow pages and + * guest CR3s (a.k.a. v->arch.shadow_table). + * + * The top bits hold the shadow type and the pinned bit. Top-level + * shadows are pinned so that they don't disappear when not in a CR3 + * somewhere. + * + * We don't need to use get|put_page for this as the updates are all + * protected by the shadow lock. We can't use get|put_page for this + * as the size of the count on shadow pages is different from that on + * normal guest pages. + */ + +/* Meaning of the type_info field in shadow pages + * ---------------------------------------------- + * + * type_info use depends on the shadow type (from count_info) + * + * PGC_SH2_none : This page is in the shadow2 free pool. type_info holds + * the chunk order for our freelist allocator. + * + * PGC_SH2_l*_shadow : This page is in use as a shadow. type_info + * holds the mfn of the guest page being shadowed, + * + * PGC_SH2_fl1_*_shadow : This page is being used to shatter a superpage. + * type_info holds the gfn being shattered. + * + * PGC_SH2_monitor_table : This page is part of a monitor table. + * type_info is not used. + */ + +/* Meaning of the _domain field in shadow pages + * -------------------------------------------- + * + * In shadow pages, this field will always have its least significant bit + * set. This ensures that all attempts to get_page() will fail (as all + * valid pickled domain pointers have a zero for their least significant bit). + * Instead, the remaining upper bits are used to record the shadow generation + * counter when the shadow was created. + */ + +/* Meaning of the shadow2_flags field + * ---------------------------------- + * + * In guest pages that are shadowed, one bit for each kind of shadow they have. + * + * In shadow pages, will be used for holding a representation of the populated + * entries in this shadow (either a min/max, or a bitmap, or ...) + * + * In monitor-table pages, holds the level of the particular page (to save + * spilling the shadow types into an extra bit by having three types of monitor + * page). + */ + +/* Meaning of the list_head struct in shadow pages + * ----------------------------------------------- + * + * In free shadow pages, this is used to hold the free-lists of chunks. + * + * In top-level shadow tables, this holds a linked-list of all top-level + * shadows (used for recovering memory and destroying shadows). + * + * In lower-level shadows, this holds the physical address of a higher-level + * shadow entry that holds a reference to this shadow (or zero). + */ + +/* Allocating shadow pages + * ----------------------- + * + * Most shadow pages are allocated singly, but there are two cases where we + * need to allocate multiple pages together. + * + * 1: Shadowing 32-bit guest tables on PAE or 64-bit shadows. + * A 32-bit guest l1 table covers 4MB of virtuial address space, + * and needs to be shadowed by two PAE/64-bit l1 tables (covering 2MB + * of virtual address space each). Similarly, a 32-bit guest l2 table + * (4GB va) needs to be shadowed by four PAE/64-bit l2 tables (1GB va + * each). These multi-page shadows are contiguous and aligned; + * functions for handling offsets into them are defined in shadow2.c + * (shadow_l1_index() etc.) + * + * 2: Shadowing PAE top-level pages. Each guest page that contains + * any PAE top-level pages requires two shadow pages to shadow it. + * They contain alternating l3 tables and pae_l3_bookkeeping structs. + * + * This table shows the allocation behaviour of the different modes: + * + * Xen paging 32b pae pae 64b 64b 64b + * Guest paging 32b 32b pae 32b pae 64b + * PV or HVM * HVM * HVM HVM * + * Shadow paging 32b pae pae pae pae 64b + * + * sl1 size 4k 8k 4k 8k 4k 4k + * sl2 size 4k 16k 4k 16k 4k 4k + * sl3 size - - 8k - 8k 4k + * sl4 size - - - - - 4k + * + * We allocate memory from xen in four-page units and break them down + * with a simple buddy allocator. Can't use the xen allocator to handle + * this as it only works for contiguous zones, and a domain's shadow + * pool is made of fragments. + * + * In HVM guests, the p2m table is built out of shadow pages, and we provide + * a function for the p2m management to steal pages, in max-order chunks, from + * the free pool. We don't provide for giving them back, yet. + */ + +/* Figure out the least acceptable quantity of shadow memory. + * The minimum memory requirement for always being able to free up a + * chunk of memory is very small -- only three max-order chunks per + * vcpu to hold the top level shadows and pages with Xen mappings in them. + * + * But for a guest to be guaranteed to successfully execute a single + * instruction, we must be able to map a large number (about thirty) VAs + * at the same time, which means that to guarantee progress, we must + * allow for more than ninety allocated pages per vcpu. We round that + * up to 128 pages, or half a megabyte per vcpu. */ +unsigned int shadow2_min_acceptable_pages(struct domain *d) +{ + u32 vcpu_count = 0; + struct vcpu *v; + + for_each_vcpu(d, v) + vcpu_count++; + + return (vcpu_count * 128); +} + +/* Using the type_info field to store freelist order */ +#define SH2_PFN_ORDER(_p) ((_p)->u.inuse.type_info) +#define SH2_SET_PFN_ORDER(_p, _o) \ + do { (_p)->u.inuse.type_info = (_o); } while (0) + + +/* Figure out the order of allocation needed for a given shadow type */ +static inline u32 +shadow_order(u32 shadow_type) +{ +#if CONFIG_PAGING_LEVELS > 2 + static const u32 type_to_order[16] = { + 0, /* PGC_SH2_none */ + 1, /* PGC_SH2_l1_32_shadow */ + 1, /* PGC_SH2_fl1_32_shadow */ + 2, /* PGC_SH2_l2_32_shadow */ + 0, /* PGC_SH2_l1_pae_shadow */ + 0, /* PGC_SH2_fl1_pae_shadow */ + 0, /* PGC_SH2_l2_pae_shadow */ + 0, /* PGC_SH2_l2h_pae_shadow */ + 1, /* PGC_SH2_l3_pae_shadow */ + 0, /* PGC_SH2_l1_64_shadow */ + 0, /* PGC_SH2_fl1_64_shadow */ + 0, /* PGC_SH2_l2_64_shadow */ + 0, /* PGC_SH2_l3_64_shadow */ + 0, /* PGC_SH2_l4_64_shadow */ + 2, /* PGC_SH2_p2m_table */ + 0 /* PGC_SH2_monitor_table */ + }; + u32 type = (shadow_type & PGC_SH2_type_mask) >> PGC_SH2_type_shift; + return type_to_order[type]; +#else /* 32-bit Xen only ever shadows 32-bit guests on 32-bit shadows. */ + return 0; +#endif +} + + +/* Do we have a free chunk of at least this order? */ +static inline int chunk_is_available(struct domain *d, int order) +{ + int i; + + for ( i = order; i <= SHADOW2_MAX_ORDER; i++ ) + if ( !list_empty(&d->arch.shadow2_freelists[i]) ) + return 1; + return 0; +} + +/* Dispatcher function: call the per-mode function that will unhook the + * non-Xen mappings in this top-level shadow mfn */ +void shadow2_unhook_mappings(struct vcpu *v, mfn_t smfn) +{ + struct page_info *pg = mfn_to_page(smfn); + switch ( (pg->count_info & PGC_SH2_type_mask) >> PGC_SH2_type_shift ) + { + case PGC_SH2_l2_32_shadow >> PGC_SH2_type_shift: +#if CONFIG_PAGING_LEVELS == 2 + SHADOW2_INTERNAL_NAME(sh2_unhook_32b_mappings,2,2)(v,smfn); +#else + SHADOW2_INTERNAL_NAME(sh2_unhook_32b_mappings,3,2)(v,smfn); +#endif + break; +#if CONFIG_PAGING_LEVELS >= 3 + case PGC_SH2_l3_pae_shadow >> PGC_SH2_type_shift: + SHADOW2_INTERNAL_NAME(sh2_unhook_pae_mappings,3,3)(v,smfn); + break; +#endif +#if CONFIG_PAGING_LEVELS >= 4 + case PGC_SH2_l4_64_shadow >> PGC_SH2_type_shift: + SHADOW2_INTERNAL_NAME(sh2_unhook_64b_mappings,4,4)(v,smfn); + break; +#endif + default: + SHADOW2_PRINTK("top-level shadow has bad type %08lx\n", + (unsigned long)((pg->count_info & PGC_SH2_type_mask) + >> PGC_SH2_type_shift)); + BUG(); + } +} + + +/* Make sure there is at least one chunk of the required order available + * in the shadow page pool. This must be called before any calls to + * shadow2_alloc(). Since this will free existing shadows to make room, + * it must be called early enough to avoid freeing shadows that the + * caller is currently working on. */ +void shadow2_prealloc(struct domain *d, unsigned int order) +{ + /* Need a vpcu for calling unpins; for now, since we don't have + * per-vcpu shadows, any will do */ + struct vcpu *v = d->vcpu[0]; + struct list_head *l, *t; + struct page_info *pg; + mfn_t smfn; + + if ( chunk_is_available(d, order) ) return; + + /* Stage one: walk the list of top-level pages, unpinning them */ + perfc_incrc(shadow2_prealloc_1); + list_for_each_backwards_safe(l, t, &d->arch.shadow2_toplevel_shadows) + { + pg = list_entry(l, struct page_info, list); + smfn = page_to_mfn(pg); + +#if CONFIG_PAGING_LEVELS >= 3 + if ( (pg->count_info & PGC_SH2_type_mask) == PGC_SH2_l3_pae_shadow ) + { + /* For PAE, we need to unpin each subshadow on this shadow */ + SHADOW2_INTERNAL_NAME(sh2_unpin_all_l3_subshadows,3,3)(v, smfn); + } + else +#endif /* 32-bit code always takes this branch */ + { + /* Unpin this top-level shadow */ + sh2_unpin(v, smfn); + } + + /* See if that freed up a chunk of appropriate size */ + if ( chunk_is_available(d, order) ) return; + } + + /* Stage two: all shadow pages are in use in hierarchies that are + * loaded in cr3 on some vcpu. Walk them, unhooking the non-Xen + * mappings. */ + perfc_incrc(shadow2_prealloc_2); + v = current; + if ( v->domain != d ) + v = d->vcpu[0]; + /* Walk the list from the tail: recently used toplevels have been pulled + * to the head */ + list_for_each_backwards_safe(l, t, &d->arch.shadow2_toplevel_shadows) + { + pg = list_entry(l, struct page_info, list); + smfn = page_to_mfn(pg); + shadow2_unhook_mappings(v, smfn); + + /* Need to flush TLB if we've altered our own tables */ + if ( !shadow2_mode_external(d) + && pagetable_get_pfn(current->arch.shadow_table) == mfn_x(smfn) ) + local_flush_tlb(); + + /* See if that freed up a chunk of appropriate size */ + if ( chunk_is_available(d, order) ) return; + } + + /* Nothing more we can do: all remaining shadows are of pages that + * hold Xen mappings for some vcpu. This can never happen. */ + SHADOW2_PRINTK("Can't pre-allocate %i shadow pages!\n" + " shadow pages total = %u, free = %u, p2m=%u\n", + 1 << order, + d->arch.shadow2_total_pages, + d->arch.shadow2_free_pages, + d->arch.shadow2_p2m_pages); + BUG(); +} + + +/* Allocate another shadow's worth of (contiguous, aligned) pages, + * and fill in the type and backpointer fields of their page_infos. + * Never fails to allocate. */ +mfn_t shadow2_alloc(struct domain *d, + u32 shadow_type, + unsigned long backpointer) +{ + struct page_info *pg = NULL; + unsigned int order = shadow_order(shadow_type); + cpumask_t mask; + void *p; + int i; + + ASSERT(shadow2_lock_is_acquired(d)); + ASSERT(order <= SHADOW2_MAX_ORDER); + ASSERT(shadow_type != PGC_SH2_none); + perfc_incrc(shadow2_alloc); + + /* Find smallest order which can satisfy the request. */ + for ( i = order; i <= SHADOW2_MAX_ORDER; i++ ) + if ( !list_empty(&d->arch.shadow2_freelists[i]) ) + { + pg = list_entry(d->arch.shadow2_freelists[i].next, + struct page_info, list); + list_del(&pg->list); + + /* We may have to halve the chunk a number of times. */ + while ( i != order ) + { + i--; + SH2_SET_PFN_ORDER(pg, i); + list_add_tail(&pg->list, &d->arch.shadow2_freelists[i]); + pg += 1 << i; + } + d->arch.shadow2_free_pages -= 1 << order; + + /* Init page info fields and clear the pages */ + for ( i = 0; i < 1<<order ; i++ ) + { + pg[i].u.inuse.type_info = backpointer; + pg[i].count_info = shadow_type; + pg[i].shadow2_flags = 0; + INIT_LIST_HEAD(&pg[i].list); + /* Before we overwrite the old contents of this page, + * we need to be sure that no TLB holds a pointer to it. */ + mask = d->domain_dirty_cpumask; + tlbflush_filter(mask, pg[i].tlbflush_timestamp); + if ( unlikely(!cpus_empty(mask)) ) + { + perfc_incrc(shadow2_alloc_tlbflush); + flush_tlb_mask(mask); + } + /* Now safe to clear the page for reuse */ + p = sh2_map_domain_page(page_to_mfn(pg+i)); + ASSERT(p != NULL); + clear_page(p); + sh2_unmap_domain_page(p); + perfc_incr(shadow2_alloc_count); + } + return page_to_mfn(pg); + } + + /* If we get here, we failed to allocate. This should never happen. + * It means that we didn't call shadow2_prealloc() correctly before + * we allocated. We can't recover by calling prealloc here, because + * we might free up higher-level pages that the caller is working on. */ + SHADOW2_PRINTK("Can't allocate %i shadow pages!\n", 1 << order); + BUG(); +} + + +/* Return some shadow pages to the pool. */ +void shadow2_free(struct domain *d, mfn_t smfn) +{ + struct page_info *pg = mfn_to_page(smfn); + u32 shadow_type; + unsigned long order; + unsigned long mask; + int i; + + ASSERT(shadow2_lock_is_acquired(d)); + perfc_incrc(shadow2_free); + + shadow_type = pg->count_info & PGC_SH2_type_mask; + ASSERT(shadow_type != PGC_SH2_none); + ASSERT(shadow_type != PGC_SH2_p2m_table); + order = shadow_order(shadow_type); + + d->arch.shadow2_free_pages += 1 << order; + + for ( i = 0; i < 1<<order; i++ ) + { + /* Strip out the type: this is now a free shadow page */ + pg[i].count_info = 0; + /* Remember the TLB timestamp so we will know whether to flush + * TLBs when we reuse the page. Because the destructors leave the + * contents of the pages in place, we can delay TLB flushes until + * just before the allocator hands the page out again. */ + pg[i].tlbflush_timestamp = tlbflush_current_time(); + perfc_decr(shadow2_alloc_count); + } + + /* Merge chunks as far as possible. */ + while ( order < SHADOW2_MAX_ORDER ) + { + mask = 1 << order; + if ( (mfn_x(page_to_mfn(pg)) & mask) ) { + /* Merge with predecessor block? */ + if ( (((pg-mask)->count_info & PGC_SH2_type_mask) != PGT_none) + || (SH2_PFN_ORDER(pg-mask) != order) ) + break; + list_del(&(pg-mask)->list); + pg -= mask; + } else { + /* Merge with successor block? */ + if ( (((pg+mask)->count_info & PGC_SH2_type_mask) != PGT_none) + || (SH2_PFN_ORDER(pg+mask) != order) ) + break; + list_del(&(pg+mask)->list); + } + order++; + } + + SH2_SET_PFN_ORDER(pg, order); + list_add_tail(&pg->list, &d->arch.shadow2_freelists[order]); +} + +/* Divert some memory from the pool to be used by the p2m mapping. + * This action is irreversible: the p2m mapping only ever grows. + * That's OK because the p2m table only exists for external domains, + * and those domains can't ever turn off shadow mode. + * Also, we only ever allocate a max-order chunk, so as to preserve + * the invariant that shadow2_prealloc() always works. + * Returns 0 iff it can't get a chunk (the caller should then + * free up some pages in domheap and call set_sh2_allocation); + * returns non-zero on success. + */ +static int +shadow2_alloc_p2m_pages(struct domain *d) +{ + struct page_info *pg; + u32 i; + ASSERT(shadow2_lock_is_acquired(d)); + + if ( d->arch.shadow2_total_pages + < (shadow2_min_acceptable_pages(d) + (1<<SHADOW2_MAX_ORDER)) ) + return 0; /* Not enough shadow memory: need to increase it first */ + + pg = mfn_to_page(shadow2_alloc(d, PGC_SH2_p2m_table, 0)); + d->arch.shadow2_p2m_pages += (1<<SHADOW2_MAX_ORDER); + d->arch.shadow2_total_pages -= (1<<SHADOW2_MAX_ORDER); + for (i = 0; i < (1<<SHADOW2_MAX_ORDER); i++) + { + /* Unlike shadow pages, mark p2m pages as owned by the domain */ + page_set_owner(&pg[i], d); + list_add_tail(&pg[i].list, &d->arch.shadow2_p2m_freelist); + } + return 1; +} + +// Returns 0 if no memory is available... +mfn_t +shadow2_alloc_p2m_page(struct domain *d) +{ + struct list_head *entry; + mfn_t mfn; + void *p; + + if ( list_empty(&d->arch.shadow2_p2m_freelist) && + !shadow2_alloc_p2m_pages(d) ) + return _mfn(0); + entry = d->arch.shadow2_p2m_freelist.next; + list_del(entry); + list_add_tail(entry, &d->arch.shadow2_p2m_inuse); + mfn = page_to_mfn(list_entry(entry, struct page_info, list)); + sh2_get_ref(mfn, 0); + p = sh2_map_domain_page(mfn); + clear_page(p); + sh2_unmap_domain_page(p); + + return mfn; +} + +#if CONFIG_PAGING_LEVELS == 3 +static void p2m_install_entry_in_monitors(struct domain *d, + l3_pgentry_t *l3e) +/* Special case, only used for external-mode domains on PAE hosts: + * update the mapping of the p2m table. Once again, this is trivial in + * other paging modes (one top-level entry points to the top-level p2m, + * no maintenance needed), but PAE makes life difficult by needing a + * copy the eight l3es of the p2m table in eight l2h slots in the + * monitor table. This function makes fresh copies when a p2m l3e + * changes. */ +{ + l2_pgentry_t *ml2e; + struct vcpu *v; + unsigned int index; + + index = ((unsigned long)l3e & ~PAGE_MASK) / sizeof(l3_pgentry_t); + ASSERT(index < MACHPHYS_MBYTES>>1); + + for_each_vcpu(d, v) + { + if ( pagetable_get_pfn(v->arch.monitor_table) == 0 ) + continue; + ASSERT(shadow2_mode_external(v->domain)); + + SHADOW2_DEBUG(P2M, "d=%u v=%u index=%u mfn=%#lx\n", + d->domain_id, v->vcpu_id, index, l3e_get_pfn(*l3e)); + + if ( v == current ) /* OK to use linear map of monitor_table */ + ml2e = __linear_l2_table + l2_linear_offset(RO_MPT_VIRT_START); + else + { + l3_pgentry_t *ml3e; + ml3e = sh2_map_domain_page(pagetable_get_mfn(v->arch.monitor_table)); + ASSERT(l3e_get_flags(ml3e[3]) & _PAGE_PRESENT); + ml2e = sh2_map_domain_page(_mfn(l3e_get_pfn(ml3e[3]))); + ml2e += l2_table_offset(RO_MPT_VIRT_START); + sh2_unmap_domain_page(ml3e); + } + ml2e[index] = l2e_from_pfn(l3e_get_pfn(*l3e), __PAGE_HYPERVISOR); + if ( v != current ) + sh2_unmap_domain_page(ml2e); + } +} +#endif + +// Find the next level's P2M entry, checking for out-of-range gfn's... +// Returns NULL on error. +// +static l1_pgentry_t * +p2m_find_entry(void *table, unsigned long *gfn_remainder, + unsigned long gfn, u32 shift, u32 max) +{ + u32 index; + + index = *gfn_remainder >> shift; + if ( index >= max ) + { + SHADOW2_DEBUG(P2M, "gfn=0x%lx out of range " + "(gfn_remainder=0x%lx shift=%d index=0x%x max=0x%x)\n", + gfn, *gfn_remainder, shift, index, max); + return NULL; + } + *gfn_remainder &= (1 << shift) - 1; + return (l1_pgentry_t *)table + index; +} + +// Walk one level of the P2M table, allocating a new table if required. +// Returns 0 on error. +// +static int +p2m_next_level(struct domain *d, mfn_t *table_mfn, void **table, + unsigned long *gfn_remainder, unsigned long gfn, u32 shift, + u32 max, unsigned long type) +{ + l1_pgentry_t *p2m_entry; + void *next; + + if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn, + shift, max)) ) + return 0; + + if ( !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) ) + { + mfn_t mfn = shadow2_alloc_p2m_page(d); + if ( mfn_x(mfn) == 0 ) + return 0; + *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER); + mfn_to_page(mfn)->u.inuse.type_info = type | 1 | PGT_validated; + mfn_to_page(mfn)->count_info = 1; +#if CONFIG_PAGING_LEVELS == 3 + if (type == PGT_l2_page_table) + { + /* We have written to the p2m l3: need to sync the per-vcpu + * copies of it in the monitor tables */ + p2m_install_entry_in_monitors(d, (l3_pgentry_t *)p2m_entry); + } +#endif + /* The P2M can be shadowed: keep the shadows synced */ + if ( d->vcpu[0] ) + (void)__shadow2_validate_guest_entry(d->vcpu[0], *table_mfn, + p2m_entry, sizeof *p2m_entry); + } + *table_mfn = _mfn(l1e_get_pfn(*p2m_entry)); + next = sh2_map_domain_page(*table_mfn); + sh2_unmap_domain_page(*table); + *table = next; + + return 1; +} + +// Returns 0 on error (out of memory) +int +shadow2_set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn) +{ + // XXX -- this might be able to be faster iff current->domain == d + mfn_t table_mfn = pagetable_get_mfn(d->arch.phys_table); + void *table = sh2_map_domain_page(table_mfn); + unsigned long gfn_remainder = gfn; + l1_pgentry_t *p2m_entry; + +#if CONFIG_PAGING_LEVELS >= 4 + if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn, + L4_PAGETABLE_SHIFT - PAGE_SHIFT, + L4_PAGETABLE_ENTRIES, PGT_l3_page_table) ) + return 0; +#endif +#if CONFIG_PAGING_LEVELS >= 3 + // When using PAE Xen, we only allow 33 bits of pseudo-physical + // address in translated guests (i.e. 8 GBytes). This restriction + // comes from wanting to map the P2M table into the 16MB RO_MPT hole + // in Xen's address space for translated PV guests. + // + if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn, + L3_PAGETABLE_SHIFT - PAGE_SHIFT, + (CONFIG_PAGING_LEVELS == 3 + ? 8 + : L3_PAGETABLE_ENTRIES), + PGT_l2_page_table) ) + return 0; +#endif + if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn, + L2_PAGETABLE_SHIFT - PAGE_SHIFT, + L2_PAGETABLE_ENTRIES, PGT_l1_page_table) ) + return 0; + + p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn, + 0, L1_PAGETABLE_ENTRIES); + ASSERT(p2m_entry); + if ( valid_mfn(mfn) ) + *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER); + else + *p2m_entry = l1e_empty(); + + /* The P2M can be shadowed: keep the shadows synced */ + (void) __shadow2_validate_guest_entry(d->vcpu[0], table_mfn, + p2m_entry, sizeof *p2m_entry); + + sh2_unmap_domain_page(table); + + return 1; +} + +// Allocate a new p2m table for a domain. +// +// The structure of the p2m table is that of a pagetable for xen (i.e. it is +// controlled by CONFIG_PAGING_LEVELS). +// +// Returns 0 if p2m table could not be initialized +// +static int +shadow2_alloc_p2m_table(struct domain *d) +{ + mfn_t p2m_top; + struct list_head *entry; + unsigned int page_count = 0; + + SHADOW2_PRINTK("allocating p2m table\n"); + ASSERT(pagetable_get_pfn(d->arch.phys_table) == 0); + + p2m_top = shadow2_alloc_p2m_page(d); + mfn_to_page(p2m_top)->count_info = 1; + mfn_to_page(p2m_top)->u.inuse.type_info = +#if CONFIG_PAGING_LEVELS == 4 + PGT_l4_page_table +#elif CONFIG_PAGING_LEVELS == 3 + PGT_l3_page_table +#elif CONFIG_PAGING_LEVELS == 2 + PGT_l2_page_table +#endif + | 1 | PGT_validated; + + if ( mfn_x(p2m_top) == 0 ) + return 0; + + d->arch.phys_table = pagetable_from_mfn(p2m_top); + + SHADOW2_PRINTK("populating p2m table\n"); + + for ( entry = d->page_list.next; + entry != &d->page_list; + entry = entry->next ) + { + struct page_info *page = list_entry(entry, struct page_info, list); + mfn_t mfn = page_to_mfn(page); + unsigned long gfn = get_gpfn_from_mfn(mfn_x(mfn)); + page_count++; + if ( +#ifdef __x86_64__ + (gfn != 0x5555555555555555L) +#else + (gfn != 0x55555555L) +#endif + && gfn != INVALID_M2P_ENTRY + && !shadow2_set_p2m_entry(d, gfn, mfn) ) + { + SHADOW2_PRINTK("failed to initialize p2m table, gfn=%05lx, mfn=%" SH2_PRI_mfn "\n", + gfn, mfn_x(mfn)); + return 0; + } + } + + SHADOW2_PRINTK("p2m table initialised (%u pages)\n", page_count); + return 1; +} + +mfn_t +sh2_gfn_to_mfn_foreign(struct domain *d, unsigned long gpfn) +/* Read another domain's p2m entries */ +{ + mfn_t mfn; + unsigned long addr = gpfn << PAGE_SHIFT; + l2_pgentry_t *l2e; + l1_pgentry_t *l1e; + + ASSERT(shadow2_mode_translate(d)); + mfn = pagetable_get_mfn(d->arch.phys_table); + + +#if CONFIG_PAGING_LEVELS > 2 + if ( gpfn > (RO_MPT_VIRT_END - RO_MPT_VIRT_START) / sizeof(l1_pgentry_t) ) + /* This pfn is higher than the p2m map can hold */ + return _mfn(INVALID_MFN); +#endif + + +#if CONFIG_PAGING_LEVELS >= 4 + { + l4_pgentry_t *l4e = sh2_map_domain_page(mfn); + l4e += l4_table_offset(addr); + if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 ) + { + sh2_unmap_domain_page(l4e); + return _mfn(INVALID_MFN); + } + mfn = _mfn(l4e_get_pfn(*l4e)); + sh2_unmap_domain_page(l4e); + } +#endif +#if CONFIG_PAGING_LEVELS >= 3 + { + l3_pgentry_t *l3e = sh2_map_domain_page(mfn); + l3e += l3_table_offset(addr); + if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 ) + { + sh2_unmap_domain_page(l3e); + return _mfn(INVALID_MFN); + } + mfn = _mfn(l3e_get_pfn(*l3e)); + sh2_unmap_domain_page(l3e); + } +#endif + + l2e = sh2_map_domain_page(mfn); + l2e += l2_table_offset(addr); + if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 ) + { + sh2_unmap_domain_page(l2e); + return _mfn(INVALID_MFN); + } + mfn = _mfn(l2e_get_pfn(*l2e)); + sh2_unmap_domain_page(l2e); + + l1e = sh2_map_domain_page(mfn); + l1e += l1_table_offset(addr); + if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 ) + { + sh2_unmap_domain_page(l1e); + return _mfn(INVALID_MFN); + } + mfn = _mfn(l1e_get_pfn(*l1e)); + sh2_unmap_domain_page(l1e); + + return mfn; +} + +unsigned long +shadow2_gfn_to_mfn_foreign(unsigned long gpfn) +{ + return mfn_x(sh2_gfn_to_mfn_foreign(current->domain, gpfn)); +} + + +static void shadow2_p2m_teardown(struct domain *d) +/* Return all the p2m pages to Xen. + * We know we don't have any extra mappings to these pages */ +{ + struct list_head *entry, *n; + struct page_info *pg; + + d->arch.phys_table = pagetable_null(); + + list_for_each_safe(entry, n, &d->arch.shadow2_p2m_inuse) + { + pg = list_entry(entry, struct page_info, list); + list_del(entry); + /* Should have just the one ref we gave it in alloc_p2m_page() */ + if ( (pg->count_info & PGC_SH2_count_mask) != 1 ) + { + SHADOW2_PRINTK("Odd p2m page count c=%#x t=%"PRtype_info"\n", + pg->count_info, pg->u.inuse.type_info); + } + ASSERT(page_get_owner(pg) == d); + /* Free should not decrement domain's total allocation, since + * these pages were allocated without an owner. */ + page_set_owner(pg, NULL); + free_domheap_pages(pg, 0); + d->arch.shadow2_p2m_pages--; + perfc_decr(shadow2_alloc_count); + } + list_for_each_safe(entry, n, &d->arch.shadow2_p2m_freelist) + { + list_del(entry); + pg = list_entry(entry, struct page_info, list); + ASSERT(page_get_owner(pg) == d); + /* Free should not decrement domain's total allocation. */ + page_set_owner(pg, NULL); + free_domheap_pages(pg, 0); + d->arch.shadow2_p2m_pages--; + perfc_decr(shadow2_alloc_count); + } + ASSERT(d->arch.shadow2_p2m_pages == 0); +} + +/* Set the pool of shadow pages to the required number of pages. + * Input will be rounded up to at least shadow2_min_acceptable_pages(), + * plus space for the p2m table. + * Returns 0 for success, non-zero for failure. */ +static unsigned int set_sh2_allocation(struct domain *d, + unsigned int pages, + int *preempted) +{ + struct page_info *pg; + unsigned int lower_bound; + int j; + + ASSERT(shadow2_lock_is_acquired(d)); + + /* Don't allocate less than the minimum acceptable, plus one page per + * megabyte of RAM (for the p2m table) */ + lower_bound = shadow2_min_acceptable_pages(d) + (d->tot_pages / 256); + if ( pages > 0 && pages < lower_bound ) + pages = lower_bound; + /* Round up to largest block size */ + pages = (pages + ((1<<SHADOW2_MAX_ORDER)-1)) & ~((1<<SHADOW2_MAX_ORDER)-1); + + SHADOW2_PRINTK("current %i target %i\n", + d->arch.shadow2_total_pages, pages); + + while ( d->arch.shadow2_total_pages != pages ) + { + if ( d->arch.shadow2_total_pages < pages ) + { + /* Need to allocate more memory from domheap */ + pg = alloc_domheap_pages(NULL, SHADOW2_MAX_ORDER, 0); + if ( pg == NULL ) + { + SHADOW2_PRINTK("failed to allocate shadow pages.\n"); + return -ENOMEM; + } + d->arch.shadow2_free_pages += 1<<SHADOW2_MAX_ORDER; + d->arch.shadow2_total_pages += 1<<SHADOW2_MAX_ORDER; + for ( j = 0; j < 1<<SHADOW2_MAX_ORDER; j++ ) + { + pg[j].u.inuse.type_info = 0; /* Free page */ + pg[j].tlbflush_timestamp = 0; /* Not in any TLB */ + } + SH2_SET_PFN_ORDER(pg, SHADOW2_MAX_ORDER); + list_add_tail(&pg->list, + &d->arch.shadow2_freelists[SHADOW2_MAX_ORDER]); + } + else if ( d->arch.shadow2_total_pages > pages ) + { + /* Need to return memory to domheap */ + shadow2_prealloc(d, SHADOW2_MAX_ORDER); + ASSERT(!list_empty(&d->arch.shadow2_freelists[SHADOW2_MAX_ORDER])); + pg = list_entry(d->arch.shadow2_freelists[SHADOW2_MAX_ORDER].next, + struct page_info, list); + list_del(&pg->list); + d->arch.shadow2_free_pages -= 1<<SHADOW2_MAX_ORDER; + d->arch.shadow2_total_pages -= 1<<SHADOW2_MAX_ORDER; + free_domheap_pages(pg, SHADOW2_MAX_ORDER); + } + + /* Check to see if we need to yield and try again */ + if ( preempted && hypercall_preempt_check() ) + { + *preempted = 1; + return 0; + } + } + + return 0; +} + +unsigned int shadow2_set_allocation(struct domain *d, + unsigned int megabytes, + int *preempted) +/* Hypercall interface to set the shadow memory allocation */ +{ + unsigned int rv; + shadow2_lock(d); + rv = set_sh2_allocation(d, megabytes << (20 - PAGE_SHIFT), preempted); + SHADOW2_PRINTK("dom %u allocation now %u pages (%u MB)\n", + d->domain_id, + d->arch.shadow2_total_pages, + shadow2_get_allocation(d)); + shadow2_unlock(d); + return rv; +} + +/**************************************************************************/ +/* Hash table for storing the guest->shadow mappings */ + +/* Hash function that takes a gfn or mfn, plus another byte of type info */ +typedef u32 key_t; +static inline key_t sh2_hash(unsigned long n, u8 t) +{ + unsigned char *p = (unsigned char *)&n; + key_t k = t; + int i; + for ( i = 0; i < sizeof(n) ; i++ ) k = (u32)p[i] + (k<<6) + (k<<16) - k; + return k; +} + +#if SHADOW2_AUDIT & (SHADOW2_AUDIT_HASH|SHADOW2_AUDIT_HASH_FULL) + +/* Before we get to the mechanism, define a pair of audit functions + * that sanity-check the contents of the hash table. */ +static void sh2_hash_audit_bucket(struct domain *d, int bucket) +/* Audit one bucket of the hash table */ +{ + struct shadow2_hash_entry *e, *x; + struct page_info *pg; + + if ( !(SHADOW2_AUDIT_ENABLE) ) + return; + + e = &d->arch.shadow2_hash_table[bucket]; + if ( e->t == 0 ) return; /* Bucket is empty */ + while ( e ) + { + /* Empty link? */ + BUG_ON( e->t == 0 ); + /* Bogus type? */ + BUG_ON( e->t > (PGC_SH2_max_shadow >> PGC_SH2_type_shift) ); + /* Wrong bucket? */ + BUG_ON( sh2_hash(e->n, e->t) % SHADOW2_HASH_BUCKETS != bucket ); + /* Duplicate entry? */ + for ( x = e->next; x; x = x->next ) + BUG_ON( x->n == e->n && x->t == e->t ); + /* Bogus MFN? */ + BUG_ON( !valid_mfn(e->smfn) ); + pg = mfn_to_page(e->smfn); + /* Not a shadow? */ + BUG_ON( page_get_owner(pg) != 0 ); + /* Wrong kind of shadow? */ + BUG_ON( (pg->count_info & PGC_SH2_type_mask) >> PGC_SH2_type_shift + != e->t ); + /* Bad backlink? */ + BUG_ON( pg->u.inuse.type_info != e->n ); + if ( e->t != (PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift) + && e->t != (PGC_SH2_fl1_pae_shadow >> PGC_SH2_type_shift) + && e->t != (PGC_SH2_fl1_64_shadow >> PGC_SH2_type_shift) ) + { + /* Bad shadow flags on guest page? */ + BUG_ON( !(mfn_to_page(_mfn(e->n))->shadow2_flags & (1<<e->t)) ); + } + /* That entry was OK; on we go */ + e = e->next; + } +} + +#else +#define sh2_hash_audit_bucket(_d, _b) +#endif /* Hashtable bucket audit */ + + +#if SHADOW2_AUDIT & SHADOW2_AUDIT_HASH_FULL + +static void sh2_hash_audit(struct domain *d) +/* Full audit: audit every bucket in the table */ +{ + int i; + + if ( !(SHADOW2_AUDIT_ENABLE) ) + return; + + for ( i = 0; i < SHADOW2_HASH_BUCKETS; i++ ) + { + sh2_hash_audit_bucket(d, i); + } +} + +#else +#define sh2_hash_audit(_d) +#endif /* Hashtable bucket audit */ + +/* Memory management interface for bucket allocation. + * These ought to come out of shadow memory, but at least on 32-bit + * machines we are forced to allocate them from xenheap so that we can + * address them. */ +static struct shadow2_hash_entry *sh2_alloc_hash_entry(struct domain *d) +{ + struct shadow2_hash_entry *extra, *x; + int i; + + /* We need to allocate a new node. Ensure the free list is not empty. + * Allocate new entries in units the same size as the original table. */ + if ( unlikely(d->arch.shadow2_hash_freelist == NULL) ) + { + size_t sz = sizeof(void *) + (SHADOW2_HASH_BUCKETS * sizeof(*x)); + extra = xmalloc_bytes(sz); + + if ( extra == NULL ) + { + /* No memory left! */ + SHADOW2_ERROR("xmalloc() failed when allocating hash buckets.\n"); + domain_crash_synchronous(); + } + memset(extra, 0, sz); + + /* Record the allocation block so it can be correctly freed later. */ + *((struct shadow2_hash_entry **)&extra[SHADOW2_HASH_BUCKETS]) = + d->arch.shadow2_hash_allocations; + d->arch.shadow2_hash_allocations = &extra[0]; + + /* Thread a free chain through the newly-allocated nodes. */ + for ( i = 0; i < (SHADOW2_HASH_BUCKETS - 1); i++ ) + extra[i].next = &extra[i+1]; + extra[i].next = NULL; + + /* Add the new nodes to the free list. */ + d->arch.shadow2_hash_freelist = &extra[0]; + } + + /* Allocate a new node from the free list. */ + x = d->arch.shadow2_hash_freelist; + d->arch.shadow2_hash_freelist = x->next; + return x; +} + +static void sh2_free_hash_entry(struct domain *d, struct shadow2_hash_entry *e) +{ + /* Mark the bucket as empty and return it to the free list */ + e->t = 0; + e->next = d->arch.shadow2_hash_freelist; + d->arch.shadow2_hash_freelist = e; +} + + +/* Allocate and initialise the table itself. + * Returns 0 for success, 1 for error. */ +static int shadow2_hash_alloc(struct domain *d) +{ + struct shadow2_hash_entry *table; + + ASSERT(shadow2_lock_is_acquired(d)); + ASSERT(!d->arch.shadow2_hash_table); + + table = xmalloc_array(struct shadow2_hash_entry, SHADOW2_HASH_BUCKETS); + if ( !table ) return 1; + memset(table, 0, + SHADOW2_HASH_BUCKETS * sizeof (struct shadow2_hash_entry)); + d->arch.shadow2_hash_table = table; + return 0; +} + +/* Tear down the hash table and return all memory to Xen. + * This function does not care whether the table is populated. */ +static void shadow2_hash_teardown(struct domain *d) +{ + struct shadow2_hash_entry *a, *n; + + ASSERT(shadow2_lock_is_acquired(d)); + ASSERT(d->arch.shadow2_hash_table); + + /* Return the table itself */ + xfree(d->arch.shadow2_hash_table); + d->arch.shadow2_hash_table = NULL; + + /* Return any extra allocations */ + a = d->arch.shadow2_hash_allocations; + while ( a ) + { + /* We stored a linked-list pointer at the end of each allocation */ + n = *((struct shadow2_hash_entry **)(&a[SHADOW2_HASH_BUCKETS])); + xfree(a); + a = n; + } + d->arch.shadow2_hash_allocations = NULL; + d->arch.shadow2_hash_freelist = NULL; +} + + +mfn_t shadow2_hash_lookup(struct vcpu *v, unsigned long n, u8 t) +/* Find an entry in the hash table. Returns the MFN of the shadow, + * or INVALID_MFN if it doesn't exist */ +{ + struct domain *d = v->domain; + struct shadow2_hash_entry *p, *x, *head; + key_t key; + + ASSERT(shadow2_lock_is_acquired(d)); + ASSERT(d->arch.shadow2_hash_table); + ASSERT(t); + + sh2_hash_audit(d); + + perfc_incrc(shadow2_hash_lookups); + key = sh2_hash(n, t); + + x = head = &d->arch.shadow2_hash_table[key % SHADOW2_HASH_BUCKETS]; + p = NULL; + + sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS); + + do + { + ASSERT(x->t || ((x == head) && (x->next == NULL))); + + if ( x->n == n && x->t == t ) + { + /* Pull-to-front if 'x' isn't already the head item */ + if ( unlikely(x != head) ) + { + if ( unlikely(d->arch.shadow2_hash_walking != 0) ) + /* Can't reorder: someone is walking the hash chains */ + return x->smfn; + else + { + /* Delete 'x' from list and reinsert after head. */ + p->next = x->next; + x->next = head->next; + head->next = x; + + /* Swap 'x' contents with head contents. */ + SWAP(head->n, x->n); + SWAP(head->t, x->t); + SWAP(head->smfn, x->smfn); + } + } + else + { + perfc_incrc(shadow2_hash_lookup_head); + } + return head->smfn; + } + + p = x; + x = x->next; + } + while ( x != NULL ); + + perfc_incrc(shadow2_hash_lookup_miss); + return _mfn(INVALID_MFN); +} + +void shadow2_hash_insert(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn) +/* Put a mapping (n,t)->smfn into the hash table */ +{ + struct domain *d = v->domain; + struct shadow2_hash_entry *x, *head; + key_t key; + + ASSERT(shadow2_lock_is_acquired(d)); + ASSERT(d->arch.shadow2_hash_table); + ASSERT(t); + + sh2_hash_audit(d); + + perfc_incrc(shadow2_hash_inserts); + key = sh2_hash(n, t); + + head = &d->arch.shadow2_hash_table[key % SHADOW2_HASH_BUCKETS]; + + sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS); + + /* If the bucket is empty then insert the new page as the head item. */ + if ( head->t == 0 ) + { + head->n = n; + head->t = t; + head->smfn = smfn; + ASSERT(head->next == NULL); + } + else + { + /* Insert a new entry directly after the head item. */ + x = sh2_alloc_hash_entry(d); + x->n = n; + x->t = t; + x->smfn = smfn; + x->next = head->next; + head->next = x; + } + + sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS); +} + +void shadow2_hash_delete(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn) +/* Excise the mapping (n,t)->smfn from the hash table */ +{ + struct domain *d = v->domain; + struct shadow2_hash_entry *p, *x, *head; + key_t key; + + ASSERT(shadow2_lock_is_acquired(d)); + ASSERT(d->arch.shadow2_hash_table); + ASSERT(t); + + sh2_hash_audit(d); + + perfc_incrc(shadow2_hash_deletes); + key = sh2_hash(n, t); + + head = &d->arch.shadow2_hash_table[key % SHADOW2_HASH_BUCKETS]; + + sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS); + + /* Match on head item? */ + if ( head->n == n && head->t == t ) + { + if ( (x = head->next) != NULL ) + { + /* Overwrite head with contents of following node. */ + head->n = x->n; + head->t = x->t; + head->smfn = x->smfn; + + /* Delete following node. */ + head->next = x->next; + sh2_free_hash_entry(d, x); + } + else + { + /* This bucket is now empty. Initialise the head node. */ + head->t = 0; + } + } + else + { + /* Not at the head; need to walk the chain */ + p = head; + x = head->next; + + while(1) + { + ASSERT(x); /* We can't have hit the end, since our target is + * still in the chain somehwere... */ + if ( x->n == n && x->t == t ) + { + /* Delete matching node. */ + p->next = x->next; + sh2_free_hash_entry(d, x); + break; + } + p = x; + x = x->next; + } + } + + sh2_hash_audit_bucket(d, key % SHADOW2_HASH_BUCKETS); +} + +typedef int (*hash_callback_t)(struct vcpu *v, mfn_t smfn, mfn_t other_mfn); + +static void hash_foreach(struct vcpu *v, + unsigned int callback_mask, + hash_callback_t callbacks[], + mfn_t callback_mfn) +/* Walk the hash table looking at the types of the entries and + * calling the appropriate callback function for each entry. + * The mask determines which shadow types we call back for, and the array + * of callbacks tells us which function to call. + * Any callback may return non-zero to let us skip the rest of the scan. + * + * WARNING: Callbacks MUST NOT add or remove hash entries unless they + * then return non-zero to terminate the scan. */ +{ + int i, done = 0; + struct domain *d = v->domain; + struct shadow2_hash_entry *x; + + /* Say we're here, to stop hash-lookups reordering the chains */ + ASSERT(shadow2_lock_is_acquired(d)); + ASSERT(d->arch.shadow2_hash_walking == 0); + d->arch.shadow2_hash_walking = 1; + + callback_mask &= ~1; /* Never attempt to call back on empty buckets */ + for ( i = 0; i < SHADOW2_HASH_BUCKETS; i++ ) + { + /* WARNING: This is not safe against changes to the hash table. + * The callback *must* return non-zero if it has inserted or + * deleted anything from the hash (lookups are OK, though). */ + for ( x = &d->arch.shadow2_hash_table[i]; x; x = x->next ) + { + if ( callback_mask & (1 << x->t) ) + { + ASSERT(x->t <= 15); + ASSERT(callbacks[x->t] != NULL); + if ( (done = callbacks[x->t](v, x->smfn, callback_mfn)) != 0 ) + break; + } + } + if ( done ) break; + } + d->arch.shadow2_hash_walking = 0; +} + + +/**************************************************************************/ +/* Destroy a shadow page: simple dispatcher to call the per-type destructor + * which will decrement refcounts appropriately and return memory to the + * free pool. */ + +void sh2_destroy_shadow(struct vcpu *v, mfn_t smfn) +{ + struct page_info *pg = mfn_to_page(smfn); + u32 t = pg->count_info & PGC_SH2_type_mask; + + + SHADOW2_PRINTK("smfn=%#lx\n", mfn_x(smfn)); + + /* Double-check, if we can, that the shadowed page belongs to this + * domain, (by following the back-pointer). */ + ASSERT(t == PGC_SH2_fl1_32_shadow || + t == PGC_SH2_fl1_pae_shadow || + t == PGC_SH2_fl1_64_shadow || + t == PGC_SH2_monitor_table || + (page_get_owner(mfn_to_page(_mfn(pg->u.inuse.type_info))) + == v->domain)); + + /* The down-shifts here are so that the switch statement is on nice + * small numbers that the compiler will enjoy */ + switch ( t >> PGC_SH2_type_shift ) + { +#if CONFIG_PAGING_LEVELS == 2 + case PGC_SH2_l1_32_shadow >> PGC_SH2_type_shift: + case PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift: + SHADOW2_INTERNAL_NAME(sh2_destroy_l1_shadow, 2, 2)(v, smfn); + break; + case PGC_SH2_l2_32_shadow >> PGC_SH2_type_shift: + SHADOW2_INTERNAL_NAME(sh2_destroy_l2_shadow, 2, 2)(v, smfn); + break; +#else /* PAE or 64bit */ + case PGC_SH2_l1_32_shadow >> PGC_SH2_type_shift: + case PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift: + SHADOW2_INTERNAL_NAME(sh2_destroy_l1_shadow, 3, 2)(v, smfn); + break; + case PGC_SH2_l2_32_shadow >> PGC_SH2_type_shift: + SHADOW2_INTERNAL_NAME(sh2_destroy_l2_shadow, 3, 2)(v, smfn); + break; +#endif + +#if CONFIG_PAGING_LEVELS >= 3 + case PGC_SH2_l1_pae_shadow >> PGC_SH2_type_shift: + case PGC_SH2_fl1_pae_shadow >> PGC_SH2_type_shift: + SHADOW2_INTERNAL_NAME(sh2_destroy_l1_shadow, 3, 3)(v, smfn); + break; + case PGC_SH2_l2_pae_shadow >> PGC_SH2_type_shift: + case PGC_SH2_l2h_pae_shadow >> PGC_SH2_type_shift: + SHADOW2_INTERNAL_NAME(sh2_destroy_l2_shadow, 3, 3)(v, smfn); + break; + case PGC_SH2_l3_pae_shadow >> PGC_SH2_type_shift: + SHADOW2_INTERNAL_NAME(sh2_destroy_l3_shadow, 3, 3)(v, smfn); + break; +#endif + +#if CONFIG_PAGING_LEVELS >= 4 + case PGC_SH2_l1_64_shadow >> PGC_SH2_type_shift: + case PGC_SH2_fl1_64_shadow >> PGC_SH2_type_shift: + SHADOW2_INTERNAL_NAME(sh2_destroy_l1_shadow, 4, 4)(v, smfn); + break; + case PGC_SH2_l2_64_shadow >> PGC_SH2_type_shift: + SHADOW2_INTERNAL_NAME(sh2_destroy_l2_shadow, 4, 4)(v, smfn); + break; + case PGC_SH2_l3_64_shadow >> PGC_SH2_type_shift: + SHADOW2_INTERNAL_NAME(sh2_destroy_l3_shadow, 4, 4)(v, smfn); + break; + case PGC_SH2_l4_64_shadow >> PGC_SH2_type_shift: + SHADOW2_INTERNAL_NAME(sh2_destroy_l4_shadow, 4, 4)(v, smfn); + break; +#endif + default: + SHADOW2_PRINTK("tried to destroy shadow of bad type %08lx\n", + (unsigned long)t); + BUG(); + } +} + +/**************************************************************************/ +/* Remove all writeable mappings of a guest frame from the shadow tables + * Returns non-zero if we need to flush TLBs. + * level and fault_addr desribe how we found this to be a pagetable; + * level==0 means we have some other reason for revoking write access.*/ + +int shadow2_remove_write_access(struct vcpu *v, mfn_t gmfn, + unsigned int level, + unsigned long fault_addr) +{ + /* Dispatch table for getting per-type functions */ + static hash_callback_t callbacks[16] = { + NULL, /* none */ +#if CONFIG_PAGING_LEVELS == 2 + SHADOW2_INTERNAL_NAME(sh2_remove_write_access,2,2), /* l1_32 */ + SHADOW2_INTERNAL_NAME(sh2_remove_write_access,2,2), /* fl1_32 */ +#else + SHADOW2_INTERNAL_NAME(sh2_remove_write_access,3,2), /* l1_32 */ + SHADOW2_INTERNAL_NAME(sh2_remove_write_access,3,2), /* fl1_32 */ +#endif + NULL, /* l2_32 */ +#if CONFIG_PAGING_LEVELS >= 3 + SHADOW2_INTERNAL_NAME(sh2_remove_write_access,3,3), /* l1_pae */ + SHADOW2_INTERNAL_NAME(sh2_remove_write_access,3,3), /* fl1_pae */ +#else + NULL, /* l1_pae */ + NULL, /* fl1_pae */ +#endif + NULL, /* l2_pae */ + NULL, /* l2h_pae */ + NULL, /* l3_pae */ +#if CONFIG_PAGING_LEVELS >= 4 + SHADOW2_INTERNAL_NAME(sh2_remove_write_access,4,4), /* l1_64 */ + SHADOW2_INTERNAL_NAME(sh2_remove_write_access,4,4), /* fl1_64 */ +#else + NULL, /* l1_64 */ + NULL, /* fl1_64 */ +#endif + NULL, /* l2_64 */ + NULL, /* l3_64 */ + NULL, /* l4_64 */ + NULL, /* p2m */ + NULL /* unused */ + }; + + static unsigned int callback_mask = + 1 << (PGC_SH2_l1_32_shadow >> PGC_SH2_type_shift) + | 1 << (PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift) + | 1 << (PGC_SH2_l1_pae_shadow >> PGC_SH2_type_shift) + | 1 << (PGC_SH2_fl1_pae_shadow >> PGC_SH2_type_shift) + | 1 << (PGC_SH2_l1_64_shadow >> PGC_SH2_type_shift) + | 1 << (PGC_SH2_fl1_64_shadow >> PGC_SH2_type_shift) + ; + struct page_info *pg = mfn_to_page(gmfn); + + ASSERT(shadow2_lock_is_acquired(v->domain)); + + /* Only remove writable mappings if we are doing shadow refcounts. + * In guest refcounting, we trust Xen to already be restricting + * all the writes to the guest page tables, so we do not need to + * do more. */ + if ( !shadow2_mode_refcounts(v->domain) ) + return 0; + + /* Early exit if it's already a pagetable, or otherwise not writeable */ + if ( sh2_mfn_is_a_page_table(gmfn) + || (pg->u.inuse.type_info & PGT_count_mask) == 0 ) + return 0; + + perfc_incrc(shadow2_writeable); + + /* If this isn't a "normal" writeable page, the domain is trying to + * put pagetables in special memory of some kind. We can't allow that. */ + if ( (pg->u.inuse.type_info & PGT_type_mask) != PGT_writable_page ) + { + SHADOW2_ERROR("can't remove write access to mfn %lx, type_info is %" + PRtype_info "\n", + mfn_x(gmfn), mfn_to_page(gmfn)->u.inuse.type_info); + domain_crash(v->domain); + } + +#if SHADOW2_OPTIMIZATIONS & SH2OPT_WRITABLE_HEURISTIC + if ( v == current && level != 0 ) + { + unsigned long gfn; + /* Heuristic: there is likely to be only one writeable mapping, + * and that mapping is likely to be in the current pagetable, + * either in the guest's linear map (linux, windows) or in a + * magic slot used to map high memory regions (linux HIGHTPTE) */ + +#define GUESS(_a, _h) do { \ + if ( v->arch.shadow2->guess_wrmap(v, (_a), gmfn) ) \ + perfc_incrc(shadow2_writeable_h_ ## _h); \ + if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 ) \ + return 1; \ + } while (0) + + + /* Linux lowmem: first 1GB is mapped 1-to-1 above 0xC0000000 */ + if ( v == current + && (gfn = sh2_mfn_to_gfn(v->domain, gmfn)) < 0x40000000 ) + GUESS(0xC0000000 + (gfn << PAGE_SHIFT), 4); + + if ( v->arch.shadow2->guest_levels == 2 ) + { + if ( level == 1 ) + /* 32bit non-PAE w2k3: linear map at 0xC0000000 */ + GUESS(0xC0000000UL + (fault_addr >> 10), 1); + } +#if CONFIG_PAGING_LEVELS >= 3 + else if ( v->arch.shadow2->guest_levels == 3 ) + { + /* 32bit PAE w2k3: linear map at 0xC0000000 */ + switch ( level ) + { + case 1: GUESS(0xC0000000UL + (fault_addr >> 9), 2); break; + case 2: GUESS(0xC0600000UL + (fault_addr >> 18), 2); break; + } + } +#if CONFIG_PAGING_LEVELS >= 4 + else if ( v->arch.shadow2->guest_levels == 4 ) + { + /* 64bit w2k3: linear map at 0x0000070000000000 */ + switch ( level ) + { + case 1: GUESS(0x70000000000UL + (fault_addr >> 9), 3); break; + case 2: GUESS(0x70380000000UL + (fault_addr >> 18), 3); break; + case 3: GUESS(0x70381C00000UL + (fault_addr >> 27), 3); break; + } + } +#endif /* CONFIG_PAGING_LEVELS >= 4 */ +#endif /* CONFIG_PAGING_LEVELS >= 3 */ + +#undef GUESS + + } +#endif + + /* Brute-force search of all the shadows, by walking the hash */ + perfc_incrc(shadow2_writeable_bf); + hash_foreach(v, callback_mask, callbacks, gmfn); + + /* If that didn't catch the mapping, something is very wrong */ + if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) != 0 ) + { + SHADOW2_ERROR("can't find all writeable mappings of mfn %lx: " + "%lu left\n", mfn_x(gmfn), + (mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask)); + domain_crash(v->domain); + } + + /* We killed at least one writeable mapping, so must flush TLBs. */ + return 1; +} + + + +/**************************************************************************/ +/* Remove all mappings of a guest frame from the shadow tables. + * Returns non-zero if we need to flush TLBs. */ + +int shadow2_remove_all_mappings(struct vcpu *v, mfn_t gmfn) +{ + struct page_info *page = mfn_to_page(gmfn); + int expected_count; + + /* Dispatch table for getting per-type functions */ + static hash_callback_t callbacks[16] = { + NULL, /* none */ +#if CONFIG_PAGING_LEVELS == 2 + SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,2,2), /* l1_32 */ + SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,2,2), /* fl1_32 */ +#else + SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,3,2), /* l1_32 */ + SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,3,2), /* fl1_32 */ +#endif + NULL, /* l2_32 */ +#if CONFIG_PAGING_LEVELS >= 3 + SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,3,3), /* l1_pae */ + SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,3,3), /* fl1_pae */ +#else + NULL, /* l1_pae */ + NULL, /* fl1_pae */ +#endif + NULL, /* l2_pae */ + NULL, /* l2h_pae */ + NULL, /* l3_pae */ +#if CONFIG_PAGING_LEVELS >= 4 + SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,4,4), /* l1_64 */ + SHADOW2_INTERNAL_NAME(sh2_remove_all_mappings,4,4), /* fl1_64 */ +#else + NULL, /* l1_64 */ + NULL, /* fl1_64 */ +#endif + NULL, /* l2_64 */ + NULL, /* l3_64 */ + NULL, /* l4_64 */ + NULL, /* p2m */ + NULL /* unused */ + }; + + static unsigned int callback_mask = + 1 << (PGC_SH2_l1_32_shadow >> PGC_SH2_type_shift) + | 1 << (PGC_SH2_fl1_32_shadow >> PGC_SH2_type_shift) + | 1 << (PGC_SH2_l1_pae_shadow >> PGC_SH2_type_shift) + | 1 << (PGC_SH2_fl1_pae_shadow >> PGC_SH2_type_shift) + | 1 << (PGC_SH2_l1_64_shadow >> PGC_SH2_type_shift) + | 1 << (PGC_SH2_fl1_64_shadow >> PGC_SH2_type_shift) + ; + + perfc_incrc(shadow2_mappings); + if ( (page->count_info & PGC_count_mask) == 0 ) + return 0; + + ASSERT(shadow2_lock_is_acquired(v->domain)); + + /* XXX TODO: + * Heuristics for finding the (probably) single mapping of this gmfn */ + + /* Brute-force search of all the shadows, by walking the hash */ + perfc_incrc(shadow2_mappings_bf); + hash_foreach(v, callback_mask, callbacks, gmfn); + + /* If that didn't catch the mapping, something is very wrong */ + expected_count = (page->count_info & PGC_allocated) ? 1 : 0; + if ( (page->count_info & PGC_count_mask) != expected_count ) + { + /* Don't complain if we're in HVM and there's one extra mapping: + * The qemu helper process has an untyped mapping of this dom's RAM */ + if ( !(shadow2_mode_external(v->domain) + && (page->count_info & PGC_count_mask) <= 2 + && (page->u.inuse.type_info & PGT_count_mask) == 0) ) + { + SHADOW2_ERROR("can't find all mappings of mfn %lx: " + "c=%08x t=%08lx\n", mfn_x(gmfn), + page->count_info, page->u.inuse.type_info); + } + } + + /* We killed at least one mapping, so must flush TLBs. */ + return 1; +} + + +/**************************************************************************/ +/* Remove all shadows of a guest frame from the shadow tables */ + +static int sh2_remove_shadow_via_pointer(struct vcpu *v, mfn_t smfn) +/* Follow this shadow's up-pointer, if it has one, and remove the reference + * found there. Returns 1 if that was the only reference to this shadow */ +{ + struct page_info *pg = mfn_to_page(smfn); + mfn_t pmfn; + void *vaddr; + int rc; + + ASSERT((pg->count_info & PGC_SH2_type_mask) > 0); + ASSERT((pg->count_info & PGC_SH2_type_mask) < PGC_SH2_max_shadow); + ASSERT((pg->count_info & PGC_SH2_type_mask) != PGC_SH2_l2_32_shadow); + ASSERT((pg->count_info & PGC_SH2_type_mask) != PGC_SH2_l3_pae_shadow); + ASSERT((pg->count_info & PGC_SH2_type_mask) != PGC_SH2_l4_64_shadow); + + if (pg->up == 0) return 0; + pmfn = _mfn(pg->up >> PAGE_SHIFT); + ASSERT(valid_mfn(pmfn)); + vaddr = sh2_map_domain_page(pmfn); + ASSERT(vaddr); + vaddr += pg->up & (PAGE_SIZE-1); + ASSERT(l1e_get_pfn(*(l1_pgentry_t *)vaddr) == mfn_x(smfn)); + + /* Is this the only reference to this shadow? */ + rc = ((pg->count_info & PGC_SH2_count_mask) == 1) ? 1 : 0; + + /* Blank the offending entry */ + switch ((pg->count_info & PGC_SH2_type_mask)) + { + case PGC_SH2_l1_32_shadow: + case PGC_SH2_l2_32_shadow: +#if CONFIG_PAGING_LEVELS == 2 + SHADOW2_INTERNAL_NAME(sh2_clear_shadow_entry,2,2)(v, vaddr, pmfn); +#else + SHADOW2_INTERNAL_NAME(sh2_clear_shadow_entry,3,2)(v, vaddr, pmfn); +#endif + break; +#if CONFIG_PAGING_LEVELS >=3 + case PGC_SH2_l1_pae_shadow: + case PGC_SH2_l2_pae_shadow: + case PGC_SH2_l2h_pae_shadow: + case PGC_SH2_l3_pae_shadow: + SHADOW2_INTERNAL_NAME(sh2_clear_shadow_entry,3,3)(v, vaddr, pmfn); + break; +#if CONFIG_PAGING_LEVELS >= 4 + case PGC_SH2_l1_64_shadow: + case PGC_SH2_l2_64_shadow: + case PGC_SH2_l3_64_shadow: + case PGC_SH2_l4_64_shadow: + SHADOW2_INTERNAL_NAME(sh2_clear_shadow_entry,4,4)(v, vaddr, pmfn); + break; +#endif +#endif + default: BUG(); /* Some wierd unknown shadow type */ + } + + sh2_unmap_domain_page(vaddr); + if ( rc ) + perfc_incrc(shadow2_up_pointer); + else + perfc_incrc(shadow2_unshadow_bf); + + return rc; +} + +void sh2_remove_shadows(struct vcpu *v, mfn_t gmfn, int all) +/* Remove the shadows of this guest page. + * If all != 0, find all shadows, if necessary by walking the tables. + * Otherwise, just try the (much faster) heuristics, which will remove + * at most one reference to each shadow of the page. */ +{ + struct page_info *pg; + mfn_t smfn; + u32 sh_flags; + unsigned char t; + + /* Dispatch table for getting per-type functions: each level must + * be called with the function to remove a lower-level shadow. */ + static hash_callback_t callbacks[16] = { + NULL, /* none */ + NULL, /* l1_32 */ + NULL, /* fl1_32 */ +#if CONFIG_PAGING_LEVELS == 2 + SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,2,2), /* l2_32 */ +#else + SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,3,2), /* l2_32 */ +#endif + NULL, /* l1_pae */ + NULL, /* fl1_pae */ +#if CONFIG_PAGING_LEVELS >= 3 + SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,3,3), /* l2_pae */ + SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,3,3), /* l2h_pae */ + SHADOW2_INTERNAL_NAME(sh2_remove_l2_shadow,3,3), /* l3_pae */ +#else + NULL, /* l2_pae */ + NULL, /* l2h_pae */ + NULL, /* l3_pae */ +#endif + NULL, /* l1_64 */ + NULL, /* fl1_64 */ +#if CONFIG_PAGING_LEVELS >= 4 + SHADOW2_INTERNAL_NAME(sh2_remove_l1_shadow,4,4), /* l2_64 */ + SHADOW2_INTERNAL_NAME(sh2_remove_l2_shadow,4,4), /* l3_64 */ + SHADOW2_INTERNAL_NAME(sh2_remove_l3_shadow,4,4), /* l4_64 */ +#else + NULL, /* l2_64 */ + NULL, /* l3_64 */ + NULL, /* l4_64 */ +#endif + NULL, /* p2m */ + NULL /* unused */ + }; + + /* Another lookup table, for choosing which mask to use */ + static unsigned int masks[16] = { + 0, /* none */ + 1 << (PGC_SH2_l2_32_shadow >> PGC_SH2_type_shift), /* l1_32 */ + 0, /* fl1_32 */ + 0, /* l2_32 */ + ((1 << (PGC_SH2_l2h_pae_shadow >> PGC_SH2_type_shift)) + | (1 << (PGC_SH2_l2_pae_shadow >> PGC_SH2_type_shift))), /* l1_pae */ + 0, /* fl1_pae */ + 1 << (PGC_SH2_l3_pae_shadow >> PGC_SH2_type_shift), /* l2_pae */ + 1 << (PGC_SH2_l3_pae_shadow >> PGC_SH2_type_shift), /* l2h_pae */ + 0, /* l3_pae */ + 1 << (PGC_SH2_l2_64_shadow >> PGC_SH2_type_shift), /* l1_64 */ + 0, /* fl1_64 */ + 1 << (PGC_SH2_l3_64_shadow >> PGC_SH2_type_shift), /* l2_64 */ + 1 << (PGC_SH2_l4_64_shadow >> PGC_SH2_type_shift), /* l3_64 */ + 0, /* l4_64 */ + 0, /* p2m */ + 0 /* unused */ + }; + + SHADOW2_PRINTK("d=%d, v=%d, gmfn=%05lx\n", + v->domain->domain_id, v->vcpu_id, mfn_x(gmfn)); + + ASSERT(shadow2_lock_is_acquired(v->domain)); + + pg = mfn_to_page(gmfn); + + /* Bale out now if the page is not shadowed */ + if ( (pg->count_info & PGC_page_table) == 0 ) + return; + + /* Search for this shadow in all appropriate shadows */ + perfc_incrc(shadow2_unshadow); + sh_flags = pg->shadow2_flags; + + /* Lower-level shadows need to be excised from upper-level shadows. + * This call to hash_foreach() looks dangerous but is in fact OK: each + * call will remove at most one shadow, and terminate immediately when + * it does remove it, so we never walk the hash after doing a deletion. */ +#define DO_UNSHADOW(_type) do { \ + t = (_type) >> PGC_SH2_type_shift; \ + smfn = shadow2_hash_lookup(v, mfn_x(gmfn), t); \ + if ( !sh2_remove_shadow_via_pointer(v, smfn) && all ) \ + hash_foreach(v, masks[t], callbacks, smfn); \ +} while (0) + + /* Top-level shadows need to be unpinned */ +#define DO_UNPIN(_type) do { \ + t = (_type) >> PGC_SH2_type_shift; \ + smfn = shadow2_hash_lookup(v, mfn_x(gmfn), t); \ + if ( mfn_to_page(smfn)->count_info & PGC_SH2_pinned ) \ + sh2_unpin(v, smfn); \ + if ( (_type) == PGC_SH2_l3_pae_shadow ) \ + SHADOW2_INTERNAL_NAME(sh2_unpin_all_l3_subshadows,3,3)(v, smfn); \ +} while (0) + + if ( sh_flags & SH2F_L1_32 ) DO_UNSHADOW(PGC_SH2_l1_32_shadow); + if ( sh_flags & SH2F_L2_32 ) DO_UNPIN(PGC_SH2_l2_32_shadow); +#if CONFIG_PAGING_LEVELS >= 3 + if ( sh_flags & SH2F_L1_PAE ) DO_UNSHADOW(PGC_SH2_l1_pae_shadow); + if ( sh_flags & SH2F_L2_PAE ) DO_UNSHADOW(PGC_SH2_l2_pae_shadow); + if ( sh_flags & SH2F_L2H_PAE ) DO_UNSHADOW(PGC_SH2_l2h_pae_shadow); + if ( sh_flags & SH2F_L3_PAE ) DO_UNPIN(PGC_SH2_l3_pae_shadow); +#if CONFIG_PAGING_LEVELS >= 4 + if ( sh_flags & SH2F_L1_64 ) DO_UNSHADOW(PGC_SH2_l1_64_shadow); + if ( sh_flags & SH2F_L2_64 ) DO_UNSHADOW(PGC_SH2_l2_64_shadow); + if ( sh_flags & SH2F_L3_64 ) DO_UNSHADOW(PGC_SH2_l3_64_shadow); + if ( sh_flags & SH2F_L4_64 ) DO_UNPIN(PGC_SH2_l4_64_shadow); +#endif +#endif + +#undef DO_UNSHADOW +#undef DO_UNPIN + + +#if CONFIG_PAGING_LEVELS > 2 + /* We may have caused some PAE l3 entries to change: need to + * fix up the copies of them in various places */ + if ( sh_flags & (SH2F_L2_PAE|SH2F_L2H_PAE) ) + sh2_pae_recopy(v->domain); +#endif + + /* If that didn't catch the shadows, something is wrong */ + if ( all && (pg->count_info & PGC_page_table) ) + { + SHADOW2_ERROR("can't find all shadows of mfn %05lx (shadow2_flags=%08x)\n", + mfn_x(gmfn), pg->shadow2_flags); + domain_crash(v->domain); + } +} + +void +shadow2_remove_all_shadows_and_parents(struct vcpu *v, mfn_t gmfn) +/* Even harsher: this is a HVM page that we thing is no longer a pagetable. + * Unshadow it, and recursively unshadow pages that reference it. */ +{ + shadow2_remove_all_shadows(v, gmfn); + /* XXX TODO: + * Rework this hashtable walker to return a linked-list of all + * the shadows it modified, then do breadth-first recursion + * to find the way up to higher-level tables and unshadow them too. + * + * The current code (just tearing down each page's shadows as we + * detect that it is not a pagetable) is correct, but very slow. + * It means extra emulated writes and slows down removal of mappings. */ +} + +/**************************************************************************/ + +void sh2_update_paging_modes(struct vcpu *v) +{ + struct domain *d = v->domain; + struct shadow2_entry_points *old_entries = v->arch.shadow2; + mfn_t old_guest_table; + + ASSERT(shadow2_lock_is_acquired(d)); + + // Valid transitions handled by this function: + // - For PV guests: + // - after a shadow mode has been changed + // - For HVM guests: + // - after a shadow mode has been changed + // - changes in CR0.PG, CR4.PAE, CR4.PSE, or CR4.PGE + // + + // Avoid determining the current shadow2 mode for uninitialized CPUs, as + // we can not yet determine whether it is an HVM or PV domain. + // + if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) ) + { + printk("%s: postponing determination of shadow2 mode\n", __func__); + return; + } + + // First, tear down any old shadow tables held by this vcpu. + // + if ( v->arch.shadow2 ) + shadow2_detach_old_tables(v); + + if ( !hvm_guest(v) ) + { + /// + /// PV guest + /// +#if CONFIG_PAGING_LEVELS == 4 + if ( pv_32bit_guest(v) ) + v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 4, 3); + else + v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 4, 4); +#elif CONFIG_PAGING_LEVELS == 3 + v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 3, 3); +#elif CONFIG_PAGING_LEVELS == 2 + v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 2, 2); +#else +#error unexpected paging mode +#endif + } + else + { + /// + /// HVM guest + /// + ASSERT(shadow2_mode_translate(d)); + ASSERT(shadow2_mode_external(d)); + + if ( !hvm_paging_enabled(v) ) + { + // paging disabled... + clear_bit(_VCPUF_shadow2_translate, &v->vcpu_flags); + + /* Set v->arch.guest_table to use the p2m map, and choose + * the appropriate shadow mode */ + old_guest_table = pagetable_get_mfn(v->arch.guest_table); +#if CONFIG_PAGING_LEVELS == 2 + v->arch.guest_table = + pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table)); + v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry,2,2); +#elif CONFIG_PAGING_LEVELS == 3 + v->arch.guest_table = + pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table)); + v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry,3,3); +#else /* CONFIG_PAGING_LEVELS == 4 */ + { + l4_pgentry_t *l4e; + /* Use the start of the first l3 table as a PAE l3 */ + ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0); + l4e = sh2_map_domain_page(pagetable_get_mfn(d->arch.phys_table)); + ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT); + v->arch.guest_table = + pagetable_from_pfn(l4e_get_pfn(l4e[0])); + sh2_unmap_domain_page(l4e); + } + v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry,3,3); +#endif + /* Fix up refcounts on guest_table */ + get_page(mfn_to_page(pagetable_get_mfn(v->arch.guest_table)), d); + if ( mfn_x(old_guest_table) != 0 ) + put_page(mfn_to_page(old_guest_table)); + } + else + { + set_bit(_VCPUF_shadow2_translate, &v->vcpu_flags); + +#ifdef __x86_64__ + if ( hvm_long_mode_enabled(v) ) + { + // long mode guest... + v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 4, 4); + } + else +#endif + if ( hvm_get_guest_ctrl_reg(v, 4) & X86_CR4_PAE ) + { +#if CONFIG_PAGING_LEVELS >= 3 + // 32-bit PAE mode guest... + v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 3, 3); +#else + SHADOW2_ERROR("PAE not supported in 32-bit Xen\n"); + domain_crash(d); + return; +#endif + } + else + { + // 32-bit 2 level guest... +#if CONFIG_PAGING_LEVELS >= 3 + v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 3, 2); +#else + v->arch.shadow2 = &SHADOW2_INTERNAL_NAME(shadow2_entry, 2, 2); +#endif + } + } + + if ( pagetable_get_pfn(v->arch.monitor_table) == 0 ) + { + mfn_t mmfn = shadow2_make_monitor_table(v); + v->arch.monitor_table = pagetable_from_mfn(mmfn); + v->arch.monitor_vtable = sh2_map_domain_page(mmfn); + } + + if ( v->arch.shadow2 != old_entries ) + { + SHADOW2_PRINTK("new paging mode: d=%u v=%u g=%u s=%u " + "(was g=%u s=%u)\n", + d->domain_id, v->vcpu_id, + v->arch.shadow2->guest_levels, + v->arch.shadow2->shadow_levels, + old_entries ? old_entries->guest_levels : 0, + old_entries ? old_entries->shadow_levels : 0); + if ( old_entries && + (v->arch.shadow2->shadow_levels != + old_entries->shadow_levels) ) + { + /* Need to make a new monitor table for the new mode */ + mfn_t new_mfn, old_mfn; + + if ( v != current ) + { + SHADOW2_ERROR("Some third party (d=%u v=%u) is changing " + "this HVM vcpu's (d=%u v=%u) paging mode!\n", + current->domain->domain_id, current->vcpu_id, + v->domain->domain_id, v->vcpu_id); + domain_crash(v->domain); + return; + } + + sh2_unmap_domain_page(v->arch.monitor_vtable); + old_mfn = pagetable_get_mfn(v->arch.monitor_table); + v->arch.monitor_table = pagetable_null(); + new_mfn = v->arch.shadow2->make_monitor_table(v); + v->arch.monitor_table = pagetable_from_mfn(new_mfn); + v->arch.monitor_vtable = sh2_map_domain_page(new_mfn); + SHADOW2_PRINTK("new monitor table %"SH2_PRI_mfn "\n", + mfn_x(new_mfn)); + + /* Don't be running on the old monitor table when we + * pull it down! Switch CR3, and warn the HVM code that + * its host cr3 has changed. */ + make_cr3(v, mfn_x(new_mfn)); + write_ptbase(v); + hvm_update_host_cr3(v); + old_entries->destroy_monitor_table(v, old_mfn); + } + } + + // XXX -- Need to deal with changes in CR4.PSE and CR4.PGE. + // These are HARD: think about the case where two CPU's have + // different values for CR4.PSE and CR4.PGE at the same time. + // This *does* happen, at least for CR4.PGE... + } + + v->arch.shadow2->update_cr3(v); +} + +/**************************************************************************/ +/* Turning on and off shadow2 features */ + +static void sh2_new_mode(struct domain *d, u32 new_mode) +/* Inform all the vcpus that the shadow mode has been changed */ +{ + struct vcpu *v; + + ASSERT(shadow2_lock_is_acquired(d)); + ASSERT(d != current->domain); + d->arch.shadow2_mode = new_mode; + if ( new_mode & SHM2_translate ) + shadow2_audit_p2m(d); + for_each_vcpu(d, v) + sh2_update_paging_modes(v); +} + +static int shadow2_enable(struct domain *d, u32 mode) +/* Turn on "permanent" shadow features: external, translate, refcount. + * Can only be called once on a domain, and these features cannot be + * disabled. + * Returns 0 for success, -errno for failure. */ +{ + unsigned int old_pages; + int rv = 0; + + domain_pause(d); + shadow2_lock(d); + + /* Sanity check the arguments */ + if ( d == current->domain + || shadow2_mode_enabled(d) + || !(mode & SHM2_enable) + || ((mode & SHM2_external) && !(mode & SHM2_translate)) ) + { + rv = -EINVAL; + goto out; + } + + // XXX -- eventually would like to require that all memory be allocated + // *after* shadow2_enabled() is called... So here, we would test to make + // sure that d->page_list is empty. +#if 0 + spin_lock(&d->page_alloc_lock); + if ( !list_empty(&d->page_list) ) + { + spin_unlock(&d->page_alloc_lock); + rv = -EINVAL; + goto out; + } + spin_unlock(&d->page_alloc_lock); +#endif + + /* Init the shadow memory allocation if the user hasn't done so */ + old_pages = d->arch.shadow2_total_pages; + if ( old_pages == 0 ) + if ( set_sh2_allocation(d, 256, NULL) != 0 ) /* Use at least 1MB */ + { + set_sh2_allocation(d, 0, NULL); + rv = -ENOMEM; + goto out; + } + + /* Init the hash table */ + if ( shadow2_hash_alloc(d) != 0 ) + { + set_sh2_allocation(d, old_pages, NULL); + rv = -ENOMEM; + goto out; + } + + /* Init the P2M table */ + if ( mode & SHM2_translate ) + if ( !shadow2_alloc_p2m_table(d) ) + { + shadow2_hash_teardown(d); + set_sh2_allocation(d, old_pages, NULL); + shadow2_p2m_teardown(d); + rv = -ENOMEM; + goto out; + } + + /* Update the bits */ + sh2_new_mode(d, mode); + shadow2_audit_p2m(d); + out: + shadow2_unlock(d); + domain_unpause(d); + return 0; +} + +void shadow2_teardown(struct domain *d) +/* Destroy the shadow pagetables of this domain and free its shadow memory. + * Should only be called for dying domains. */ +{ + struct vcpu *v; + mfn_t mfn; + + ASSERT(test_bit(_DOMF_dying, &d->domain_flags)); + ASSERT(d != current->domain); + + if ( !shadow2_lock_is_acquired(d) ) + shadow2_lock(d); /* Keep various asserts happy */ + + if ( shadow2_mode_enabled(d) ) + { + /* Release the shadow and monitor tables held by each vcpu */ + for_each_vcpu(d, v) + { + if ( v->arch.shadow2 ) + shadow2_detach_old_tables(v); + if ( shadow2_mode_external(d) ) + { + mfn = pagetable_get_mfn(v->arch.monitor_table); + if ( valid_mfn(mfn) && (mfn_x(mfn) != 0) ) + shadow2_destroy_monitor_table(v, mfn); + v->arch.monitor_table = pagetable_null(); + } + } + } + + if ( d->arch.shadow2_total_pages != 0 ) + { + SHADOW2_PRINTK("teardown of domain %u starts." + " Shadow pages total = %u, free = %u, p2m=%u\n", + d->domain_id, + d->arch.shadow2_total_pages, + d->arch.shadow2_free_pages, + d->arch.shadow2_p2m_pages); + /* Destroy all the shadows and release memory to domheap */ + set_sh2_allocation(d, 0, NULL); + /* Release the hash table back to xenheap */ + if (d->arch.shadow2_hash_table) + shadow2_hash_teardown(d); + /* Release the log-dirty bitmap of dirtied pages */ + sh2_free_log_dirty_bitmap(d); + /* Should not have any more memory held */ + SHADOW2_PRINTK("teardown done." + " Shadow pages total = %u, free = %u, p2m=%u\n", + d->arch.shadow2_total_pages, + d->arch.shadow2_free_pages, + d->arch.shadow2_p2m_pages); + ASSERT(d->arch.shadow2_total_pages == 0); + } + + /* We leave the "permanent" shadow modes enabled, but clear the + * log-dirty mode bit. We don't want any more mark_dirty() + * calls now that we've torn down the bitmap */ + d->arch.shadow2_mode &= ~SHM2_log_dirty; + + shadow2_unlock(d); +} + +void shadow2_final_teardown(struct domain *d) +/* Called by arch_domain_destroy(), when it's safe to pull down the p2m map. */ +{ + + SHADOW2_PRINTK("dom %u final teardown starts." + " Shadow pages total = %u, free = %u, p2m=%u\n", + d->domain_id, + d->arch.shadow2_total_pages, + d->arch.shadow2_free_pages, + d->arch.shadow2_p2m_pages); + + /* Double-check that the domain didn't have any shadow memory. + * It is possible for a domain that never got domain_kill()ed + * to get here with its shadow allocation intact. */ + if ( d->arch.shadow2_total_pages != 0 ) + shadow2_teardown(d); + + /* It is now safe to pull down the p2m map. */ + if ( d->arch.shadow2_p2m_pages != 0 ) + shadow2_p2m_teardown(d); + + SHADOW2_PRINTK("dom %u final teardown done." + " Shadow pages total = %u, free = %u, p2m=%u\n", + d->domain_id, + d->arch.shadow2_total_pages, + d->arch.shadow2_free_pages, + d->arch.shadow2_p2m_pages); +} + +static int shadow2_one_bit_enable(struct domain *d, u32 mode) +/* Turn on a single shadow mode feature */ +{ + ASSERT(shadow2_lock_is_acquired(d)); + + /* Sanity check the call */ + if ( d == current->domain || (d->arch.shadow2_mode & mode) ) + { + return -EINVAL; + } + + if ( d->arch.shadow2_mode == 0 ) + { + /* Init the shadow memory allocation and the hash table */ + if ( set_sh2_allocation(d, 1, NULL) != 0 + || shadow2_hash_alloc(d) != 0 ) + { + set_sh2_allocation(d, 0, NULL); + return -ENOMEM; + } + } + + /* Update the bits */ + sh2_new_mode(d, d->arch.shadow2_mode | mode); + + return 0; +} + +static int shadow2_one_bit_disable(struct domain *d, u32 mode) +/* Turn off a single shadow mode feature */ +{ + struct vcpu *v; + ASSERT(shadow2_lock_is_acquired(d)); + + /* Sanity check the call */ + if ( d == current->domain || !(d->arch.shadow2_mode & mode) ) + { + return -EINVAL; + } + + /* Update the bits */ + sh2_new_mode(d, d->arch.shadow2_mode & ~mode); + if ( d->arch.shadow2_mode == 0 ) + { + /* Get this domain off shadows */ + SHADOW2_PRINTK("un-shadowing of domain %u starts." + " Shadow pages total = %u, free = %u, p2m=%u\n", + d->domain_id, + d->arch.shadow2_total_pages, + d->arch.shadow2_free_pages, + d->arch.shadow2_p2m_pages); + for_each_vcpu(d, v) + { + if ( v->arch.shadow2 ) + shadow2_detach_old_tables(v); +#if CONFIG_PAGING_LEVELS == 4 + if ( !(v->arch.flags & TF_kernel_mode) ) + make_cr3(v, pagetable_get_pfn(v->arch.guest_table_user)); + else +#endif + make_cr3(v, pagetable_get_pfn(v->arch.guest_table)); + + } + + /* Pull down the memory allocation */ + if ( set_sh2_allocation(d, 0, NULL) != 0 ) + { + // XXX - How can this occur? + // Seems like a bug to return an error now that we've + // disabled the relevant shadow mode. + // + return -ENOMEM; + } + shadow2_hash_teardown(d); + SHADOW2_PRINTK("un-shadowing of domain %u done." + " Shadow pages total = %u, free = %u, p2m=%u\n", + d->domain_id, + d->arch.shadow2_total_pages, + d->arch.shadow2_free_pages, + d->arch.shadow2_p2m_pages); + } + + return 0; +} + +/* Enable/disable ops for the "test" and "log-dirty" modes */ +int shadow2_test_enable(struct domain *d) +{ + int ret; + + domain_pause(d); + shadow2_lock(d); + + if ( shadow2_mode_enabled(d) ) + { + SHADOW2_ERROR("Don't support enabling test mode" + "on already shadowed doms\n"); + ret = -EINVAL; + goto out; + } + + ret = shadow2_one_bit_enable(d, SHM2_enable); + out: + shadow2_unlock(d); + domain_unpause(d); + + return ret; +} + +int shadow2_test_disable(struct domain *d) +{ + int ret; + + domain_pause(d); + shadow2_lock(d); + ret = shadow2_one_bit_disable(d, SHM2_enable); + shadow2_unlock(d); + domain_unpause(d); + + return ret; +} + +static int +sh2_alloc_log_dirty_bitmap(struct domain *d) +{ + ASSERT(d->arch.shadow_dirty_bitmap == NULL); + d->arch.shadow_dirty_bitmap_size = + (d->shared_info->arch.max_pfn + (BITS_PER_LONG - 1)) & + ~(BITS_PER_LONG - 1); + d->arch.shadow_dirty_bitmap = + xmalloc_array(unsigned long, + d->arch.shadow_dirty_bitmap_size / BITS_PER_LONG); + if ( d->arch.shadow_dirty_bitmap == NULL ) + { + d->arch.shadow_dirty_bitmap_size = 0; + return -ENOMEM; + } + memset(d->arch.shadow_dirty_bitmap, 0, d->arch.shadow_dirty_bitmap_size/8); + + return 0; +} + +static void +sh2_free_log_dirty_bitmap(struct domain *d) +{ + d->arch.shadow_dirty_bitmap_size = 0; + if ( d->arch.shadow_dirty_bitmap ) + { + xfree(d->arch.shadow_dirty_bitmap); + d->arch.shadow_dirty_bitmap = NULL; + } +} + +static int shadow2_log_dirty_enable(struct domain *d) +{ + int ret; + + domain_pause(d); + shadow2_lock(d); + + if ( shadow2_mode_log_dirty(d) ) + { + ret = -EINVAL; + goto out; + } + + if ( shadow2_mode_enabled(d) ) + { + SHADOW2_ERROR("Don't (yet) support enabling log-dirty" + "on already shadowed doms\n"); + ret = -EINVAL; + goto out; + } + + ret = sh2_alloc_log_dirty_bitmap(d); + if ( ret != 0 ) + { + sh2_free_log_dirty_bitmap(d); + goto out; + } + + ret = shadow2_one_bit_enable(d, SHM2_log_dirty); + if ( ret != 0 ) + sh2_free_log_dirty_bitmap(d); + + out: + shadow2_unlock(d); + domain_unpause(d); + return ret; +} + +static int shadow2_log_dirty_disable(struct domain *d) +{ + int ret; + + domain_pause(d); + shadow2_lock(d); + ret = shadow2_one_bit_disable(d, SHM2_log_dirty); + if ( !shadow2_mode_log_dirty(d) ) + sh2_free_log_dirty_bitmap(d); + shadow2_unlock(d); + domain_unpause(d); + + return ret; +} + +/**************************************************************************/ +/* P2M map manipulations */ + +static void +sh2_p2m_remove_page(struct domain *d, unsigned long gfn, unsigned long mfn) +{ + struct vcpu *v; + + if ( !shadow2_mode_translate(d) ) + return; + + v = current; + if ( v->domain != d ) + v = d->vcpu[0]; + + + SHADOW2_PRINTK("removing gfn=%#lx mfn=%#lx\n", gfn, mfn); + + ASSERT(mfn_x(sh2_gfn_to_mfn(d, gfn)) == mfn); + //ASSERT(sh2_mfn_to_gfn(d, mfn) == gfn); + + shadow2_remove_all_shadows_and_parents(v, _mfn(mfn)); + if ( shadow2_remove_all_mappings(v, _mfn(mfn)) ) + flush_tlb_mask(d->domain_dirty_cpumask); + shadow2_set_p2m_entry(d, gfn, _mfn(INVALID_MFN)); + set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY); +} + +void +shadow2_guest_physmap_remove_page(struct domain *d, unsigned long gfn, + unsigned long mfn) +{ + shadow2_lock(d); + shadow2_audit_p2m(d); + sh2_p2m_remove_page(d, gfn, mfn); + shadow2_audit_p2m(d); + shadow2_unlock(d); +} + +void +shadow2_guest_physmap_add_page(struct domain *d, unsigned long gfn, + unsigned long mfn) +{ + struct vcpu *v; + unsigned long ogfn; + mfn_t omfn; + + if ( !shadow2_mode_translate(d) ) + return; + + v = current; + if ( v->domain != d ) + v = d->vcpu[0]; + + shadow2_lock(d); + shadow2_audit_p2m(d); + + SHADOW2_DEBUG(P2M, "adding gfn=%#lx mfn=%#lx\n", gfn, mfn); + + omfn = sh2_gfn_to_mfn(d, gfn); + if ( valid_mfn(omfn) ) + { + /* Get rid of the old mapping, especially any shadows */ + shadow2_remove_all_shadows_and_parents(v, omfn); + if ( shadow2_remove_all_mappings(v, omfn) ) + flush_tlb_mask(d->domain_dirty_cpumask); + set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY); + } + + ogfn = sh2_mfn_to_gfn(d, _mfn(mfn)); + if ( +#ifdef __x86_64__ + (ogfn != 0x5555555555555555L) +#else + (ogfn != 0x55555555L) +#endif + && (ogfn != INVALID_M2P_ENTRY) + && (ogfn != gfn) ) + { + /* This machine frame is already mapped at another physical address */ + SHADOW2_DEBUG(P2M, "aliased! mfn=%#lx, old gfn=%#lx, new gfn=%#lx\n", + mfn, ogfn, gfn); + if ( valid_mfn(omfn = sh2_gfn_to_mfn(d, ogfn)) ) + { + SHADOW2_DEBUG(P2M, "old gfn=%#lx -> mfn %#lx\n", + ogfn , mfn_x(omfn)); + if ( mfn_x(omfn) == mfn ) + sh2_p2m_remove_page(d, ogfn, mfn); + } + } + + shadow2_set_p2m_entry(d, gfn, _mfn(mfn)); + set_gpfn_from_mfn(mfn, gfn); + shadow2_audit_p2m(d); + shadow2_unlock(d); +} + +/**************************************************************************/ +/* Log-dirty mode support */ + +/* Convert a shadow to log-dirty mode. */ +void shadow2_convert_to_log_dirty(struct vcpu *v, mfn_t smfn) +{ + BUG(); +} + + +/* Read a domain's log-dirty bitmap and stats. + * If the operation is a CLEAN, clear the bitmap and stats as well. */ +static int shadow2_log_dirty_op(struct domain *d, dom0_shadow_control_t *sc) +{ + int i, rv = 0, clean = 0; + + domain_pause(d); + shadow2_lock(d); + + if ( sc->op == DOM0_SHADOW_CONTROL_OP_CLEAN + || sc->op == DOM0_SHADOW_CONTROL_OP_FLUSH ) + clean = 1; + else + ASSERT(sc->op == DOM0_SHADOW_CONTROL_OP_PEEK); + + SHADOW2_DEBUG(LOGDIRTY, "log-dirty %s: dom %u faults=%u dirty=%u\n", + (clean) ? "clean" : "peek", + d->domain_id, + d->arch.shadow_fault_count, + d->arch.shadow_dirty_count); + + sc->stats.fault_count = d->arch.shadow_fault_count; + sc->stats.dirty_count = d->arch.shadow_dirty_count; + + if ( clean ) + { + struct list_head *l, *t; + struct page_info *pg; + + /* Need to revoke write access to the domain's pages again. + * In future, we'll have a less heavy-handed approach to this, + * but for now, we just unshadow everything except Xen. */ + list_for_each_safe(l, t, &d->arch.shadow2_toplevel_shadows) + { + pg = list_entry(l, struct page_info, list); + shadow2_unhook_mappings(d->vcpu[0], page_to_mfn(pg)); + } + + d->arch.shadow_fault_count = 0; + d->arch.shadow_dirty_count = 0; + } + + if ( guest_handle_is_null(sc->dirty_bitmap) || + (d->arch.shadow_dirty_bitmap == NULL) ) + { + rv = -EINVAL; + goto out; + } + + if ( sc->pages > d->arch.shadow_dirty_bitmap_size ) + sc->pages = d->arch.shadow_dirty_bitmap_size; + +#define CHUNK (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */ + for ( i = 0; i < sc->pages; i += CHUNK ) + { + int bytes = ((((sc->pages - i) > CHUNK) + ? CHUNK + : (sc->pages - i)) + 7) / 8; + + if ( copy_to_guest_offset( + sc->dirty_bitmap, + i/(8*sizeof(unsigned long)), + d->arch.shadow_dirty_bitmap + (i/(8*sizeof(unsigned long))), + (bytes + sizeof(unsigned long) - 1) / sizeof(unsigned long)) ) + { + rv = -EINVAL; + goto out; + } + + if ( clean ) + memset(d->arch.shadow_dirty_bitmap + (i/(8*sizeof(unsigned long))), + 0, bytes); + } +#undef CHUNK + + out: + shadow2_unlock(d); + domain_unpause(d); + return 0; +} + + +/* Mark a page as dirty */ +void sh2_do_mark_dirty(struct domain *d, mfn_t gmfn) +{ + unsigned long pfn; + + ASSERT(shadow2_lock_is_acquired(d)); + ASSERT(shadow2_mode_log_dirty(d)); + + if ( !valid_mfn(gmfn) ) + return; + + ASSERT(d->arch.shadow_dirty_bitmap != NULL); + + /* We /really/ mean PFN here, even for non-translated guests. */ + pfn = get_gpfn_from_mfn(mfn_x(gmfn)); + + /* + * Values with the MSB set denote MFNs that aren't really part of the + * domain's pseudo-physical memory map (e.g., the shared info frame). + * Nothing to do here... + */ + if ( unlikely(!VALID_M2P(pfn)) ) + return; + + /* N.B. Can use non-atomic TAS because protected by shadow2_lock. */ + if ( likely(pfn < d->arch.shadow_dirty_bitmap_size) ) + { + if ( !__test_and_set_bit(pfn, d->arch.shadow_dirty_bitmap) ) + { + SHADOW2_DEBUG(LOGDIRTY, + "marked mfn %" SH2_PRI_mfn " (pfn=%lx), dom %d\n", + mfn_x(gmfn), pfn, d->domain_id); + d->arch.shadow_dirty_count++; + } + } + else + { + SHADOW2_PRINTK("mark_dirty OOR! " + "mfn=%" SH2_PRI_mfn " pfn=%lx max=%x (dom %d)\n" + "owner=%d c=%08x t=%" PRtype_info "\n", + mfn_x(gmfn), + pfn, + d->arch.shadow_dirty_bitmap_size, + d->domain_id, + (page_get_owner(mfn_to_page(gmfn)) + ? page_get_owner(mfn_to_page(gmfn))->domain_id + : -1), + mfn_to_page(gmfn)->count_info, + mfn_to_page(gmfn)->u.inuse.type_info); + } +} + + +/**************************************************************************/ +/* Shadow-control DOM0_OP dispatcher */ + +int shadow2_control_op(struct domain *d, + dom0_shadow_control_t *sc, + XEN_GUEST_HANDLE(dom0_op_t) u_dom0_op) +{ + int rc, preempted = 0; + + if ( unlikely(d == current->domain) ) + { + DPRINTK("Don't try to do a shadow op on yourself!\n"); + return -EINVAL; + } + + switch ( sc->op ) + { + case DOM0_SHADOW_CONTROL_OP_OFF: + if ( shadow2_mode_log_dirty(d) ) + if ( (rc = shadow2_log_dirty_disable(d)) != 0 ) + return rc; + if ( d->arch.shadow2_mode & SHM2_enable ) + if ( (rc = shadow2_test_disable(d)) != 0 ) + return rc; + return 0; + + case DOM0_SHADOW_CONTROL_OP_ENABLE_TEST: + return shadow2_test_enable(d); + + case DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY: + return shadow2_log_dirty_enable(d); + + case DOM0_SHADOW_CONTROL_OP_FLUSH: + case DOM0_SHADOW_CONTROL_OP_CLEAN: + case DOM0_SHADOW_CONTROL_OP_PEEK: + return shadow2_log_dirty_op(d, sc); + + + + case DOM0_SHADOW2_CONTROL_OP_ENABLE: + return shadow2_enable(d, sc->mode << SHM2_shift); + + case DOM0_SHADOW2_CONTROL_OP_GET_ALLOCATION: + sc->mb = shadow2_get_allocation(d); + return 0; + + case DOM0_SHADOW2_CONTROL_OP_SET_ALLOCATION: + rc = shadow2_set_allocation(d, sc->mb, &preempted); + if ( preempted ) + /* Not finished. Set up to re-run the call. */ + rc = hypercall_create_continuation( + __HYPERVISOR_dom0_op, "h", u_dom0_op); + else + /* Finished. Return the new allocation */ + sc->mb = shadow2_get_allocation(d); + return rc; + + + default: + SHADOW2_ERROR("Bad shadow op %u\n", sc->op); + return -EINVAL; + } +} + + +/**************************************************************************/ +/* Auditing shadow tables */ + +#if SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_FULL + +void shadow2_audit_tables(struct vcpu *v) +{ + /* Dispatch table for getting per-type functions */ + static hash_callback_t callbacks[16] = { + NULL, /* none */ +#if CONFIG_PAGING_LEVELS == 2 + SHADOW2_INTERNAL_NAME(sh2_audit_l1_table,2,2), /* l1_32 */ + SHADOW2_INTERNAL_NAME(sh2_audit_fl1_table,2,2), /* fl1_32 */ + SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,2,2), /* l2_32 */ +#else + SHADOW2_INTERNAL_NAME(sh2_audit_l1_table,3,2), /* l1_32 */ + SHADOW2_INTERNAL_NAME(sh2_audit_fl1_table,3,2), /* fl1_32 */ + SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,3,2), /* l2_32 */ + SHADOW2_INTERNAL_NAME(sh2_audit_l1_table,3,3), /* l1_pae */ + SHADOW2_INTERNAL_NAME(sh2_audit_fl1_table,3,3), /* fl1_pae */ + SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,3,3), /* l2_pae */ + SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,3,3), /* l2h_pae */ + SHADOW2_INTERNAL_NAME(sh2_audit_l3_table,3,3), /* l3_pae */ +#if CONFIG_PAGING_LEVELS >= 4 + SHADOW2_INTERNAL_NAME(sh2_audit_l1_table,4,4), /* l1_64 */ + SHADOW2_INTERNAL_NAME(sh2_audit_fl1_table,4,4), /* fl1_64 */ + SHADOW2_INTERNAL_NAME(sh2_audit_l2_table,4,4), /* l2_64 */ + SHADOW2_INTERNAL_NAME(sh2_audit_l3_table,4,4), /* l3_64 */ + SHADOW2_INTERNAL_NAME(sh2_audit_l4_table,4,4), /* l4_64 */ +#endif /* CONFIG_PAGING_LEVELS >= 4 */ +#endif /* CONFIG_PAGING_LEVELS > 2 */ + NULL /* All the rest */ + }; + unsigned int mask; + + if ( !(SHADOW2_AUDIT_ENABLE) ) + return; + + if ( SHADOW2_AUDIT & SHADOW2_AUDIT_ENTRIES_FULL ) + mask = ~1; /* Audit every table in the system */ + else + { + /* Audit only the current mode's tables */ + switch (v->arch.shadow2->guest_levels) + { + case 2: mask = (SH2F_L1_32|SH2F_FL1_32|SH2F_L2_32); break; + case 3: mask = (SH2F_L1_PAE|SH2F_FL1_PAE|SH2F_L2_PAE + |SH2F_L2H_PAE|SH2F_L3_PAE); break; + case 4: mask = (SH2F_L1_64|SH2F_FL1_64|SH2F_L2_64 + |SH2F_L3_64|SH2F_L4_64); break; + default: BUG(); + } + } + + hash_foreach(v, ~1, callbacks, _mfn(INVALID_MFN)); +} + +#endif /* Shadow audit */ + + +/**************************************************************************/ +/* Auditing p2m tables */ + +#if SHADOW2_AUDIT & SHADOW2_AUDIT_P2M + +void shadow2_audit_p2m(struct domain *d) +{ + struct list_head *entry; + struct page_info *page; + struct domain *od; + unsigned long mfn, gfn, m2pfn, lp2mfn = 0; + mfn_t p2mfn; + unsigned long orphans_d = 0, orphans_i = 0, mpbad = 0, pmbad = 0; + int test_linear; + + if ( !(SHADOW2_AUDIT_ENABLE) || !shadow2_mode_translate(d) ) + return; + + //SHADOW2_PRINTK("p2m audit starts\n"); + + test_linear = ( (d == current->domain) && current->arch.monitor_vtable ); + if ( test_linear ) + local_flush_tlb(); + + /* Audit part one: walk the domain's page allocation list, checking + * the m2p entries. */ + for ( entry = d->page_list.next; + entry != &d->page_list; + entry = entry->next ) + { + page = list_entry(entry, struct page_info, list); + mfn = mfn_x(page_to_mfn(page)); + + // SHADOW2_PRINTK("auditing guest page, mfn=%#lx\n", mfn); + + od = page_get_owner(page); + + if ( od != d ) + { + SHADOW2_PRINTK("wrong owner %#lx -> %p(%u) != %p(%u)\n", + mfn, od, (od?od->domain_id:-1), d, d->domain_id); + continue; + } + + gfn = get_gpfn_from_mfn(mfn); + if ( gfn == INVALID_M2P_ENTRY ) + { + orphans_i++; + //SHADOW2_PRINTK("orphaned guest page: mfn=%#lx has invalid gfn\n", + // mfn); + continue; + } + + if ( gfn == 0x55555555 ) + { + orphans_d++; + //SHADOW2_PRINTK("orphaned guest page: mfn=%#lx has debug gfn\n", + // mfn); + continue; + } + + p2mfn = sh2_gfn_to_mfn_foreign(d, gfn); + if ( mfn_x(p2mfn) != mfn ) + { + mpbad++; + SHADOW2_PRINTK("map mismatch mfn %#lx -> gfn %#lx -> mfn %#lx" + " (-> gfn %#lx)\n", + mfn, gfn, mfn_x(p2mfn), + (mfn_valid(p2mfn) + ? get_gpfn_from_mfn(mfn_x(p2mfn)) + : -1u)); + /* This m2p entry is stale: the domain has another frame in + * this physical slot. No great disaster, but for neatness, + * blow away the m2p entry. */ + set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY); + } + + if ( test_linear ) + { + lp2mfn = get_mfn_from_gpfn(gfn); + if ( lp2mfn != mfn_x(p2mfn) ) + { + SHADOW2_PRINTK("linear mismatch gfn %#lx -> mfn %#lx " + "(!= mfn %#lx)\n", gfn, lp2mfn, p2mfn); + } + } + + // SHADOW2_PRINTK("OK: mfn=%#lx, gfn=%#lx, p2mfn=%#lx, lp2mfn=%#lx\n", + // mfn, gfn, p2mfn, lp2mfn); + } + + /* Audit part two: walk the domain's p2m table, checking the entries. */ + if ( pagetable_get_pfn(d->arch.phys_table) != 0 ) + { + l2_pgentry_t *l2e; + l1_pgentry_t *l1e; + int i1, i2; + +#if CONFIG_PAGING_LEVELS == 4 + l4_pgentry_t *l4e; + l3_pgentry_t *l3e; + int i3, i4; + l4e = sh2_map_domain_page(pagetable_get_mfn(d->arch.phys_table)); +#elif CONFIG_PAGING_LEVELS == 3 + l3_pgentry_t *l3e; + int i3; + l3e = sh2_map_domain_page(pagetable_get_mfn(d->arch.phys_table)); +#else /* CONFIG_PAGING_LEVELS == 2 */ + l2e = sh2_map_domain_page(pagetable_get_mfn(d->arch.phys_table)); +#endif + + gfn = 0; +#if CONFIG_PAGING_LEVELS >= 3 +#if CONFIG_PAGING_LEVELS >= 4 + for ( i4 = 0; i4 < L4_PAGETABLE_ENTRIES; i4++ ) + { + if ( !(l4e_get_flags(l4e[i4]) & _PAGE_PRESENT) ) + { + gfn += 1 << (L4_PAGETABLE_SHIFT - PAGE_SHIFT); + continue; + } + l3e = sh2_map_domain_page(_mfn(l4e_get_pfn(l4e[i4]))); +#endif /* now at levels 3 or 4... */ + for ( i3 = 0; + i3 < ((CONFIG_PAGING_LEVELS==4) ? L3_PAGETABLE_ENTRIES : 8); + i3++ ) + { + if ( !(l3e_get_flags(l3e[i3]) & _PAGE_PRESENT) ) + { + gfn += 1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT); + continue; + } + l2e = sh2_map_domain_page(_mfn(l3e_get_pfn(l3e[i3]))); +#endif /* all levels... */ + for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ ) + { + if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) ) + { + gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT); + continue; + } + l1e = sh2_map_domain_page(_mfn(l2e_get_pfn(l2e[i2]))); + + for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ ) + { + if ( !(l1e_get_flags(l1e[i1]) & _PAGE_PRESENT) ) + continue; + mfn = l1e_get_pfn(l1e[i1]); + ASSERT(valid_mfn(_mfn(mfn))); + m2pfn = get_gpfn_from_mfn(mfn); + if ( m2pfn != gfn ) + { + pmbad++; + SHADOW2_PRINTK("mismatch: gfn %#lx -> mfn %#lx" + " -> gfn %#lx\n", gfn, mfn, m2pfn); + BUG(); + } + } + sh2_unmap_domain_page(l1e); + } +#if CONFIG_PAGING_LEVELS >= 3 + sh2_unmap_domain_page(l2e); + } +#if CONFIG_PAGING_LEVELS >= 4 + sh2_unmap_domain_page(l3e); + } +#endif +#endif + +#if CONFIG_PAGING_LEVELS == 4 + sh2_unmap_domain_page(l4e); +#elif CONFIG_PAGING_LEVELS == 3 + sh2_unmap_domain_page(l3e); +#else /* CONFIG_PAGING_LEVELS == 2 */ + sh2_unmap_domain_page(l2e); +#endif + + } + + //SHADOW2_PRINTK("p2m audit complete\n"); + //if ( orphans_i | orphans_d | mpbad | pmbad ) + // SHADOW2_PRINTK("p2m audit found %lu orphans (%lu inval %lu debug)\n", + // orphans_i + orphans_d, orphans_i, orphans_d, + if ( mpbad | pmbad ) + SHADOW2_PRINTK("p2m audit found %lu odd p2m, %lu bad m2p entries\n", + pmbad, mpbad); +} + +#endif /* p2m audit */ + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + */ diff -r f2151423f729 -r 01345b08d012 xen/arch/x86/shadow2.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/arch/x86/shadow2.c Wed Aug 16 17:11:56 2006 +0100 @@ -0,0 +1,4469 @@ +/****************************************************************************** + * arch/x86/shadow2.c + * + * Simple, mostly-synchronous shadow page tables. + * Parts of this code are Copyright (c) 2006 by XenSource Inc. + * Parts of this code are Copyright (c) 2006 by Michael A Fetterman + * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +// DESIGN QUESTIONS: +// Why use subshadows for PAE guests? +// - reduces pressure in the hash table +// - reduces shadow size (64-vs-4096 bytes of shadow for 32 bytes of guest L3) +// - would need to find space in the page_info to store 7 more bits of +// backpointer +// - independent shadows of 32 byte chunks makes it non-obvious how to quickly +// figure out when to demote the guest page from l3 status +// +// PAE Xen HVM guests are restricted to 8GB of pseudo-physical address space. +// - Want to map the P2M table into the 16MB RO_MPT hole in Xen's address +// space for both PV and HVM guests. +// + +#define SHADOW2 1 + +#include <xen/config.h> +#include <xen/types.h> +#include <xen/mm.h> +#include <xen/trace.h> +#include <xen/sched.h> +#include <xen/perfc.h> +#include <xen/domain_page.h> +#include <asm/page.h> +#include <asm/current.h> +#include <asm/shadow2.h> +#include <asm/shadow2-private.h> +#include <asm/shadow2-types.h> +#include <asm/flushtlb.h> +#include <asm/hvm/hvm.h> + +/* The first cut: an absolutely synchronous, trap-and-emulate version, + * supporting only HVM guests (and so only "external" shadow mode). + * + * THINGS TO DO LATER: + * + * FIX GVA_TO_GPA + * The current interface returns an unsigned long, which is not big enough + * to hold a physical address in PAE. Should return a gfn instead. + * + * TEARDOWN HEURISTICS + * Also: have a heuristic for when to destroy a previous paging-mode's + * shadows. When a guest is done with its start-of-day 32-bit tables + * and reuses the memory we want to drop those shadows. Start with + * shadows in a page in two modes as a hint, but beware of clever tricks + * like reusing a pagetable for both PAE and 64-bit during boot... + * + * PAE LINEAR MAPS + * Rework shadow_get_l*e() to have the option of using map_domain_page() + * instead of linear maps. Add appropriate unmap_l*e calls in the users. + * Then we can test the speed difference made by linear maps. If the + * map_domain_page() version is OK on PAE, we could maybe allow a lightweight + * l3-and-l2h-only shadow mode for PAE PV guests that would allow them + * to share l2h pages again. + * + * PAE L3 COPYING + * In this code, we copy all 32 bytes of a PAE L3 every time we change an + * entry in it, and every time we change CR3. We copy it for the linear + * mappings (ugh! PAE linear mappings) and we copy it to the low-memory + * buffer so it fits in CR3. Maybe we can avoid some of this recopying + * by using the shadow directly in some places. + * Also, for SMP, need to actually respond to seeing shadow2_pae_flip_pending. + * + * GUEST_WALK_TABLES TLB FLUSH COALESCE + * guest_walk_tables can do up to three remote TLB flushes as it walks to + * the first l1 of a new pagetable. Should coalesce the flushes to the end, + * and if we do flush, re-do the walk. If anything has changed, then + * pause all the other vcpus and do the walk *again*. + * + * WP DISABLED + * Consider how to implement having the WP bit of CR0 set to 0. + * Since we need to be able to cause write faults to pagetables, this might + * end up looking like not having the (guest) pagetables present at all in + * HVM guests... + * + * PSE disabled / PSE36 + * We don't support any modes other than PSE enabled, PSE36 disabled. + * Neither of those would be hard to change, but we'd need to be able to + * deal with shadows made in one mode and used in another. + */ + +#define FETCH_TYPE_PREFETCH 1 +#define FETCH_TYPE_DEMAND 2 +#define FETCH_TYPE_WRITE 4 +typedef enum { + ft_prefetch = FETCH_TYPE_PREFETCH, + ft_demand_read = FETCH_TYPE_DEMAND, + ft_demand_write = FETCH_TYPE_DEMAND | FETCH_TYPE_WRITE, +} fetch_type_t; + +#ifndef NDEBUG +static char *fetch_type_names[] = { + [ft_prefetch] "prefetch", + [ft_demand_read] "demand read", + [ft_demand_write] "demand write", +}; +#endif + +/* XXX forward declarations */ +#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3) +static unsigned long hvm_pae_copy_root(struct vcpu *v, l3_pgentry_t *l3tab, int clear_res); +#endif +static inline void sh2_update_linear_entries(struct vcpu *v); + +/**************************************************************************/ +/* Hash table mapping from guest pagetables to shadows + * + * Normal case: maps the mfn of a guest page to the mfn of its shadow page. + * FL1's: maps the *gfn* of the start of a superpage to the mfn of a + * shadow L1 which maps its "splinters". + * PAE CR3s: maps the 32-byte aligned, 32-bit CR3 value to the mfn of the + * PAE L3 info page for that CR3 value. + */ + +static inline mfn_t +get_fl1_shadow_status(struct vcpu *v, gfn_t gfn) +/* Look for FL1 shadows in the hash table */ +{ + mfn_t smfn = shadow2_hash_lookup(v, gfn_x(gfn), + PGC_SH2_fl1_shadow >> PGC_SH2_type_shift); + + if ( unlikely(shadow2_mode_log_dirty(v->domain) && valid_mfn(smfn)) ) + { + struct page_info *page = mfn_to_page(smfn); + if ( !(page->count_info & PGC_SH2_log_dirty) ) + shadow2_convert_to_log_dirty(v, smfn); + } + + return smfn; +} + +static inline mfn_t +get_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type) +/* Look for shadows in the hash table */ +{ + mfn_t smfn = shadow2_hash_lookup(v, mfn_x(gmfn), + shadow_type >> PGC_SH2_type_shift); + perfc_incrc(shadow2_get_shadow_status); + + if ( unlikely(shadow2_mode_log_dirty(v->domain) && valid_mfn(smfn)) ) _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-changelog
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |