Xen project Mailing List

[Xen-changelog] [xen-unstable] merge with xen-unstable.hg

From: Xen patchbot-unstable <patchbot-unstable@xxxxxxxxxxxxxxxxxxx>

Date: Wed, 30 Aug 2006 22:11:03 +0000

Delivery-date: Wed, 30 Aug 2006 15:17:16 -0700

List-id: BK change log <xen-changelog.lists.xensource.com>

# HG changeset patch # User awilliam@xxxxxxxxxxx # Node ID 684fdcfb251a443fa885c142b427d253ec033212 # Parent 896fcdd49c7ff59f7d28b6402fd4453e60c38232 # Parent f790546ecfda03193a4b8983f7bb6b0f65924603 merge with xen-unstable.hg --- xen/arch/x86/shadow2-common.c | 3407 --------------- xen/arch/x86/shadow2.c | 4492 --------------------- xen/include/asm-x86/page-guest32.h | 105 xen/include/asm-x86/shadow2-multi.h | 116 xen/include/asm-x86/shadow2-private.h | 593 -- xen/include/asm-x86/shadow2-types.h | 692 --- xen/include/asm-x86/shadow2.h | 626 -- docs/man/xend-config.sxp.pod.5 | 2 docs/misc/xend.tex | 4 docs/src/user.tex | 4 tools/Makefile | 1 tools/console/daemon/io.c | 18 tools/examples/vif-route | 6 tools/examples/xen-hotplug-common.sh | 2 tools/examples/xen-network-common.sh | 40 tools/examples/xend-config.sxp | 2 tools/firmware/hvmloader/smbios.c | 12 tools/firmware/hvmloader/util.c | 54 tools/firmware/hvmloader/util.h | 10 tools/ioemu/Makefile | 2 tools/ioemu/patches/qemu-logging | 1 tools/ioemu/patches/xen-build | 14 tools/ioemu/vl.c | 2 tools/libxc/xc_hvm_build.c | 2 tools/misc/xend | 2 tools/python/xen/util/bugtool.py | 4 tools/python/xen/xend/XendRoot.py | 2 tools/python/xen/xend/server/params.py | 4 tools/security/python/xensec_gen/main.py | 2 unmodified_drivers/linux-2.6/platform-pci/evtchn.c | 2 xen/arch/x86/Makefile | 21 xen/arch/x86/domain.c | 46 xen/arch/x86/domain_build.c | 8 xen/arch/x86/domctl.c | 2 xen/arch/x86/hvm/hvm.c | 6 xen/arch/x86/hvm/platform.c | 4 xen/arch/x86/hvm/svm/svm.c | 183 xen/arch/x86/hvm/svm/vmcb.c | 2 xen/arch/x86/hvm/vmx/vmcs.c | 4 xen/arch/x86/hvm/vmx/vmx.c | 20 xen/arch/x86/mm.c | 142 xen/arch/x86/mm/Makefile | 1 xen/arch/x86/mm/shadow/Makefile | 15 xen/arch/x86/mm/shadow/common.c | 3407 +++++++++++++++ xen/arch/x86/mm/shadow/multi.c | 4492 +++++++++++++++++++++ xen/arch/x86/mm/shadow/multi.h | 116 xen/arch/x86/mm/shadow/page-guest32.h | 105 xen/arch/x86/mm/shadow/private.h | 593 ++ xen/arch/x86/mm/shadow/types.h | 692 +++ xen/arch/x86/traps.c | 8 xen/include/asm-x86/domain.h | 18 xen/include/asm-x86/hvm/svm/vmcb.h | 45 xen/include/asm-x86/mm.h | 82 xen/include/asm-x86/perfc_defn.h | 102 xen/include/asm-x86/shadow.h | 614 ++ 55 files changed, 10488 insertions(+), 10463 deletions(-) diff -r 896fcdd49c7f -r 684fdcfb251a docs/man/xend-config.sxp.pod.5 --- a/docs/man/xend-config.sxp.pod.5 Mon Aug 28 16:16:07 2006 -0600 +++ b/docs/man/xend-config.sxp.pod.5 Mon Aug 28 16:26:37 2006 -0600 @@ -23,7 +23,7 @@ The following lists the daemon configura =item I<logfile> The location of the file to record runtime log messages. Defaults to -I</var/log/xend.log>. +I</var/log/xen/xend.log>. =item I<loglevel> diff -r 896fcdd49c7f -r 684fdcfb251a docs/misc/xend.tex --- a/docs/misc/xend.tex Mon Aug 28 16:16:07 2006 -0600 +++ b/docs/misc/xend.tex Mon Aug 28 16:26:37 2006 -0600 @@ -214,7 +214,7 @@ Configuration scripts ({\it e.g.} for ne Configuration scripts ({\it e.g.} for network-script) are looked for in {\tt /etc/xen} unless their name begins with '/'. -Xend sends its log output to {\tt /var/log/xend.log}. This is a rotating logfile, +Xend sends its log output to {\tt /var/log/xen/xend.log}. This is a rotating logfile, and logs are moved onto {\tt xend.log.1} {\it etc.} as they get large. Old logs may be deleted. @@ -411,7 +411,7 @@ allows access to some debugging function \end{itemize} When tracing is on xend logs all functions calls and exceptions to -{\tt /var/log/xend.trace}. +{\tt /var/log/xen/xend.trace}. \begin{thebibliography}{99} diff -r 896fcdd49c7f -r 684fdcfb251a docs/src/user.tex --- a/docs/src/user.tex Mon Aug 28 16:16:07 2006 -0600 +++ b/docs/src/user.tex Mon Aug 28 16:26:37 2006 -0600 @@ -973,8 +973,8 @@ using the \texttt{xm} tool. \subsection{Logging} -As \xend\ runs, events will be logged to \path{/var/log/xend.log} and -(less frequently) to \path{/var/log/xend-debug.log}. These, along with +As \xend\ runs, events will be logged to \path{/var/log/xen/xend.log} and +(less frequently) to \path{/var/log/xen/xend-debug.log}. These, along with the standard syslog files, are useful when troubleshooting problems. \subsection{Configuring \Xend\ } diff -r 896fcdd49c7f -r 684fdcfb251a tools/Makefile --- a/tools/Makefile Mon Aug 28 16:16:07 2006 -0600 +++ b/tools/Makefile Mon Aug 28 16:26:37 2006 -0600 @@ -39,6 +39,7 @@ install: check done $(MAKE) ioemuinstall $(INSTALL_DIR) -p $(DESTDIR)/var/xen/dump + $(INSTALL_DIR) -p $(DESTDIR)/var/log/xen .PHONY: clean clean: check_clean diff -r 896fcdd49c7f -r 684fdcfb251a tools/console/daemon/io.c --- a/tools/console/daemon/io.c Mon Aug 28 16:16:07 2006 -0600 +++ b/tools/console/daemon/io.c Mon Aug 28 16:26:37 2006 -0600 @@ -584,16 +584,14 @@ void handle_io(void) FD_ISSET(xc_evtchn_fd(d->xce_handle), &readfds)) handle_ring_read(d); - if (d->tty_fd != -1) { - if (FD_ISSET(d->tty_fd, &readfds)) - handle_tty_read(d); - - if (FD_ISSET(d->tty_fd, &writefds)) - handle_tty_write(d); - - if (d->is_dead) - cleanup_domain(d); - } + if (d->tty_fd != -1 && FD_ISSET(d->tty_fd, &readfds)) + handle_tty_read(d); + + if (d->tty_fd != -1 && FD_ISSET(d->tty_fd, &writefds)) + handle_tty_write(d); + + if (d->is_dead) + cleanup_domain(d); } } while (ret > -1); } diff -r 896fcdd49c7f -r 684fdcfb251a tools/examples/vif-route --- a/tools/examples/vif-route Mon Aug 28 16:16:07 2006 -0600 +++ b/tools/examples/vif-route Mon Aug 28 16:26:37 2006 -0600 @@ -30,10 +30,12 @@ case "$command" in ifconfig ${vif} ${main_ip} netmask 255.255.255.255 up echo 1 >/proc/sys/net/ipv4/conf/${vif}/proxy_arp ipcmd='add' + cmdprefix='' ;; offline) - ifdown ${vif} + do_without_error ifdown ${vif} ipcmd='del' + cmdprefix='do_without_error' ;; esac @@ -41,7 +43,7 @@ if [ "${ip}" ] ; then # If we've been given a list of IP addresses, then add routes from dom0 to # the guest using those addresses. for addr in ${ip} ; do - ip route ${ipcmd} ${addr} dev ${vif} src ${main_ip} + ${cmdprefix} ip route ${ipcmd} ${addr} dev ${vif} src ${main_ip} done fi diff -r 896fcdd49c7f -r 684fdcfb251a tools/examples/xen-hotplug-common.sh --- a/tools/examples/xen-hotplug-common.sh Mon Aug 28 16:16:07 2006 -0600 +++ b/tools/examples/xen-hotplug-common.sh Mon Aug 28 16:26:37 2006 -0600 @@ -21,7 +21,7 @@ dir=$(dirname "$0") . "$dir/xen-script-common.sh" . "$dir/locking.sh" -exec 2>>/var/log/xen-hotplug.log +exec 2>>/var/log/xen/xen-hotplug.log export PATH="/sbin:/bin:/usr/bin:/usr/sbin:$PATH" export LANG="POSIX" diff -r 896fcdd49c7f -r 684fdcfb251a tools/examples/xen-network-common.sh --- a/tools/examples/xen-network-common.sh Mon Aug 28 16:16:07 2006 -0600 +++ b/tools/examples/xen-network-common.sh Mon Aug 28 16:26:37 2006 -0600 @@ -44,34 +44,18 @@ then } elif ! which ifup >/dev/null 2>/dev/null then - if [ -e /etc/conf.d/net ] - then - preiftransfer() - { - true - } - ifup() - { - /etc/init.d/net.$1 start - } - ifdown() - { - /etc/init.d/net.$1 stop - } - else - preiftransfer() - { - true - } - ifup() - { - false - } - ifdown() - { - false - } - fi + preiftransfer() + { + true + } + ifup() + { + false + } + ifdown() + { + false + } else preiftransfer() { diff -r 896fcdd49c7f -r 684fdcfb251a tools/examples/xend-config.sxp --- a/tools/examples/xend-config.sxp Mon Aug 28 16:16:07 2006 -0600 +++ b/tools/examples/xend-config.sxp Mon Aug 28 16:26:37 2006 -0600 @@ -11,7 +11,7 @@ # Commented out entries show the default for that entry, unless otherwise # specified. -#(logfile /var/log/xend.log) +#(logfile /var/log/xen/xend.log) #(loglevel DEBUG) #(xend-http-server no) diff -r 896fcdd49c7f -r 684fdcfb251a tools/firmware/hvmloader/smbios.c --- a/tools/firmware/hvmloader/smbios.c Mon Aug 28 16:16:07 2006 -0600 +++ b/tools/firmware/hvmloader/smbios.c Mon Aug 28 16:26:37 2006 -0600 @@ -116,8 +116,10 @@ smbios_table_size(uint32_t vcpus, const /* type 0: "Xen", xen_version, and release_date */ size += strlen("Xen") + strlen(xen_version) + 2; - /* type 1: "Xen", xen_version, "HVM domU" */ - size += strlen("Xen") + strlen("HVM domU") + strlen(xen_version) + 3; + /* type 1: "Xen", xen_version, "HVM domU", UUID as string for + serial number */ + size += strlen("Xen") + strlen("HVM domU") + strlen(xen_version) + + 36 + 4; /* type 3: "Xen" */ size += strlen("Xen") + 1; /* type 4: socket designation ("CPU n"), processor_manufacturer */ @@ -371,6 +373,7 @@ smbios_type_1_init(void *start, const ch smbios_type_1_init(void *start, const char *xen_version, uint8_t uuid[16]) { + char uuid_str[37]; struct smbios_type_1 *p = (struct smbios_type_1 *)start; p->header.type = 1; p->header.length = sizeof(struct smbios_type_1); @@ -379,7 +382,7 @@ smbios_type_1_init(void *start, const ch p->manufacturer_str = 1; p->product_name_str = 2; p->version_str = 3; - p->serial_number_str = 0; + p->serial_number_str = 4; memcpy(p->uuid, uuid, 16); @@ -395,6 +398,9 @@ smbios_type_1_init(void *start, const ch start += strlen("HVM domU") + 1; strcpy((char *)start, xen_version); start += strlen(xen_version) + 1; + uuid_to_string(uuid_str, uuid); + strcpy((char *)start, uuid_str); + start += strlen(uuid_str) + 1; *((uint8_t *)start) = 0; return start+1; diff -r 896fcdd49c7f -r 684fdcfb251a tools/firmware/hvmloader/util.c --- a/tools/firmware/hvmloader/util.c Mon Aug 28 16:16:07 2006 -0600 +++ b/tools/firmware/hvmloader/util.c Mon Aug 28 16:26:37 2006 -0600 @@ -174,3 +174,57 @@ cpuid(uint32_t idx, uint32_t *eax, uint3 : "0" (idx) ); } +/* Write a two-character hex representation of 'byte' to digits[]. + Pre-condition: sizeof(digits) >= 2 */ +void +byte_to_hex(char *digits, uint8_t byte) +{ + uint8_t nybbel = byte >> 4; + + if (nybbel > 9) + digits[0] = 'a' + nybbel-10; + else + digits[0] = '0' + nybbel; + + nybbel = byte & 0x0f; + if (nybbel > 9) + digits[1] = 'a' + nybbel-10; + else + digits[1] = '0' + nybbel; +} + +/* Convert an array of 16 unsigned bytes to a DCE/OSF formatted UUID + string. + + Pre-condition: sizeof(dest) >= 37 */ +void +uuid_to_string(char *dest, uint8_t *uuid) +{ + int i = 0; + char *p = dest; + + for (i = 0; i < 4; ++i) { + byte_to_hex(p, uuid[i]); + p += 2; + } + *p++ = '-'; + for (i = 4; i < 6; ++i) { + byte_to_hex(p, uuid[i]); + p += 2; + } + *p++ = '-'; + for (i = 6; i < 8; ++i) { + byte_to_hex(p, uuid[i]); + p += 2; + } + *p++ = '-'; + for (i = 8; i < 10; ++i) { + byte_to_hex(p, uuid[i]); + p += 2; + } + *p++ = '-'; + for (i = 10; i < 16; ++i) { + byte_to_hex(p, uuid[i]); + p += 2; + } +} diff -r 896fcdd49c7f -r 684fdcfb251a tools/firmware/hvmloader/util.h --- a/tools/firmware/hvmloader/util.h Mon Aug 28 16:16:07 2006 -0600 +++ b/tools/firmware/hvmloader/util.h Mon Aug 28 16:26:37 2006 -0600 @@ -25,6 +25,16 @@ void *memset(void *s, int c, unsigned n) void *memset(void *s, int c, unsigned n); char *itoa(char *a, unsigned int i); +/* convert a byte to two lowercase hex digits, with no terminating NUL + character. digits[] must have at least two elements. */ +void byte_to_hex(char *digits, uint8_t byte); + +/* Convert an array of 16 unsigned bytes to a DCE/OSF formatted UUID + string. + + Pre-condition: sizeof(dest) >= 37 */ +void uuid_to_string(char *dest, uint8_t *uuid); + /* Debug output */ void puts(const char *s); diff -r 896fcdd49c7f -r 684fdcfb251a tools/ioemu/Makefile --- a/tools/ioemu/Makefile Mon Aug 28 16:16:07 2006 -0600 +++ b/tools/ioemu/Makefile Mon Aug 28 16:26:37 2006 -0600 @@ -94,7 +94,7 @@ test speed test2: all $(MAKE) -C tests $@ TAGS: - etags *.[ch] tests/*.[ch] + etags *.[ch] target-i386-dm/*.[ch] hw/*.[ch] cscope: rm -f ./cscope.* diff -r 896fcdd49c7f -r 684fdcfb251a tools/ioemu/patches/qemu-logging --- a/tools/ioemu/patches/qemu-logging Mon Aug 28 16:16:07 2006 -0600 +++ b/tools/ioemu/patches/qemu-logging Mon Aug 28 16:26:37 2006 -0600 @@ -43,7 +43,7 @@ Index: ioemu/vl.c /* default mac address of the first network interface */ + /* init debug */ -+ sprintf(qemu_dm_logfilename, "/var/log/qemu-dm.%d.log", getpid()); ++ sprintf(qemu_dm_logfilename, "/var/log/xen/qemu-dm.%d.log", getpid()); + cpu_set_log_filename(qemu_dm_logfilename); + cpu_set_log(0); + diff -r 896fcdd49c7f -r 684fdcfb251a tools/ioemu/patches/xen-build --- a/tools/ioemu/patches/xen-build Mon Aug 28 16:16:07 2006 -0600 +++ b/tools/ioemu/patches/xen-build Mon Aug 28 16:26:37 2006 -0600 @@ -1,7 +1,7 @@ Index: ioemu/Makefile Index: ioemu/Makefile =================================================================== ---- ioemu.orig/Makefile 2006-08-06 02:03:44.915543858 +0100 -+++ ioemu/Makefile 2006-08-06 02:11:33.461331417 +0100 +--- ioemu.orig/Makefile 2006-08-28 20:19:23.000000000 +0100 ++++ ioemu/Makefile 2006-08-28 20:20:08.000000000 +0100 @@ -1,11 +1,14 @@ # Makefile for QEMU. @@ -60,6 +60,15 @@ Index: ioemu/Makefile ifndef CONFIG_WIN32 mkdir -p "$(DESTDIR)$(datadir)/keymaps" for x in $(KEYMAPS); do \ +@@ -89,7 +94,7 @@ + $(MAKE) -C tests $@ + + TAGS: +- etags *.[ch] tests/*.[ch] ++ etags *.[ch] target-i386-dm/*.[ch] hw/*.[ch] + + cscope: + rm -f ./cscope.* @@ -107,11 +112,11 @@ texi2dvi $< @@ -76,8 +85,8 @@ Index: ioemu/Makefile info: qemu-doc.info qemu-tech.info Index: ioemu/Makefile.target =================================================================== ---- ioemu.orig/Makefile.target 2006-08-06 02:03:44.922543079 +0100 -+++ ioemu/Makefile.target 2006-08-06 02:09:22.320951557 +0100 +--- ioemu.orig/Makefile.target 2006-08-28 20:19:23.000000000 +0100 ++++ ioemu/Makefile.target 2006-08-28 20:19:47.000000000 +0100 @@ -1,5 +1,8 @@ include config.mak @@ -149,8 +158,8 @@ Index: ioemu/Makefile.target include .depend Index: ioemu/configure =================================================================== ---- ioemu.orig/configure 2006-08-06 02:03:45.783447220 +0100 -+++ ioemu/configure 2006-08-06 02:09:41.076860544 +0100 +--- ioemu.orig/configure 2006-08-28 20:19:23.000000000 +0100 ++++ ioemu/configure 2006-08-28 20:19:47.000000000 +0100 @@ -18,8 +18,8 @@ # default parameters diff -r 896fcdd49c7f -r 684fdcfb251a tools/ioemu/vl.c --- a/tools/ioemu/vl.c Mon Aug 28 16:16:07 2006 -0600 +++ b/tools/ioemu/vl.c Mon Aug 28 16:26:37 2006 -0600 @@ -5924,7 +5924,7 @@ int main(int argc, char **argv) /* default mac address of the first network interface */ /* init debug */ - sprintf(qemu_dm_logfilename, "/var/log/qemu-dm.%d.log", getpid()); + sprintf(qemu_dm_logfilename, "/var/log/xen/qemu-dm.%d.log", getpid()); cpu_set_log_filename(qemu_dm_logfilename); cpu_set_log(0); diff -r 896fcdd49c7f -r 684fdcfb251a tools/libxc/xc_hvm_build.c --- a/tools/libxc/xc_hvm_build.c Mon Aug 28 16:16:07 2006 -0600 +++ b/tools/libxc/xc_hvm_build.c Mon Aug 28 16:26:37 2006 -0600 @@ -441,7 +441,7 @@ static int xc_hvm_build_internal(int xc_ goto error_out; } - /* HVM domains must be put into shadow2 mode at the start of day */ + /* HVM domains must be put into shadow mode at the start of day */ if ( xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_ENABLE, NULL, 0, NULL, XEN_DOMCTL_SHADOW_ENABLE_REFCOUNT | diff -r 896fcdd49c7f -r 684fdcfb251a tools/misc/xend --- a/tools/misc/xend Mon Aug 28 16:16:07 2006 -0600 +++ b/tools/misc/xend Mon Aug 28 16:26:37 2006 -0600 @@ -86,7 +86,7 @@ def start_xenstored(): XENSTORED_TRACE = os.getenv("XENSTORED_TRACE") cmd = "xenstored --pid-file /var/run/xenstore.pid" if XENSTORED_TRACE: - cmd += " -T /var/log/xenstored-trace.log" + cmd += " -T /var/log/xen/xenstored-trace.log" s,o = commands.getstatusoutput(cmd) def start_consoled(): diff -r 896fcdd49c7f -r 684fdcfb251a tools/python/xen/util/bugtool.py --- a/tools/python/xen/util/bugtool.py Mon Aug 28 16:16:07 2006 -0600 +++ b/tools/python/xen/util/bugtool.py Mon Aug 28 16:26:37 2006 -0600 @@ -43,8 +43,8 @@ TITLE_RE = re.compile(r'<title>(.*)</tit FILES_TO_SEND = [ '/var/log/' + x for x in [ 'syslog', 'messages', 'debug', - 'xend.log', 'xend-debug.log', 'xenstored-trace.log', - 'xen-hotplug.log' ] ] + 'xen/xend.log', 'xen/xend-debug.log', 'xen/xenstored-trace.log', + 'xen/xen-hotplug.log' ] ] #FILES_TO_SEND = [ ] diff -r 896fcdd49c7f -r 684fdcfb251a tools/python/xen/xend/XendRoot.py --- a/tools/python/xen/xend/XendRoot.py Mon Aug 28 16:16:07 2006 -0600 +++ b/tools/python/xen/xend/XendRoot.py Mon Aug 28 16:26:37 2006 -0600 @@ -52,7 +52,7 @@ class XendRoot: block_script_dir = "/etc/xen/scripts" """Default path to the log file. """ - logfile_default = "/var/log/xend.log" + logfile_default = "/var/log/xen/xend.log" """Default level of information to be logged.""" loglevel_default = 'DEBUG' diff -r 896fcdd49c7f -r 684fdcfb251a tools/python/xen/xend/server/params.py --- a/tools/python/xen/xend/server/params.py Mon Aug 28 16:16:07 2006 -0600 +++ b/tools/python/xen/xend/server/params.py Mon Aug 28 16:26:37 2006 -0600 @@ -39,8 +39,8 @@ def getenv(var, val, conv=None): # The following parameters could be placed in a configuration file. XEND_PID_FILE = '/var/run/xend.pid' -XEND_TRACE_FILE = '/var/log/xend.trace' -XEND_DEBUG_LOG = '/var/log/xend-debug.log' +XEND_TRACE_FILE = '/var/log/xen/xend.trace' +XEND_DEBUG_LOG = '/var/log/xen/xend-debug.log' XEND_USER = 'root' XEND_DEBUG = getenv("XEND_DEBUG", 0, conv=int) XEND_DAEMONIZE = getenv("XEND_DAEMONIZE", not XEND_DEBUG, conv=int) diff -r 896fcdd49c7f -r 684fdcfb251a tools/security/python/xensec_gen/main.py --- a/tools/security/python/xensec_gen/main.py Mon Aug 28 16:16:07 2006 -0600 +++ b/tools/security/python/xensec_gen/main.py Mon Aug 28 16:26:37 2006 -0600 @@ -34,7 +34,7 @@ import CGIHTTPServer gHttpPort = 7777 gHttpDir = '/var/lib/xensec_gen' -gLogFile = '/var/log/xensec_gen.log' +gLogFile = '/var/log/xen/xensec_gen.log' gUser = 'nobody' gGroup = 'nobody' diff -r 896fcdd49c7f -r 684fdcfb251a unmodified_drivers/linux-2.6/platform-pci/evtchn.c --- a/unmodified_drivers/linux-2.6/platform-pci/evtchn.c Mon Aug 28 16:16:07 2006 -0600 +++ b/unmodified_drivers/linux-2.6/platform-pci/evtchn.c Mon Aug 28 16:26:37 2006 -0600 @@ -4,7 +4,7 @@ * A simplified event channel for para-drivers in unmodified linux * * Copyright (c) 2002-2005, K A Fraser - * Copyright (c) 2005, <xiaofeng.ling@xxxxxxxxx> + * Copyright (c) 2005, Intel Corporation <xiaofeng.ling@xxxxxxxxx> * * This file may be distributed separately from the Linux kernel, or * incorporated into other software packages, subject to the following license: diff -r 896fcdd49c7f -r 684fdcfb251a xen/arch/x86/Makefile --- a/xen/arch/x86/Makefile Mon Aug 28 16:16:07 2006 -0600 +++ b/xen/arch/x86/Makefile Mon Aug 28 16:26:37 2006 -0600 @@ -2,6 +2,7 @@ subdir-y += cpu subdir-y += cpu subdir-y += genapic subdir-y += hvm +subdir-y += mm subdir-y += oprofile subdir-$(x86_32) += x86_32 @@ -41,23 +42,6 @@ obj-y += usercopy.o obj-y += usercopy.o obj-y += x86_emulate.o -ifneq ($(pae),n) -obj-$(x86_32) += shadow2-common.o shadow2_g2_on_s3.o shadow2_g3_on_s3.o -else -obj-$(x86_32) += shadow2-common.o shadow2_g2_on_s2.o -endif - -obj-$(x86_64) += shadow2-common.o shadow2_g4_on_s4.o shadow2_g3_on_s3.o \ - shadow2_g2_on_s3.o - -guest_levels = $(subst g,,$(filter g%,$(subst ., ,$(subst _, ,$(subst shadow2_,,$(1)))))) -shadow_levels = $(subst s,,$(filter s%,$(subst ., ,$(subst _, ,$(subst shadow2_,,$(1)))))) -shadow2_defns = -DGUEST_PAGING_LEVELS=$(call guest_levels,$(1)) \ - -DSHADOW_PAGING_LEVELS=$(call shadow_levels,$(1)) - -shadow2_%.o: shadow2.c $(HDRS) Makefile - $(CC) $(CFLAGS) $(call shadow2_defns,$(@F)) -c $< -o $@ - obj-$(crash_debug) += gdbstub.o $(TARGET): $(TARGET)-syms boot/mkelf32 @@ -86,9 +70,6 @@ boot/mkelf32: boot/mkelf32.c boot/mkelf32: boot/mkelf32.c $(HOSTCC) $(HOSTCFLAGS) -o $@ $< -shadow_guest32.o: shadow.c -shadow_guest32pae.o: shadow.c - .PHONY: clean clean:: rm -f asm-offsets.s xen.lds boot/*.o boot/*~ boot/core boot/mkelf32 diff -r 896fcdd49c7f -r 684fdcfb251a xen/arch/x86/domain.c --- a/xen/arch/x86/domain.c Mon Aug 28 16:16:07 2006 -0600 +++ b/xen/arch/x86/domain.c Mon Aug 28 16:26:37 2006 -0600 @@ -200,12 +200,12 @@ int arch_domain_create(struct domain *d) #endif /* __x86_64__ */ - shadow2_lock_init(d); - for ( i = 0; i <= SHADOW2_MAX_ORDER; i++ ) - INIT_LIST_HEAD(&d->arch.shadow2.freelists[i]); - INIT_LIST_HEAD(&d->arch.shadow2.p2m_freelist); - INIT_LIST_HEAD(&d->arch.shadow2.p2m_inuse); - INIT_LIST_HEAD(&d->arch.shadow2.toplevel_shadows); + shadow_lock_init(d); + for ( i = 0; i <= SHADOW_MAX_ORDER; i++ ) + INIT_LIST_HEAD(&d->arch.shadow.freelists[i]); + INIT_LIST_HEAD(&d->arch.shadow.p2m_freelist); + INIT_LIST_HEAD(&d->arch.shadow.p2m_inuse); + INIT_LIST_HEAD(&d->arch.shadow.toplevel_shadows); if ( !is_idle_domain(d) ) { @@ -236,7 +236,7 @@ int arch_domain_create(struct domain *d) void arch_domain_destroy(struct domain *d) { - shadow2_final_teardown(d); + shadow_final_teardown(d); free_xenheap_pages( d->arch.mm_perdomain_pt, @@ -342,10 +342,10 @@ int arch_set_info_guest( } } - /* Shadow2: make sure the domain has enough shadow memory to + /* Shadow: make sure the domain has enough shadow memory to * boot another vcpu */ - if ( shadow2_mode_enabled(d) - && d->arch.shadow2.total_pages < shadow2_min_acceptable_pages(d) ) + if ( shadow_mode_enabled(d) + && d->arch.shadow.total_pages < shadow_min_acceptable_pages(d) ) { destroy_gdt(v); return -ENOMEM; @@ -357,8 +357,8 @@ int arch_set_info_guest( /* Don't redo final setup */ set_bit(_VCPUF_initialised, &v->vcpu_flags); - if ( shadow2_mode_enabled(d) ) - shadow2_update_paging_modes(v); + if ( shadow_mode_enabled(d) ) + shadow_update_paging_modes(v); update_cr3(v); @@ -936,11 +936,11 @@ void domain_relinquish_resources(struct for_each_vcpu ( d, v ) { /* Drop ref to guest_table (from new_guest_cr3(), svm/vmx cr3 handling, - * or sh2_update_paging_modes()) */ + * or sh_update_paging_modes()) */ pfn = pagetable_get_pfn(v->arch.guest_table); if ( pfn != 0 ) { - if ( shadow2_mode_refcounts(d) ) + if ( shadow_mode_refcounts(d) ) put_page(mfn_to_page(pfn)); else put_page_and_type(mfn_to_page(pfn)); @@ -962,7 +962,7 @@ void domain_relinquish_resources(struct hvm_relinquish_guest_resources(d); /* Tear down shadow mode stuff. */ - shadow2_teardown(d); + shadow_teardown(d); /* * Relinquish GDT mappings. No need for explicit unmapping of the LDT as @@ -981,18 +981,18 @@ void domain_relinquish_resources(struct void arch_dump_domain_info(struct domain *d) { - if ( shadow2_mode_enabled(d) ) - { - printk(" shadow2 mode: "); - if ( d->arch.shadow2.mode & SHM2_enable ) + if ( shadow_mode_enabled(d) ) + { + printk(" shadow mode: "); + if ( d->arch.shadow.mode & SHM2_enable ) printk("enabled "); - if ( shadow2_mode_refcounts(d) ) + if ( shadow_mode_refcounts(d) ) printk("refcounts "); - if ( shadow2_mode_log_dirty(d) ) + if ( shadow_mode_log_dirty(d) ) printk("log_dirty "); - if ( shadow2_mode_translate(d) ) + if ( shadow_mode_translate(d) ) printk("translate "); - if ( shadow2_mode_external(d) ) + if ( shadow_mode_external(d) ) printk("external "); printk("\n"); } diff -r 896fcdd49c7f -r 684fdcfb251a xen/arch/x86/domain_build.c --- a/xen/arch/x86/domain_build.c Mon Aug 28 16:16:07 2006 -0600 +++ b/xen/arch/x86/domain_build.c Mon Aug 28 16:26:37 2006 -0600 @@ -679,8 +679,8 @@ int construct_dom0(struct domain *d, (void)alloc_vcpu(d, i, i); /* Set up CR3 value for write_ptbase */ - if ( shadow2_mode_enabled(v->domain) ) - shadow2_update_paging_modes(v); + if ( shadow_mode_enabled(v->domain) ) + shadow_update_paging_modes(v); else update_cr3(v); @@ -791,8 +791,8 @@ int construct_dom0(struct domain *d, new_thread(v, dsi.v_kernentry, vstack_end, vstartinfo_start); if ( opt_dom0_shadow ) - if ( shadow2_test_enable(d) == 0 ) - shadow2_update_paging_modes(v); + if ( shadow_test_enable(d) == 0 ) + shadow_update_paging_modes(v); if ( supervisor_mode_kernel ) { diff -r 896fcdd49c7f -r 684fdcfb251a xen/arch/x86/domctl.c --- a/xen/arch/x86/domctl.c Mon Aug 28 16:16:07 2006 -0600 +++ b/xen/arch/x86/domctl.c Mon Aug 28 16:26:37 2006 -0600 @@ -39,7 +39,7 @@ long arch_do_domctl( d = find_domain_by_id(domctl->domain); if ( d != NULL ) { - ret = shadow2_domctl(d, &domctl->u.shadow_op, u_domctl); + ret = shadow_domctl(d, &domctl->u.shadow_op, u_domctl); put_domain(d); copy_to_guest(u_domctl, domctl, 1); } diff -r 896fcdd49c7f -r 684fdcfb251a xen/arch/x86/hvm/hvm.c --- a/xen/arch/x86/hvm/hvm.c Mon Aug 28 16:16:07 2006 -0600 +++ b/xen/arch/x86/hvm/hvm.c Mon Aug 28 16:26:37 2006 -0600 @@ -384,8 +384,8 @@ int hvm_copy(void *buf, unsigned long va if (count > size) count = size; - gfn = shadow2_gva_to_gfn(v, vaddr); - mfn = mfn_x(sh2_vcpu_gfn_to_mfn(v, gfn)); + gfn = shadow_gva_to_gfn(v, vaddr); + mfn = mfn_x(sh_vcpu_gfn_to_mfn(v, gfn)); if (mfn == INVALID_MFN) return 0; @@ -539,7 +539,7 @@ void hvm_do_hypercall(struct cpu_user_re return; } - if ( current->arch.shadow2.mode->guest_levels == 4 ) + if ( current->arch.shadow.mode->guest_levels == 4 ) { pregs->rax = hvm_hypercall64_table[pregs->rax](pregs->rdi, pregs->rsi, diff -r 896fcdd49c7f -r 684fdcfb251a xen/arch/x86/hvm/platform.c --- a/xen/arch/x86/hvm/platform.c Mon Aug 28 16:16:07 2006 -0600 +++ b/xen/arch/x86/hvm/platform.c Mon Aug 28 16:26:37 2006 -0600 @@ -721,7 +721,7 @@ void send_pio_req(struct cpu_user_regs * if (pvalid) { if (hvm_paging_enabled(current)) - p->u.data = shadow2_gva_to_gpa(current, value); + p->u.data = shadow_gva_to_gpa(current, value); else p->u.pdata = (void *) value; /* guest VA == guest PA */ } else @@ -771,7 +771,7 @@ void send_mmio_req( if (pvalid) { if (hvm_paging_enabled(v)) - p->u.data = shadow2_gva_to_gpa(v, value); + p->u.data = shadow_gva_to_gpa(v, value); else p->u.pdata = (void *) value; /* guest VA == guest PA */ } else diff -r 896fcdd49c7f -r 684fdcfb251a xen/arch/x86/hvm/svm/svm.c --- a/xen/arch/x86/hvm/svm/svm.c Mon Aug 28 16:16:07 2006 -0600 +++ b/xen/arch/x86/hvm/svm/svm.c Mon Aug 28 16:26:37 2006 -0600 @@ -29,7 +29,7 @@ #include <xen/domain_page.h> #include <asm/current.h> #include <asm/io.h> -#include <asm/shadow2.h> +#include <asm/shadow.h> #include <asm/regs.h> #include <asm/cpufeature.h> #include <asm/processor.h> @@ -402,6 +402,50 @@ static inline int long_mode_do_msr_write } return 1; } + + +#define loaddebug(_v,_reg) \ + __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_v)->debugreg[_reg])) +#define savedebug(_v,_reg) \ + __asm__ __volatile__ ("mov %%db" #_reg ",%0" : : "r" ((_v)->debugreg[_reg])) + + +static inline void svm_save_dr(struct vcpu *v) +{ + if (v->arch.hvm_vcpu.flag_dr_dirty) + { + /* clear the DR dirty flag and re-enable intercepts for DR accesses */ + v->arch.hvm_vcpu.flag_dr_dirty = 0; + v->arch.hvm_svm.vmcb->dr_intercepts = DR_INTERCEPT_ALL_WRITES; + + savedebug(&v->arch.guest_context, 0); + savedebug(&v->arch.guest_context, 1); + savedebug(&v->arch.guest_context, 2); + savedebug(&v->arch.guest_context, 3); + } +} + + +static inline void __restore_debug_registers(struct vcpu *v) +{ + loaddebug(&v->arch.guest_context, 0); + loaddebug(&v->arch.guest_context, 1); + loaddebug(&v->arch.guest_context, 2); + loaddebug(&v->arch.guest_context, 3); +} + + +static inline void svm_restore_dr(struct vcpu *v) +{ + struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; + + if (!vmcb) + return; + + if (unlikely(vmcb->dr7 & 0xFF)) + __restore_debug_registers(v); +} + static int svm_realmode(struct vcpu *v) { @@ -717,6 +761,7 @@ static void svm_ctxt_switch_from(struct static void svm_ctxt_switch_from(struct vcpu *v) { svm_freeze_time(v); + svm_save_dr(v); } static void svm_ctxt_switch_to(struct vcpu *v) @@ -732,6 +777,7 @@ static void svm_ctxt_switch_to(struct vc set_segment_register(es, 0); set_segment_register(ss, 0); #endif + svm_restore_dr(v); } @@ -746,10 +792,10 @@ static void svm_final_setup_guest(struct if ( v != d->vcpu[0] ) return; - if ( !shadow2_mode_external(d) ) + if ( !shadow_mode_external(d) ) { DPRINTK("Can't init HVM for dom %u vcpu %u: " - "not in shadow2 external mode\n", d->domain_id, v->vcpu_id); + "not in shadow external mode\n", d->domain_id, v->vcpu_id); domain_crash(d); } @@ -914,7 +960,7 @@ static int svm_do_page_fault(unsigned lo va, eip, (unsigned long)regs->error_code); //#endif - result = shadow2_fault(va, regs); + result = shadow_fault(va, regs); if( result ) { /* Let's make sure that the Guest TLB is flushed */ @@ -1183,55 +1229,16 @@ static inline void set_reg(unsigned int } -static void svm_dr_access (struct vcpu *v, unsigned int reg, unsigned int type, - struct cpu_user_regs *regs) -{ - unsigned long *reg_p = 0; - unsigned int gpreg = 0; - unsigned long eip; - int inst_len; - int index; - struct vmcb_struct *vmcb; - u8 buffer[MAX_INST_LEN]; - u8 prefix = 0; - - vmcb = v->arch.hvm_svm.vmcb; - - ASSERT(vmcb); - - eip = vmcb->rip; - inst_copy_from_guest(buffer, svm_rip2pointer(vmcb), sizeof(buffer)); - index = skip_prefix_bytes(buffer, sizeof(buffer)); - - ASSERT(buffer[index+0] == 0x0f && (buffer[index+1] & 0xFD) == 0x21); - - if (index > 0 && (buffer[index-1] & 0xF0) == 0x40) - prefix = buffer[index-1]; - - gpreg = decode_src_reg(prefix, buffer[index + 2]); - ASSERT(reg == decode_dest_reg(prefix, buffer[index + 2])); - - HVM_DBG_LOG(DBG_LEVEL_1, "svm_dr_access : eip=%lx, reg=%d, gpreg = %x", - eip, reg, gpreg); - - reg_p = get_reg_p(gpreg, regs, vmcb); - - switch (type) - { - case TYPE_MOV_TO_DR: - inst_len = __get_instruction_length(vmcb, INSTR_MOV2DR, buffer); - v->arch.guest_context.debugreg[reg] = *reg_p; - break; - case TYPE_MOV_FROM_DR: - inst_len = __get_instruction_length(vmcb, INSTR_MOVDR2, buffer); - *reg_p = v->arch.guest_context.debugreg[reg]; - break; - default: - __hvm_bug(regs); - break; - } - ASSERT(inst_len > 0); - __update_guest_eip(vmcb, inst_len); +static void svm_dr_access(struct vcpu *v, struct cpu_user_regs *regs) +{ + struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; + + v->arch.hvm_vcpu.flag_dr_dirty = 1; + + __restore_debug_registers(v); + + /* allow the guest full access to the debug registers */ + vmcb->dr_intercepts = 0; } @@ -1562,7 +1569,7 @@ static int svm_set_cr0(unsigned long val v->arch.guest_table = pagetable_from_pfn(mfn); if ( old_base_mfn ) put_page(mfn_to_page(old_base_mfn)); - shadow2_update_paging_modes(v); + shadow_update_paging_modes(v); HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx", (unsigned long) (mfn << PAGE_SHIFT)); @@ -1588,14 +1595,14 @@ static int svm_set_cr0(unsigned long val svm_inject_exception(v, TRAP_gp_fault, 1, 0); return 0; } - shadow2_update_paging_modes(v); + shadow_update_paging_modes(v); vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3; set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags); } else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE ) { /* we should take care of this kind of situation */ - shadow2_update_paging_modes(v); + shadow_update_paging_modes(v); vmcb->cr3 = v->arch.hvm_vcpu.hw_cr3; set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags); } @@ -1706,7 +1713,7 @@ static int mov_to_cr(int gpreg, int cr, mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT); if (mfn != pagetable_get_pfn(v->arch.guest_table)) __hvm_bug(regs); - shadow2_update_cr3(v); + shadow_update_cr3(v); } else { @@ -1771,7 +1778,7 @@ static int mov_to_cr(int gpreg, int cr, v->arch.guest_table = pagetable_from_pfn(mfn); if ( old_base_mfn ) put_page(mfn_to_page(old_base_mfn)); - shadow2_update_paging_modes(v); + shadow_update_paging_modes(v); HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx", (unsigned long) (mfn << PAGE_SHIFT)); @@ -1808,7 +1815,7 @@ static int mov_to_cr(int gpreg, int cr, if ((old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE)) { set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags); - shadow2_update_paging_modes(v); + shadow_update_paging_modes(v); } break; } @@ -2149,7 +2156,7 @@ void svm_handle_invlpg(const short invlp /* Overkill, we may not this */ set_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags); - shadow2_invlpg(v, g_vaddr); + shadow_invlpg(v, g_vaddr); } @@ -2520,7 +2527,7 @@ void walk_shadow_and_guest_pt(unsigned l struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; unsigned long gpa; - gpa = shadow2_gva_to_gpa(current, gva); + gpa = shadow_gva_to_gpa(current, gva); printk( "gva = %lx, gpa=%lx, gCR3=%x\n", gva, gpa, (u32)vmcb->cr3 ); if( !svm_paging_enabled(v) || mmio_space(gpa) ) return; @@ -2591,7 +2598,7 @@ asmlinkage void svm_vmexit_handler(struc if (svm_dbg_on && exit_reason == VMEXIT_EXCEPTION_PF) { if (svm_paging_enabled(v) && - !mmio_space(shadow2_gva_to_gpa(current, vmcb->exitinfo2))) + !mmio_space(shadow_gva_to_gpa(current, vmcb->exitinfo2))) { printk("I%08ld,ExC=%s(%d),IP=%x:%llx," "I1=%llx,I2=%llx,INT=%llx, " @@ -2601,7 +2608,7 @@ asmlinkage void svm_vmexit_handler(struc (unsigned long long) vmcb->exitinfo1, (unsigned long long) vmcb->exitinfo2, (unsigned long long) vmcb->exitintinfo.bytes, - (unsigned long long) shadow2_gva_to_gpa(current, vmcb->exitinfo2)); + (unsigned long long) shadow_gva_to_gpa(current, vmcb->exitinfo2)); } else { @@ -2862,53 +2869,9 @@ asmlinkage void svm_vmexit_handler(struc case VMEXIT_CR8_WRITE: svm_cr_access(v, 8, TYPE_MOV_TO_CR, &regs); break; - - case VMEXIT_DR0_READ: - svm_dr_access(v, 0, TYPE_MOV_FROM_DR, &regs); - break; - - case VMEXIT_DR1_READ: - svm_dr_access(v, 1, TYPE_MOV_FROM_DR, &regs); - break; - - case VMEXIT_DR2_READ: - svm_dr_access(v, 2, TYPE_MOV_FROM_DR, &regs); - break; - - case VMEXIT_DR3_READ: - svm_dr_access(v, 3, TYPE_MOV_FROM_DR, &regs); - break; - - case VMEXIT_DR6_READ: - svm_dr_access(v, 6, TYPE_MOV_FROM_DR, &regs); - break; - - case VMEXIT_DR7_READ: - svm_dr_access(v, 7, TYPE_MOV_FROM_DR, &regs); - break; - - case VMEXIT_DR0_WRITE: - svm_dr_access(v, 0, TYPE_MOV_TO_DR, &regs); - break; - - case VMEXIT_DR1_WRITE: - svm_dr_access(v, 1, TYPE_MOV_TO_DR, &regs); - break; - - case VMEXIT_DR2_WRITE: - svm_dr_access(v, 2, TYPE_MOV_TO_DR, &regs); - break; - - case VMEXIT_DR3_WRITE: - svm_dr_access(v, 3, TYPE_MOV_TO_DR, &regs); - break; - - case VMEXIT_DR6_WRITE: - svm_dr_access(v, 6, TYPE_MOV_TO_DR, &regs); - break; - - case VMEXIT_DR7_WRITE: - svm_dr_access(v, 7, TYPE_MOV_TO_DR, &regs); + + case VMEXIT_DR0_WRITE ... VMEXIT_DR7_WRITE: + svm_dr_access(v, &regs); break; case VMEXIT_IOIO: diff -r 896fcdd49c7f -r 684fdcfb251a xen/arch/x86/hvm/svm/vmcb.c --- a/xen/arch/x86/hvm/svm/vmcb.c Mon Aug 28 16:16:07 2006 -0600 +++ b/xen/arch/x86/hvm/svm/vmcb.c Mon Aug 28 16:26:37 2006 -0600 @@ -121,7 +121,7 @@ static int construct_vmcb_controls(struc GENERAL2_INTERCEPT_SKINIT | GENERAL2_INTERCEPT_RDTSCP; /* read or write all debug registers 0 - 15 */ - vmcb->dr_intercepts = 0; + vmcb->dr_intercepts = DR_INTERCEPT_ALL_WRITES; /* RD/WR all control registers 0 - 15, but not read CR2 */ vmcb->cr_intercepts = ~(CR_INTERCEPT_CR2_READ | CR_INTERCEPT_CR2_WRITE); diff -r 896fcdd49c7f -r 684fdcfb251a xen/arch/x86/hvm/vmx/vmcs.c --- a/xen/arch/x86/hvm/vmx/vmcs.c Mon Aug 28 16:16:07 2006 -0600 +++ b/xen/arch/x86/hvm/vmx/vmcs.c Mon Aug 28 16:26:37 2006 -0600 @@ -35,7 +35,7 @@ #include <xen/event.h> #include <xen/kernel.h> #include <xen/keyhandler.h> -#include <asm/shadow2.h> +#include <asm/shadow.h> static int vmcs_size; static int vmcs_order; @@ -272,7 +272,7 @@ static void vmx_do_launch(struct vcpu *v error |= __vmwrite(GUEST_TR_BASE, 0); error |= __vmwrite(GUEST_TR_LIMIT, 0xff); - shadow2_update_paging_modes(v); + shadow_update_paging_modes(v); printk("%s(): GUEST_CR3<=%08lx, HOST_CR3<=%08lx\n", __func__, v->arch.hvm_vcpu.hw_cr3, v->arch.cr3); __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3); diff -r 896fcdd49c7f -r 684fdcfb251a xen/arch/x86/hvm/vmx/vmx.c --- a/xen/arch/x86/hvm/vmx/vmx.c Mon Aug 28 16:16:07 2006 -0600 +++ b/xen/arch/x86/hvm/vmx/vmx.c Mon Aug 28 16:26:37 2006 -0600 @@ -40,7 +40,7 @@ #include <asm/hvm/vmx/vmx.h> #include <asm/hvm/vmx/vmcs.h> #include <asm/hvm/vmx/cpu.h> -#include <asm/shadow2.h> +#include <asm/shadow.h> #include <public/sched.h> #include <public/hvm/ioreq.h> #include <asm/hvm/vpic.h> @@ -66,10 +66,10 @@ static int vmx_initialize_guest_resource if ( v->vcpu_id != 0 ) return 1; - if ( !shadow2_mode_external(d) ) + if ( !shadow_mode_external(d) ) { DPRINTK("Can't init HVM for dom %u vcpu %u: " - "not in shadow2 external mode\n", + "not in shadow external mode\n", d->domain_id, v->vcpu_id); domain_crash(d); } @@ -865,7 +865,7 @@ static int vmx_do_page_fault(unsigned lo } #endif - result = shadow2_fault(va, regs); + result = shadow_fault(va, regs); TRACE_VMEXIT (2,result); #if 0 @@ -1039,7 +1039,7 @@ static void vmx_vmexit_do_invlpg(unsigne * We do the safest things first, then try to update the shadow * copying from guest */ - shadow2_invlpg(v, va); + shadow_invlpg(v, va); } @@ -1301,7 +1301,7 @@ vmx_world_restore(struct vcpu *v, struct skip_cr3: - shadow2_update_paging_modes(v); + shadow_update_paging_modes(v); if (!vmx_paging_enabled(v)) HVM_DBG_LOG(DBG_LEVEL_VMMU, "switching to vmxassist. use phys table"); else @@ -1504,7 +1504,7 @@ static int vmx_set_cr0(unsigned long val v->arch.guest_table = pagetable_from_pfn(mfn); if (old_base_mfn) put_page(mfn_to_page(old_base_mfn)); - shadow2_update_paging_modes(v); + shadow_update_paging_modes(v); HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx", (unsigned long) (mfn << PAGE_SHIFT)); @@ -1577,7 +1577,7 @@ static int vmx_set_cr0(unsigned long val else if ( (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PE ) { __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr3); - shadow2_update_paging_modes(v); + shadow_update_paging_modes(v); } return 1; @@ -1662,7 +1662,7 @@ static int mov_to_cr(int gp, int cr, str mfn = get_mfn_from_gpfn(value >> PAGE_SHIFT); if (mfn != pagetable_get_pfn(v->arch.guest_table)) __hvm_bug(regs); - shadow2_update_cr3(v); + shadow_update_cr3(v); } else { /* * If different, make a shadow. Check if the PDBR is valid @@ -1755,7 +1755,7 @@ static int mov_to_cr(int gp, int cr, str * all TLB entries except global entries. */ if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) ) - shadow2_update_paging_modes(v); + shadow_update_paging_modes(v); break; } default: diff -r 896fcdd49c7f -r 684fdcfb251a xen/arch/x86/mm.c --- a/xen/arch/x86/mm.c Mon Aug 28 16:16:07 2006 -0600 +++ b/xen/arch/x86/mm.c Mon Aug 28 16:26:37 2006 -0600 @@ -454,12 +454,12 @@ int map_ldt_shadow_page(unsigned int off res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page); - if ( !res && unlikely(shadow2_mode_refcounts(d)) ) - { - shadow2_lock(d); - shadow2_remove_write_access(d->vcpu[0], _mfn(mfn), 0, 0); + if ( !res && unlikely(shadow_mode_refcounts(d)) ) + { + shadow_lock(d); + shadow_remove_write_access(d->vcpu[0], _mfn(mfn), 0, 0); res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page); - shadow2_unlock(d); + shadow_unlock(d); } if ( unlikely(!res) ) @@ -527,7 +527,7 @@ get_linear_pagetable( struct page_info *page; unsigned long pfn; - ASSERT( !shadow2_mode_refcounts(d) ); + ASSERT( !shadow_mode_refcounts(d) ); if ( (root_get_flags(re) & _PAGE_RW) ) { @@ -602,12 +602,12 @@ get_page_from_l1e( d = dom_io; } - /* Foreign mappings into guests in shadow2 external mode don't + /* Foreign mappings into guests in shadow external mode don't * contribute to writeable mapping refcounts. (This allows the * qemu-dm helper process in dom0 to map the domain's memory without * messing up the count of "real" writable mappings.) */ okay = (((l1e_get_flags(l1e) & _PAGE_RW) && - !(unlikely(shadow2_mode_external(d) && (d != current->domain)))) + !(unlikely(shadow_mode_external(d) && (d != current->domain)))) ? get_page_and_type(page, d, PGT_writable_page) : get_page(page, d)); if ( !okay ) @@ -771,9 +771,9 @@ void put_page_from_l1e(l1_pgentry_t l1e, } /* Remember we didn't take a type-count of foreign writable mappings - * to shadow2 external domains */ + * to shadow external domains */ if ( (l1e_get_flags(l1e) & _PAGE_RW) && - !(unlikely((e != d) && shadow2_mode_external(e))) ) + !(unlikely((e != d) && shadow_mode_external(e))) ) { put_page_and_type(page); } @@ -830,7 +830,7 @@ static int alloc_l1_table(struct page_in l1_pgentry_t *pl1e; int i; - ASSERT(!shadow2_mode_refcounts(d)); + ASSERT(!shadow_mode_refcounts(d)); pl1e = map_domain_page(pfn); @@ -883,7 +883,7 @@ static int create_pae_xen_mappings(l3_pg * a. alloc_l3_table() calls this function and this check will fail * b. mod_l3_entry() disallows updates to slot 3 in an existing table * - * XXX -- this needs revisiting for shadow2_mode_refcount()==true... + * XXX -- this needs revisiting for shadow_mode_refcount()==true... */ page = l3e_get_page(l3e3); BUG_ON(page->u.inuse.type_info & PGT_pinned); @@ -1007,7 +1007,7 @@ static int alloc_l2_table(struct page_in l2_pgentry_t *pl2e; int i; - ASSERT(!shadow2_mode_refcounts(d)); + ASSERT(!shadow_mode_refcounts(d)); pl2e = map_domain_page(pfn); @@ -1059,7 +1059,7 @@ static int alloc_l3_table(struct page_in l3_pgentry_t *pl3e; int i; - ASSERT(!shadow2_mode_refcounts(d)); + ASSERT(!shadow_mode_refcounts(d)); #ifdef CONFIG_X86_PAE /* @@ -1120,7 +1120,7 @@ static int alloc_l4_table(struct page_in unsigned long vaddr; int i; - ASSERT(!shadow2_mode_refcounts(d)); + ASSERT(!shadow_mode_refcounts(d)); for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ ) { @@ -1234,8 +1234,8 @@ static inline int update_l1e(l1_pgentry_ struct vcpu *v) { int rv = 1; - if ( unlikely(shadow2_mode_enabled(v->domain)) ) - shadow2_lock(v->domain); + if ( unlikely(shadow_mode_enabled(v->domain)) ) + shadow_lock(v->domain); #ifndef PTE_UPDATE_WITH_CMPXCHG rv = (!__copy_to_user(pl1e, &nl1e, sizeof(nl1e))); #else @@ -1266,10 +1266,10 @@ static inline int update_l1e(l1_pgentry_ } } #endif - if ( unlikely(shadow2_mode_enabled(v->domain)) ) - { - shadow2_validate_guest_entry(v, _mfn(gl1mfn), pl1e); - shadow2_unlock(v->domain); + if ( unlikely(shadow_mode_enabled(v->domain)) ) + { + shadow_validate_guest_entry(v, _mfn(gl1mfn), pl1e); + shadow_unlock(v->domain); } return rv; } @@ -1339,13 +1339,13 @@ static int mod_l1_entry(l1_pgentry_t *pl #endif #define UPDATE_ENTRY(_t,_p,_o,_n,_m) ({ \ int rv; \ - if ( unlikely(shadow2_mode_enabled(current->domain)) ) \ - shadow2_lock(current->domain); \ + if ( unlikely(shadow_mode_enabled(current->domain)) ) \ + shadow_lock(current->domain); \ rv = _UPDATE_ENTRY(_t, _p, _o, _n); \ - if ( unlikely(shadow2_mode_enabled(current->domain)) ) \ + if ( unlikely(shadow_mode_enabled(current->domain)) ) \ { \ - shadow2_validate_guest_entry(current, _mfn(_m), (_p)); \ - shadow2_unlock(current->domain); \ + shadow_validate_guest_entry(current, _mfn(_m), (_p)); \ + shadow_unlock(current->domain); \ } \ rv; \ }) @@ -1581,21 +1581,21 @@ void free_page_type(struct page_info *pa */ this_cpu(percpu_mm_info).deferred_ops |= DOP_FLUSH_ALL_TLBS; - if ( unlikely(shadow2_mode_enabled(owner) - && !shadow2_lock_is_acquired(owner)) ) + if ( unlikely(shadow_mode_enabled(owner) + && !shadow_lock_is_acquired(owner)) ) { /* Raw page tables are rewritten during save/restore. */ - if ( !shadow2_mode_translate(owner) ) + if ( !shadow_mode_translate(owner) ) mark_dirty(owner, page_to_mfn(page)); - if ( shadow2_mode_refcounts(owner) ) + if ( shadow_mode_refcounts(owner) ) return; gmfn = mfn_to_gmfn(owner, page_to_mfn(page)); ASSERT(VALID_M2P(gmfn)); - shadow2_lock(owner); - shadow2_remove_all_shadows(owner->vcpu[0], _mfn(gmfn)); - shadow2_unlock(owner); + shadow_lock(owner); + shadow_remove_all_shadows(owner->vcpu[0], _mfn(gmfn)); + shadow_unlock(owner); } } @@ -1760,7 +1760,7 @@ int get_page_type(struct page_info *page #endif /* Fixme: add code to propagate va_unknown to subtables. */ if ( ((type & PGT_type_mask) >= PGT_l2_page_table) && - !shadow2_mode_refcounts(page_get_owner(page)) ) + !shadow_mode_refcounts(page_get_owner(page)) ) return 0; /* This table is possibly mapped at multiple locations. */ nx &= ~PGT_va_mask; @@ -1810,7 +1810,7 @@ int new_guest_cr3(unsigned long mfn) if ( hvm_guest(v) && !hvm_paging_enabled(v) ) domain_crash_synchronous(); - if ( shadow2_mode_refcounts(d) ) + if ( shadow_mode_refcounts(d) ) { okay = get_page_from_pagenr(mfn, d); if ( unlikely(!okay) ) @@ -1858,7 +1858,7 @@ int new_guest_cr3(unsigned long mfn) if ( likely(old_base_mfn != 0) ) { - if ( shadow2_mode_refcounts(d) ) + if ( shadow_mode_refcounts(d) ) put_page(mfn_to_page(old_base_mfn)); else put_page_and_type(mfn_to_page(old_base_mfn)); @@ -2043,7 +2043,7 @@ int do_mmuext_op( type = PGT_root_page_table; pin_page: - if ( shadow2_mode_refcounts(FOREIGNDOM) ) + if ( shadow_mode_refcounts(FOREIGNDOM) ) break; okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM); @@ -2065,7 +2065,7 @@ int do_mmuext_op( break; case MMUEXT_UNPIN_TABLE: - if ( shadow2_mode_refcounts(d) ) + if ( shadow_mode_refcounts(d) ) break; if ( unlikely(!(okay = get_page_from_pagenr(mfn, d))) ) @@ -2078,11 +2078,11 @@ int do_mmuext_op( { put_page_and_type(page); put_page(page); - if ( shadow2_mode_enabled(d) ) + if ( shadow_mode_enabled(d) ) { - shadow2_lock(d); - shadow2_remove_all_shadows(v, _mfn(mfn)); - shadow2_unlock(d); + shadow_lock(d); + shadow_remove_all_shadows(v, _mfn(mfn)); + shadow_unlock(d); } } else @@ -2125,8 +2125,8 @@ int do_mmuext_op( break; case MMUEXT_INVLPG_LOCAL: - if ( !shadow2_mode_enabled(d) - || shadow2_invlpg(v, op.arg1.linear_addr) != 0 ) + if ( !shadow_mode_enabled(d) + || shadow_invlpg(v, op.arg1.linear_addr) != 0 ) local_flush_tlb_one(op.arg1.linear_addr); break; @@ -2173,7 +2173,7 @@ int do_mmuext_op( unsigned long ptr = op.arg1.linear_addr; unsigned long ents = op.arg2.nr_ents; - if ( shadow2_mode_external(d) ) + if ( shadow_mode_external(d) ) { MEM_LOG("ignoring SET_LDT hypercall from external " "domain %u", d->domain_id); @@ -2319,7 +2319,7 @@ int do_mmu_update( case PGT_l3_page_table: case PGT_l4_page_table: { - if ( shadow2_mode_refcounts(d) ) + if ( shadow_mode_refcounts(d) ) { DPRINTK("mmu update on shadow-refcounted domain!"); break; @@ -2372,16 +2372,16 @@ int do_mmu_update( if ( unlikely(!get_page_type(page, PGT_writable_page)) ) break; - if ( unlikely(shadow2_mode_enabled(d)) ) - shadow2_lock(d); + if ( unlikely(shadow_mode_enabled(d)) ) + shadow_lock(d); *(intpte_t *)va = req.val; okay = 1; - if ( unlikely(shadow2_mode_enabled(d)) ) + if ( unlikely(shadow_mode_enabled(d)) ) { - shadow2_validate_guest_entry(v, _mfn(mfn), va); - shadow2_unlock(d); + shadow_validate_guest_entry(v, _mfn(mfn), va); + shadow_unlock(d); } put_page_type(page); @@ -2405,8 +2405,8 @@ int do_mmu_update( break; } - if ( shadow2_mode_translate(FOREIGNDOM) ) - shadow2_guest_physmap_add_page(FOREIGNDOM, gpfn, mfn); + if ( shadow_mode_translate(FOREIGNDOM) ) + shadow_guest_physmap_add_page(FOREIGNDOM, gpfn, mfn); else set_gpfn_from_mfn(mfn, gpfn); okay = 1; @@ -2492,7 +2492,7 @@ static int create_grant_pte_mapping( goto failed; } - if ( !shadow2_mode_refcounts(d) ) + if ( !shadow_mode_refcounts(d) ) put_page_from_l1e(ol1e, d); put_page_type(page); @@ -2590,7 +2590,7 @@ static int create_grant_va_mapping( l2e_get_pfn(__linear_l2_table[l2_linear_offset(va)]), v) ) return GNTST_general_error; - if ( !shadow2_mode_refcounts(d) ) + if ( !shadow_mode_refcounts(d) ) put_page_from_l1e(ol1e, d); return GNTST_okay; @@ -2714,10 +2714,10 @@ int do_update_va_mapping(unsigned long v perfc_incrc(calls_to_update_va); - if ( unlikely(!__addr_ok(va) && !shadow2_mode_external(d)) ) + if ( unlikely(!__addr_ok(va) && !shadow_mode_external(d)) ) return -EINVAL; - if ( unlikely(shadow2_mode_refcounts(d)) ) + if ( unlikely(shadow_mode_refcounts(d)) ) { DPRINTK("Grant op on a shadow-refcounted domain\n"); return -EINVAL; @@ -2725,11 +2725,11 @@ int do_update_va_mapping(unsigned long v LOCK_BIGLOCK(d); - if ( likely(rc == 0) && unlikely(shadow2_mode_enabled(d)) ) + if ( likely(rc == 0) && unlikely(shadow_mode_enabled(d)) ) { if ( unlikely(this_cpu(percpu_mm_info).foreign && - (shadow2_mode_translate(d) || - shadow2_mode_translate( + (shadow_mode_translate(d) || + shadow_mode_translate( this_cpu(percpu_mm_info).foreign))) ) { /* @@ -2770,8 +2770,8 @@ int do_update_va_mapping(unsigned long v switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) ) { case UVMF_LOCAL: - if ( !shadow2_mode_enabled(d) - || (shadow2_invlpg(current, va) != 0) ) + if ( !shadow_mode_enabled(d) + || (shadow_invlpg(current, va) != 0) ) local_flush_tlb_one(va); break; case UVMF_ALL: @@ -3006,7 +3006,7 @@ long arch_memory_op(int op, XEN_GUEST_HA break; } - if ( !shadow2_mode_translate(d) || (mfn == 0) ) + if ( !shadow_mode_translate(d) || (mfn == 0) ) { put_domain(d); return -EINVAL; @@ -3196,21 +3196,21 @@ static int ptwr_emulated_update( pl1e = (l1_pgentry_t *)((unsigned long)pl1e + (addr & ~PAGE_MASK)); if ( do_cmpxchg ) { - if ( shadow2_mode_enabled(d) ) - shadow2_lock(d); + if ( shadow_mode_enabled(d) ) + shadow_lock(d); ol1e = l1e_from_intpte(old); if ( cmpxchg((intpte_t *)pl1e, old, val) != old ) { - if ( shadow2_mode_enabled(d) ) - shadow2_unlock(d); + if ( shadow_mode_enabled(d) ) + shadow_unlock(d); unmap_domain_page(pl1e); put_page_from_l1e(nl1e, d); return X86EMUL_CMPXCHG_FAILED; } - if ( unlikely(shadow2_mode_enabled(v->domain)) ) - { - shadow2_validate_guest_entry(v, _mfn(page_to_mfn(page)), pl1e); - shadow2_unlock(v->domain); + if ( unlikely(shadow_mode_enabled(v->domain)) ) + { + shadow_validate_guest_entry(v, _mfn(page_to_mfn(page)), pl1e); + shadow_unlock(v->domain); } } else diff -r 896fcdd49c7f -r 684fdcfb251a xen/arch/x86/traps.c --- a/xen/arch/x86/traps.c Mon Aug 28 16:16:07 2006 -0600 +++ b/xen/arch/x86/traps.c Mon Aug 28 16:26:37 2006 -0600 @@ -870,8 +870,8 @@ static int fixup_page_fault(unsigned lon if ( unlikely(IN_HYPERVISOR_RANGE(addr)) ) { - if ( shadow2_mode_external(d) && guest_mode(regs) ) - return shadow2_fault(addr, regs); + if ( shadow_mode_external(d) && guest_mode(regs) ) + return shadow_fault(addr, regs); if ( (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) ) return handle_gdt_ldt_mapping_fault( addr - GDT_LDT_VIRT_START, regs); @@ -890,8 +890,8 @@ static int fixup_page_fault(unsigned lon ptwr_do_page_fault(d, addr, regs) ) return EXCRET_fault_fixed; - if ( shadow2_mode_enabled(d) ) - return shadow2_fault(addr, regs); + if ( shadow_mode_enabled(d) ) + return shadow_fault(addr, regs); return 0; } diff -r 896fcdd49c7f -r 684fdcfb251a xen/include/asm-x86/domain.h --- a/xen/include/asm-x86/domain.h Mon Aug 28 16:16:07 2006 -0600 +++ b/xen/include/asm-x86/domain.h Mon Aug 28 16:26:37 2006 -0600 @@ -59,10 +59,10 @@ extern void hypercall_page_initialise(st struct shadow_domain { u32 mode; /* flags to control shadow operation */ - spinlock_t lock; /* shadow2 domain lock */ + spinlock_t lock; /* shadow domain lock */ int locker; /* processor which holds the lock */ const char *locker_function; /* Func that took it */ - struct list_head freelists[SHADOW2_MAX_ORDER + 1]; + struct list_head freelists[SHADOW_MAX_ORDER + 1]; struct list_head p2m_freelist; struct list_head p2m_inuse; struct list_head toplevel_shadows; @@ -70,10 +70,10 @@ struct shadow_domain { unsigned int free_pages; /* number of pages on freelists */ unsigned int p2m_pages; /* number of pages in p2m map */ - /* Shadow2 hashtable */ - struct shadow2_hash_entry *hash_table; - struct shadow2_hash_entry *hash_freelist; - struct shadow2_hash_entry *hash_allocations; + /* Shadow hashtable */ + struct shadow_hash_entry *hash_table; + struct shadow_hash_entry *hash_freelist; + struct shadow_hash_entry *hash_allocations; int hash_walking; /* Some function is walking the hash table */ /* Shadow log-dirty bitmap */ @@ -107,7 +107,7 @@ struct arch_domain /* Shadow-translated guest: Pseudophys base address of reserved area. */ unsigned long first_reserved_pfn; - struct shadow_domain shadow2; + struct shadow_domain shadow; /* Shadow translated domain: P2M mapping */ pagetable_t phys_table; @@ -135,7 +135,7 @@ struct pae_l3_cache { }; struct shadow_vcpu { /* Pointers to mode-specific entry points. */ - struct shadow2_paging_mode *mode; + struct shadow_paging_mode *mode; /* Last MFN that we emulated a write to. */ unsigned long last_emulated_mfn; /* HVM guest: paging enabled (CR0.PG)? */ @@ -201,7 +201,7 @@ struct arch_vcpu /* Current LDT details. */ unsigned long shadow_ldt_mapcnt; - struct shadow_vcpu shadow2; + struct shadow_vcpu shadow; } __cacheline_aligned; /* shorthands to improve code legibility */ diff -r 896fcdd49c7f -r 684fdcfb251a xen/include/asm-x86/hvm/svm/vmcb.h --- a/xen/include/asm-x86/hvm/svm/vmcb.h Mon Aug 28 16:16:07 2006 -0600 +++ b/xen/include/asm-x86/hvm/svm/vmcb.h Mon Aug 28 16:26:37 2006 -0600 @@ -113,6 +113,51 @@ enum CRInterceptBits CR_INTERCEPT_CR14_WRITE = 1 << 30, CR_INTERCEPT_CR15_WRITE = 1 << 31, }; + + +/* debug register intercepts */ +enum DRInterceptBits +{ + DR_INTERCEPT_DR0_READ = 1 << 0, + DR_INTERCEPT_DR1_READ = 1 << 1, + DR_INTERCEPT_DR2_READ = 1 << 2, + DR_INTERCEPT_DR3_READ = 1 << 3, + DR_INTERCEPT_DR4_READ = 1 << 4, + DR_INTERCEPT_DR5_READ = 1 << 5, + DR_INTERCEPT_DR6_READ = 1 << 6, + DR_INTERCEPT_DR7_READ = 1 << 7, + DR_INTERCEPT_DR8_READ = 1 << 8, + DR_INTERCEPT_DR9_READ = 1 << 9, + DR_INTERCEPT_DR10_READ = 1 << 10, + DR_INTERCEPT_DR11_READ = 1 << 11, + DR_INTERCEPT_DR12_READ = 1 << 12, + DR_INTERCEPT_DR13_READ = 1 << 13, + DR_INTERCEPT_DR14_READ = 1 << 14, + DR_INTERCEPT_DR15_READ = 1 << 15, + DR_INTERCEPT_DR0_WRITE = 1 << 16, + DR_INTERCEPT_DR1_WRITE = 1 << 17, + DR_INTERCEPT_DR2_WRITE = 1 << 18, + DR_INTERCEPT_DR3_WRITE = 1 << 19, + DR_INTERCEPT_DR4_WRITE = 1 << 20, + DR_INTERCEPT_DR5_WRITE = 1 << 21, + DR_INTERCEPT_DR6_WRITE = 1 << 22, + DR_INTERCEPT_DR7_WRITE = 1 << 23, + DR_INTERCEPT_DR8_WRITE = 1 << 24, + DR_INTERCEPT_DR9_WRITE = 1 << 25, + DR_INTERCEPT_DR10_WRITE = 1 << 26, + DR_INTERCEPT_DR11_WRITE = 1 << 27, + DR_INTERCEPT_DR12_WRITE = 1 << 28, + DR_INTERCEPT_DR13_WRITE = 1 << 29, + DR_INTERCEPT_DR14_WRITE = 1 << 30, + DR_INTERCEPT_DR15_WRITE = 1 << 31, +}; + +/* for lazy save/restore we'd like to intercept all DR writes */ +#define DR_INTERCEPT_ALL_WRITES \ + (DR_INTERCEPT_DR0_WRITE|DR_INTERCEPT_DR1_WRITE|DR_INTERCEPT_DR2_WRITE \ + |DR_INTERCEPT_DR3_WRITE|DR_INTERCEPT_DR4_WRITE|DR_INTERCEPT_DR5_WRITE \ + |DR_INTERCEPT_DR6_WRITE|DR_INTERCEPT_DR7_WRITE) + enum VMEXIT_EXITCODE { diff -r 896fcdd49c7f -r 684fdcfb251a xen/include/asm-x86/mm.h --- a/xen/include/asm-x86/mm.h Mon Aug 28 16:16:07 2006 -0600 +++ b/xen/include/asm-x86/mm.h Mon Aug 28 16:26:37 2006 -0600 @@ -22,7 +22,7 @@ struct page_info /* Each frame can be threaded onto a doubly-linked list. */ union { struct list_head list; - /* Shadow2 uses this field as an up-pointer in lower-level shadows */ + /* Shadow uses this field as an up-pointer in lower-level shadows */ paddr_t up; }; @@ -59,7 +59,7 @@ struct page_info /* Only used on guest pages with a shadow. * Guest pages with a shadow must have a non-zero type count, so this * does not conflict with the tlbflush timestamp. */ - u32 shadow2_flags; + u32 shadow_flags; // XXX -- we expect to add another field here, to be used for min/max // purposes, which is only used for shadow pages. @@ -76,7 +76,7 @@ struct page_info #define PGT_ldt_page (6U<<29) /* using this page in an LDT? */ #define PGT_writable_page (7U<<29) /* has writable mappings of this page? */ -#ifndef SHADOW2 +#ifndef SHADOW #define PGT_l1_shadow PGT_l1_page_table #define PGT_l2_shadow PGT_l2_page_table #define PGT_l3_shadow PGT_l3_page_table @@ -117,7 +117,7 @@ struct page_info /* 16-bit count of uses of this frame as its current type. */ #define PGT_count_mask ((1U<<16)-1) -#ifndef SHADOW2 +#ifndef SHADOW #ifdef __x86_64__ #define PGT_high_mfn_shift 52 #define PGT_high_mfn_mask (0xfffUL << PGT_high_mfn_shift) @@ -132,7 +132,7 @@ struct page_info #define PGT_score_shift 23 #define PGT_score_mask (((1U<<4)-1)<<PGT_score_shift) #endif -#endif /* SHADOW2 */ +#endif /* SHADOW */ /* Cleared when the owning guest 'frees' this page. */ #define _PGC_allocated 31 @@ -146,38 +146,38 @@ struct page_info /* 29-bit count of references to this frame. */ #define PGC_count_mask ((1U<<29)-1) -/* shadow2 uses the count_info on shadow pages somewhat differently */ -/* NB: please coordinate any changes here with the SH2F's in shadow2.h */ -#define PGC_SH2_none (0U<<28) /* on the shadow2 free list */ -#define PGC_SH2_min_shadow (1U<<28) -#define PGC_SH2_l1_32_shadow (1U<<28) /* shadowing a 32-bit L1 guest page */ -#define PGC_SH2_fl1_32_shadow (2U<<28) /* L1 shadow for a 32b 4M superpage */ -#define PGC_SH2_l2_32_shadow (3U<<28) /* shadowing a 32-bit L2 guest page */ -#define PGC_SH2_l1_pae_shadow (4U<<28) /* shadowing a pae L1 page */ -#define PGC_SH2_fl1_pae_shadow (5U<<28) /* L1 shadow for pae 2M superpg */ -#define PGC_SH2_l2_pae_shadow (6U<<28) /* shadowing a pae L2-low page */ -#define PGC_SH2_l2h_pae_shadow (7U<<28) /* shadowing a pae L2-high page */ -#define PGC_SH2_l3_pae_shadow (8U<<28) /* shadowing a pae L3 page */ -#define PGC_SH2_l1_64_shadow (9U<<28) /* shadowing a 64-bit L1 page */ -#define PGC_SH2_fl1_64_shadow (10U<<28) /* L1 shadow for 64-bit 2M superpg */ -#define PGC_SH2_l2_64_shadow (11U<<28) /* shadowing a 64-bit L2 page */ -#define PGC_SH2_l3_64_shadow (12U<<28) /* shadowing a 64-bit L3 page */ -#define PGC_SH2_l4_64_shadow (13U<<28) /* shadowing a 64-bit L4 page */ -#define PGC_SH2_max_shadow (13U<<28) -#define PGC_SH2_p2m_table (14U<<28) /* in use as the p2m table */ -#define PGC_SH2_monitor_table (15U<<28) /* in use as a monitor table */ -#define PGC_SH2_unused (15U<<28) - -#define PGC_SH2_type_mask (15U<<28) -#define PGC_SH2_type_shift 28 - -#define PGC_SH2_pinned (1U<<27) - -#define _PGC_SH2_log_dirty 26 -#define PGC_SH2_log_dirty (1U<<26) +/* shadow uses the count_info on shadow pages somewhat differently */ +/* NB: please coordinate any changes here with the SHF's in shadow.h */ +#define PGC_SH_none (0U<<28) /* on the shadow free list */ +#define PGC_SH_min_shadow (1U<<28) +#define PGC_SH_l1_32_shadow (1U<<28) /* shadowing a 32-bit L1 guest page */ +#define PGC_SH_fl1_32_shadow (2U<<28) /* L1 shadow for a 32b 4M superpage */ +#define PGC_SH_l2_32_shadow (3U<<28) /* shadowing a 32-bit L2 guest page */ +#define PGC_SH_l1_pae_shadow (4U<<28) /* shadowing a pae L1 page */ +#define PGC_SH_fl1_pae_shadow (5U<<28) /* L1 shadow for pae 2M superpg */ +#define PGC_SH_l2_pae_shadow (6U<<28) /* shadowing a pae L2-low page */ +#define PGC_SH_l2h_pae_shadow (7U<<28) /* shadowing a pae L2-high page */ +#define PGC_SH_l3_pae_shadow (8U<<28) /* shadowing a pae L3 page */ +#define PGC_SH_l1_64_shadow (9U<<28) /* shadowing a 64-bit L1 page */ +#define PGC_SH_fl1_64_shadow (10U<<28) /* L1 shadow for 64-bit 2M superpg */ +#define PGC_SH_l2_64_shadow (11U<<28) /* shadowing a 64-bit L2 page */ +#define PGC_SH_l3_64_shadow (12U<<28) /* shadowing a 64-bit L3 page */ +#define PGC_SH_l4_64_shadow (13U<<28) /* shadowing a 64-bit L4 page */ +#define PGC_SH_max_shadow (13U<<28) +#define PGC_SH_p2m_table (14U<<28) /* in use as the p2m table */ +#define PGC_SH_monitor_table (15U<<28) /* in use as a monitor table */ +#define PGC_SH_unused (15U<<28) + +#define PGC_SH_type_mask (15U<<28) +#define PGC_SH_type_shift 28 + +#define PGC_SH_pinned (1U<<27) + +#define _PGC_SH_log_dirty 26 +#define PGC_SH_log_dirty (1U<<26) /* 26 bit ref count for shadow pages */ -#define PGC_SH2_count_mask ((1U<<26) - 1) +#define PGC_SH_count_mask ((1U<<26) - 1) /* We trust the slab allocator in slab.c, and our use of it. */ #define PageSlab(page) (1) @@ -201,9 +201,9 @@ static inline u32 pickle_domptr(struct d /* The order of the largest allocation unit we use for shadow pages */ #if CONFIG_PAGING_LEVELS == 2 -#define SHADOW2_MAX_ORDER 0 /* Only ever need 4k allocations */ +#define SHADOW_MAX_ORDER 0 /* Only ever need 4k allocations */ #else -#define SHADOW2_MAX_ORDER 2 /* Need up to 16k allocs for 32-bit on PAE/64 */ +#define SHADOW_MAX_ORDER 2 /* Need up to 16k allocs for 32-bit on PAE/64 */ #endif #define page_get_owner(_p) (unpickle_domptr((_p)->u.inuse._domain)) @@ -227,7 +227,7 @@ extern int shadow_remove_all_write_acces extern int shadow_remove_all_write_access( struct domain *d, unsigned long gmfn, unsigned long mfn); extern u32 shadow_remove_all_access( struct domain *d, unsigned long gmfn); -extern int _shadow2_mode_refcounts(struct domain *d); +extern int _shadow_mode_refcounts(struct domain *d); static inline void put_page(struct page_info *page) { @@ -259,7 +259,7 @@ static inline int get_page(struct page_i unlikely((nx & PGC_count_mask) == 0) || /* Count overflow? */ unlikely(d != _domain) ) /* Wrong owner? */ { - if ( !_shadow2_mode_refcounts(domain) ) + if ( !_shadow_mode_refcounts(domain) ) DPRINTK("Error pfn %lx: rd=%p, od=%p, caf=%08x, taf=%" PRtype_info "\n", page_to_mfn(page), domain, unpickle_domptr(d), @@ -345,11 +345,11 @@ int check_descriptor(struct desc_struct #define mfn_to_gmfn(_d, mfn) \ - ( (shadow2_mode_translate(_d)) \ + ( (shadow_mode_translate(_d)) \ ? get_gpfn_from_mfn(mfn) \ : (mfn) ) -#define gmfn_to_mfn(_d, gpfn) mfn_x(sh2_gfn_to_mfn(_d, gpfn)) +#define gmfn_to_mfn(_d, gpfn) mfn_x(sh_gfn_to_mfn(_d, gpfn)) /* diff -r 896fcdd49c7f -r 684fdcfb251a xen/include/asm-x86/perfc_defn.h --- a/xen/include/asm-x86/perfc_defn.h Mon Aug 28 16:16:07 2006 -0600 +++ b/xen/include/asm-x86/perfc_defn.h Mon Aug 28 16:26:37 2006 -0600 @@ -30,59 +30,59 @@ PERFCOUNTER_CPU(exception_fixed, PERFCOUNTER_CPU(exception_fixed, "pre-exception fixed") -/* Shadow2 counters */ -PERFCOUNTER_CPU(shadow2_alloc, "calls to shadow2_alloc") -PERFCOUNTER_CPU(shadow2_alloc_tlbflush, "shadow2_alloc flushed TLBs") +/* Shadow counters */ +PERFCOUNTER_CPU(shadow_alloc, "calls to shadow_alloc") +PERFCOUNTER_CPU(shadow_alloc_tlbflush, "shadow_alloc flushed TLBs") /* STATUS counters do not reset when 'P' is hit */ -PERFSTATUS(shadow2_alloc_count, "number of shadow pages in use") -PERFCOUNTER_CPU(shadow2_free, "calls to shadow2_free") -PERFCOUNTER_CPU(shadow2_prealloc_1, "shadow2 recycles old shadows") -PERFCOUNTER_CPU(shadow2_prealloc_2, "shadow2 recycles in-use shadows") -PERFCOUNTER_CPU(shadow2_linear_map_failed, "shadow2 hit read-only linear map") -PERFCOUNTER_CPU(shadow2_a_update, "shadow2 A bit update") -PERFCOUNTER_CPU(shadow2_ad_update, "shadow2 A&D bit update") -PERFCOUNTER_CPU(shadow2_fault, "calls to shadow2_fault") -PERFCOUNTER_CPU(shadow2_fault_bail_bad_gfn, "shadow2_fault guest bad gfn") -PERFCOUNTER_CPU(shadow2_fault_bail_not_present, - "shadow2_fault guest not-present") -PERFCOUNTER_CPU(shadow2_fault_bail_nx, "shadow2_fault guest NX fault") -PERFCOUNTER_CPU(shadow2_fault_bail_ro_mapping, "shadow2_fault guest R/W fault") -PERFCOUNTER_CPU(shadow2_fault_bail_user_supervisor, - "shadow2_fault guest U/S fault") -PERFCOUNTER_CPU(shadow2_fault_emulate_read, "shadow2_fault emulates a read") -PERFCOUNTER_CPU(shadow2_fault_emulate_write, "shadow2_fault emulates a write") -PERFCOUNTER_CPU(shadow2_fault_emulate_failed, "shadow2_fault emulator fails") -PERFCOUNTER_CPU(shadow2_fault_mmio, "shadow2_fault handled as mmio") -PERFCOUNTER_CPU(shadow2_fault_fixed, "shadow2_fault fixed fault") -PERFCOUNTER_CPU(shadow2_ptwr_emulate, "shadow2 causes ptwr to emulate") -PERFCOUNTER_CPU(shadow2_validate_gl1e_calls, "calls to shadow2_validate_gl1e") -PERFCOUNTER_CPU(shadow2_validate_gl2e_calls, "calls to shadow2_validate_gl2e") -PERFCOUNTER_CPU(shadow2_validate_gl3e_calls, "calls to shadow2_validate_gl3e") -PERFCOUNTER_CPU(shadow2_validate_gl4e_calls, "calls to shadow2_validate_gl4e") -PERFCOUNTER_CPU(shadow2_hash_lookups, "calls to shadow2_hash_lookup") -PERFCOUNTER_CPU(shadow2_hash_lookup_head, "shadow2 hash hit in bucket head") -PERFCOUNTER_CPU(shadow2_hash_lookup_miss, "shadow2 hash misses") -PERFCOUNTER_CPU(shadow2_get_shadow_status, "calls to get_shadow_status") -PERFCOUNTER_CPU(shadow2_hash_inserts, "calls to shadow2_hash_insert") -PERFCOUNTER_CPU(shadow2_hash_deletes, "calls to shadow2_hash_delete") -PERFCOUNTER_CPU(shadow2_writeable, "shadow2 removes write access") -PERFCOUNTER_CPU(shadow2_writeable_h_1, "shadow2 writeable: 32b w2k3") -PERFCOUNTER_CPU(shadow2_writeable_h_2, "shadow2 writeable: 32pae w2k3") -PERFCOUNTER_CPU(shadow2_writeable_h_3, "shadow2 writeable: 64b w2k3") -PERFCOUNTER_CPU(shadow2_writeable_h_4, "shadow2 writeable: 32b linux low") -PERFCOUNTER_CPU(shadow2_writeable_bf, "shadow2 writeable brute-force") -PERFCOUNTER_CPU(shadow2_mappings, "shadow2 removes all mappings") -PERFCOUNTER_CPU(shadow2_mappings_bf, "shadow2 rm-mappings brute-force") -PERFCOUNTER_CPU(shadow2_early_unshadow, "shadow2 unshadows for fork/exit") -PERFCOUNTER_CPU(shadow2_early_unshadow_top, "shadow2 unhooks for fork/exit") -PERFCOUNTER_CPU(shadow2_unshadow, "shadow2 unshadows a page") -PERFCOUNTER_CPU(shadow2_up_pointer, "shadow2 unshadow by up-pointer") -PERFCOUNTER_CPU(shadow2_unshadow_bf, "shadow2 unshadow brute-force") -PERFCOUNTER_CPU(shadow2_get_page_fail, "shadow2_get_page_from_l1e failed") -PERFCOUNTER_CPU(shadow2_guest_walk, "shadow2 walks guest tables") -PERFCOUNTER_CPU(shadow2_walk_cache_hit, "shadow2 walk-cache hits") -PERFCOUNTER_CPU(shadow2_walk_cache_miss, "shadow2 walk-cache misses") +PERFSTATUS(shadow_alloc_count, "number of shadow pages in use") +PERFCOUNTER_CPU(shadow_free, "calls to shadow_free") +PERFCOUNTER_CPU(shadow_prealloc_1, "shadow recycles old shadows") +PERFCOUNTER_CPU(shadow_prealloc_2, "shadow recycles in-use shadows") +PERFCOUNTER_CPU(shadow_linear_map_failed, "shadow hit read-only linear map") +PERFCOUNTER_CPU(shadow_a_update, "shadow A bit update") +PERFCOUNTER_CPU(shadow_ad_update, "shadow A&D bit update") +PERFCOUNTER_CPU(shadow_fault, "calls to shadow_fault") +PERFCOUNTER_CPU(shadow_fault_bail_bad_gfn, "shadow_fault guest bad gfn") +PERFCOUNTER_CPU(shadow_fault_bail_not_present, + "shadow_fault guest not-present") +PERFCOUNTER_CPU(shadow_fault_bail_nx, "shadow_fault guest NX fault") +PERFCOUNTER_CPU(shadow_fault_bail_ro_mapping, "shadow_fault guest R/W fault") +PERFCOUNTER_CPU(shadow_fault_bail_user_supervisor, + "shadow_fault guest U/S fault") +PERFCOUNTER_CPU(shadow_fault_emulate_read, "shadow_fault emulates a read") +PERFCOUNTER_CPU(shadow_fault_emulate_write, "shadow_fault emulates a write") +PERFCOUNTER_CPU(shadow_fault_emulate_failed, "shadow_fault emulator fails") +PERFCOUNTER_CPU(shadow_fault_mmio, "shadow_fault handled as mmio") +PERFCOUNTER_CPU(shadow_fault_fixed, "shadow_fault fixed fault") +PERFCOUNTER_CPU(shadow_ptwr_emulate, "shadow causes ptwr to emulate") +PERFCOUNTER_CPU(shadow_validate_gl1e_calls, "calls to shadow_validate_gl1e") +PERFCOUNTER_CPU(shadow_validate_gl2e_calls, "calls to shadow_validate_gl2e") +PERFCOUNTER_CPU(shadow_validate_gl3e_calls, "calls to shadow_validate_gl3e") +PERFCOUNTER_CPU(shadow_validate_gl4e_calls, "calls to shadow_validate_gl4e") +PERFCOUNTER_CPU(shadow_hash_lookups, "calls to shadow_hash_lookup") +PERFCOUNTER_CPU(shadow_hash_lookup_head, "shadow hash hit in bucket head") +PERFCOUNTER_CPU(shadow_hash_lookup_miss, "shadow hash misses") +PERFCOUNTER_CPU(shadow_get_shadow_status, "calls to get_shadow_status") +PERFCOUNTER_CPU(shadow_hash_inserts, "calls to shadow_hash_insert") +PERFCOUNTER_CPU(shadow_hash_deletes, "calls to shadow_hash_delete") +PERFCOUNTER_CPU(shadow_writeable, "shadow removes write access") +PERFCOUNTER_CPU(shadow_writeable_h_1, "shadow writeable: 32b w2k3") +PERFCOUNTER_CPU(shadow_writeable_h_2, "shadow writeable: 32pae w2k3") +PERFCOUNTER_CPU(shadow_writeable_h_3, "shadow writeable: 64b w2k3") +PERFCOUNTER_CPU(shadow_writeable_h_4, "shadow writeable: 32b linux low") +PERFCOUNTER_CPU(shadow_writeable_bf, "shadow writeable brute-force") +PERFCOUNTER_CPU(shadow_mappings, "shadow removes all mappings") +PERFCOUNTER_CPU(shadow_mappings_bf, "shadow rm-mappings brute-force") +PERFCOUNTER_CPU(shadow_early_unshadow, "shadow unshadows for fork/exit") +PERFCOUNTER_CPU(shadow_early_unshadow_top, "shadow unhooks for fork/exit") +PERFCOUNTER_CPU(shadow_unshadow, "shadow unshadows a page") +PERFCOUNTER_CPU(shadow_up_pointer, "shadow unshadow by up-pointer") +PERFCOUNTER_CPU(shadow_unshadow_bf, "shadow unshadow brute-force") +PERFCOUNTER_CPU(shadow_get_page_fail, "shadow_get_page_from_l1e failed") +PERFCOUNTER_CPU(shadow_guest_walk, "shadow walks guest tables") +PERFCOUNTER_CPU(shadow_walk_cache_hit, "shadow walk-cache hits") +PERFCOUNTER_CPU(shadow_walk_cache_miss, "shadow walk-cache misses") /*#endif*/ /* __XEN_PERFC_DEFN_H__ */ diff -r 896fcdd49c7f -r 684fdcfb251a xen/include/asm-x86/shadow.h --- a/xen/include/asm-x86/shadow.h Mon Aug 28 16:16:07 2006 -0600 +++ b/xen/include/asm-x86/shadow.h Mon Aug 28 16:26:37 2006 -0600 @@ -1,7 +1,9 @@ /****************************************************************************** * include/asm-x86/shadow.h * - * Copyright (c) 2006 by XenSource Inc. + * Parts of this code are Copyright (c) 2006 by XenSource Inc. + * Parts of this code are Copyright (c) 2006 by Michael A Fetterman + * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -21,26 +23,608 @@ #ifndef _XEN_SHADOW_H #define _XEN_SHADOW_H -/* This file is just a wrapper around the new Shadow2 header, - * providing names that must be defined in any shadow implementation. */ - -#include <asm/shadow2.h> +#include <public/domctl.h> +#include <xen/sched.h> +#include <xen/perfc.h> +#include <asm/flushtlb.h> /* How to make sure a page is not referred to in a shadow PT */ /* This will need to be a for_each_vcpu if we go to per-vcpu shadows */ #define shadow_drop_references(_d, _p) \ - shadow2_remove_all_mappings((_d)->vcpu[0], _mfn(page_to_mfn(_p))) + shadow_remove_all_mappings((_d)->vcpu[0], _mfn(page_to_mfn(_p))) #define shadow_sync_and_drop_references(_d, _p) \ - shadow2_remove_all_mappings((_d)->vcpu[0], _mfn(page_to_mfn(_p))) - -/* Whether we are translating the domain's frame numbers for it */ -#define shadow_mode_translate(d) shadow2_mode_translate(d) - -/* ...and if so, how to add and remove entries in the mapping */ + shadow_remove_all_mappings((_d)->vcpu[0], _mfn(page_to_mfn(_p))) + +/* How to add and remove entries in the p2m mapping. */ #define guest_physmap_add_page(_d, _p, _m) \ - shadow2_guest_physmap_add_page((_d), (_p), (_m)) + shadow_guest_physmap_add_page((_d), (_p), (_m)) #define guest_physmap_remove_page(_d, _p, _m ) \ - shadow2_guest_physmap_remove_page((_d), (_p), (_m)) + shadow_guest_physmap_remove_page((_d), (_p), (_m)) + +/* Shadow PT operation mode : shadow-mode variable in arch_domain. */ + +#define SHM2_shift 10 +/* We're in one of the shadow modes */ +#define SHM2_enable (1U << SHM2_shift) +/* Refcounts based on shadow tables instead of guest tables */ +#define SHM2_refcounts (XEN_DOMCTL_SHADOW_ENABLE_REFCOUNT << SHM2_shift) +/* Enable log dirty mode */ +#define SHM2_log_dirty (XEN_DOMCTL_SHADOW_ENABLE_LOG_DIRTY << SHM2_shift) +/* Xen does p2m translation, not guest */ +#define SHM2_translate (XEN_DOMCTL_SHADOW_ENABLE_TRANSLATE << SHM2_shift) +/* Xen does not steal address space from the domain for its own booking; + * requires VT or similar mechanisms */ +#define SHM2_external (XEN_DOMCTL_SHADOW_ENABLE_EXTERNAL << SHM2_shift) + +#define shadow_mode_enabled(_d) ((_d)->arch.shadow.mode) +#define shadow_mode_refcounts(_d) ((_d)->arch.shadow.mode & SHM2_refcounts) +#define shadow_mode_log_dirty(_d) ((_d)->arch.shadow.mode & SHM2_log_dirty) +#define shadow_mode_translate(_d) ((_d)->arch.shadow.mode & SHM2_translate) +#define shadow_mode_external(_d) ((_d)->arch.shadow.mode & SHM2_external) + +/* Xen traps & emulates all reads of all page table pages: + *not yet supported + */ +#define shadow_mode_trap_reads(_d) ({ (void)(_d); 0; }) + +// flags used in the return value of the shadow_set_lXe() functions... +#define SHADOW_SET_CHANGED 0x1 +#define SHADOW_SET_FLUSH 0x2 +#define SHADOW_SET_ERROR 0x4 +#define SHADOW_SET_L3PAE_RECOPY 0x8 + +// How do we tell that we have a 32-bit PV guest in a 64-bit Xen? +#ifdef __x86_64__ +#define pv_32bit_guest(_v) 0 // not yet supported +#else +#define pv_32bit_guest(_v) !hvm_guest(v) +#endif + +/* The shadow lock. + * + * This lock is per-domain. It is intended to allow us to make atomic + * updates to the software TLB that the shadow tables provide. + * + * Specifically, it protects: + * - all changes to shadow page table pages + * - the shadow hash table + * - the shadow page allocator + * - all changes to guest page table pages; if/when the notion of + * out-of-sync pages is added to this code, then the shadow lock is + * protecting all guest page table pages which are not listed as + * currently as both guest-writable and out-of-sync... + * XXX -- need to think about this relative to writable page tables. + * - all changes to the page_info->tlbflush_timestamp + * - the page_info->count fields on shadow pages + * - the shadow dirty bit array and count + * - XXX + */ +#ifndef CONFIG_SMP +#error shadow.h currently requires CONFIG_SMP +#endif + +#define shadow_lock_init(_d) \ + do { \ + spin_lock_init(&(_d)->arch.shadow.lock); \ + (_d)->arch.shadow.locker = -1; \ + (_d)->arch.shadow.locker_function = "nobody"; \ + } while (0) + +#define shadow_lock_is_acquired(_d) \ + (current->processor == (_d)->arch.shadow.locker) + +#define shadow_lock(_d) \ + do { \ + if ( unlikely((_d)->arch.shadow.locker == current->processor) ) \ + { \ + printk("Error: shadow lock held by %s\n", \ + (_d)->arch.shadow.locker_function); \ + BUG(); \ + } \ + spin_lock(&(_d)->arch.shadow.lock); \ + ASSERT((_d)->arch.shadow.locker == -1); \ + (_d)->arch.shadow.locker = current->processor; \ + (_d)->arch.shadow.locker_function = __func__; \ + } while (0) + +#define shadow_unlock(_d) \ + do { \ + ASSERT((_d)->arch.shadow.locker == current->processor); \ + (_d)->arch.shadow.locker = -1; \ + (_d)->arch.shadow.locker_function = "nobody"; \ + spin_unlock(&(_d)->arch.shadow.lock); \ + } while (0) + +/* + * Levels of self-test and paranoia + * XXX should go in config files somewhere? + */ +#define SHADOW_AUDIT_HASH 0x01 /* Check current hash bucket */ +#define SHADOW_AUDIT_HASH_FULL 0x02 /* Check every hash bucket */ +#define SHADOW_AUDIT_ENTRIES 0x04 /* Check this walk's shadows */ +#define SHADOW_AUDIT_ENTRIES_FULL 0x08 /* Check every shadow */ +#define SHADOW_AUDIT_ENTRIES_MFNS 0x10 /* Check gfn-mfn map in shadows */ +#define SHADOW_AUDIT_P2M 0x20 /* Check the p2m table */ + +#ifdef NDEBUG +#define SHADOW_AUDIT 0 +#define SHADOW_AUDIT_ENABLE 0 +#else +#define SHADOW_AUDIT 0x15 /* Basic audit of all except p2m. */ +#define SHADOW_AUDIT_ENABLE shadow_audit_enable +extern int shadow_audit_enable; +#endif + +/* + * Levels of optimization + * XXX should go in config files somewhere? + */ +#define SHOPT_WRITABLE_HEURISTIC 0x01 /* Guess at RW PTEs via linear maps */ +#define SHOPT_EARLY_UNSHADOW 0x02 /* Unshadow l1s on fork or exit */ + +#define SHADOW_OPTIMIZATIONS 0x03 + + +/* With shadow pagetables, the different kinds of address start + * to get get confusing. + * + * Virtual addresses are what they usually are: the addresses that are used + * to accessing memory while the guest is running. The MMU translates from + * virtual addresses to machine addresses. + * + * (Pseudo-)physical addresses are the abstraction of physical memory the + * guest uses for allocation and so forth. For the purposes of this code, + * we can largely ignore them. + * + * Guest frame numbers (gfns) are the entries that the guest puts in its + * pagetables. For normal paravirtual guests, they are actual frame numbers, + * with the translation done by the guest. + * + * Machine frame numbers (mfns) are the entries that the hypervisor puts + * in the shadow page tables. + * + * Elsewhere in the xen code base, the name "gmfn" is generally used to refer + * to a "machine frame number, from the guest's perspective", or in other + * words, pseudo-physical frame numbers. However, in the shadow code, the + * term "gmfn" means "the mfn of a guest page"; this combines naturally with + * other terms such as "smfn" (the mfn of a shadow page), gl2mfn (the mfn of a + * guest L2 page), etc... + */ + +/* With this defined, we do some ugly things to force the compiler to + * give us type safety between mfns and gfns and other integers. + * TYPE_SAFE(int foo) defines a foo_t, and _foo() and foo_x() functions + * that translate beween int and foo_t. + * + * It does have some performance cost because the types now have + * a different storage attribute, so may not want it on all the time. */ +#ifndef NDEBUG +#define TYPE_SAFETY 1 +#endif + +#ifdef TYPE_SAFETY +#define TYPE_SAFE(_type,_name) \ +typedef struct { _type _name; } _name##_t; \ +static inline _name##_t _##_name(_type n) { return (_name##_t) { n }; } \ +static inline _type _name##_x(_name##_t n) { return n._name; } +#else +#define TYPE_SAFE(_type,_name) \ +typedef _type _name##_t; \ +static inline _name##_t _##_name(_type n) { return n; } \ +static inline _type _name##_x(_name##_t n) { return n; } +#endif + +TYPE_SAFE(unsigned long,mfn) +#define SH_PRI_mfn "05lx" + +static inline int +valid_mfn(mfn_t m) +{ + return VALID_MFN(mfn_x(m)); +} + +static inline mfn_t +pagetable_get_mfn(pagetable_t pt) +{ + return _mfn(pagetable_get_pfn(pt)); +} + +static inline pagetable_t +pagetable_from_mfn(mfn_t mfn) +{ + return pagetable_from_pfn(mfn_x(mfn)); +} + +static inline int +shadow_vcpu_mode_translate(struct vcpu *v) +{ + // Returns true if this VCPU needs to be using the P2M table to translate + // between GFNs and MFNs. + // + // This is true of translated HVM domains on a vcpu which has paging + // enabled. (HVM vcpu's with paging disabled are using the p2m table as + // its paging table, so no translation occurs in this case.) + // + return v->arch.shadow.hvm_paging_enabled; +} + + +/**************************************************************************/ +/* Mode-specific entry points into the shadow code */ + +struct x86_emulate_ctxt; +struct shadow_paging_mode { + int (*page_fault )(struct vcpu *v, unsigned long va, + struct cpu_user_regs *regs); + int (*invlpg )(struct vcpu *v, unsigned long va); + unsigned long (*gva_to_gpa )(struct vcpu *v, unsigned long va); + unsigned long (*gva_to_gfn )(struct vcpu *v, unsigned long va); + void (*update_cr3 )(struct vcpu *v); + int (*map_and_validate_gl1e )(struct vcpu *v, mfn_t gmfn, + void *new_guest_entry, u32 size); + int (*map_and_validate_gl2e )(struct vcpu *v, mfn_t gmfn, + void *new_guest_entry, u32 size); + int (*map_and_validate_gl2he)(struct vcpu *v, mfn_t gmfn, + void *new_guest_entry, u32 size); + int (*map_and_validate_gl3e )(struct vcpu *v, mfn_t gmfn, + void *new_guest_entry, u32 size); + int (*map_and_validate_gl4e )(struct vcpu *v, mfn_t gmfn, + void *new_guest_entry, u32 size); + void (*detach_old_tables )(struct vcpu *v); + int (*x86_emulate_write )(struct vcpu *v, unsigned long va, + void *src, u32 bytes, + struct x86_emulate_ctxt *ctxt); + int (*x86_emulate_cmpxchg )(struct vcpu *v, unsigned long va, + unsigned long old, + unsigned long new, + unsigned int bytes, + struct x86_emulate_ctxt *ctxt); + int (*x86_emulate_cmpxchg8b )(struct vcpu *v, unsigned long va, + unsigned long old_lo, + unsigned long old_hi, + unsigned long new_lo, + unsigned long new_hi, + struct x86_emulate_ctxt *ctxt); + mfn_t (*make_monitor_table )(struct vcpu *v); + void (*destroy_monitor_table )(struct vcpu *v, mfn_t mmfn); +#if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC + int (*guess_wrmap )(struct vcpu *v, + unsigned long vaddr, mfn_t gmfn); +#endif + /* For outsiders to tell what mode we're in */ + unsigned int shadow_levels; + unsigned int guest_levels; +}; + +static inline int shadow_guest_paging_levels(struct vcpu *v) +{ + ASSERT(v->arch.shadow.mode != NULL); + return v->arch.shadow.mode->guest_levels; +} + +/**************************************************************************/ +/* Entry points into the shadow code */ + +/* Turning on shadow test mode */ +int shadow_test_enable(struct domain *d); + +/* Handler for shadow control ops: enabling and disabling shadow modes, + * and log-dirty bitmap ops all happen through here. */ +int shadow_domctl(struct domain *d, + xen_domctl_shadow_op_t *sc, + XEN_GUEST_HANDLE(xen_domctl_t) u_domctl); + +/* Call when destroying a domain */ +void shadow_teardown(struct domain *d); + +/* Call once all of the references to the domain have gone away */ +void shadow_final_teardown(struct domain *d); + + +/* Mark a page as dirty in the bitmap */ +void sh_do_mark_dirty(struct domain *d, mfn_t gmfn); +static inline void mark_dirty(struct domain *d, unsigned long gmfn) +{ + if ( shadow_mode_log_dirty(d) ) + { + shadow_lock(d); + sh_do_mark_dirty(d, _mfn(gmfn)); + shadow_unlock(d); + } +} + +/* Internal version, for when the shadow lock is already held */ +static inline void sh_mark_dirty(struct domain *d, mfn_t gmfn) +{ + ASSERT(shadow_lock_is_acquired(d)); + if ( shadow_mode_log_dirty(d) ) + sh_do_mark_dirty(d, gmfn); +} + +static inline int +shadow_fault(unsigned long va, struct cpu_user_regs *regs) +/* Called from pagefault handler in Xen, and from the HVM trap handlers + * for pagefaults. Returns 1 if this fault was an artefact of the + * shadow code (and the guest should retry) or 0 if it is not (and the + * fault should be handled elsewhere or passed to the guest). */ +{ + struct vcpu *v = current; + perfc_incrc(shadow_fault); + return v->arch.shadow.mode->page_fault(v, va, regs); +} + +static inline int +shadow_invlpg(struct vcpu *v, unsigned long va) +/* Called when the guest requests an invlpg. Returns 1 if the invlpg + * instruction should be issued on the hardware, or 0 if it's safe not + * to do so. */ +{ + return v->arch.shadow.mode->invlpg(v, va); +} + +static inline unsigned long +shadow_gva_to_gpa(struct vcpu *v, unsigned long va) +/* Called to translate a guest virtual address to what the *guest* + * pagetables would map it to. */ +{ + return v->arch.shadow.mode->gva_to_gpa(v, va); +} + +static inline unsigned long +shadow_gva_to_gfn(struct vcpu *v, unsigned long va) +/* Called to translate a guest virtual address to what the *guest* + * pagetables would map it to. */ +{ + return v->arch.shadow.mode->gva_to_gfn(v, va); +} + +static inline void +shadow_update_cr3(struct vcpu *v) +/* Updates all the things that are derived from the guest's CR3. + * Called when the guest changes CR3. */ +{ + shadow_lock(v->domain); + v->arch.shadow.mode->update_cr3(v); + shadow_unlock(v->domain); +} + + +/* Should be called after CR3 is updated. + * Updates vcpu->arch.cr3 and, for HVM guests, vcpu->arch.hvm_vcpu.cpu_cr3. + * + * Also updates other state derived from CR3 (vcpu->arch.guest_vtable, + * shadow_vtable, etc). + * + * Uses values found in vcpu->arch.(guest_table and guest_table_user), and + * for HVM guests, arch.monitor_table and hvm's guest CR3. + * + * Update ref counts to shadow tables appropriately. + * For PAE, relocate L3 entries, if necessary, into low memory. + */ +static inline void update_cr3(struct vcpu *v) +{ + unsigned long cr3_mfn=0; + + if ( shadow_mode_enabled(v->domain) ) + { + shadow_update_cr3(v); + return; + } + +#if CONFIG_PAGING_LEVELS == 4 + if ( !(v->arch.flags & TF_kernel_mode) ) + cr3_mfn = pagetable_get_pfn(v->arch.guest_table_user); + else +#endif + cr3_mfn = pagetable_get_pfn(v->arch.guest_table); + + make_cr3(v, cr3_mfn); +} + +extern void sh_update_paging_modes(struct vcpu *v); + +/* Should be called to initialise paging structures if the paging mode + * has changed, and when bringing up a VCPU for the first time. */ +static inline void shadow_update_paging_modes(struct vcpu *v) +{ + ASSERT(shadow_mode_enabled(v->domain)); + shadow_lock(v->domain); + sh_update_paging_modes(v); + shadow_unlock(v->domain); +} + +static inline void +shadow_detach_old_tables(struct vcpu *v) +{ + if ( v->arch.shadow.mode ) + v->arch.shadow.mode->detach_old_tables(v); +} + +static inline mfn_t +shadow_make_monitor_table(struct vcpu *v) +{ + return v->arch.shadow.mode->make_monitor_table(v); +} + +static inline void +shadow_destroy_monitor_table(struct vcpu *v, mfn_t mmfn) +{ + v->arch.shadow.mode->destroy_monitor_table(v, mmfn); +} + +/* Validate a pagetable change from the guest and update the shadows. */ +extern int shadow_validate_guest_entry(struct vcpu *v, mfn_t gmfn, + void *new_guest_entry); + +/* Update the shadows in response to a pagetable write from a HVM guest */ +extern void shadow_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn, + void *entry, u32 size); + +/* Remove all writeable mappings of a guest frame from the shadows. + * Returns non-zero if we need to flush TLBs. + * level and fault_addr desribe how we found this to be a pagetable; + * level==0 means we have some other reason for revoking write access. */ +extern int shadow_remove_write_access(struct vcpu *v, mfn_t readonly_mfn, + unsigned int level, + unsigned long fault_addr); + +/* Remove all mappings of the guest mfn from the shadows. + * Returns non-zero if we need to flush TLBs. */ +extern int shadow_remove_all_mappings(struct vcpu *v, mfn_t target_mfn); + +void +shadow_remove_all_shadows_and_parents(struct vcpu *v, mfn_t gmfn); +/* This is a HVM page that we thing is no longer a pagetable. + * Unshadow it, and recursively unshadow pages that reference it. */ + +/* Remove all shadows of the guest mfn. */ +extern void sh_remove_shadows(struct vcpu *v, mfn_t gmfn, int all); +static inline void shadow_remove_all_shadows(struct vcpu *v, mfn_t gmfn) +{ + sh_remove_shadows(v, gmfn, 1); +} + +/* Add a page to a domain */ +void +shadow_guest_physmap_add_page(struct domain *d, unsigned long gfn, + unsigned long mfn); + +/* Remove a page from a domain */ +void +shadow_guest_physmap_remove_page(struct domain *d, unsigned long gfn, + unsigned long mfn); + +/* + * Definitions for the shadow_flags field in page_info. + * These flags are stored on *guest* pages... + * Bits 1-13 are encodings for the shadow types. + */ +#define PGC_SH_type_to_index(_type) ((_type) >> PGC_SH_type_shift) +#define SHF_page_type_mask \ + (((1u << (PGC_SH_type_to_index(PGC_SH_max_shadow) + 1u)) - 1u) - \ + ((1u << PGC_SH_type_to_index(PGC_SH_min_shadow)) - 1u)) + +#define SHF_L1_32 (1u << PGC_SH_type_to_index(PGC_SH_l1_32_shadow)) +#define SHF_FL1_32 (1u << PGC_SH_type_to_index(PGC_SH_fl1_32_shadow)) +#define SHF_L2_32 (1u << PGC_SH_type_to_index(PGC_SH_l2_32_shadow)) +#define SHF_L1_PAE (1u << PGC_SH_type_to_index(PGC_SH_l1_pae_shadow)) +#define SHF_FL1_PAE (1u << PGC_SH_type_to_index(PGC_SH_fl1_pae_shadow)) +#define SHF_L2_PAE (1u << PGC_SH_type_to_index(PGC_SH_l2_pae_shadow)) +#define SHF_L2H_PAE (1u << PGC_SH_type_to_index(PGC_SH_l2h_pae_shadow)) +#define SHF_L3_PAE (1u << PGC_SH_type_to_index(PGC_SH_l3_pae_shadow)) +#define SHF_L1_64 (1u << PGC_SH_type_to_index(PGC_SH_l1_64_shadow)) +#define SHF_FL1_64 (1u << PGC_SH_type_to_index(PGC_SH_fl1_64_shadow)) +#define SHF_L2_64 (1u << PGC_SH_type_to_index(PGC_SH_l2_64_shadow)) +#define SHF_L3_64 (1u << PGC_SH_type_to_index(PGC_SH_l3_64_shadow)) +#define SHF_L4_64 (1u << PGC_SH_type_to_index(PGC_SH_l4_64_shadow)) + +/* Used for hysteresis when automatically unhooking mappings on fork/exit */ +#define SHF_unhooked_mappings (1u<<31) + +/* + * Allocation of shadow pages + */ + +/* Return the minumum acceptable number of shadow pages a domain needs */ +unsigned int shadow_min_acceptable_pages(struct domain *d); + +/* Set the pool of shadow pages to the required number of MB. + * Input will be rounded up to at least min_acceptable_shadow_pages(). + * Returns 0 for success, 1 for failure. */ +unsigned int shadow_set_allocation(struct domain *d, + unsigned int megabytes, + int *preempted); + +/* Return the size of the shadow pool, rounded up to the nearest MB */ +static inline unsigned int shadow_get_allocation(struct domain *d) +{ + unsigned int pg = d->arch.shadow.total_pages; + return ((pg >> (20 - PAGE_SHIFT)) + + ((pg & ((1 << (20 - PAGE_SHIFT)) - 1)) ? 1 : 0)); +} + +/* + * Linked list for chaining entries in the shadow hash table. + */ +struct shadow_hash_entry { + struct shadow_hash_entry *next; + mfn_t smfn; /* MFN of the shadow */ +#ifdef _x86_64_ /* Shorten 'n' so we don't waste a whole word on storing 't' */ + unsigned long n:56; /* MFN of guest PT or GFN of guest superpage */ +#else + unsigned long n; /* MFN of guest PT or GFN of guest superpage */ +#endif + unsigned char t; /* shadow type bits, or 0 for empty */ +}; + +#define SHADOW_HASH_BUCKETS 251 +/* Other possibly useful primes are 509, 1021, 2039, 4093, 8191, 16381 */ + + +#if SHADOW_OPTIMIZATIONS & SHOPT_CACHE_WALKS +/* Optimization: cache the results of guest walks. This helps with MMIO + * and emulated writes, which tend to issue very similar walk requests + * repeatedly. We keep the results of the last few walks, and blow + * away the cache on guest cr3 write, mode change, or page fault. */ + +#define SH_WALK_CACHE_ENTRIES 4 + +/* Rather than cache a guest walk, which would include mapped pointers + * to pages, we cache what a TLB would remember about the walk: the + * permissions and the l1 gfn */ +struct shadow_walk_cache { + unsigned long va; /* The virtual address (or 0 == unused) */ + unsigned long gfn; /* The gfn from the effective l1e */ + u32 permissions; /* The aggregated permission bits */ +}; +#endif + + +/**************************************************************************/ +/* Guest physmap (p2m) support */ + +/* Walk another domain's P2M table, mapping pages as we go */ +extern mfn_t +sh_gfn_to_mfn_foreign(struct domain *d, unsigned long gpfn); + + +/* General conversion function from gfn to mfn */ +static inline mfn_t +sh_gfn_to_mfn(struct domain *d, unsigned long gfn) +{ + if ( !shadow_mode_translate(d) ) + return _mfn(gfn); + else if ( likely(current->domain == d) ) + return _mfn(get_mfn_from_gpfn(gfn)); + else + return sh_gfn_to_mfn_foreign(d, gfn); +} + +// vcpu-specific version of gfn_to_mfn(). This is where we hide the dirty +// little secret that, for hvm guests with paging disabled, nearly all of the +// shadow code actually think that the guest is running on *untranslated* page +// tables (which is actually domain->phys_table). +// +static inline mfn_t +sh_vcpu_gfn_to_mfn(struct vcpu *v, unsigned long gfn) +{ + if ( !shadow_vcpu_mode_translate(v) ) + return _mfn(gfn); + if ( likely(current->domain == v->domain) ) + return _mfn(get_mfn_from_gpfn(gfn)); + return sh_gfn_to_mfn_foreign(v->domain, gfn); +} + +static inline unsigned long +sh_mfn_to_gfn(struct domain *d, mfn_t mfn) +{ + if ( shadow_mode_translate(d) ) + return get_gpfn_from_mfn(mfn_x(mfn)); + else + return mfn_x(mfn); +} + + #endif /* _XEN_SHADOW_H */ @@ -49,7 +633,7 @@ * mode: C * c-set-style: "BSD" * c-basic-offset: 4 - * tab-width: 4 * indent-tabs-mode: nil * End: */ + diff -r 896fcdd49c7f -r 684fdcfb251a xen/arch/x86/mm/Makefile --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/arch/x86/mm/Makefile Mon Aug 28 16:26:37 2006 -0600 @@ -0,0 +1,1 @@ +subdir-y += shadow diff -r 896fcdd49c7f -r 684fdcfb251a xen/arch/x86/mm/shadow/Makefile --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/arch/x86/mm/shadow/Makefile Mon Aug 28 16:26:37 2006 -0600 @@ -0,0 +1,15 @@ +ifneq ($(pae),n) +obj-$(x86_32) += common.o g2_on_s3.o g3_on_s3.o +else +obj-$(x86_32) += common.o g2_on_s2.o +endif + +obj-$(x86_64) += common.o g4_on_s4.o g3_on_s3.o g2_on_s3.o + +guest_levels = $(subst g,,$(filter g%,$(subst ., ,$(subst _, ,$(1))))) +shadow_levels = $(subst s,,$(filter s%,$(subst ., ,$(subst _, ,$(1))))) +shadow_defns = -DGUEST_PAGING_LEVELS=$(call guest_levels,$(1)) \ + -DSHADOW_PAGING_LEVELS=$(call shadow_levels,$(1)) + +g%.o: multi.c $(HDRS) Makefile + $(CC) $(CFLAGS) $(call shadow_defns,$(@F)) -c $< -o $@ diff -r 896fcdd49c7f -r 684fdcfb251a xen/arch/x86/mm/shadow/common.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/arch/x86/mm/shadow/common.c Mon Aug 28 16:26:37 2006 -0600 @@ -0,0 +1,3407 @@ +/****************************************************************************** + * arch/x86/mm/shadow/common.c + * + * Shadow code that does not need to be multiply compiled. + * Parts of this code are Copyright (c) 2006 by XenSource Inc. + * Parts of this code are Copyright (c) 2006 by Michael A Fetterman + * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#define SHADOW 1 + +#include <xen/config.h> +#include <xen/types.h> +#include <xen/mm.h> +#include <xen/trace.h> +#include <xen/sched.h> +#include <xen/perfc.h> +#include <xen/irq.h> +#include <xen/domain_page.h> +#include <xen/guest_access.h> +#include <xen/keyhandler.h> +#include <asm/event.h> +#include <asm/page.h> +#include <asm/current.h> +#include <asm/flushtlb.h> +#include <asm/shadow.h> +#include "private.h" + +#if SHADOW_AUDIT +int shadow_audit_enable = 0; + +static void shadow_audit_key(unsigned char key) +{ + shadow_audit_enable = !shadow_audit_enable; + printk("%s shadow_audit_enable=%d\n", + __func__, shadow_audit_enable); +} + +static int __init shadow_audit_key_init(void) +{ + register_keyhandler( + 'O', shadow_audit_key, "toggle shadow audits"); + return 0; +} +__initcall(shadow_audit_key_init); +#endif /* SHADOW_AUDIT */ + +static void sh_free_log_dirty_bitmap(struct domain *d); + +int _shadow_mode_refcounts(struct domain *d) +{ + return shadow_mode_refcounts(d); +} + + +/**************************************************************************/ +/* x86 emulator support for the shadow code + */ + +static int +sh_x86_emulate_read_std(unsigned long addr, + unsigned long *val, + unsigned int bytes, + struct x86_emulate_ctxt *ctxt) +{ + struct vcpu *v = current; + if ( hvm_guest(v) ) + { + *val = 0; + // XXX -- this is WRONG. + // It entirely ignores the permissions in the page tables. + // In this case, that is only a user vs supervisor access check. + // + if ( hvm_copy(val, addr, bytes, HVM_COPY_IN) ) + { +#if 0 + SHADOW_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n", + v->domain->domain_id, v->vcpu_id, + addr, *val, bytes); +#endif + return X86EMUL_CONTINUE; + } + + /* If we got here, there was nothing mapped here, or a bad GFN + * was mapped here. This should never happen: we're here because + * of a write fault at the end of the instruction we're emulating. */ + SHADOW_PRINTK("read failed to va %#lx\n", addr); + return X86EMUL_PROPAGATE_FAULT; + } + else + { + SHADOW_PRINTK("this operation is not emulated yet\n"); + return X86EMUL_UNHANDLEABLE; + } +} + +static int +sh_x86_emulate_write_std(unsigned long addr, + unsigned long val, + unsigned int bytes, + struct x86_emulate_ctxt *ctxt) +{ + struct vcpu *v = current; +#if 0 + SHADOW_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n", + v->domain->domain_id, v->vcpu_id, addr, val, bytes); +#endif + if ( hvm_guest(v) ) + { + // XXX -- this is WRONG. + // It entirely ignores the permissions in the page tables. + // In this case, that includes user vs supervisor, and + // write access. + // + if ( hvm_copy(&val, addr, bytes, HVM_COPY_OUT) ) + return X86EMUL_CONTINUE; + + /* If we got here, there was nothing mapped here, or a bad GFN + * was mapped here. This should never happen: we're here because + * of a write fault at the end of the instruction we're emulating, + * which should be handled by sh_x86_emulate_write_emulated. */ + SHADOW_PRINTK("write failed to va %#lx\n", addr); + return X86EMUL_PROPAGATE_FAULT; + } + else + { + SHADOW_PRINTK("this operation is not emulated yet\n"); + return X86EMUL_UNHANDLEABLE; + } +} + +static int +sh_x86_emulate_write_emulated(unsigned long addr, + unsigned long val, + unsigned int bytes, + struct x86_emulate_ctxt *ctxt) +{ + struct vcpu *v = current; +#if 0 + SHADOW_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n", + v->domain->domain_id, v->vcpu_id, addr, val, bytes); +#endif + if ( hvm_guest(v) ) + { + return v->arch.shadow.mode->x86_emulate_write(v, addr, &val, bytes, ctxt); + } + else + { + SHADOW_PRINTK("this operation is not emulated yet\n"); + return X86EMUL_UNHANDLEABLE; + } +} + +static int +sh_x86_emulate_cmpxchg_emulated(unsigned long addr, + unsigned long old, + unsigned long new, + unsigned int bytes, + struct x86_emulate_ctxt *ctxt) +{ + struct vcpu *v = current; +#if 0 + SHADOW_PRINTK("d=%u v=%u a=%#lx o?=%#lx n:=%#lx bytes=%u\n", + v->domain->domain_id, v->vcpu_id, addr, old, new, bytes); +#endif + if ( hvm_guest(v) ) + { + return v->arch.shadow.mode->x86_emulate_cmpxchg(v, addr, old, new, + bytes, ctxt); + } + else + { + SHADOW_PRINTK("this operation is not emulated yet\n"); + return X86EMUL_UNHANDLEABLE; + } +} + +static int +sh_x86_emulate_cmpxchg8b_emulated(unsigned long addr, + unsigned long old_lo, + unsigned long old_hi, + unsigned long new_lo, + unsigned long new_hi, + struct x86_emulate_ctxt *ctxt) +{ + struct vcpu *v = current; +#if 0 + SHADOW_PRINTK("d=%u v=%u a=%#lx o?=%#lx:%lx n:=%#lx:%lx\n", + v->domain->domain_id, v->vcpu_id, addr, old_hi, old_lo, + new_hi, new_lo, ctxt); +#endif + if ( hvm_guest(v) ) + { + return v->arch.shadow.mode->x86_emulate_cmpxchg8b(v, addr, old_lo, old_hi, + new_lo, new_hi, ctxt); + } + else + { + SHADOW_PRINTK("this operation is not emulated yet\n"); + return X86EMUL_UNHANDLEABLE; + } +} + + +struct x86_emulate_ops shadow_emulator_ops = { + .read_std = sh_x86_emulate_read_std, + .write_std = sh_x86_emulate_write_std, + .read_emulated = sh_x86_emulate_read_std, + .write_emulated = sh_x86_emulate_write_emulated, + .cmpxchg_emulated = sh_x86_emulate_cmpxchg_emulated, + .cmpxchg8b_emulated = sh_x86_emulate_cmpxchg8b_emulated, +}; + + +/**************************************************************************/ +/* Code for "promoting" a guest page to the point where the shadow code is + * willing to let it be treated as a guest page table. This generally + * involves making sure there are no writable mappings available to the guest + * for this page. + */ +void shadow_promote(struct vcpu *v, mfn_t gmfn, u32 type) +{ + struct page_info *page = mfn_to_page(gmfn); + unsigned long type_info; + + ASSERT(valid_mfn(gmfn)); + + /* We should never try to promote a gmfn that has writeable mappings */ + ASSERT(shadow_remove_write_access(v, gmfn, 0, 0) == 0); + + // Is the page already shadowed? + if ( !test_and_set_bit(_PGC_page_table, &page->count_info) ) + { + // No prior shadow exists... + + // Grab a type-ref. We don't really care if we are racing with another + // vcpu or not, or even what kind of type we get; we just want the type + // count to be > 0. + // + do { + type_info = + page->u.inuse.type_info & (PGT_type_mask | PGT_va_mask); + } while ( !get_page_type(page, type_info) ); + + // Now that the type ref is non-zero, we can safely use the + // shadow_flags. + // + page->shadow_flags = 0; + } + + ASSERT(!test_bit(type >> PGC_SH_type_shift, &page->shadow_flags)); + set_bit(type >> PGC_SH_type_shift, &page->shadow_flags); +} + +void shadow_demote(struct vcpu *v, mfn_t gmfn, u32 type) +{ + struct page_info *page = mfn_to_page(gmfn); + + ASSERT(test_bit(_PGC_page_table, &page->count_info)); + ASSERT(test_bit(type >> PGC_SH_type_shift, &page->shadow_flags)); + + clear_bit(type >> PGC_SH_type_shift, &page->shadow_flags); + + if ( (page->shadow_flags & SHF_page_type_mask) == 0 ) + { + // release the extra type ref + put_page_type(page); + + // clear the is-a-page-table bit. + clear_bit(_PGC_page_table, &page->count_info); + } +} + +/**************************************************************************/ +/* Validate a pagetable change from the guest and update the shadows. + * Returns a bitmask of SHADOW_SET_* flags. */ + +static int +__shadow_validate_guest_entry(struct vcpu *v, mfn_t gmfn, + void *entry, u32 size) +{ + int result = 0; + struct page_info *page = mfn_to_page(gmfn); + + sh_mark_dirty(v->domain, gmfn); + + // Determine which types of shadows are affected, and update each. + // + // Always validate L1s before L2s to prevent another cpu with a linear + // mapping of this gmfn from seeing a walk that results from + // using the new L2 value and the old L1 value. (It is OK for such a + // guest to see a walk that uses the old L2 value with the new L1 value, + // as hardware could behave this way if one level of the pagewalk occurs + // before the store, and the next level of the pagewalk occurs after the + // store. + // + // Ditto for L2s before L3s, etc. + // + + if ( !(page->count_info & PGC_page_table) ) + return 0; /* Not shadowed at all */ + +#if CONFIG_PAGING_LEVELS == 2 + if ( page->shadow_flags & SHF_L1_32 ) + result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 2, 2) + (v, gmfn, entry, size); +#else + if ( page->shadow_flags & SHF_L1_32 ) + result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 3, 2) + (v, gmfn, entry, size); +#endif + +#if CONFIG_PAGING_LEVELS == 2 + if ( page->shadow_flags & SHF_L2_32 ) + result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 2, 2) + (v, gmfn, entry, size); +#else + if ( page->shadow_flags & SHF_L2_32 ) + result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 3, 2) + (v, gmfn, entry, size); +#endif + +#if CONFIG_PAGING_LEVELS >= 3 + if ( page->shadow_flags & SHF_L1_PAE ) + result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 3, 3) + (v, gmfn, entry, size); + if ( page->shadow_flags & SHF_L2_PAE ) + result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 3, 3) + (v, gmfn, entry, size); + if ( page->shadow_flags & SHF_L2H_PAE ) + result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2he, 3, 3) + (v, gmfn, entry, size); + if ( page->shadow_flags & SHF_L3_PAE ) + result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl3e, 3, 3) + (v, gmfn, entry, size); +#else /* 32-bit non-PAE hypervisor does not support PAE guests */ + ASSERT((page->shadow_flags & (SHF_L3_PAE|SHF_L2_PAE|SHF_L1_PAE)) == 0); +#endif + +#if CONFIG_PAGING_LEVELS >= 4 + if ( page->shadow_flags & SHF_L1_64 ) + result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl1e, 4, 4) + (v, gmfn, entry, size); + if ( page->shadow_flags & SHF_L2_64 ) + result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl2e, 4, 4) + (v, gmfn, entry, size); + if ( page->shadow_flags & SHF_L3_64 ) + result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl3e, 4, 4) + (v, gmfn, entry, size); + if ( page->shadow_flags & SHF_L4_64 ) + result |= SHADOW_INTERNAL_NAME(sh_map_and_validate_gl4e, 4, 4) + (v, gmfn, entry, size); +#else /* 32-bit/PAE hypervisor does not support 64-bit guests */ + ASSERT((page->shadow_flags + & (SHF_L4_64|SHF_L3_64|SHF_L2_64|SHF_L1_64)) == 0); +#endif + + return result; +} + + +int +shadow_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *entry) +/* This is the entry point from hypercalls. It returns a bitmask of all the + * results of shadow_set_l*e() calls, so the caller knows to do TLB flushes. */ +{ + int rc; + + ASSERT(shadow_lock_is_acquired(v->domain)); + rc = __shadow_validate_guest_entry(v, gmfn, entry, sizeof(l1_pgentry_t)); + shadow_audit_tables(v); + return rc; +} + +void +shadow_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn, + void *entry, u32 size) +/* This is the entry point for emulated writes to pagetables in HVM guests */ +{ + struct domain *d = v->domain; + int rc; + + ASSERT(shadow_lock_is_acquired(v->domain)); + rc = __shadow_validate_guest_entry(v, gmfn, entry, size); + if ( rc & SHADOW_SET_FLUSH ) + { + // Flush everyone except the local processor, which will flush when it + // re-enters the HVM guest. + // + cpumask_t mask = d->domain_dirty_cpumask; + cpu_clear(v->processor, mask); + flush_tlb_mask(mask); + } + if ( rc & SHADOW_SET_ERROR ) + { + /* This page is probably not a pagetable any more: tear it out of the + * shadows, along with any tables that reference it */ + shadow_remove_all_shadows_and_parents(v, gmfn); + } + /* We ignore the other bits: since we are about to change CR3 on + * VMENTER we don't need to do any extra TLB flushes. */ +} + + +/**************************************************************************/ +/* Memory management for shadow pages. */ + +/* Meaning of the count_info field in shadow pages + * ---------------------------------------------- + * + * A count of all references to this page from other shadow pages and + * guest CR3s (a.k.a. v->arch.shadow.table). + * + * The top bits hold the shadow type and the pinned bit. Top-level + * shadows are pinned so that they don't disappear when not in a CR3 + * somewhere. + * + * We don't need to use get|put_page for this as the updates are all + * protected by the shadow lock. We can't use get|put_page for this + * as the size of the count on shadow pages is different from that on + * normal guest pages. + */ + +/* Meaning of the type_info field in shadow pages + * ---------------------------------------------- + * + * type_info use depends on the shadow type (from count_info) + * + * PGC_SH_none : This page is in the shadow free pool. type_info holds + * the chunk order for our freelist allocator. + * + * PGC_SH_l*_shadow : This page is in use as a shadow. type_info + * holds the mfn of the guest page being shadowed, + * + * PGC_SH_fl1_*_shadow : This page is being used to shatter a superpage. + * type_info holds the gfn being shattered. + * + * PGC_SH_monitor_table : This page is part of a monitor table. + * type_info is not used. + */ + +/* Meaning of the _domain field in shadow pages + * -------------------------------------------- + * + * In shadow pages, this field will always have its least significant bit + * set. This ensures that all attempts to get_page() will fail (as all + * valid pickled domain pointers have a zero for their least significant bit). + * Instead, the remaining upper bits are used to record the shadow generation + * counter when the shadow was created. + */ + +/* Meaning of the shadow_flags field + * ---------------------------------- + * + * In guest pages that are shadowed, one bit for each kind of shadow they have. + * + * In shadow pages, will be used for holding a representation of the populated + * entries in this shadow (either a min/max, or a bitmap, or ...) + * + * In monitor-table pages, holds the level of the particular page (to save + * spilling the shadow types into an extra bit by having three types of monitor + * page). + */ + +/* Meaning of the list_head struct in shadow pages + * ----------------------------------------------- + * + * In free shadow pages, this is used to hold the free-lists of chunks. + * + * In top-level shadow tables, this holds a linked-list of all top-level + * shadows (used for recovering memory and destroying shadows). + * + * In lower-level shadows, this holds the physical address of a higher-level + * shadow entry that holds a reference to this shadow (or zero). + */ + +/* Allocating shadow pages + * ----------------------- + * + * Most shadow pages are allocated singly, but there are two cases where we + * need to allocate multiple pages together. + * + * 1: Shadowing 32-bit guest tables on PAE or 64-bit shadows. + * A 32-bit guest l1 table covers 4MB of virtuial address space, + * and needs to be shadowed by two PAE/64-bit l1 tables (covering 2MB + * of virtual address space each). Similarly, a 32-bit guest l2 table + * (4GB va) needs to be shadowed by four PAE/64-bit l2 tables (1GB va + * each). These multi-page shadows are contiguous and aligned; + * functions for handling offsets into them are defined in shadow.c + * (shadow_l1_index() etc.) + * + * 2: Shadowing PAE top-level pages. Each guest page that contains + * any PAE top-level pages requires two shadow pages to shadow it. + * They contain alternating l3 tables and pae_l3_bookkeeping structs. + * + * This table shows the allocation behaviour of the different modes: + * + * Xen paging 32b pae pae 64b 64b 64b + * Guest paging 32b 32b pae 32b pae 64b + * PV or HVM * HVM * HVM HVM * + * Shadow paging 32b pae pae pae pae 64b + * + * sl1 size 4k 8k 4k 8k 4k 4k + * sl2 size 4k 16k 4k 16k 4k 4k + * sl3 size - - 8k - 8k 4k + * sl4 size - - - - - 4k + * + * We allocate memory from xen in four-page units and break them down + * with a simple buddy allocator. Can't use the xen allocator to handle + * this as it only works for contiguous zones, and a domain's shadow + * pool is made of fragments. + * + * In HVM guests, the p2m table is built out of shadow pages, and we provide + * a function for the p2m management to steal pages, in max-order chunks, from + * the free pool. We don't provide for giving them back, yet. + */ + +/* Figure out the least acceptable quantity of shadow memory. + * The minimum memory requirement for always being able to free up a + * chunk of memory is very small -- only three max-order chunks per + * vcpu to hold the top level shadows and pages with Xen mappings in them. + * + * But for a guest to be guaranteed to successfully execute a single + * instruction, we must be able to map a large number (about thirty) VAs + * at the same time, which means that to guarantee progress, we must + * allow for more than ninety allocated pages per vcpu. We round that + * up to 128 pages, or half a megabyte per vcpu. */ +unsigned int shadow_min_acceptable_pages(struct domain *d) +{ + u32 vcpu_count = 0; + struct vcpu *v; + + for_each_vcpu(d, v) + vcpu_count++; + + return (vcpu_count * 128); +} + +/* Using the type_info field to store freelist order */ +#define SH_PFN_ORDER(_p) ((_p)->u.inuse.type_info) +#define SH_SET_PFN_ORDER(_p, _o) \ + do { (_p)->u.inuse.type_info = (_o); } while (0) + + +/* Figure out the order of allocation needed for a given shadow type */ +static inline u32 +shadow_order(u32 shadow_type) +{ +#if CONFIG_PAGING_LEVELS > 2 + static const u32 type_to_order[16] = { + 0, /* PGC_SH_none */ + 1, /* PGC_SH_l1_32_shadow */ + 1, /* PGC_SH_fl1_32_shadow */ + 2, /* PGC_SH_l2_32_shadow */ + 0, /* PGC_SH_l1_pae_shadow */ + 0, /* PGC_SH_fl1_pae_shadow */ + 0, /* PGC_SH_l2_pae_shadow */ + 0, /* PGC_SH_l2h_pae_shadow */ + 1, /* PGC_SH_l3_pae_shadow */ + 0, /* PGC_SH_l1_64_shadow */ + 0, /* PGC_SH_fl1_64_shadow */ + 0, /* PGC_SH_l2_64_shadow */ + 0, /* PGC_SH_l3_64_shadow */ + 0, /* PGC_SH_l4_64_shadow */ + 2, /* PGC_SH_p2m_table */ + 0 /* PGC_SH_monitor_table */ + }; + u32 type = (shadow_type & PGC_SH_type_mask) >> PGC_SH_type_shift; + return type_to_order[type]; +#else /* 32-bit Xen only ever shadows 32-bit guests on 32-bit shadows. */ + return 0; +#endif +} + + +/* Do we have a free chunk of at least this order? */ +static inline int chunk_is_available(struct domain *d, int order) +{ + int i; + + for ( i = order; i <= SHADOW_MAX_ORDER; i++ ) + if ( !list_empty(&d->arch.shadow.freelists[i]) ) + return 1; + return 0; +} + +/* Dispatcher function: call the per-mode function that will unhook the + * non-Xen mappings in this top-level shadow mfn */ +void shadow_unhook_mappings(struct vcpu *v, mfn_t smfn) +{ + struct page_info *pg = mfn_to_page(smfn); + switch ( (pg->count_info & PGC_SH_type_mask) >> PGC_SH_type_shift ) + { + case PGC_SH_l2_32_shadow >> PGC_SH_type_shift: +#if CONFIG_PAGING_LEVELS == 2 + SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings,2,2)(v,smfn); +#else + SHADOW_INTERNAL_NAME(sh_unhook_32b_mappings,3,2)(v,smfn); +#endif + break; +#if CONFIG_PAGING_LEVELS >= 3 + case PGC_SH_l3_pae_shadow >> PGC_SH_type_shift: + SHADOW_INTERNAL_NAME(sh_unhook_pae_mappings,3,3)(v,smfn); + break; +#endif +#if CONFIG_PAGING_LEVELS >= 4 + case PGC_SH_l4_64_shadow >> PGC_SH_type_shift: + SHADOW_INTERNAL_NAME(sh_unhook_64b_mappings,4,4)(v,smfn); + break; +#endif + default: + SHADOW_PRINTK("top-level shadow has bad type %08lx\n", + (unsigned long)((pg->count_info & PGC_SH_type_mask) + >> PGC_SH_type_shift)); + BUG(); + } +} + + +/* Make sure there is at least one chunk of the required order available + * in the shadow page pool. This must be called before any calls to + * shadow_alloc(). Since this will free existing shadows to make room, + * it must be called early enough to avoid freeing shadows that the + * caller is currently working on. */ +void shadow_prealloc(struct domain *d, unsigned int order) +{ + /* Need a vpcu for calling unpins; for now, since we don't have + * per-vcpu shadows, any will do */ + struct vcpu *v = d->vcpu[0]; + struct list_head *l, *t; + struct page_info *pg; + mfn_t smfn; + + if ( chunk_is_available(d, order) ) return; + + /* Stage one: walk the list of top-level pages, unpinning them */ + perfc_incrc(shadow_prealloc_1); + list_for_each_backwards_safe(l, t, &d->arch.shadow.toplevel_shadows) + { + pg = list_entry(l, struct page_info, list); + smfn = page_to_mfn(pg); + +#if CONFIG_PAGING_LEVELS >= 3 + if ( (pg->count_info & PGC_SH_type_mask) == PGC_SH_l3_pae_shadow ) + { + /* For PAE, we need to unpin each subshadow on this shadow */ + SHADOW_INTERNAL_NAME(sh_unpin_all_l3_subshadows,3,3)(v, smfn); + } + else +#endif /* 32-bit code always takes this branch */ + { + /* Unpin this top-level shadow */ + sh_unpin(v, smfn); + } + + /* See if that freed up a chunk of appropriate size */ + if ( chunk_is_available(d, order) ) return; + } + + /* Stage two: all shadow pages are in use in hierarchies that are + * loaded in cr3 on some vcpu. Walk them, unhooking the non-Xen + * mappings. */ + perfc_incrc(shadow_prealloc_2); + v = current; + if ( v->domain != d ) + v = d->vcpu[0]; + /* Walk the list from the tail: recently used toplevels have been pulled + * to the head */ + list_for_each_backwards_safe(l, t, &d->arch.shadow.toplevel_shadows) + { + pg = list_entry(l, struct page_info, list); + smfn = page_to_mfn(pg); + shadow_unhook_mappings(v, smfn); + + /* Need to flush TLB if we've altered our own tables */ + if ( !shadow_mode_external(d) + && pagetable_get_pfn(current->arch.shadow_table) == mfn_x(smfn) ) + local_flush_tlb(); + + /* See if that freed up a chunk of appropriate size */ + if ( chunk_is_available(d, order) ) return; + } + + /* Nothing more we can do: all remaining shadows are of pages that + * hold Xen mappings for some vcpu. This can never happen. */ + SHADOW_PRINTK("Can't pre-allocate %i shadow pages!\n" + " shadow pages total = %u, free = %u, p2m=%u\n", + 1 << order, + d->arch.shadow.total_pages, + d->arch.shadow.free_pages, + d->arch.shadow.p2m_pages); + BUG(); +} + + +/* Allocate another shadow's worth of (contiguous, aligned) pages, + * and fill in the type and backpointer fields of their page_infos. + * Never fails to allocate. */ +mfn_t shadow_alloc(struct domain *d, + u32 shadow_type, + unsigned long backpointer) +{ + struct page_info *pg = NULL; + unsigned int order = shadow_order(shadow_type); + cpumask_t mask; + void *p; + int i; + + ASSERT(shadow_lock_is_acquired(d)); + ASSERT(order <= SHADOW_MAX_ORDER); + ASSERT(shadow_type != PGC_SH_none); + perfc_incrc(shadow_alloc); + + /* Find smallest order which can satisfy the request. */ + for ( i = order; i <= SHADOW_MAX_ORDER; i++ ) + if ( !list_empty(&d->arch.shadow.freelists[i]) ) + { + pg = list_entry(d->arch.shadow.freelists[i].next, + struct page_info, list); + list_del(&pg->list); + + /* We may have to halve the chunk a number of times. */ + while ( i != order ) + { + i--; + SH_SET_PFN_ORDER(pg, i); + list_add_tail(&pg->list, &d->arch.shadow.freelists[i]); + pg += 1 << i; + } + d->arch.shadow.free_pages -= 1 << order; + + /* Init page info fields and clear the pages */ + for ( i = 0; i < 1<<order ; i++ ) + { + pg[i].u.inuse.type_info = backpointer; + pg[i].count_info = shadow_type; + pg[i].shadow_flags = 0; + INIT_LIST_HEAD(&pg[i].list); + /* Before we overwrite the old contents of this page, + * we need to be sure that no TLB holds a pointer to it. */ + mask = d->domain_dirty_cpumask; + tlbflush_filter(mask, pg[i].tlbflush_timestamp); + if ( unlikely(!cpus_empty(mask)) ) + { + perfc_incrc(shadow_alloc_tlbflush); + flush_tlb_mask(mask); + } + /* Now safe to clear the page for reuse */ + p = sh_map_domain_page(page_to_mfn(pg+i)); + ASSERT(p != NULL); + clear_page(p); + sh_unmap_domain_page(p); + perfc_incr(shadow_alloc_count); + } + return page_to_mfn(pg); + } + + /* If we get here, we failed to allocate. This should never happen. + * It means that we didn't call shadow_prealloc() correctly before + * we allocated. We can't recover by calling prealloc here, because + * we might free up higher-level pages that the caller is working on. */ + SHADOW_PRINTK("Can't allocate %i shadow pages!\n", 1 << order); + BUG(); +} + + +/* Return some shadow pages to the pool. */ +void shadow_free(struct domain *d, mfn_t smfn) +{ + struct page_info *pg = mfn_to_page(smfn); + u32 shadow_type; + unsigned long order; + unsigned long mask; + int i; + + ASSERT(shadow_lock_is_acquired(d)); + perfc_incrc(shadow_free); + + shadow_type = pg->count_info & PGC_SH_type_mask; + ASSERT(shadow_type != PGC_SH_none); + ASSERT(shadow_type != PGC_SH_p2m_table); + order = shadow_order(shadow_type); + + d->arch.shadow.free_pages += 1 << order; + + for ( i = 0; i < 1<<order; i++ ) + { + /* Strip out the type: this is now a free shadow page */ + pg[i].count_info = 0; + /* Remember the TLB timestamp so we will know whether to flush + * TLBs when we reuse the page. Because the destructors leave the + * contents of the pages in place, we can delay TLB flushes until + * just before the allocator hands the page out again. */ + pg[i].tlbflush_timestamp = tlbflush_current_time(); + perfc_decr(shadow_alloc_count); + } + + /* Merge chunks as far as possible. */ + while ( order < SHADOW_MAX_ORDER ) + { + mask = 1 << order; + if ( (mfn_x(page_to_mfn(pg)) & mask) ) { + /* Merge with predecessor block? */ + if ( (((pg-mask)->count_info & PGC_SH_type_mask) != PGT_none) + || (SH_PFN_ORDER(pg-mask) != order) ) + break; + list_del(&(pg-mask)->list); + pg -= mask; + } else { + /* Merge with successor block? */ + if ( (((pg+mask)->count_info & PGC_SH_type_mask) != PGT_none) + || (SH_PFN_ORDER(pg+mask) != order) ) + break; + list_del(&(pg+mask)->list); + } + order++; + } + + SH_SET_PFN_ORDER(pg, order); + list_add_tail(&pg->list, &d->arch.shadow.freelists[order]); +} + +/* Divert some memory from the pool to be used by the p2m mapping. + * This action is irreversible: the p2m mapping only ever grows. + * That's OK because the p2m table only exists for external domains, + * and those domains can't ever turn off shadow mode. + * Also, we only ever allocate a max-order chunk, so as to preserve + * the invariant that shadow_prealloc() always works. + * Returns 0 iff it can't get a chunk (the caller should then + * free up some pages in domheap and call set_sh_allocation); + * returns non-zero on success. + */ +static int +shadow_alloc_p2m_pages(struct domain *d) +{ + struct page_info *pg; + u32 i; + ASSERT(shadow_lock_is_acquired(d)); + + if ( d->arch.shadow.total_pages + < (shadow_min_acceptable_pages(d) + (1<<SHADOW_MAX_ORDER)) ) + return 0; /* Not enough shadow memory: need to increase it first */ + + pg = mfn_to_page(shadow_alloc(d, PGC_SH_p2m_table, 0)); + d->arch.shadow.p2m_pages += (1<<SHADOW_MAX_ORDER); + d->arch.shadow.total_pages -= (1<<SHADOW_MAX_ORDER); + for (i = 0; i < (1<<SHADOW_MAX_ORDER); i++) + { + /* Unlike shadow pages, mark p2m pages as owned by the domain */ + page_set_owner(&pg[i], d); + list_add_tail(&pg[i].list, &d->arch.shadow.p2m_freelist); + } + return 1; +} + +// Returns 0 if no memory is available... +mfn_t +shadow_alloc_p2m_page(struct domain *d) +{ + struct list_head *entry; + mfn_t mfn; + void *p; + + if ( list_empty(&d->arch.shadow.p2m_freelist) && + !shadow_alloc_p2m_pages(d) ) + return _mfn(0); + entry = d->arch.shadow.p2m_freelist.next; + list_del(entry); + list_add_tail(entry, &d->arch.shadow.p2m_inuse); + mfn = page_to_mfn(list_entry(entry, struct page_info, list)); + sh_get_ref(mfn, 0); + p = sh_map_domain_page(mfn); + clear_page(p); + sh_unmap_domain_page(p); + + return mfn; +} + +#if CONFIG_PAGING_LEVELS == 3 +static void p2m_install_entry_in_monitors(struct domain *d, + l3_pgentry_t *l3e) +/* Special case, only used for external-mode domains on PAE hosts: + * update the mapping of the p2m table. Once again, this is trivial in + * other paging modes (one top-level entry points to the top-level p2m, + * no maintenance needed), but PAE makes life difficult by needing a + * copy the eight l3es of the p2m table in eight l2h slots in the + * monitor table. This function makes fresh copies when a p2m l3e + * changes. */ +{ + l2_pgentry_t *ml2e; + struct vcpu *v; + unsigned int index; + + index = ((unsigned long)l3e & ~PAGE_MASK) / sizeof(l3_pgentry_t); + ASSERT(index < MACHPHYS_MBYTES>>1); + + for_each_vcpu(d, v) + { + if ( pagetable_get_pfn(v->arch.monitor_table) == 0 ) + continue; + ASSERT(shadow_mode_external(v->domain)); + + SHADOW_DEBUG(P2M, "d=%u v=%u index=%u mfn=%#lx\n", + d->domain_id, v->vcpu_id, index, l3e_get_pfn(*l3e)); + + if ( v == current ) /* OK to use linear map of monitor_table */ + ml2e = __linear_l2_table + l2_linear_offset(RO_MPT_VIRT_START); + else + { + l3_pgentry_t *ml3e; + ml3e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table)); + ASSERT(l3e_get_flags(ml3e[3]) & _PAGE_PRESENT); + ml2e = sh_map_domain_page(_mfn(l3e_get_pfn(ml3e[3]))); + ml2e += l2_table_offset(RO_MPT_VIRT_START); + sh_unmap_domain_page(ml3e); + } + ml2e[index] = l2e_from_pfn(l3e_get_pfn(*l3e), __PAGE_HYPERVISOR); + if ( v != current ) + sh_unmap_domain_page(ml2e); + } +} +#endif + +// Find the next level's P2M entry, checking for out-of-range gfn's... +// Returns NULL on error. +// +static l1_pgentry_t * +p2m_find_entry(void *table, unsigned long *gfn_remainder, + unsigned long gfn, u32 shift, u32 max) +{ + u32 index; + + index = *gfn_remainder >> shift; + if ( index >= max ) + { + SHADOW_DEBUG(P2M, "gfn=0x%lx out of range " + "(gfn_remainder=0x%lx shift=%d index=0x%x max=0x%x)\n", + gfn, *gfn_remainder, shift, index, max); + return NULL; + } + *gfn_remainder &= (1 << shift) - 1; + return (l1_pgentry_t *)table + index; +} + +// Walk one level of the P2M table, allocating a new table if required. +// Returns 0 on error. +// +static int +p2m_next_level(struct domain *d, mfn_t *table_mfn, void **table, + unsigned long *gfn_remainder, unsigned long gfn, u32 shift, + u32 max, unsigned long type) +{ + l1_pgentry_t *p2m_entry; + void *next; + + if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn, + shift, max)) ) + return 0; + + if ( !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) ) + { + mfn_t mfn = shadow_alloc_p2m_page(d); + if ( mfn_x(mfn) == 0 ) + return 0; + *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER); + mfn_to_page(mfn)->u.inuse.type_info = type | 1 | PGT_validated; + mfn_to_page(mfn)->count_info = 1; +#if CONFIG_PAGING_LEVELS == 3 + if (type == PGT_l2_page_table) + { + /* We have written to the p2m l3: need to sync the per-vcpu + * copies of it in the monitor tables */ + p2m_install_entry_in_monitors(d, (l3_pgentry_t *)p2m_entry); + } +#endif + /* The P2M can be shadowed: keep the shadows synced */ + if ( d->vcpu[0] ) + (void)__shadow_validate_guest_entry(d->vcpu[0], *table_mfn, + p2m_entry, sizeof *p2m_entry); + } + *table_mfn = _mfn(l1e_get_pfn(*p2m_entry)); + next = sh_map_domain_page(*table_mfn); + sh_unmap_domain_page(*table); + *table = next; + + return 1; +} + +// Returns 0 on error (out of memory) +int +shadow_set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn) +{ + // XXX -- this might be able to be faster iff current->domain == d + mfn_t table_mfn = pagetable_get_mfn(d->arch.phys_table); + void *table = sh_map_domain_page(table_mfn); + unsigned long gfn_remainder = gfn; + l1_pgentry_t *p2m_entry; + +#if CONFIG_PAGING_LEVELS >= 4 + if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn, + L4_PAGETABLE_SHIFT - PAGE_SHIFT, + L4_PAGETABLE_ENTRIES, PGT_l3_page_table) ) + return 0; +#endif +#if CONFIG_PAGING_LEVELS >= 3 + // When using PAE Xen, we only allow 33 bits of pseudo-physical + // address in translated guests (i.e. 8 GBytes). This restriction + // comes from wanting to map the P2M table into the 16MB RO_MPT hole + // in Xen's address space for translated PV guests. + // + if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn, + L3_PAGETABLE_SHIFT - PAGE_SHIFT, + (CONFIG_PAGING_LEVELS == 3 + ? 8 + : L3_PAGETABLE_ENTRIES), + PGT_l2_page_table) ) + return 0; +#endif + if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn, + L2_PAGETABLE_SHIFT - PAGE_SHIFT, + L2_PAGETABLE_ENTRIES, PGT_l1_page_table) ) + return 0; + + p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn, + 0, L1_PAGETABLE_ENTRIES); + ASSERT(p2m_entry); + if ( valid_mfn(mfn) ) + *p2m_entry = l1e_from_pfn(mfn_x(mfn), __PAGE_HYPERVISOR|_PAGE_USER); + else + *p2m_entry = l1e_empty(); + + /* The P2M can be shadowed: keep the shadows synced */ + (void) __shadow_validate_guest_entry(d->vcpu[0], table_mfn, + p2m_entry, sizeof *p2m_entry); + + sh_unmap_domain_page(table); + + return 1; +} + +// Allocate a new p2m table for a domain. +// +// The structure of the p2m table is that of a pagetable for xen (i.e. it is +// controlled by CONFIG_PAGING_LEVELS). +// +// Returns 0 if p2m table could not be initialized +// +static int +shadow_alloc_p2m_table(struct domain *d) +{ + mfn_t p2m_top; + struct list_head *entry; + unsigned int page_count = 0; + + SHADOW_PRINTK("allocating p2m table\n"); + ASSERT(pagetable_get_pfn(d->arch.phys_table) == 0); + + p2m_top = shadow_alloc_p2m_page(d); + mfn_to_page(p2m_top)->count_info = 1; + mfn_to_page(p2m_top)->u.inuse.type_info = +#if CONFIG_PAGING_LEVELS == 4 + PGT_l4_page_table +#elif CONFIG_PAGING_LEVELS == 3 + PGT_l3_page_table +#elif CONFIG_PAGING_LEVELS == 2 + PGT_l2_page_table +#endif + | 1 | PGT_validated; + + if ( mfn_x(p2m_top) == 0 ) + return 0; + + d->arch.phys_table = pagetable_from_mfn(p2m_top); + + SHADOW_PRINTK("populating p2m table\n"); + + for ( entry = d->page_list.next; + entry != &d->page_list; + entry = entry->next ) + { + struct page_info *page = list_entry(entry, struct page_info, list); + mfn_t mfn = page_to_mfn(page); + unsigned long gfn = get_gpfn_from_mfn(mfn_x(mfn)); + page_count++; + if ( +#ifdef __x86_64__ + (gfn != 0x5555555555555555L) +#else + (gfn != 0x55555555L) +#endif + && gfn != INVALID_M2P_ENTRY + && !shadow_set_p2m_entry(d, gfn, mfn) ) + { + SHADOW_PRINTK("failed to initialize p2m table, gfn=%05lx, mfn=%" SH_PRI_mfn "\n", + gfn, mfn_x(mfn)); + return 0; + } + } + + SHADOW_PRINTK("p2m table initialised (%u pages)\n", page_count); + return 1; +} + +mfn_t +sh_gfn_to_mfn_foreign(struct domain *d, unsigned long gpfn) +/* Read another domain's p2m entries */ +{ + mfn_t mfn; + unsigned long addr = gpfn << PAGE_SHIFT; + l2_pgentry_t *l2e; + l1_pgentry_t *l1e; + + ASSERT(shadow_mode_translate(d)); + mfn = pagetable_get_mfn(d->arch.phys_table); + + +#if CONFIG_PAGING_LEVELS > 2 + if ( gpfn > (RO_MPT_VIRT_END - RO_MPT_VIRT_START) / sizeof(l1_pgentry_t) ) + /* This pfn is higher than the p2m map can hold */ + return _mfn(INVALID_MFN); +#endif + + +#if CONFIG_PAGING_LEVELS >= 4 + { + l4_pgentry_t *l4e = sh_map_domain_page(mfn); + l4e += l4_table_offset(addr); + if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 ) + { + sh_unmap_domain_page(l4e); + return _mfn(INVALID_MFN); + } + mfn = _mfn(l4e_get_pfn(*l4e)); + sh_unmap_domain_page(l4e); + } +#endif +#if CONFIG_PAGING_LEVELS >= 3 + { + l3_pgentry_t *l3e = sh_map_domain_page(mfn); + l3e += l3_table_offset(addr); + if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 ) + { + sh_unmap_domain_page(l3e); + return _mfn(INVALID_MFN); + } + mfn = _mfn(l3e_get_pfn(*l3e)); + sh_unmap_domain_page(l3e); + } +#endif + + l2e = sh_map_domain_page(mfn); + l2e += l2_table_offset(addr); + if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 ) + { + sh_unmap_domain_page(l2e); + return _mfn(INVALID_MFN); + } + mfn = _mfn(l2e_get_pfn(*l2e)); + sh_unmap_domain_page(l2e); + + l1e = sh_map_domain_page(mfn); + l1e += l1_table_offset(addr); + if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 ) + { + sh_unmap_domain_page(l1e); + return _mfn(INVALID_MFN); + } + mfn = _mfn(l1e_get_pfn(*l1e)); + sh_unmap_domain_page(l1e); + + return mfn; +} + +unsigned long +shadow_gfn_to_mfn_foreign(unsigned long gpfn) +{ + return mfn_x(sh_gfn_to_mfn_foreign(current->domain, gpfn)); +} + + +static void shadow_p2m_teardown(struct domain *d) +/* Return all the p2m pages to Xen. + * We know we don't have any extra mappings to these pages */ +{ + struct list_head *entry, *n; + struct page_info *pg; + + d->arch.phys_table = pagetable_null(); + + list_for_each_safe(entry, n, &d->arch.shadow.p2m_inuse) + { + pg = list_entry(entry, struct page_info, list); + list_del(entry); + /* Should have just the one ref we gave it in alloc_p2m_page() */ + if ( (pg->count_info & PGC_SH_count_mask) != 1 ) + { + SHADOW_PRINTK("Odd p2m page count c=%#x t=%"PRtype_info"\n", + pg->count_info, pg->u.inuse.type_info); + } + ASSERT(page_get_owner(pg) == d); + /* Free should not decrement domain's total allocation, since + * these pages were allocated without an owner. */ + page_set_owner(pg, NULL); + free_domheap_pages(pg, 0); + d->arch.shadow.p2m_pages--; + perfc_decr(shadow_alloc_count); + } + list_for_each_safe(entry, n, &d->arch.shadow.p2m_freelist) + { + list_del(entry); + pg = list_entry(entry, struct page_info, list); + ASSERT(page_get_owner(pg) == d); + /* Free should not decrement domain's total allocation. */ + page_set_owner(pg, NULL); + free_domheap_pages(pg, 0); + d->arch.shadow.p2m_pages--; + perfc_decr(shadow_alloc_count); + } + ASSERT(d->arch.shadow.p2m_pages == 0); +} + +/* Set the pool of shadow pages to the required number of pages. + * Input will be rounded up to at least shadow_min_acceptable_pages(), + * plus space for the p2m table. + * Returns 0 for success, non-zero for failure. */ +static unsigned int set_sh_allocation(struct domain *d, + unsigned int pages, + int *preempted) +{ + struct page_info *pg; + unsigned int lower_bound; + int j; + + ASSERT(shadow_lock_is_acquired(d)); + + /* Don't allocate less than the minimum acceptable, plus one page per + * megabyte of RAM (for the p2m table) */ + lower_bound = shadow_min_acceptable_pages(d) + (d->tot_pages / 256); + if ( pages > 0 && pages < lower_bound ) + pages = lower_bound; + /* Round up to largest block size */ + pages = (pages + ((1<<SHADOW_MAX_ORDER)-1)) & ~((1<<SHADOW_MAX_ORDER)-1); + + SHADOW_PRINTK("current %i target %i\n", + d->arch.shadow.total_pages, pages); + + while ( d->arch.shadow.total_pages != pages ) + { + if ( d->arch.shadow.total_pages < pages ) + { + /* Need to allocate more memory from domheap */ + pg = alloc_domheap_pages(NULL, SHADOW_MAX_ORDER, 0); + if ( pg == NULL ) + { + SHADOW_PRINTK("failed to allocate shadow pages.\n"); + return -ENOMEM; + } + d->arch.shadow.free_pages += 1<<SHADOW_MAX_ORDER; + d->arch.shadow.total_pages += 1<<SHADOW_MAX_ORDER; + for ( j = 0; j < 1<<SHADOW_MAX_ORDER; j++ ) + { + pg[j].u.inuse.type_info = 0; /* Free page */ + pg[j].tlbflush_timestamp = 0; /* Not in any TLB */ + } + SH_SET_PFN_ORDER(pg, SHADOW_MAX_ORDER); + list_add_tail(&pg->list, + &d->arch.shadow.freelists[SHADOW_MAX_ORDER]); + } + else if ( d->arch.shadow.total_pages > pages ) + { + /* Need to return memory to domheap */ + shadow_prealloc(d, SHADOW_MAX_ORDER); + ASSERT(!list_empty(&d->arch.shadow.freelists[SHADOW_MAX_ORDER])); + pg = list_entry(d->arch.shadow.freelists[SHADOW_MAX_ORDER].next, + struct page_info, list); + list_del(&pg->list); + d->arch.shadow.free_pages -= 1<<SHADOW_MAX_ORDER; + d->arch.shadow.total_pages -= 1<<SHADOW_MAX_ORDER; + free_domheap_pages(pg, SHADOW_MAX_ORDER); + } + + /* Check to see if we need to yield and try again */ + if ( preempted && hypercall_preempt_check() ) + { + *preempted = 1; + return 0; + } + } + + return 0; +} + +unsigned int shadow_set_allocation(struct domain *d, + unsigned int megabytes, + int *preempted) +/* Hypercall interface to set the shadow memory allocation */ +{ + unsigned int rv; + shadow_lock(d); + rv = set_sh_allocation(d, megabytes << (20 - PAGE_SHIFT), preempted); + SHADOW_PRINTK("dom %u allocation now %u pages (%u MB)\n", + d->domain_id, + d->arch.shadow.total_pages, + shadow_get_allocation(d)); + shadow_unlock(d); + return rv; +} + +/**************************************************************************/ +/* Hash table for storing the guest->shadow mappings */ + +/* Hash function that takes a gfn or mfn, plus another byte of type info */ +typedef u32 key_t; +static inline key_t sh_hash(unsigned long n, u8 t) +{ + unsigned char *p = (unsigned char *)&n; + key_t k = t; + int i; + for ( i = 0; i < sizeof(n) ; i++ ) k = (u32)p[i] + (k<<6) + (k<<16) - k; + return k; +} + +#if SHADOW_AUDIT & (SHADOW_AUDIT_HASH|SHADOW_AUDIT_HASH_FULL) + +/* Before we get to the mechanism, define a pair of audit functions + * that sanity-check the contents of the hash table. */ +static void sh_hash_audit_bucket(struct domain *d, int bucket) +/* Audit one bucket of the hash table */ +{ + struct shadow_hash_entry *e, *x; + struct page_info *pg; + + if ( !(SHADOW_AUDIT_ENABLE) ) + return; + + e = &d->arch.shadow.hash_table[bucket]; + if ( e->t == 0 ) return; /* Bucket is empty */ + while ( e ) + { + /* Empty link? */ + BUG_ON( e->t == 0 ); + /* Bogus type? */ + BUG_ON( e->t > (PGC_SH_max_shadow >> PGC_SH_type_shift) ); + /* Wrong bucket? */ + BUG_ON( sh_hash(e->n, e->t) % SHADOW_HASH_BUCKETS != bucket ); + /* Duplicate entry? */ + for ( x = e->next; x; x = x->next ) + BUG_ON( x->n == e->n && x->t == e->t ); + /* Bogus MFN? */ + BUG_ON( !valid_mfn(e->smfn) ); + pg = mfn_to_page(e->smfn); + /* Not a shadow? */ + BUG_ON( page_get_owner(pg) != 0 ); + /* Wrong kind of shadow? */ + BUG_ON( (pg->count_info & PGC_SH_type_mask) >> PGC_SH_type_shift + != e->t ); + /* Bad backlink? */ + BUG_ON( pg->u.inuse.type_info != e->n ); + if ( e->t != (PGC_SH_fl1_32_shadow >> PGC_SH_type_shift) + && e->t != (PGC_SH_fl1_pae_shadow >> PGC_SH_type_shift) + && e->t != (PGC_SH_fl1_64_shadow >> PGC_SH_type_shift) ) + { + /* Bad shadow flags on guest page? */ + BUG_ON( !(mfn_to_page(_mfn(e->n))->shadow_flags & (1<<e->t)) ); + } + /* That entry was OK; on we go */ + e = e->next; + } +} + +#else +#define sh_hash_audit_bucket(_d, _b) +#endif /* Hashtable bucket audit */ + + +#if SHADOW_AUDIT & SHADOW_AUDIT_HASH_FULL + +static void sh_hash_audit(struct domain *d) +/* Full audit: audit every bucket in the table */ +{ + int i; + + if ( !(SHADOW_AUDIT_ENABLE) ) + return; + + for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ ) + { + sh_hash_audit_bucket(d, i); + } +} + +#else +#define sh_hash_audit(_d) +#endif /* Hashtable bucket audit */ + +/* Memory management interface for bucket allocation. + * These ought to come out of shadow memory, but at least on 32-bit + * machines we are forced to allocate them from xenheap so that we can + * address them. */ +static struct shadow_hash_entry *sh_alloc_hash_entry(struct domain *d) +{ + struct shadow_hash_entry *extra, *x; + int i; + + /* We need to allocate a new node. Ensure the free list is not empty. + * Allocate new entries in units the same size as the original table. */ + if ( unlikely(d->arch.shadow.hash_freelist == NULL) ) + { + size_t sz = sizeof(void *) + (SHADOW_HASH_BUCKETS * sizeof(*x)); + extra = xmalloc_bytes(sz); + + if ( extra == NULL ) + { + /* No memory left! */ + SHADOW_ERROR("xmalloc() failed when allocating hash buckets.\n"); + domain_crash_synchronous(); + } + memset(extra, 0, sz); + + /* Record the allocation block so it can be correctly freed later. */ + *((struct shadow_hash_entry **)&extra[SHADOW_HASH_BUCKETS]) = + d->arch.shadow.hash_allocations; + d->arch.shadow.hash_allocations = &extra[0]; + + /* Thread a free chain through the newly-allocated nodes. */ + for ( i = 0; i < (SHADOW_HASH_BUCKETS - 1); i++ ) + extra[i].next = &extra[i+1]; + extra[i].next = NULL; + + /* Add the new nodes to the free list. */ + d->arch.shadow.hash_freelist = &extra[0]; + } + + /* Allocate a new node from the free list. */ + x = d->arch.shadow.hash_freelist; + d->arch.shadow.hash_freelist = x->next; + return x; +} + +static void sh_free_hash_entry(struct domain *d, struct shadow_hash_entry *e) +{ + /* Mark the bucket as empty and return it to the free list */ + e->t = 0; + e->next = d->arch.shadow.hash_freelist; + d->arch.shadow.hash_freelist = e; +} + + +/* Allocate and initialise the table itself. + * Returns 0 for success, 1 for error. */ +static int shadow_hash_alloc(struct domain *d) +{ + struct shadow_hash_entry *table; + + ASSERT(shadow_lock_is_acquired(d)); + ASSERT(!d->arch.shadow.hash_table); + + table = xmalloc_array(struct shadow_hash_entry, SHADOW_HASH_BUCKETS); + if ( !table ) return 1; + memset(table, 0, + SHADOW_HASH_BUCKETS * sizeof (struct shadow_hash_entry)); + d->arch.shadow.hash_table = table; + return 0; +} + +/* Tear down the hash table and return all memory to Xen. + * This function does not care whether the table is populated. */ +static void shadow_hash_teardown(struct domain *d) +{ + struct shadow_hash_entry *a, *n; + + ASSERT(shadow_lock_is_acquired(d)); + ASSERT(d->arch.shadow.hash_table); + + /* Return the table itself */ + xfree(d->arch.shadow.hash_table); + d->arch.shadow.hash_table = NULL; + + /* Return any extra allocations */ + a = d->arch.shadow.hash_allocations; + while ( a ) + { + /* We stored a linked-list pointer at the end of each allocation */ + n = *((struct shadow_hash_entry **)(&a[SHADOW_HASH_BUCKETS])); + xfree(a); + a = n; + } + d->arch.shadow.hash_allocations = NULL; + d->arch.shadow.hash_freelist = NULL; +} + + +mfn_t shadow_hash_lookup(struct vcpu *v, unsigned long n, u8 t) +/* Find an entry in the hash table. Returns the MFN of the shadow, + * or INVALID_MFN if it doesn't exist */ +{ + struct domain *d = v->domain; + struct shadow_hash_entry *p, *x, *head; + key_t key; + + ASSERT(shadow_lock_is_acquired(d)); + ASSERT(d->arch.shadow.hash_table); + ASSERT(t); + + sh_hash_audit(d); + + perfc_incrc(shadow_hash_lookups); + key = sh_hash(n, t); + + x = head = &d->arch.shadow.hash_table[key % SHADOW_HASH_BUCKETS]; + p = NULL; + + sh_hash_audit_bucket(d, key % SHADOW_HASH_BUCKETS); + + do + { + ASSERT(x->t || ((x == head) && (x->next == NULL))); + + if ( x->n == n && x->t == t ) + { + /* Pull-to-front if 'x' isn't already the head item */ + if ( unlikely(x != head) ) + { + if ( unlikely(d->arch.shadow.hash_walking != 0) ) + /* Can't reorder: someone is walking the hash chains */ + return x->smfn; + else + { + /* Delete 'x' from list and reinsert after head. */ + p->next = x->next; + x->next = head->next; + head->next = x; + + /* Swap 'x' contents with head contents. */ + SWAP(head->n, x->n); + SWAP(head->t, x->t); + SWAP(head->smfn, x->smfn); + } + } + else + { + perfc_incrc(shadow_hash_lookup_head); + } + return head->smfn; + } + + p = x; + x = x->next; + } + while ( x != NULL ); + + perfc_incrc(shadow_hash_lookup_miss); + return _mfn(INVALID_MFN); +} + +void shadow_hash_insert(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn) +/* Put a mapping (n,t)->smfn into the hash table */ +{ + struct domain *d = v->domain; + struct shadow_hash_entry *x, *head; + key_t key; + + ASSERT(shadow_lock_is_acquired(d)); + ASSERT(d->arch.shadow.hash_table); + ASSERT(t); + + sh_hash_audit(d); + + perfc_incrc(shadow_hash_inserts); + key = sh_hash(n, t); + + head = &d->arch.shadow.hash_table[key % SHADOW_HASH_BUCKETS]; + + sh_hash_audit_bucket(d, key % SHADOW_HASH_BUCKETS); + + /* If the bucket is empty then insert the new page as the head item. */ + if ( head->t == 0 ) + { + head->n = n; + head->t = t; + head->smfn = smfn; + ASSERT(head->next == NULL); + } + else + { + /* Insert a new entry directly after the head item. */ + x = sh_alloc_hash_entry(d); + x->n = n; + x->t = t; + x->smfn = smfn; + x->next = head->next; + head->next = x; + } + + sh_hash_audit_bucket(d, key % SHADOW_HASH_BUCKETS); +} + +void shadow_hash_delete(struct vcpu *v, unsigned long n, u8 t, mfn_t smfn) +/* Excise the mapping (n,t)->smfn from the hash table */ +{ + struct domain *d = v->domain; + struct shadow_hash_entry *p, *x, *head; + key_t key; + + ASSERT(shadow_lock_is_acquired(d)); + ASSERT(d->arch.shadow.hash_table); + ASSERT(t); + + sh_hash_audit(d); + + perfc_incrc(shadow_hash_deletes); + key = sh_hash(n, t); + + head = &d->arch.shadow.hash_table[key % SHADOW_HASH_BUCKETS]; + + sh_hash_audit_bucket(d, key % SHADOW_HASH_BUCKETS); + + /* Match on head item? */ + if ( head->n == n && head->t == t ) + { + if ( (x = head->next) != NULL ) + { + /* Overwrite head with contents of following node. */ + head->n = x->n; + head->t = x->t; + head->smfn = x->smfn; + + /* Delete following node. */ + head->next = x->next; + sh_free_hash_entry(d, x); + } + else + { + /* This bucket is now empty. Initialise the head node. */ + head->t = 0; + } + } + else + { + /* Not at the head; need to walk the chain */ + p = head; + x = head->next; + + while(1) + { + ASSERT(x); /* We can't have hit the end, since our target is + * still in the chain somehwere... */ + if ( x->n == n && x->t == t ) + { + /* Delete matching node. */ + p->next = x->next; + sh_free_hash_entry(d, x); + break; + } + p = x; + x = x->next; + } + } + + sh_hash_audit_bucket(d, key % SHADOW_HASH_BUCKETS); +} + +typedef int (*hash_callback_t)(struct vcpu *v, mfn_t smfn, mfn_t other_mfn); + +static void hash_foreach(struct vcpu *v, + unsigned int callback_mask, + hash_callback_t callbacks[], + mfn_t callback_mfn) +/* Walk the hash table looking at the types of the entries and + * calling the appropriate callback function for each entry. + * The mask determines which shadow types we call back for, and the array + * of callbacks tells us which function to call. + * Any callback may return non-zero to let us skip the rest of the scan. + * + * WARNING: Callbacks MUST NOT add or remove hash entries unless they + * then return non-zero to terminate the scan. */ +{ + int i, done = 0; + struct domain *d = v->domain; + struct shadow_hash_entry *x; + + /* Say we're here, to stop hash-lookups reordering the chains */ + ASSERT(shadow_lock_is_acquired(d)); + ASSERT(d->arch.shadow.hash_walking == 0); + d->arch.shadow.hash_walking = 1; + + callback_mask &= ~1; /* Never attempt to call back on empty buckets */ + for ( i = 0; i < SHADOW_HASH_BUCKETS; i++ ) + { + /* WARNING: This is not safe against changes to the hash table. + * The callback *must* return non-zero if it has inserted or + * deleted anything from the hash (lookups are OK, though). */ + for ( x = &d->arch.shadow.hash_table[i]; x; x = x->next ) + { + if ( callback_mask & (1 << x->t) ) + { + ASSERT(x->t <= 15); + ASSERT(callbacks[x->t] != NULL); + if ( (done = callbacks[x->t](v, x->smfn, callback_mfn)) != 0 ) + break; + } + } + if ( done ) break; + } + d->arch.shadow.hash_walking = 0; +} + + +/**************************************************************************/ +/* Destroy a shadow page: simple dispatcher to call the per-type destructor + * which will decrement refcounts appropriately and return memory to the + * free pool. */ + +void sh_destroy_shadow(struct vcpu *v, mfn_t smfn) +{ + struct page_info *pg = mfn_to_page(smfn); + u32 t = pg->count_info & PGC_SH_type_mask; + + + SHADOW_PRINTK("smfn=%#lx\n", mfn_x(smfn)); + + /* Double-check, if we can, that the shadowed page belongs to this + * domain, (by following the back-pointer). */ + ASSERT(t == PGC_SH_fl1_32_shadow || + t == PGC_SH_fl1_pae_shadow || + t == PGC_SH_fl1_64_shadow || + t == PGC_SH_monitor_table || + (page_get_owner(mfn_to_page(_mfn(pg->u.inuse.type_info))) + == v->domain)); + + /* The down-shifts here are so that the switch statement is on nice + * small numbers that the compiler will enjoy */ + switch ( t >> PGC_SH_type_shift ) + { +#if CONFIG_PAGING_LEVELS == 2 + case PGC_SH_l1_32_shadow >> PGC_SH_type_shift: + case PGC_SH_fl1_32_shadow >> PGC_SH_type_shift: + SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 2, 2)(v, smfn); + break; + case PGC_SH_l2_32_shadow >> PGC_SH_type_shift: + SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 2, 2)(v, smfn); + break; +#else /* PAE or 64bit */ + case PGC_SH_l1_32_shadow >> PGC_SH_type_shift: + case PGC_SH_fl1_32_shadow >> PGC_SH_type_shift: + SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 3, 2)(v, smfn); + break; + case PGC_SH_l2_32_shadow >> PGC_SH_type_shift: + SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 3, 2)(v, smfn); + break; +#endif + +#if CONFIG_PAGING_LEVELS >= 3 + case PGC_SH_l1_pae_shadow >> PGC_SH_type_shift: + case PGC_SH_fl1_pae_shadow >> PGC_SH_type_shift: + SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 3, 3)(v, smfn); + break; + case PGC_SH_l2_pae_shadow >> PGC_SH_type_shift: + case PGC_SH_l2h_pae_shadow >> PGC_SH_type_shift: + SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 3, 3)(v, smfn); + break; + case PGC_SH_l3_pae_shadow >> PGC_SH_type_shift: + SHADOW_INTERNAL_NAME(sh_destroy_l3_shadow, 3, 3)(v, smfn); + break; +#endif + +#if CONFIG_PAGING_LEVELS >= 4 + case PGC_SH_l1_64_shadow >> PGC_SH_type_shift: + case PGC_SH_fl1_64_shadow >> PGC_SH_type_shift: + SHADOW_INTERNAL_NAME(sh_destroy_l1_shadow, 4, 4)(v, smfn); + break; + case PGC_SH_l2_64_shadow >> PGC_SH_type_shift: + SHADOW_INTERNAL_NAME(sh_destroy_l2_shadow, 4, 4)(v, smfn); + break; + case PGC_SH_l3_64_shadow >> PGC_SH_type_shift: + SHADOW_INTERNAL_NAME(sh_destroy_l3_shadow, 4, 4)(v, smfn); + break; + case PGC_SH_l4_64_shadow >> PGC_SH_type_shift: + SHADOW_INTERNAL_NAME(sh_destroy_l4_shadow, 4, 4)(v, smfn); + break; +#endif + default: + SHADOW_PRINTK("tried to destroy shadow of bad type %08lx\n", + (unsigned long)t); + BUG(); + } +} + +/**************************************************************************/ +/* Remove all writeable mappings of a guest frame from the shadow tables + * Returns non-zero if we need to flush TLBs. + * level and fault_addr desribe how we found this to be a pagetable; + * level==0 means we have some other reason for revoking write access.*/ + +int shadow_remove_write_access(struct vcpu *v, mfn_t gmfn, + unsigned int level, + unsigned long fault_addr) +{ + /* Dispatch table for getting per-type functions */ + static hash_callback_t callbacks[16] = { + NULL, /* none */ +#if CONFIG_PAGING_LEVELS == 2 + SHADOW_INTERNAL_NAME(sh_remove_write_access,2,2), /* l1_32 */ + SHADOW_INTERNAL_NAME(sh_remove_write_access,2,2), /* fl1_32 */ +#else + SHADOW_INTERNAL_NAME(sh_remove_write_access,3,2), /* l1_32 */ + SHADOW_INTERNAL_NAME(sh_remove_write_access,3,2), /* fl1_32 */ +#endif + NULL, /* l2_32 */ +#if CONFIG_PAGING_LEVELS >= 3 + SHADOW_INTERNAL_NAME(sh_remove_write_access,3,3), /* l1_pae */ + SHADOW_INTERNAL_NAME(sh_remove_write_access,3,3), /* fl1_pae */ +#else + NULL, /* l1_pae */ + NULL, /* fl1_pae */ +#endif + NULL, /* l2_pae */ + NULL, /* l2h_pae */ + NULL, /* l3_pae */ +#if CONFIG_PAGING_LEVELS >= 4 + SHADOW_INTERNAL_NAME(sh_remove_write_access,4,4), /* l1_64 */ + SHADOW_INTERNAL_NAME(sh_remove_write_access,4,4), /* fl1_64 */ +#else + NULL, /* l1_64 */ + NULL, /* fl1_64 */ +#endif + NULL, /* l2_64 */ + NULL, /* l3_64 */ + NULL, /* l4_64 */ + NULL, /* p2m */ + NULL /* unused */ + }; + + static unsigned int callback_mask = + 1 << (PGC_SH_l1_32_shadow >> PGC_SH_type_shift) + | 1 << (PGC_SH_fl1_32_shadow >> PGC_SH_type_shift) + | 1 << (PGC_SH_l1_pae_shadow >> PGC_SH_type_shift) + | 1 << (PGC_SH_fl1_pae_shadow >> PGC_SH_type_shift) + | 1 << (PGC_SH_l1_64_shadow >> PGC_SH_type_shift) + | 1 << (PGC_SH_fl1_64_shadow >> PGC_SH_type_shift) + ; + struct page_info *pg = mfn_to_page(gmfn); + + ASSERT(shadow_lock_is_acquired(v->domain)); + + /* Only remove writable mappings if we are doing shadow refcounts. + * In guest refcounting, we trust Xen to already be restricting + * all the writes to the guest page tables, so we do not need to + * do more. */ + if ( !shadow_mode_refcounts(v->domain) ) + return 0; + + /* Early exit if it's already a pagetable, or otherwise not writeable */ + if ( sh_mfn_is_a_page_table(gmfn) + || (pg->u.inuse.type_info & PGT_count_mask) == 0 ) + return 0; + + perfc_incrc(shadow_writeable); + + /* If this isn't a "normal" writeable page, the domain is trying to + * put pagetables in special memory of some kind. We can't allow that. */ + if ( (pg->u.inuse.type_info & PGT_type_mask) != PGT_writable_page ) + { + SHADOW_ERROR("can't remove write access to mfn %lx, type_info is %" + PRtype_info "\n", + mfn_x(gmfn), mfn_to_page(gmfn)->u.inuse.type_info); + domain_crash(v->domain); + } + +#if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC + if ( v == current && level != 0 ) + { + unsigned long gfn; + /* Heuristic: there is likely to be only one writeable mapping, + * and that mapping is likely to be in the current pagetable, + * either in the guest's linear map (linux, windows) or in a + * magic slot used to map high memory regions (linux HIGHTPTE) */ + +#define GUESS(_a, _h) do { \ + if ( v->arch.shadow.mode->guess_wrmap(v, (_a), gmfn) ) \ + perfc_incrc(shadow_writeable_h_ ## _h); \ + if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 ) \ + return 1; \ + } while (0) + + + /* Linux lowmem: first 1GB is mapped 1-to-1 above 0xC0000000 */ + if ( v == current + && (gfn = sh_mfn_to_gfn(v->domain, gmfn)) < 0x40000000 ) + GUESS(0xC0000000 + (gfn << PAGE_SHIFT), 4); + + if ( v->arch.shadow.mode->guest_levels == 2 ) + { + if ( level == 1 ) + /* 32bit non-PAE w2k3: linear map at 0xC0000000 */ + GUESS(0xC0000000UL + (fault_addr >> 10), 1); + } +#if CONFIG_PAGING_LEVELS >= 3 + else if ( v->arch.shadow.mode->guest_levels == 3 ) + { + /* 32bit PAE w2k3: linear map at 0xC0000000 */ + switch ( level ) + { + case 1: GUESS(0xC0000000UL + (fault_addr >> 9), 2); break; + case 2: GUESS(0xC0600000UL + (fault_addr >> 18), 2); break; + } + } +#if CONFIG_PAGING_LEVELS >= 4 + else if ( v->arch.shadow.mode->guest_levels == 4 ) + { + /* 64bit w2k3: linear map at 0x0000070000000000 */ + switch ( level ) + { + case 1: GUESS(0x70000000000UL + (fault_addr >> 9), 3); break; + case 2: GUESS(0x70380000000UL + (fault_addr >> 18), 3); break; + case 3: GUESS(0x70381C00000UL + (fault_addr >> 27), 3); break; + } + } +#endif /* CONFIG_PAGING_LEVELS >= 4 */ +#endif /* CONFIG_PAGING_LEVELS >= 3 */ + +#undef GUESS + + } +#endif + + /* Brute-force search of all the shadows, by walking the hash */ + perfc_incrc(shadow_writeable_bf); + hash_foreach(v, callback_mask, callbacks, gmfn); + + /* If that didn't catch the mapping, something is very wrong */ + if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) != 0 ) + { + SHADOW_ERROR("can't find all writeable mappings of mfn %lx: " + "%lu left\n", mfn_x(gmfn), + (mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask)); + domain_crash(v->domain); + } + + /* We killed at least one writeable mapping, so must flush TLBs. */ + return 1; +} + + + +/**************************************************************************/ +/* Remove all mappings of a guest frame from the shadow tables. + * Returns non-zero if we need to flush TLBs. */ + +int shadow_remove_all_mappings(struct vcpu *v, mfn_t gmfn) +{ + struct page_info *page = mfn_to_page(gmfn); + int expected_count; + + /* Dispatch table for getting per-type functions */ + static hash_callback_t callbacks[16] = { + NULL, /* none */ +#if CONFIG_PAGING_LEVELS == 2 + SHADOW_INTERNAL_NAME(sh_remove_all_mappings,2,2), /* l1_32 */ + SHADOW_INTERNAL_NAME(sh_remove_all_mappings,2,2), /* fl1_32 */ +#else + SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,2), /* l1_32 */ + SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,2), /* fl1_32 */ +#endif + NULL, /* l2_32 */ +#if CONFIG_PAGING_LEVELS >= 3 + SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,3), /* l1_pae */ + SHADOW_INTERNAL_NAME(sh_remove_all_mappings,3,3), /* fl1_pae */ +#else + NULL, /* l1_pae */ + NULL, /* fl1_pae */ +#endif + NULL, /* l2_pae */ + NULL, /* l2h_pae */ + NULL, /* l3_pae */ +#if CONFIG_PAGING_LEVELS >= 4 + SHADOW_INTERNAL_NAME(sh_remove_all_mappings,4,4), /* l1_64 */ + SHADOW_INTERNAL_NAME(sh_remove_all_mappings,4,4), /* fl1_64 */ +#else + NULL, /* l1_64 */ + NULL, /* fl1_64 */ +#endif + NULL, /* l2_64 */ + NULL, /* l3_64 */ + NULL, /* l4_64 */ + NULL, /* p2m */ + NULL /* unused */ + }; + + static unsigned int callback_mask = + 1 << (PGC_SH_l1_32_shadow >> PGC_SH_type_shift) + | 1 << (PGC_SH_fl1_32_shadow >> PGC_SH_type_shift) + | 1 << (PGC_SH_l1_pae_shadow >> PGC_SH_type_shift) + | 1 << (PGC_SH_fl1_pae_shadow >> PGC_SH_type_shift) + | 1 << (PGC_SH_l1_64_shadow >> PGC_SH_type_shift) + | 1 << (PGC_SH_fl1_64_shadow >> PGC_SH_type_shift) + ; + + perfc_incrc(shadow_mappings); + if ( (page->count_info & PGC_count_mask) == 0 ) + return 0; + + ASSERT(shadow_lock_is_acquired(v->domain)); + + /* XXX TODO: + * Heuristics for finding the (probably) single mapping of this gmfn */ + + /* Brute-force search of all the shadows, by walking the hash */ + perfc_incrc(shadow_mappings_bf); + hash_foreach(v, callback_mask, callbacks, gmfn); + + /* If that didn't catch the mapping, something is very wrong */ + expected_count = (page->count_info & PGC_allocated) ? 1 : 0; + if ( (page->count_info & PGC_count_mask) != expected_count ) + { + /* Don't complain if we're in HVM and there's one extra mapping: + * The qemu helper process has an untyped mapping of this dom's RAM */ + if ( !(shadow_mode_external(v->domain) + && (page->count_info & PGC_count_mask) <= 2 + && (page->u.inuse.type_info & PGT_count_mask) == 0) ) + { + SHADOW_ERROR("can't find all mappings of mfn %lx: " + "c=%08x t=%08lx\n", mfn_x(gmfn), + page->count_info, page->u.inuse.type_info); + } + } + + /* We killed at least one mapping, so must flush TLBs. */ + return 1; +} + + +/**************************************************************************/ +/* Remove all shadows of a guest frame from the shadow tables */ + +static int sh_remove_shadow_via_pointer(struct vcpu *v, mfn_t smfn) +/* Follow this shadow's up-pointer, if it has one, and remove the reference + * found there. Returns 1 if that was the only reference to this shadow */ +{ + struct page_info *pg = mfn_to_page(smfn); + mfn_t pmfn; + void *vaddr; + int rc; + + ASSERT((pg->count_info & PGC_SH_type_mask) > 0); + ASSERT((pg->count_info & PGC_SH_type_mask) < PGC_SH_max_shadow); + ASSERT((pg->count_info & PGC_SH_type_mask) != PGC_SH_l2_32_shadow); + ASSERT((pg->count_info & PGC_SH_type_mask) != PGC_SH_l3_pae_shadow); + ASSERT((pg->count_info & PGC_SH_type_mask) != PGC_SH_l4_64_shadow); + + if (pg->up == 0) return 0; + pmfn = _mfn(pg->up >> PAGE_SHIFT); + ASSERT(valid_mfn(pmfn)); + vaddr = sh_map_domain_page(pmfn); + ASSERT(vaddr); + vaddr += pg->up & (PAGE_SIZE-1); + ASSERT(l1e_get_pfn(*(l1_pgentry_t *)vaddr) == mfn_x(smfn)); + + /* Is this the only reference to this shadow? */ + rc = ((pg->count_info & PGC_SH_count_mask) == 1) ? 1 : 0; + + /* Blank the offending entry */ + switch ((pg->count_info & PGC_SH_type_mask)) + { + case PGC_SH_l1_32_shadow: + case PGC_SH_l2_32_shadow: +#if CONFIG_PAGING_LEVELS == 2 + SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,2,2)(v, vaddr, pmfn); +#else + SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,3,2)(v, vaddr, pmfn); +#endif + break; +#if CONFIG_PAGING_LEVELS >=3 + case PGC_SH_l1_pae_shadow: + case PGC_SH_l2_pae_shadow: + case PGC_SH_l2h_pae_shadow: + case PGC_SH_l3_pae_shadow: + SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,3,3)(v, vaddr, pmfn); + break; +#if CONFIG_PAGING_LEVELS >= 4 + case PGC_SH_l1_64_shadow: + case PGC_SH_l2_64_shadow: + case PGC_SH_l3_64_shadow: + case PGC_SH_l4_64_shadow: + SHADOW_INTERNAL_NAME(sh_clear_shadow_entry,4,4)(v, vaddr, pmfn); + break; +#endif +#endif + default: BUG(); /* Some wierd unknown shadow type */ + } + + sh_unmap_domain_page(vaddr); + if ( rc ) + perfc_incrc(shadow_up_pointer); + else + perfc_incrc(shadow_unshadow_bf); + + return rc; +} + +void sh_remove_shadows(struct vcpu *v, mfn_t gmfn, int all) +/* Remove the shadows of this guest page. + * If all != 0, find all shadows, if necessary by walking the tables. + * Otherwise, just try the (much faster) heuristics, which will remove + * at most one reference to each shadow of the page. */ +{ + struct page_info *pg; + mfn_t smfn; + u32 sh_flags; + unsigned char t; + + /* Dispatch table for getting per-type functions: each level must + * be called with the function to remove a lower-level shadow. */ + static hash_callback_t callbacks[16] = { + NULL, /* none */ + NULL, /* l1_32 */ + NULL, /* fl1_32 */ +#if CONFIG_PAGING_LEVELS == 2 + SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,2,2), /* l2_32 */ +#else + SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,2), /* l2_32 */ +#endif + NULL, /* l1_pae */ + NULL, /* fl1_pae */ +#if CONFIG_PAGING_LEVELS >= 3 + SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,3), /* l2_pae */ + SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,3,3), /* l2h_pae */ + SHADOW_INTERNAL_NAME(sh_remove_l2_shadow,3,3), /* l3_pae */ +#else + NULL, /* l2_pae */ + NULL, /* l2h_pae */ + NULL, /* l3_pae */ +#endif + NULL, /* l1_64 */ + NULL, /* fl1_64 */ +#if CONFIG_PAGING_LEVELS >= 4 + SHADOW_INTERNAL_NAME(sh_remove_l1_shadow,4,4), /* l2_64 */ + SHADOW_INTERNAL_NAME(sh_remove_l2_shadow,4,4), /* l3_64 */ + SHADOW_INTERNAL_NAME(sh_remove_l3_shadow,4,4), /* l4_64 */ +#else + NULL, /* l2_64 */ + NULL, /* l3_64 */ + NULL, /* l4_64 */ +#endif + NULL, /* p2m */ + NULL /* unused */ + }; + + /* Another lookup table, for choosing which mask to use */ + static unsigned int masks[16] = { + 0, /* none */ + 1 << (PGC_SH_l2_32_shadow >> PGC_SH_type_shift), /* l1_32 */ + 0, /* fl1_32 */ + 0, /* l2_32 */ + ((1 << (PGC_SH_l2h_pae_shadow >> PGC_SH_type_shift)) + | (1 << (PGC_SH_l2_pae_shadow >> PGC_SH_type_shift))), /* l1_pae */ + 0, /* fl1_pae */ + 1 << (PGC_SH_l3_pae_shadow >> PGC_SH_type_shift), /* l2_pae */ + 1 << (PGC_SH_l3_pae_shadow >> PGC_SH_type_shift), /* l2h_pae */ + 0, /* l3_pae */ + 1 << (PGC_SH_l2_64_shadow >> PGC_SH_type_shift), /* l1_64 */ + 0, /* fl1_64 */ + 1 << (PGC_SH_l3_64_shadow >> PGC_SH_type_shift), /* l2_64 */ + 1 << (PGC_SH_l4_64_shadow >> PGC_SH_type_shift), /* l3_64 */ + 0, /* l4_64 */ + 0, /* p2m */ + 0 /* unused */ + }; + + ASSERT(shadow_lock_is_acquired(v->domain)); + + pg = mfn_to_page(gmfn); + + /* Bale out now if the page is not shadowed */ + if ( (pg->count_info & PGC_page_table) == 0 ) + return; + + SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx\n", + v->domain->domain_id, v->vcpu_id, mfn_x(gmfn)); + + /* Search for this shadow in all appropriate shadows */ + perfc_incrc(shadow_unshadow); + sh_flags = pg->shadow_flags; + + /* Lower-level shadows need to be excised from upper-level shadows. + * This call to hash_foreach() looks dangerous but is in fact OK: each + * call will remove at most one shadow, and terminate immediately when + * it does remove it, so we never walk the hash after doing a deletion. */ +#define DO_UNSHADOW(_type) do { \ + t = (_type) >> PGC_SH_type_shift; \ + smfn = shadow_hash_lookup(v, mfn_x(gmfn), t); \ + if ( !sh_remove_shadow_via_pointer(v, smfn) && all ) \ + hash_foreach(v, masks[t], callbacks, smfn); \ +} while (0) + + /* Top-level shadows need to be unpinned */ +#define DO_UNPIN(_type) do { \ + t = (_type) >> PGC_SH_type_shift; \ + smfn = shadow_hash_lookup(v, mfn_x(gmfn), t); \ + if ( mfn_to_page(smfn)->count_info & PGC_SH_pinned ) \ + sh_unpin(v, smfn); \ + if ( (_type) == PGC_SH_l3_pae_shadow ) \ + SHADOW_INTERNAL_NAME(sh_unpin_all_l3_subshadows,3,3)(v, smfn); \ +} while (0) + + if ( sh_flags & SHF_L1_32 ) DO_UNSHADOW(PGC_SH_l1_32_shadow); + if ( sh_flags & SHF_L2_32 ) DO_UNPIN(PGC_SH_l2_32_shadow); +#if CONFIG_PAGING_LEVELS >= 3 + if ( sh_flags & SHF_L1_PAE ) DO_UNSHADOW(PGC_SH_l1_pae_shadow); + if ( sh_flags & SHF_L2_PAE ) DO_UNSHADOW(PGC_SH_l2_pae_shadow); + if ( sh_flags & SHF_L2H_PAE ) DO_UNSHADOW(PGC_SH_l2h_pae_shadow); + if ( sh_flags & SHF_L3_PAE ) DO_UNPIN(PGC_SH_l3_pae_shadow); +#if CONFIG_PAGING_LEVELS >= 4 + if ( sh_flags & SHF_L1_64 ) DO_UNSHADOW(PGC_SH_l1_64_shadow); + if ( sh_flags & SHF_L2_64 ) DO_UNSHADOW(PGC_SH_l2_64_shadow); + if ( sh_flags & SHF_L3_64 ) DO_UNSHADOW(PGC_SH_l3_64_shadow); + if ( sh_flags & SHF_L4_64 ) DO_UNPIN(PGC_SH_l4_64_shadow); +#endif +#endif + +#undef DO_UNSHADOW +#undef DO_UNPIN + + +#if CONFIG_PAGING_LEVELS > 2 + /* We may have caused some PAE l3 entries to change: need to + * fix up the copies of them in various places */ + if ( sh_flags & (SHF_L2_PAE|SHF_L2H_PAE) ) + sh_pae_recopy(v->domain); +#endif + + /* If that didn't catch the shadows, something is wrong */ + if ( all && (pg->count_info & PGC_page_table) ) + { + SHADOW_ERROR("can't find all shadows of mfn %05lx (shadow_flags=%08x)\n", + mfn_x(gmfn), pg->shadow_flags); + domain_crash(v->domain); + } +} + +void +shadow_remove_all_shadows_and_parents(struct vcpu *v, mfn_t gmfn) +/* Even harsher: this is a HVM page that we thing is no longer a pagetable. + * Unshadow it, and recursively unshadow pages that reference it. */ +{ + shadow_remove_all_shadows(v, gmfn); + /* XXX TODO: + * Rework this hashtable walker to return a linked-list of all + * the shadows it modified, then do breadth-first recursion + * to find the way up to higher-level tables and unshadow them too. + * + * The current code (just tearing down each page's shadows as we + * detect that it is not a pagetable) is correct, but very slow. + * It means extra emulated writes and slows down removal of mappings. */ +} + +/**************************************************************************/ + +void sh_update_paging_modes(struct vcpu *v) +{ + struct domain *d = v->domain; + struct shadow_paging_mode *old_mode = v->arch.shadow.mode; + mfn_t old_guest_table; + + ASSERT(shadow_lock_is_acquired(d)); + + // Valid transitions handled by this function: + // - For PV guests: + // - after a shadow mode has been changed + // - For HVM guests: + // - after a shadow mode has been changed + // - changes in CR0.PG, CR4.PAE, CR4.PSE, or CR4.PGE + // + + // Avoid determining the current shadow mode for uninitialized CPUs, as + // we can not yet determine whether it is an HVM or PV domain. + // + if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) ) + { + printk("%s: postponing determination of shadow mode\n", __func__); + return; + } + + // First, tear down any old shadow tables held by this vcpu. + // + shadow_detach_old_tables(v); + + if ( !hvm_guest(v) ) + { + /// + /// PV guest + /// +#if CONFIG_PAGING_LEVELS == 4 + if ( pv_32bit_guest(v) ) + v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,4,3); + else + v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,4,4); +#elif CONFIG_PAGING_LEVELS == 3 + v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3); +#elif CONFIG_PAGING_LEVELS == 2 + v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,2,2); +#else +#error unexpected paging mode +#endif + } + else + { + /// + /// HVM guest + /// + ASSERT(shadow_mode_translate(d)); + ASSERT(shadow_mode_external(d)); + + v->arch.shadow.hvm_paging_enabled = !!hvm_paging_enabled(v); + if ( !v->arch.shadow.hvm_paging_enabled ) + { + + /* Set v->arch.guest_table to use the p2m map, and choose + * the appropriate shadow mode */ + old_guest_table = pagetable_get_mfn(v->arch.guest_table); +#if CONFIG_PAGING_LEVELS == 2 + v->arch.guest_table = + pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table)); + v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,2,2); +#elif CONFIG_PAGING_LEVELS == 3 + v->arch.guest_table = + pagetable_from_pfn(pagetable_get_pfn(d->arch.phys_table)); + v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3); +#else /* CONFIG_PAGING_LEVELS == 4 */ + { + l4_pgentry_t *l4e; + /* Use the start of the first l3 table as a PAE l3 */ + ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0); + l4e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table)); + ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT); + v->arch.guest_table = + pagetable_from_pfn(l4e_get_pfn(l4e[0])); + sh_unmap_domain_page(l4e); + } + v->arch.shadow.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode,3,3); +#endif + /* Fix up refcounts on guest_table */ + get_page(mfn_to_page(pagetable_get_mfn(v->arch.guest_table)), d); + if ( mfn_x(old_guest_table) != 0 ) + put_page(mfn_to_page(old_guest_table)); + } + else + { +#ifdef __x86_64__ + if ( hvm_long_mode_enabled(v) ) + { + // long mode guest... + v->arch.shadow.mode = + &SHADOW_INTERNAL_NAME(sh_paging_mode, 4, 4); + } + else +#endif + if ( hvm_get_guest_ctrl_reg(v, 4) & X86_CR4_PAE ) + { +#if CONFIG_PAGING_LEVELS >= 3 + // 32-bit PAE mode guest... + v->arch.shadow.mode = + &SHADOW_INTERNAL_NAME(sh_paging_mode, 3, 3); +#else + SHADOW_ERROR("PAE not supported in 32-bit Xen\n"); + domain_crash(d); + return; +#endif + } + else + { + // 32-bit 2 level guest... +#if CONFIG_PAGING_LEVELS >= 3 + v->arch.shadow.mode = + &SHADOW_INTERNAL_NAME(sh_paging_mode, 3, 2); +#else + v->arch.shadow.mode = + &SHADOW_INTERNAL_NAME(sh_paging_mode, 2, 2); +#endif + } + } + + if ( pagetable_get_pfn(v->arch.monitor_table) == 0 ) + { + mfn_t mmfn = shadow_make_monitor_table(v); + v->arch.monitor_table = pagetable_from_mfn(mmfn); + v->arch.monitor_vtable = sh_map_domain_page(mmfn); + } + + if ( v->arch.shadow.mode != old_mode ) + { + SHADOW_PRINTK("new paging mode: d=%u v=%u g=%u s=%u " + "(was g=%u s=%u)\n", + d->domain_id, v->vcpu_id, + v->arch.shadow.mode->guest_levels, + v->arch.shadow.mode->shadow_levels, + old_mode ? old_mode->guest_levels : 0, + old_mode ? old_mode->shadow_levels : 0); + if ( old_mode && + (v->arch.shadow.mode->shadow_levels != + old_mode->shadow_levels) ) + { + /* Need to make a new monitor table for the new mode */ + mfn_t new_mfn, old_mfn; + + if ( v != current ) + { + SHADOW_ERROR("Some third party (d=%u v=%u) is changing " + "this HVM vcpu's (d=%u v=%u) paging mode!\n", + current->domain->domain_id, current->vcpu_id, + v->domain->domain_id, v->vcpu_id); + domain_crash(v->domain); + return; + } + + sh_unmap_domain_page(v->arch.monitor_vtable); + old_mfn = pagetable_get_mfn(v->arch.monitor_table); + v->arch.monitor_table = pagetable_null(); + new_mfn = v->arch.shadow.mode->make_monitor_table(v); + v->arch.monitor_table = pagetable_from_mfn(new_mfn); + v->arch.monitor_vtable = sh_map_domain_page(new_mfn); + SHADOW_PRINTK("new monitor table %"SH_PRI_mfn "\n", + mfn_x(new_mfn)); + + /* Don't be running on the old monitor table when we + * pull it down! Switch CR3, and warn the HVM code that + * its host cr3 has changed. */ + make_cr3(v, mfn_x(new_mfn)); + write_ptbase(v); + hvm_update_host_cr3(v); + old_mode->destroy_monitor_table(v, old_mfn); + } + } + + // XXX -- Need to deal with changes in CR4.PSE and CR4.PGE. + // These are HARD: think about the case where two CPU's have + // different values for CR4.PSE and CR4.PGE at the same time. + // This *does* happen, at least for CR4.PGE... + } + + v->arch.shadow.mode->update_cr3(v); +} + +/**************************************************************************/ +/* Turning on and off shadow features */ + +static void sh_new_mode(struct domain *d, u32 new_mode) +/* Inform all the vcpus that the shadow mode has been changed */ +{ + struct vcpu *v; + + ASSERT(shadow_lock_is_acquired(d)); + ASSERT(d != current->domain); + d->arch.shadow.mode = new_mode; + if ( new_mode & SHM2_translate ) + shadow_audit_p2m(d); + for_each_vcpu(d, v) + sh_update_paging_modes(v); +} + +static int shadow_enable(struct domain *d, u32 mode) +/* Turn on "permanent" shadow features: external, translate, refcount. + * Can only be called once on a domain, and these features cannot be + * disabled. + * Returns 0 for success, -errno for failure. */ +{ + unsigned int old_pages; + int rv = 0; + + mode |= SHM2_enable; + + domain_pause(d); + shadow_lock(d); + + /* Sanity check the arguments */ + if ( (d == current->domain) || + shadow_mode_enabled(d) || + ((mode & SHM2_external) && !(mode & SHM2_translate)) ) + { + rv = -EINVAL; + goto out; + } + + // XXX -- eventually would like to require that all memory be allocated + // *after* shadow_enabled() is called... So here, we would test to make + // sure that d->page_list is empty. +#if 0 + spin_lock(&d->page_alloc_lock); + if ( !list_empty(&d->page_list) ) + { + spin_unlock(&d->page_alloc_lock); + rv = -EINVAL; + goto out; + } + spin_unlock(&d->page_alloc_lock); +#endif + + /* Init the shadow memory allocation if the user hasn't done so */ + old_pages = d->arch.shadow.total_pages; + if ( old_pages == 0 ) + if ( set_sh_allocation(d, 256, NULL) != 0 ) /* Use at least 1MB */ + { + set_sh_allocation(d, 0, NULL); + rv = -ENOMEM; + goto out; + } + + /* Init the hash table */ + if ( shadow_hash_alloc(d) != 0 ) + { + set_sh_allocation(d, old_pages, NULL); + rv = -ENOMEM; + goto out; + } + + /* Init the P2M table */ + if ( mode & SHM2_translate ) + if ( !shadow_alloc_p2m_table(d) ) + { + shadow_hash_teardown(d); + set_sh_allocation(d, old_pages, NULL); + shadow_p2m_teardown(d); + rv = -ENOMEM; + goto out; + } + + /* Update the bits */ + sh_new_mode(d, mode); + shadow_audit_p2m(d); + out: + shadow_unlock(d); + domain_unpause(d); + return 0; +} + +void shadow_teardown(struct domain *d) +/* Destroy the shadow pagetables of this domain and free its shadow memory. + * Should only be called for dying domains. */ +{ + struct vcpu *v; + mfn_t mfn; + + ASSERT(test_bit(_DOMF_dying, &d->domain_flags)); + ASSERT(d != current->domain); + + if ( !shadow_lock_is_acquired(d) ) + shadow_lock(d); /* Keep various asserts happy */ + + if ( shadow_mode_enabled(d) ) + { + /* Release the shadow and monitor tables held by each vcpu */ + for_each_vcpu(d, v) + { + shadow_detach_old_tables(v); + if ( shadow_mode_external(d) ) + { + mfn = pagetable_get_mfn(v->arch.monitor_table); + if ( valid_mfn(mfn) && (mfn_x(mfn) != 0) ) + shadow_destroy_monitor_table(v, mfn); + v->arch.monitor_table = pagetable_null(); + } + } + } + + if ( d->arch.shadow.total_pages != 0 ) + { + SHADOW_PRINTK("teardown of domain %u starts." + " Shadow pages total = %u, free = %u, p2m=%u\n", + d->domain_id, + d->arch.shadow.total_pages, + d->arch.shadow.free_pages, + d->arch.shadow.p2m_pages); + /* Destroy all the shadows and release memory to domheap */ + set_sh_allocation(d, 0, NULL); + /* Release the hash table back to xenheap */ + if (d->arch.shadow.hash_table) + shadow_hash_teardown(d); + /* Release the log-dirty bitmap of dirtied pages */ + sh_free_log_dirty_bitmap(d); + /* Should not have any more memory held */ + SHADOW_PRINTK("teardown done." + " Shadow pages total = %u, free = %u, p2m=%u\n", + d->arch.shadow.total_pages, + d->arch.shadow.free_pages, + d->arch.shadow.p2m_pages); + ASSERT(d->arch.shadow.total_pages == 0); + } + + /* We leave the "permanent" shadow modes enabled, but clear the + * log-dirty mode bit. We don't want any more mark_dirty() + * calls now that we've torn down the bitmap */ + d->arch.shadow.mode &= ~SHM2_log_dirty; + + shadow_unlock(d); +} + +void shadow_final_teardown(struct domain *d) +/* Called by arch_domain_destroy(), when it's safe to pull down the p2m map. */ +{ + + SHADOW_PRINTK("dom %u final teardown starts." + " Shadow pages total = %u, free = %u, p2m=%u\n", + d->domain_id, + d->arch.shadow.total_pages, + d->arch.shadow.free_pages, + d->arch.shadow.p2m_pages); + + /* Double-check that the domain didn't have any shadow memory. + * It is possible for a domain that never got domain_kill()ed + * to get here with its shadow allocation intact. */ + if ( d->arch.shadow.total_pages != 0 ) + shadow_teardown(d); + + /* It is now safe to pull down the p2m map. */ + if ( d->arch.shadow.p2m_pages != 0 ) + shadow_p2m_teardown(d); + + SHADOW_PRINTK("dom %u final teardown done." + " Shadow pages total = %u, free = %u, p2m=%u\n", + d->domain_id, + d->arch.shadow.total_pages, + d->arch.shadow.free_pages, + d->arch.shadow.p2m_pages); +} + +static int shadow_one_bit_enable(struct domain *d, u32 mode) +/* Turn on a single shadow mode feature */ +{ + ASSERT(shadow_lock_is_acquired(d)); + + /* Sanity check the call */ + if ( d == current->domain || (d->arch.shadow.mode & mode) ) + { + return -EINVAL; + } + + if ( d->arch.shadow.mode == 0 ) + { + /* Init the shadow memory allocation and the hash table */ + if ( set_sh_allocation(d, 1, NULL) != 0 + || shadow_hash_alloc(d) != 0 ) + { + set_sh_allocation(d, 0, NULL); + return -ENOMEM; + } + } + + /* Update the bits */ + sh_new_mode(d, d->arch.shadow.mode | mode); + + return 0; +} + +static int shadow_one_bit_disable(struct domain *d, u32 mode) +/* Turn off a single shadow mode feature */ +{ + struct vcpu *v; + ASSERT(shadow_lock_is_acquired(d)); + + /* Sanity check the call */ + if ( d == current->domain || !(d->arch.shadow.mode & mode) ) + { + return -EINVAL; + } + + /* Update the bits */ + sh_new_mode(d, d->arch.shadow.mode & ~mode); + if ( d->arch.shadow.mode == 0 ) + { + /* Get this domain off shadows */ + SHADOW_PRINTK("un-shadowing of domain %u starts." + " Shadow pages total = %u, free = %u, p2m=%u\n", + d->domain_id, + d->arch.shadow.total_pages, + d->arch.shadow.free_pages, + d->arch.shadow.p2m_pages); + for_each_vcpu(d, v) + { + shadow_detach_old_tables(v); +#if CONFIG_PAGING_LEVELS == 4 + if ( !(v->arch.flags & TF_kernel_mode) ) + make_cr3(v, pagetable_get_pfn(v->arch.guest_table_user)); + else +#endif + make_cr3(v, pagetable_get_pfn(v->arch.guest_table)); + + } + + /* Pull down the memory allocation */ + if ( set_sh_allocation(d, 0, NULL) != 0 ) + { + // XXX - How can this occur? + // Seems like a bug to return an error now that we've + // disabled the relevant shadow mode. + // + return -ENOMEM; + } + shadow_hash_teardown(d); + SHADOW_PRINTK("un-shadowing of domain %u done." + " Shadow pages total = %u, free = %u, p2m=%u\n", + d->domain_id, + d->arch.shadow.total_pages, + d->arch.shadow.free_pages, + d->arch.shadow.p2m_pages); + } + + return 0; +} + +/* Enable/disable ops for the "test" and "log-dirty" modes */ +int shadow_test_enable(struct domain *d) +{ + int ret; + + domain_pause(d); + shadow_lock(d); + + if ( shadow_mode_enabled(d) ) + { + SHADOW_ERROR("Don't support enabling test mode" + "on already shadowed doms\n"); + ret = -EINVAL; + goto out; + } + + ret = shadow_one_bit_enable(d, SHM2_enable); + out: + shadow_unlock(d); + domain_unpause(d); + + return ret; +} + +int shadow_test_disable(struct domain *d) +{ + int ret; + + domain_pause(d); + shadow_lock(d); + ret = shadow_one_bit_disable(d, SHM2_enable); + shadow_unlock(d); + domain_unpause(d); + + return ret; +} + +static int +sh_alloc_log_dirty_bitmap(struct domain *d) +{ + ASSERT(d->arch.shadow.dirty_bitmap == NULL); + d->arch.shadow.dirty_bitmap_size = + (d->shared_info->arch.max_pfn + (BITS_PER_LONG - 1)) & + ~(BITS_PER_LONG - 1); + d->arch.shadow.dirty_bitmap = + xmalloc_array(unsigned long, + d->arch.shadow.dirty_bitmap_size / BITS_PER_LONG); + if ( d->arch.shadow.dirty_bitmap == NULL ) + { + d->arch.shadow.dirty_bitmap_size = 0; + return -ENOMEM; + } + memset(d->arch.shadow.dirty_bitmap, 0, d->arch.shadow.dirty_bitmap_size/8); + + return 0; +} + +static void +sh_free_log_dirty_bitmap(struct domain *d) +{ + d->arch.shadow.dirty_bitmap_size = 0; + if ( d->arch.shadow.dirty_bitmap ) + { + xfree(d->arch.shadow.dirty_bitmap); + d->arch.shadow.dirty_bitmap = NULL; + } +} + +static int shadow_log_dirty_enable(struct domain *d) +{ + int ret; + + domain_pause(d); + shadow_lock(d); + + if ( shadow_mode_log_dirty(d) ) + { + ret = -EINVAL; + goto out; + } + + if ( shadow_mode_enabled(d) ) + { + SHADOW_ERROR("Don't (yet) support enabling log-dirty" + "on already shadowed doms\n"); + ret = -EINVAL; + goto out; + } + + ret = sh_alloc_log_dirty_bitmap(d); + if ( ret != 0 ) + { + sh_free_log_dirty_bitmap(d); + goto out; + } + + ret = shadow_one_bit_enable(d, SHM2_log_dirty); + if ( ret != 0 ) + sh_free_log_dirty_bitmap(d); + + out: + shadow_unlock(d); + domain_unpause(d); + return ret; +} + +static int shadow_log_dirty_disable(struct domain *d) +{ + int ret; + + domain_pause(d); + shadow_lock(d); + ret = shadow_one_bit_disable(d, SHM2_log_dirty); + if ( !shadow_mode_log_dirty(d) ) + sh_free_log_dirty_bitmap(d); + shadow_unlock(d); + domain_unpause(d); + + return ret; +} + +/**************************************************************************/ +/* P2M map manipulations */ + +static void +sh_p2m_remove_page(struct domain *d, unsigned long gfn, unsigned long mfn) +{ + struct vcpu *v; + + if ( !shadow_mode_translate(d) ) + return; + + v = current; + if ( v->domain != d ) + v = d->vcpu[0]; + + + SHADOW_DEBUG(P2M, "removing gfn=%#lx mfn=%#lx\n", gfn, mfn); + + ASSERT(mfn_x(sh_gfn_to_mfn(d, gfn)) == mfn); + //ASSERT(sh_mfn_to_gfn(d, mfn) == gfn); + + shadow_remove_all_shadows_and_parents(v, _mfn(mfn)); + if ( shadow_remove_all_mappings(v, _mfn(mfn)) ) + flush_tlb_mask(d->domain_dirty_cpumask); + shadow_set_p2m_entry(d, gfn, _mfn(INVALID_MFN)); + set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY); +} + +void +shadow_guest_physmap_remove_page(struct domain *d, unsigned long gfn, + unsigned long mfn) +{ + shadow_lock(d); + shadow_audit_p2m(d); + sh_p2m_remove_page(d, gfn, mfn); + shadow_audit_p2m(d); + shadow_unlock(d); +} + +void +shadow_guest_physmap_add_page(struct domain *d, unsigned long gfn, + unsigned long mfn) +{ + struct vcpu *v; + unsigned long ogfn; + mfn_t omfn; + + if ( !shadow_mode_translate(d) ) + return; + + v = current; + if ( v->domain != d ) + v = d->vcpu[0]; + + shadow_lock(d); + shadow_audit_p2m(d); + + SHADOW_DEBUG(P2M, "adding gfn=%#lx mfn=%#lx\n", gfn, mfn); + + omfn = sh_gfn_to_mfn(d, gfn); + if ( valid_mfn(omfn) ) + { + /* Get rid of the old mapping, especially any shadows */ + shadow_remove_all_shadows_and_parents(v, omfn); + if ( shadow_remove_all_mappings(v, omfn) ) + flush_tlb_mask(d->domain_dirty_cpumask); + set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY); + } + + ogfn = sh_mfn_to_gfn(d, _mfn(mfn)); + if ( +#ifdef __x86_64__ + (ogfn != 0x5555555555555555L) +#else + (ogfn != 0x55555555L) +#endif + && (ogfn != INVALID_M2P_ENTRY) + && (ogfn != gfn) ) + { + /* This machine frame is already mapped at another physical address */ + SHADOW_DEBUG(P2M, "aliased! mfn=%#lx, old gfn=%#lx, new gfn=%#lx\n", + mfn, ogfn, gfn); + if ( valid_mfn(omfn = sh_gfn_to_mfn(d, ogfn)) ) + { + SHADOW_DEBUG(P2M, "old gfn=%#lx -> mfn %#lx\n", + ogfn , mfn_x(omfn)); + if ( mfn_x(omfn) == mfn ) + sh_p2m_remove_page(d, ogfn, mfn); + } + } + + shadow_set_p2m_entry(d, gfn, _mfn(mfn)); + set_gpfn_from_mfn(mfn, gfn); + shadow_audit_p2m(d); + shadow_unlock(d); +} + +/**************************************************************************/ +/* Log-dirty mode support */ + +/* Convert a shadow to log-dirty mode. */ +void shadow_convert_to_log_dirty(struct vcpu *v, mfn_t smfn) +{ + BUG(); +} + + +/* Read a domain's log-dirty bitmap and stats. + * If the operation is a CLEAN, clear the bitmap and stats as well. */ +static int shadow_log_dirty_op( + struct domain *d, struct xen_domctl_shadow_op *sc) +{ + int i, rv = 0, clean = 0; + + domain_pause(d); + shadow_lock(d); + + clean = (sc->op == XEN_DOMCTL_SHADOW_OP_CLEAN); + + SHADOW_DEBUG(LOGDIRTY, "log-dirty %s: dom %u faults=%u dirty=%u\n", + (clean) ? "clean" : "peek", + d->domain_id, + d->arch.shadow.fault_count, + d->arch.shadow.dirty_count); + + sc->stats.fault_count = d->arch.shadow.fault_count; + sc->stats.dirty_count = d->arch.shadow.dirty_count; + + if ( clean ) + { + struct list_head *l, *t; + struct page_info *pg; + + /* Need to revoke write access to the domain's pages again. + * In future, we'll have a less heavy-handed approach to this, + * but for now, we just unshadow everything except Xen. */ + list_for_each_safe(l, t, &d->arch.shadow.toplevel_shadows) + { + pg = list_entry(l, struct page_info, list); + shadow_unhook_mappings(d->vcpu[0], page_to_mfn(pg)); + } + + d->arch.shadow.fault_count = 0; + d->arch.shadow.dirty_count = 0; + } + + if ( guest_handle_is_null(sc->dirty_bitmap) || + (d->arch.shadow.dirty_bitmap == NULL) ) + { + rv = -EINVAL; + goto out; + } + + if ( sc->pages > d->arch.shadow.dirty_bitmap_size ) + sc->pages = d->arch.shadow.dirty_bitmap_size; + +#define CHUNK (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */ + for ( i = 0; i < sc->pages; i += CHUNK ) + { + int bytes = ((((sc->pages - i) > CHUNK) + ? CHUNK + : (sc->pages - i)) + 7) / 8; + + if ( copy_to_guest_offset( + sc->dirty_bitmap, + i/(8*sizeof(unsigned long)), + d->arch.shadow.dirty_bitmap + (i/(8*sizeof(unsigned long))), + (bytes + sizeof(unsigned long) - 1) / sizeof(unsigned long)) ) + { + rv = -EINVAL; + goto out; + } + + if ( clean ) + memset(d->arch.shadow.dirty_bitmap + (i/(8*sizeof(unsigned long))), + 0, bytes); + } +#undef CHUNK + + out: + shadow_unlock(d); + domain_unpause(d); + return 0; +} + + +/* Mark a page as dirty */ +void sh_do_mark_dirty(struct domain *d, mfn_t gmfn) +{ + unsigned long pfn; + + ASSERT(shadow_lock_is_acquired(d)); + ASSERT(shadow_mode_log_dirty(d)); + + if ( !valid_mfn(gmfn) ) + return; + + ASSERT(d->arch.shadow.dirty_bitmap != NULL); + + /* We /really/ mean PFN here, even for non-translated guests. */ + pfn = get_gpfn_from_mfn(mfn_x(gmfn)); + + /* + * Values with the MSB set denote MFNs that aren't really part of the + * domain's pseudo-physical memory map (e.g., the shared info frame). + * Nothing to do here... + */ + if ( unlikely(!VALID_M2P(pfn)) ) + return; + + /* N.B. Can use non-atomic TAS because protected by shadow_lock. */ + if ( likely(pfn < d->arch.shadow.dirty_bitmap_size) ) + { + if ( !__test_and_set_bit(pfn, d->arch.shadow.dirty_bitmap) ) + { + SHADOW_DEBUG(LOGDIRTY, + "marked mfn %" SH_PRI_mfn " (pfn=%lx), dom %d\n", + mfn_x(gmfn), pfn, d->domain_id); + d->arch.shadow.dirty_count++; + } + } + else + { + SHADOW_PRINTK("mark_dirty OOR! " + "mfn=%" SH_PRI_mfn " pfn=%lx max=%x (dom %d)\n" + "owner=%d c=%08x t=%" PRtype_info "\n", + mfn_x(gmfn), + pfn, + d->arch.shadow.dirty_bitmap_size, + d->domain_id, + (page_get_owner(mfn_to_page(gmfn)) + ? page_get_owner(mfn_to_page(gmfn))->domain_id + : -1), + mfn_to_page(gmfn)->count_info, + mfn_to_page(gmfn)->u.inuse.type_info); + } +} + + +/**************************************************************************/ +/* Shadow-control XEN_DOMCTL dispatcher */ + +int shadow_domctl(struct domain *d, + xen_domctl_shadow_op_t *sc, + XEN_GUEST_HANDLE(xen_domctl_t) u_domctl) +{ + int rc, preempted = 0; + + if ( unlikely(d == current->domain) ) + { + DPRINTK("Don't try to do a shadow op on yourself!\n"); + return -EINVAL; + } + + switch ( sc->op ) + { + case XEN_DOMCTL_SHADOW_OP_OFF: + if ( shadow_mode_log_dirty(d) ) + if ( (rc = shadow_log_dirty_disable(d)) != 0 ) + return rc; + if ( d->arch.shadow.mode & SHM2_enable ) + if ( (rc = shadow_test_disable(d)) != 0 ) + return rc; + return 0; + + case XEN_DOMCTL_SHADOW_OP_ENABLE_TEST: + return shadow_test_enable(d); + + case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY: + return shadow_log_dirty_enable(d); + + case XEN_DOMCTL_SHADOW_OP_ENABLE_TRANSLATE: + return shadow_enable(d, SHM2_refcounts|SHM2_translate); + + case XEN_DOMCTL_SHADOW_OP_CLEAN: + case XEN_DOMCTL_SHADOW_OP_PEEK: + return shadow_log_dirty_op(d, sc); + + case XEN_DOMCTL_SHADOW_OP_ENABLE: + if ( sc->mode & XEN_DOMCTL_SHADOW_ENABLE_LOG_DIRTY ) + return shadow_log_dirty_enable(d); + return shadow_enable(d, sc->mode << SHM2_shift); + + case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION: + sc->mb = shadow_get_allocation(d); + return 0; + + case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION: + rc = shadow_set_allocation(d, sc->mb, &preempted); + if ( preempted ) + /* Not finished. Set up to re-run the call. */ + rc = hypercall_create_continuation( + __HYPERVISOR_domctl, "h", u_domctl); + else + /* Finished. Return the new allocation */ + sc->mb = shadow_get_allocation(d); + return rc; + + default: + SHADOW_ERROR("Bad shadow op %u\n", sc->op); + return -EINVAL; + } +} + + +/**************************************************************************/ +/* Auditing shadow tables */ + +#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL + +void shadow_audit_tables(struct vcpu *v) +{ + /* Dispatch table for getting per-type functions */ + static hash_callback_t callbacks[16] = { + NULL, /* none */ +#if CONFIG_PAGING_LEVELS == 2 + SHADOW_INTERNAL_NAME(sh_audit_l1_table,2,2), /* l1_32 */ + SHADOW_INTERNAL_NAME(sh_audit_fl1_table,2,2), /* fl1_32 */ + SHADOW_INTERNAL_NAME(sh_audit_l2_table,2,2), /* l2_32 */ +#else + SHADOW_INTERNAL_NAME(sh_audit_l1_table,3,2), /* l1_32 */ + SHADOW_INTERNAL_NAME(sh_audit_fl1_table,3,2), /* fl1_32 */ + SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,2), /* l2_32 */ + SHADOW_INTERNAL_NAME(sh_audit_l1_table,3,3), /* l1_pae */ + SHADOW_INTERNAL_NAME(sh_audit_fl1_table,3,3), /* fl1_pae */ + SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,3), /* l2_pae */ + SHADOW_INTERNAL_NAME(sh_audit_l2_table,3,3), /* l2h_pae */ + SHADOW_INTERNAL_NAME(sh_audit_l3_table,3,3), /* l3_pae */ +#if CONFIG_PAGING_LEVELS >= 4 + SHADOW_INTERNAL_NAME(sh_audit_l1_table,4,4), /* l1_64 */ + SHADOW_INTERNAL_NAME(sh_audit_fl1_table,4,4), /* fl1_64 */ + SHADOW_INTERNAL_NAME(sh_audit_l2_table,4,4), /* l2_64 */ + SHADOW_INTERNAL_NAME(sh_audit_l3_table,4,4), /* l3_64 */ + SHADOW_INTERNAL_NAME(sh_audit_l4_table,4,4), /* l4_64 */ +#endif /* CONFIG_PAGING_LEVELS >= 4 */ +#endif /* CONFIG_PAGING_LEVELS > 2 */ + NULL /* All the rest */ + }; + unsigned int mask; + + if ( !(SHADOW_AUDIT_ENABLE) ) + return; + + if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL ) + mask = ~1; /* Audit every table in the system */ + else + { + /* Audit only the current mode's tables */ + switch ( v->arch.shadow.mode->guest_levels ) + { + case 2: mask = (SHF_L1_32|SHF_FL1_32|SHF_L2_32); break; + case 3: mask = (SHF_L1_PAE|SHF_FL1_PAE|SHF_L2_PAE + |SHF_L2H_PAE|SHF_L3_PAE); break; + case 4: mask = (SHF_L1_64|SHF_FL1_64|SHF_L2_64 + |SHF_L3_64|SHF_L4_64); break; + default: BUG(); + } + } + + hash_foreach(v, ~1, callbacks, _mfn(INVALID_MFN)); +} + +#endif /* Shadow audit */ + + +/**************************************************************************/ +/* Auditing p2m tables */ + +#if SHADOW_AUDIT & SHADOW_AUDIT_P2M + +void shadow_audit_p2m(struct domain *d) +{ + struct list_head *entry; + struct page_info *page; + struct domain *od; + unsigned long mfn, gfn, m2pfn, lp2mfn = 0; + mfn_t p2mfn; + unsigned long orphans_d = 0, orphans_i = 0, mpbad = 0, pmbad = 0; + int test_linear; + + if ( !(SHADOW_AUDIT_ENABLE) || !shadow_mode_translate(d) ) + return; + + //SHADOW_PRINTK("p2m audit starts\n"); + + test_linear = ( (d == current->domain) && current->arch.monitor_vtable ); + if ( test_linear ) + local_flush_tlb(); + + /* Audit part one: walk the domain's page allocation list, checking + * the m2p entries. */ + for ( entry = d->page_list.next; + entry != &d->page_list; + entry = entry->next ) + { + page = list_entry(entry, struct page_info, list); + mfn = mfn_x(page_to_mfn(page)); + + // SHADOW_PRINTK("auditing guest page, mfn=%#lx\n", mfn); + + od = page_get_owner(page); + + if ( od != d ) + { + SHADOW_PRINTK("wrong owner %#lx -> %p(%u) != %p(%u)\n", + mfn, od, (od?od->domain_id:-1), d, d->domain_id); + continue; + } + + gfn = get_gpfn_from_mfn(mfn); + if ( gfn == INVALID_M2P_ENTRY ) + { + orphans_i++; + //SHADOW_PRINTK("orphaned guest page: mfn=%#lx has invalid gfn\n", + // mfn); + continue; + } + + if ( gfn == 0x55555555 ) + { + orphans_d++; + //SHADOW_PRINTK("orphaned guest page: mfn=%#lx has debug gfn\n", + // mfn); + continue; + } + + p2mfn = sh_gfn_to_mfn_foreign(d, gfn); + if ( mfn_x(p2mfn) != mfn ) + { + mpbad++; + SHADOW_PRINTK("map mismatch mfn %#lx -> gfn %#lx -> mfn %#lx" + " (-> gfn %#lx)\n", + mfn, gfn, mfn_x(p2mfn), + (mfn_valid(p2mfn) + ? get_gpfn_from_mfn(mfn_x(p2mfn)) + : -1u)); + /* This m2p entry is stale: the domain has another frame in + * this physical slot. No great disaster, but for neatness, + * blow away the m2p entry. */ + set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY); + } + + if ( test_linear ) + { + lp2mfn = get_mfn_from_gpfn(gfn); + if ( lp2mfn != mfn_x(p2mfn) ) + { + SHADOW_PRINTK("linear mismatch gfn %#lx -> mfn %#lx " + "(!= mfn %#lx)\n", gfn, lp2mfn, p2mfn); + } + } + + // SHADOW_PRINTK("OK: mfn=%#lx, gfn=%#lx, p2mfn=%#lx, lp2mfn=%#lx\n", + // mfn, gfn, p2mfn, lp2mfn); + } + + /* Audit part two: walk the domain's p2m table, checking the entries. */ + if ( pagetable_get_pfn(d->arch.phys_table) != 0 ) + { + l2_pgentry_t *l2e; + l1_pgentry_t *l1e; + int i1, i2; + +#if CONFIG_PAGING_LEVELS == 4 + l4_pgentry_t *l4e; + l3_pgentry_t *l3e; + int i3, i4; + l4e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table)); +#elif CONFIG_PAGING_LEVELS == 3 + l3_pgentry_t *l3e; + int i3; + l3e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table)); +#else /* CONFIG_PAGING_LEVELS == 2 */ + l2e = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table)); +#endif + + gfn = 0; +#if CONFIG_PAGING_LEVELS >= 3 +#if CONFIG_PAGING_LEVELS >= 4 + for ( i4 = 0; i4 < L4_PAGETABLE_ENTRIES; i4++ ) + { + if ( !(l4e_get_flags(l4e[i4]) & _PAGE_PRESENT) ) + { + gfn += 1 << (L4_PAGETABLE_SHIFT - PAGE_SHIFT); + continue; + } + l3e = sh_map_domain_page(_mfn(l4e_get_pfn(l4e[i4]))); +#endif /* now at levels 3 or 4... */ + for ( i3 = 0; + i3 < ((CONFIG_PAGING_LEVELS==4) ? L3_PAGETABLE_ENTRIES : 8); + i3++ ) + { + if ( !(l3e_get_flags(l3e[i3]) & _PAGE_PRESENT) ) + { + gfn += 1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT); + continue; + } + l2e = sh_map_domain_page(_mfn(l3e_get_pfn(l3e[i3]))); +#endif /* all levels... */ + for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ ) + { + if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) ) + { + gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT); + continue; + } + l1e = sh_map_domain_page(_mfn(l2e_get_pfn(l2e[i2]))); + + for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ ) + { + if ( !(l1e_get_flags(l1e[i1]) & _PAGE_PRESENT) ) + continue; + mfn = l1e_get_pfn(l1e[i1]); + ASSERT(valid_mfn(_mfn(mfn))); + m2pfn = get_gpfn_from_mfn(mfn); + if ( m2pfn != gfn ) + { + pmbad++; + SHADOW_PRINTK("mismatch: gfn %#lx -> mfn %#lx" + " -> gfn %#lx\n", gfn, mfn, m2pfn); + BUG(); + } + } + sh_unmap_domain_page(l1e); + } +#if CONFIG_PAGING_LEVELS >= 3 + sh_unmap_domain_page(l2e); + } +#if CONFIG_PAGING_LEVELS >= 4 + sh_unmap_domain_page(l3e); + } +#endif +#endif + +#if CONFIG_PAGING_LEVELS == 4 + sh_unmap_domain_page(l4e); +#elif CONFIG_PAGING_LEVELS == 3 + sh_unmap_domain_page(l3e); +#else /* CONFIG_PAGING_LEVELS == 2 */ + sh_unmap_domain_page(l2e); +#endif + + } + + //SHADOW_PRINTK("p2m audit complete\n"); + //if ( orphans_i | orphans_d | mpbad | pmbad ) + // SHADOW_PRINTK("p2m audit found %lu orphans (%lu inval %lu debug)\n", + // orphans_i + orphans_d, orphans_i, orphans_d, + if ( mpbad | pmbad ) + SHADOW_PRINTK("p2m audit found %lu odd p2m, %lu bad m2p entries\n", + pmbad, mpbad); +} + +#endif /* p2m audit */ + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + */ diff -r 896fcdd49c7f -r 684fdcfb251a xen/arch/x86/mm/shadow/multi.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/arch/x86/mm/shadow/multi.c Mon Aug 28 16:26:37 2006 -0600 @@ -0,0 +1,4492 @@ +/****************************************************************************** + * arch/x86/mm/shadow/multi.c + * + * Simple, mostly-synchronous shadow page tables. + * Parts of this code are Copyright (c) 2006 by XenSource Inc. + * Parts of this code are Copyright (c) 2006 by Michael A Fetterman + * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +// DESIGN QUESTIONS: +// Why use subshadows for PAE guests? +// - reduces pressure in the hash table +// - reduces shadow size (64-vs-4096 bytes of shadow for 32 bytes of guest L3) +// - would need to find space in the page_info to store 7 more bits of +// backpointer +// - independent shadows of 32 byte chunks makes it non-obvious how to quickly +// figure out when to demote the guest page from l3 status +// +// PAE Xen HVM guests are restricted to 8GB of pseudo-physical address space. +// - Want to map the P2M table into the 16MB RO_MPT hole in Xen's address +// space for both PV and HVM guests. +// + +#define SHADOW 1 + +#include <xen/config.h> +#include <xen/types.h> +#include <xen/mm.h> +#include <xen/trace.h> +#include <xen/sched.h> +#include <xen/perfc.h> +#include <xen/domain_page.h> +#include <asm/page.h> +#include <asm/current.h> +#include <asm/shadow.h> +#include <asm/flushtlb.h> +#include <asm/hvm/hvm.h> +#include "private.h" +#include "types.h" + +/* The first cut: an absolutely synchronous, trap-and-emulate version, + * supporting only HVM guests (and so only "external" shadow mode). + * + * THINGS TO DO LATER: + * + * FIX GVA_TO_GPA + * The current interface returns an unsigned long, which is not big enough + * to hold a physical address in PAE. Should return a gfn instead. + * + * TEARDOWN HEURISTICS + * Also: have a heuristic for when to destroy a previous paging-mode's + * shadows. When a guest is done with its start-of-day 32-bit tables + * and reuses the memory we want to drop those shadows. Start with + * shadows in a page in two modes as a hint, but beware of clever tricks + * like reusing a pagetable for both PAE and 64-bit during boot... + * + * PAE LINEAR MAPS + * Rework shadow_get_l*e() to have the option of using map_domain_page() + * instead of linear maps. Add appropriate unmap_l*e calls in the users. + * Then we can test the speed difference made by linear maps. If the + * map_domain_page() version is OK on PAE, we could maybe allow a lightweight + * l3-and-l2h-only shadow mode for PAE PV guests that would allow them + * to share l2h pages again. + * + * PAE L3 COPYING + * In this code, we copy all 32 bytes of a PAE L3 every time we change an + * entry in it, and every time we change CR3. We copy it for the linear + * mappings (ugh! PAE linear mappings) and we copy it to the low-memory + * buffer so it fits in CR3. Maybe we can avoid some of this recopying + * by using the shadow directly in some places. + * Also, for SMP, need to actually respond to seeing shadow.pae_flip_pending. + * + * GUEST_WALK_TABLES TLB FLUSH COALESCE + * guest_walk_tables can do up to three remote TLB flushes as it walks to + * the first l1 of a new pagetable. Should coalesce the flushes to the end, + * and if we do flush, re-do the walk. If anything has changed, then + * pause all the other vcpus and do the walk *again*. + * + * WP DISABLED + * Consider how to implement having the WP bit of CR0 set to 0. + * Since we need to be able to cause write faults to pagetables, this might + * end up looking like not having the (guest) pagetables present at all in + * HVM guests... + * + * PSE disabled / PSE36 + * We don't support any modes other than PSE enabled, PSE36 disabled. + * Neither of those would be hard to change, but we'd need to be able to + * deal with shadows made in one mode and used in another. + */ + +#define FETCH_TYPE_PREFETCH 1 +#define FETCH_TYPE_DEMAND 2 +#define FETCH_TYPE_WRITE 4 +typedef enum { + ft_prefetch = FETCH_TYPE_PREFETCH, + ft_demand_read = FETCH_TYPE_DEMAND, + ft_demand_write = FETCH_TYPE_DEMAND | FETCH_TYPE_WRITE, +} fetch_type_t; + +#ifdef DEBUG_TRACE_DUMP +static char *fetch_type_names[] = { + [ft_prefetch] "prefetch", + [ft_demand_read] "demand read", + [ft_demand_write] "demand write", +}; +#endif + +/* XXX forward declarations */ +#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3) +static unsigned long hvm_pae_copy_root(struct vcpu *v, l3_pgentry_t *l3tab, int clear_res); +#endif +static inline void sh_update_linear_entries(struct vcpu *v); + +/**************************************************************************/ +/* Hash table mapping from guest pagetables to shadows + * + * Normal case: maps the mfn of a guest page to the mfn of its shadow page. + * FL1's: maps the *gfn* of the start of a superpage to the mfn of a + * shadow L1 which maps its "splinters". + * PAE CR3s: maps the 32-byte aligned, 32-bit CR3 value to the mfn of the + * PAE L3 info page for that CR3 value. + */ + +static inline mfn_t +get_fl1_shadow_status(struct vcpu *v, gfn_t gfn) +/* Look for FL1 shadows in the hash table */ +{ + mfn_t smfn = shadow_hash_lookup(v, gfn_x(gfn), + PGC_SH_fl1_shadow >> PGC_SH_type_shift); + + if ( unlikely(shadow_mode_log_dirty(v->domain) && valid_mfn(smfn)) ) + { + struct page_info *page = mfn_to_page(smfn); + if ( !(page->count_info & PGC_SH_log_dirty) ) + shadow_convert_to_log_dirty(v, smfn); + } + + return smfn; +} + +static inline mfn_t +get_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type) +/* Look for shadows in the hash table */ +{ + mfn_t smfn = shadow_hash_lookup(v, mfn_x(gmfn), + shadow_type >> PGC_SH_type_shift); + perfc_incrc(shadow_get_shadow_status); + + if ( unlikely(shadow_mode_log_dirty(v->domain) && valid_mfn(smfn)) ) + { + struct page_info *page = mfn_to_page(smfn); + if ( !(page->count_info & PGC_SH_log_dirty) ) + shadow_convert_to_log_dirty(v, smfn); + } + + return smfn; +} + +static inline void +set_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn) +/* Put an FL1 shadow into the hash table */ +{ + SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n", + gfn_x(gfn), PGC_SH_fl1_shadow, mfn_x(smfn)); + + if ( unlikely(shadow_mode_log_dirty(v->domain)) ) + // mark this shadow as a log dirty shadow... + set_bit(_PGC_SH_log_dirty, &mfn_to_page(smfn)->count_info); + else + clear_bit(_PGC_SH_log_dirty, &mfn_to_page(smfn)->count_info); + + shadow_hash_insert(v, gfn_x(gfn), + PGC_SH_fl1_shadow >> PGC_SH_type_shift, smfn); +} + +static inline void +set_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn) +/* Put a shadow into the hash table */ +{ + struct domain *d = v->domain; + int res; + + SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n", + d->domain_id, v->vcpu_id, mfn_x(gmfn), + shadow_type, mfn_x(smfn)); + + if ( unlikely(shadow_mode_log_dirty(d)) ) + // mark this shadow as a log dirty shadow... + set_bit(_PGC_SH_log_dirty, &mfn_to_page(smfn)->count_info); + else + clear_bit(_PGC_SH_log_dirty, &mfn_to_page(smfn)->count_info); + + res = get_page(mfn_to_page(gmfn), d); + ASSERT(res == 1); + + shadow_hash_insert(v, mfn_x(gmfn), shadow_type >> PGC_SH_type_shift, + smfn); +} + +static inline void +delete_fl1_shadow_status(struct vcpu *v, gfn_t gfn, mfn_t smfn) +/* Remove a shadow from the hash table */ +{ + SHADOW_PRINTK("gfn=%"SH_PRI_gfn", type=%08x, smfn=%05lx\n", + gfn_x(gfn), PGC_SH_fl1_shadow, mfn_x(smfn)); + + shadow_hash_delete(v, gfn_x(gfn), + PGC_SH_fl1_shadow >> PGC_SH_type_shift, smfn); +} + +static inline void +delete_shadow_status(struct vcpu *v, mfn_t gmfn, u32 shadow_type, mfn_t smfn) +/* Remove a shadow from the hash table */ +{ + SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, type=%08x, smfn=%05lx\n", + v->domain->domain_id, v->vcpu_id, + mfn_x(gmfn), shadow_type, mfn_x(smfn)); + shadow_hash_delete(v, mfn_x(gmfn), + shadow_type >> PGC_SH_type_shift, smfn); + put_page(mfn_to_page(gmfn)); +} + +/**************************************************************************/ +/* CPU feature support querying */ + +static inline int +guest_supports_superpages(struct vcpu *v) +{ + /* The _PAGE_PSE bit must be honoured in HVM guests, whenever + * CR4.PSE is set or the guest is in PAE or long mode */ + return (hvm_guest(v) && (GUEST_PAGING_LEVELS != 2 + || (hvm_get_guest_ctrl_reg(v, 4) & X86_CR4_PSE))); +} + +static inline int +guest_supports_nx(struct vcpu *v) +{ + if ( !hvm_guest(v) ) + return cpu_has_nx; + + // XXX - fix this! + return 1; +} + + +/**************************************************************************/ +/* Functions for walking the guest page tables */ + + +/* Walk the guest pagetables, filling the walk_t with what we see. + * Takes an uninitialised walk_t. The caller must call unmap_walk() + * on the walk_t before discarding it or calling guest_walk_tables again. + * If "guest_op" is non-zero, we are serving a genuine guest memory access, + * and must (a) be under the shadow lock, and (b) remove write access + * from any gueat PT pages we see, as we will be using their contents to + * perform shadow updates. + * Returns 0 for success or non-zero if the guest pagetables are malformed. + * N.B. Finding a not-present entry does not cause a non-zero return code. */ +static inline int +guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, int guest_op) +{ + ASSERT(!guest_op || shadow_lock_is_acquired(v->domain)); + + perfc_incrc(shadow_guest_walk); + memset(gw, 0, sizeof(*gw)); + gw->va = va; + +#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */ +#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */ + /* Get l4e from the top level table */ + gw->l4mfn = pagetable_get_mfn(v->arch.guest_table); + gw->l4e = (guest_l4e_t *)v->arch.guest_vtable + guest_l4_table_offset(va); + /* Walk down to the l3e */ + if ( !(guest_l4e_get_flags(*gw->l4e) & _PAGE_PRESENT) ) return 0; + gw->l3mfn = vcpu_gfn_to_mfn(v, guest_l4e_get_gfn(*gw->l4e)); + if ( !valid_mfn(gw->l3mfn) ) return 1; + /* This mfn is a pagetable: make sure the guest can't write to it. */ + if ( guest_op && shadow_remove_write_access(v, gw->l3mfn, 3, va) != 0 ) + flush_tlb_mask(v->domain->domain_dirty_cpumask); + gw->l3e = ((guest_l3e_t *)sh_map_domain_page(gw->l3mfn)) + + guest_l3_table_offset(va); +#else /* PAE only... */ + /* Get l3e from the top level table */ + gw->l3mfn = pagetable_get_mfn(v->arch.guest_table); + gw->l3e = (guest_l3e_t *)v->arch.guest_vtable + guest_l3_table_offset(va); +#endif /* PAE or 64... */ + /* Walk down to the l2e */ + if ( !(guest_l3e_get_flags(*gw->l3e) & _PAGE_PRESENT) ) return 0; + gw->l2mfn = vcpu_gfn_to_mfn(v, guest_l3e_get_gfn(*gw->l3e)); + if ( !valid_mfn(gw->l2mfn) ) return 1; + /* This mfn is a pagetable: make sure the guest can't write to it. */ + if ( guest_op && shadow_remove_write_access(v, gw->l2mfn, 2, va) != 0 ) + flush_tlb_mask(v->domain->domain_dirty_cpumask); + gw->l2e = ((guest_l2e_t *)sh_map_domain_page(gw->l2mfn)) + + guest_l2_table_offset(va); +#else /* 32-bit only... */ + /* Get l2e from the top level table */ + gw->l2mfn = pagetable_get_mfn(v->arch.guest_table); + gw->l2e = (guest_l2e_t *)v->arch.guest_vtable + guest_l2_table_offset(va); +#endif /* All levels... */ + + if ( !(guest_l2e_get_flags(*gw->l2e) & _PAGE_PRESENT) ) return 0; + if ( guest_supports_superpages(v) && + (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE) ) + { + /* Special case: this guest VA is in a PSE superpage, so there's + * no guest l1e. We make one up so that the propagation code + * can generate a shadow l1 table. Start with the gfn of the + * first 4k-page of the superpage. */ + gfn_t start = guest_l2e_get_gfn(*gw->l2e); + /* Grant full access in the l1e, since all the guest entry's + * access controls are enforced in the shadow l2e. This lets + * us reflect l2 changes later without touching the l1s. */ + int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW| + _PAGE_ACCESSED|_PAGE_DIRTY); + /* PSE level 2 entries use bit 12 for PAT; propagate it to bit 7 + * of the level 1 */ + if ( (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE_PAT) ) + flags |= _PAGE_PAT; + /* Increment the pfn by the right number of 4k pages. + * The ~0x1 is to mask out the PAT bit mentioned above. */ + start = _gfn((gfn_x(start) & ~0x1) + guest_l1_table_offset(va)); + gw->eff_l1e = guest_l1e_from_gfn(start, flags); + gw->l1e = NULL; + gw->l1mfn = _mfn(INVALID_MFN); + } + else + { + /* Not a superpage: carry on and find the l1e. */ + gw->l1mfn = vcpu_gfn_to_mfn(v, guest_l2e_get_gfn(*gw->l2e)); + if ( !valid_mfn(gw->l1mfn) ) return 1; + /* This mfn is a pagetable: make sure the guest can't write to it. */ + if ( guest_op + && shadow_remove_write_access(v, gw->l1mfn, 1, va) != 0 ) + flush_tlb_mask(v->domain->domain_dirty_cpumask); + gw->l1e = ((guest_l1e_t *)sh_map_domain_page(gw->l1mfn)) + + guest_l1_table_offset(va); + gw->eff_l1e = *gw->l1e; + } + + return 0; +} + +/* Given a walk_t, translate the gw->va into the guest's notion of the + * corresponding frame number. */ +static inline gfn_t +guest_walk_to_gfn(walk_t *gw) +{ + if ( !(guest_l1e_get_flags(gw->eff_l1e) & _PAGE_PRESENT) ) + return _gfn(INVALID_GFN); + return guest_l1e_get_gfn(gw->eff_l1e); +} + +/* Given a walk_t, translate the gw->va into the guest's notion of the + * corresponding physical address. */ +static inline paddr_t +guest_walk_to_gpa(walk_t *gw) +{ + if ( !(guest_l1e_get_flags(gw->eff_l1e) & _PAGE_PRESENT) ) + return 0; + return guest_l1e_get_paddr(gw->eff_l1e) + (gw->va & ~PAGE_MASK); +} + + +/* Unmap (and reinitialise) a guest walk. + * Call this to dispose of any walk filled in by guest_walk_tables() */ +static void unmap_walk(struct vcpu *v, walk_t *gw) +{ +#if GUEST_PAGING_LEVELS >= 3 +#if GUEST_PAGING_LEVELS >= 4 + if ( gw->l3e != NULL ) sh_unmap_domain_page(gw->l3e); +#endif + if ( gw->l2e != NULL ) sh_unmap_domain_page(gw->l2e); +#endif + if ( gw->l1e != NULL ) sh_unmap_domain_page(gw->l1e); +#ifdef DEBUG + memset(gw, 0, sizeof(*gw)); +#endif +} + + +/* Pretty-print the contents of a guest-walk */ +static inline void print_gw(walk_t *gw) +{ + SHADOW_PRINTK("GUEST WALK TO %#lx:\n", gw->va); +#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */ +#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */ + SHADOW_PRINTK(" l4mfn=%" SH_PRI_mfn "\n", mfn_x(gw->l4mfn)); + SHADOW_PRINTK(" l4e=%p\n", gw->l4e); + if ( gw->l4e ) + SHADOW_PRINTK(" *l4e=%" SH_PRI_gpte "\n", gw->l4e->l4); +#endif /* PAE or 64... */ + SHADOW_PRINTK(" l3mfn=%" SH_PRI_mfn "\n", mfn_x(gw->l3mfn)); + SHADOW_PRINTK(" l3e=%p\n", gw->l3e); + if ( gw->l3e ) + SHADOW_PRINTK(" *l3e=%" SH_PRI_gpte "\n", gw->l3e->l3); +#endif /* All levels... */ + SHADOW_PRINTK(" l2mfn=%" SH_PRI_mfn "\n", mfn_x(gw->l2mfn)); + SHADOW_PRINTK(" l2e=%p\n", gw->l2e); + if ( gw->l2e ) + SHADOW_PRINTK(" *l2e=%" SH_PRI_gpte "\n", gw->l2e->l2); + SHADOW_PRINTK(" l1mfn=%" SH_PRI_mfn "\n", mfn_x(gw->l1mfn)); + SHADOW_PRINTK(" l1e=%p\n", gw->l1e); + if ( gw->l1e ) + SHADOW_PRINTK(" *l1e=%" SH_PRI_gpte "\n", gw->l1e->l1); + SHADOW_PRINTK(" eff_l1e=%" SH_PRI_gpte "\n", gw->eff_l1e.l1); +} + + +#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES +/* Lightweight audit: pass all the shadows associated with this guest walk + * through the audit mechanisms */ +static void sh_audit_gw(struct vcpu *v, walk_t *gw) +{ + mfn_t smfn; + + if ( !(SHADOW_AUDIT_ENABLE) ) + return; + +#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */ +#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */ + if ( valid_mfn(gw->l4mfn) + && valid_mfn((smfn = get_shadow_status(v, gw->l4mfn, + PGC_SH_l4_shadow))) ) + (void) sh_audit_l4_table(v, smfn, _mfn(INVALID_MFN)); +#endif /* PAE or 64... */ + if ( valid_mfn(gw->l3mfn) + && valid_mfn((smfn = get_shadow_status(v, gw->l3mfn, + PGC_SH_l3_shadow))) ) + (void) sh_audit_l3_table(v, smfn, _mfn(INVALID_MFN)); +#endif /* All levels... */ + if ( valid_mfn(gw->l2mfn) ) + { + if ( valid_mfn((smfn = get_shadow_status(v, gw->l2mfn, + PGC_SH_l2_shadow))) ) + (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN)); +#if GUEST_PAGING_LEVELS == 3 + if ( valid_mfn((smfn = get_shadow_status(v, gw->l2mfn, + PGC_SH_l2h_shadow))) ) + (void) sh_audit_l2_table(v, smfn, _mfn(INVALID_MFN)); +#endif + } + if ( valid_mfn(gw->l1mfn) + && valid_mfn((smfn = get_shadow_status(v, gw->l1mfn, + PGC_SH_l1_shadow))) ) + (void) sh_audit_l1_table(v, smfn, _mfn(INVALID_MFN)); + else if ( gw->l2e + && (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE) + && valid_mfn( + (smfn = get_fl1_shadow_status(v, guest_l2e_get_gfn(*gw->l2e)))) ) + (void) sh_audit_fl1_table(v, smfn, _mfn(INVALID_MFN)); +} + +#else +#define sh_audit_gw(_v, _gw) do {} while(0) +#endif /* audit code */ + + + +/**************************************************************************/ +/* Function to write to the guest tables, for propagating accessed and + * dirty bits from the shadow to the guest. + * Takes a guest mfn, a pointer to the guest entry, the level of pagetable, + * and an operation type. The guest entry is always passed as an l1e: + * since we only ever write flags, that's OK. + * Returns the new flag bits of the guest entry. */ + +static u32 guest_set_ad_bits(struct vcpu *v, + mfn_t gmfn, + guest_l1e_t *ep, + unsigned int level, + fetch_type_t ft) +{ + u32 flags, shflags, bit; + struct page_info *pg; + int res = 0; + + ASSERT(valid_mfn(gmfn) + && (sh_mfn_is_a_page_table(gmfn) + || ((mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) + == 0))); + ASSERT(ep && !(((unsigned long)ep) & ((sizeof *ep) - 1))); + ASSERT(level <= GUEST_PAGING_LEVELS); + ASSERT(ft == ft_demand_read || ft == ft_demand_write); + ASSERT(shadow_lock_is_acquired(v->domain)); + + flags = guest_l1e_get_flags(*ep); + + /* PAE l3s do not have A and D bits */ + if ( unlikely(GUEST_PAGING_LEVELS == 3 && level == 3) ) + return flags; + + /* Need the D bit as well for writes, in l1es and 32bit/PAE PSE l2es. */ + if ( ft == ft_demand_write + && (level == 1 || + (level == 2 && GUEST_PAGING_LEVELS < 4 + && (flags & _PAGE_PSE) && guest_supports_superpages(v))) ) + { + if ( (flags & (_PAGE_DIRTY | _PAGE_ACCESSED)) + == (_PAGE_DIRTY | _PAGE_ACCESSED) ) + return flags; /* Guest already has A and D bits set */ + flags |= _PAGE_DIRTY | _PAGE_ACCESSED; + perfc_incrc(shadow_ad_update); + } + else + { + if ( flags & _PAGE_ACCESSED ) + return flags; /* Guest already has A bit set */ + flags |= _PAGE_ACCESSED; + perfc_incrc(shadow_a_update); + } + + /* Set the bit(s) */ + sh_mark_dirty(v->domain, gmfn); + SHADOW_DEBUG(A_AND_D, "gfn = %"SH_PRI_gfn", " + "old flags = %#x, new flags = %#x\n", + guest_l1e_get_gfn(*ep), guest_l1e_get_flags(*ep), flags); + *ep = guest_l1e_from_gfn(guest_l1e_get_gfn(*ep), flags); + + /* May need to propagate this change forward to other kinds of shadow */ + pg = mfn_to_page(gmfn); + if ( !sh_mfn_is_a_page_table(gmfn) ) + { + /* This guest pagetable is not yet shadowed at all. */ + // MAF: I think this assert is busted... If this gmfn has not yet + // been promoted, then it seems perfectly reasonable for there to be + // outstanding type refs to it... + /* TJD: No. If the gmfn has not been promoted, we must at least + * have recognised that it is a pagetable, and pulled write access. + * The type count should only be non-zero if it is actually a page + * table. The test above was incorrect, though, so I've fixed it. */ + ASSERT((pg->u.inuse.type_info & PGT_count_mask) == 0); + return flags; + } + + shflags = pg->shadow_flags & SHF_page_type_mask; + while ( shflags ) + { + bit = find_first_set_bit(shflags); + ASSERT(shflags & (1u << bit)); + shflags &= ~(1u << bit); + if ( !(pg->shadow_flags & (1u << bit)) ) + continue; + switch ( bit ) + { + case PGC_SH_type_to_index(PGC_SH_l1_shadow): + if (level != 1) + res |= sh_map_and_validate_gl1e(v, gmfn, ep, sizeof (*ep)); + break; + case PGC_SH_type_to_index(PGC_SH_l2_shadow): + if (level != 2) + res |= sh_map_and_validate_gl2e(v, gmfn, ep, sizeof (*ep)); + break; +#if GUEST_PAGING_LEVELS == 3 /* PAE only */ + case PGC_SH_type_to_index(PGC_SH_l2h_shadow): + if (level != 2) + res |= sh_map_and_validate_gl2he(v, gmfn, ep, sizeof (*ep)); + break; +#endif +#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */ + case PGC_SH_type_to_index(PGC_SH_l3_shadow): + if (level != 3) + res |= sh_map_and_validate_gl3e(v, gmfn, ep, sizeof (*ep)); + break; +#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */ + case PGC_SH_type_to_index(PGC_SH_l4_shadow): + if (level != 4) + res |= sh_map_and_validate_gl4e(v, gmfn, ep, sizeof (*ep)); + break; +#endif +#endif + default: + SHADOW_ERROR("mfn %"SH_PRI_mfn" is shadowed in multiple " + "modes: A&D bits may be out of sync (flags=%#x).\n", + mfn_x(gmfn), pg->shadow_flags); + /* XXX Shadows in other modes will not be updated, so will + * have their A and D bits out of sync. */ + } + } + + /* We should never need to flush the TLB or recopy PAE entries */ + ASSERT( res == 0 || res == SHADOW_SET_CHANGED ); + return flags; +} + +/**************************************************************************/ +/* Functions to compute the correct index into a shadow page, given an + * index into the guest page (as returned by guest_get_index()). + * This is trivial when the shadow and guest use the same sized PTEs, but + * gets more interesting when those sizes are mismatched (e.g. 32-bit guest, + * PAE- or 64-bit shadows). + * + * These functions also increment the shadow mfn, when necessary. When PTE + * sizes are mismatched, it takes 2 shadow L1 pages for a single guest L1 + * page. In this case, we allocate 2 contiguous pages for the shadow L1, and + * use simple pointer arithmetic on a pointer to the guest L1e to figure out + * which shadow page we really want. Similarly, when PTE sizes are + * mismatched, we shadow a guest L2 page with 4 shadow L2 pages. (The easiest + * way to see this is: a 32-bit guest L2 page maps 4GB of virtual address + * space, while a PAE- or 64-bit shadow L2 page maps 1GB of virtual address + * space.) + * + * For PAE guests, for every 32-bytes of guest L3 page table, we use 64-bytes + * of shadow (to store both the shadow, and the info that would normally be + * stored in page_info fields). This arrangement allows the shadow and the + * "page_info" fields to always be stored in the same page (in fact, in + * the same cache line), avoiding an extra call to map_domain_page(). + */ + +static inline u32 +guest_index(void *ptr) +{ + return (u32)((unsigned long)ptr & ~PAGE_MASK) / sizeof(guest_l1e_t); +} + +static inline u32 +shadow_l1_index(mfn_t *smfn, u32 guest_index) +{ +#if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2) + *smfn = _mfn(mfn_x(*smfn) + + (guest_index / SHADOW_L1_PAGETABLE_ENTRIES)); + return (guest_index % SHADOW_L1_PAGETABLE_ENTRIES); +#else + return guest_index; +#endif +} + +static inline u32 +shadow_l2_index(mfn_t *smfn, u32 guest_index) +{ +#if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2) + // Because we use 2 shadow l2 entries for each guest entry, the number of + // guest entries per shadow page is SHADOW_L2_PAGETABLE_ENTRIES/2 + // + *smfn = _mfn(mfn_x(*smfn) + + (guest_index / (SHADOW_L2_PAGETABLE_ENTRIES / 2))); + + // We multiple by two to get the index of the first of the two entries + // used to shadow the specified guest entry. + return (guest_index % (SHADOW_L2_PAGETABLE_ENTRIES / 2)) * 2; +#else + return guest_index; +#endif +} + +#if GUEST_PAGING_LEVELS >= 3 + +static inline u32 +shadow_l3_index(mfn_t *smfn, u32 guest_index) +{ +#if GUEST_PAGING_LEVELS == 3 + u32 group_id; + + // Because we use twice the space in L3 shadows as was consumed in guest + // L3s, the number of guest entries per shadow page is + // SHADOW_L2_PAGETABLE_ENTRIES/2. (Note this is *not* + // SHADOW_L3_PAGETABLE_ENTRIES, which in this case is 4...) + // + *smfn = _mfn(mfn_x(*smfn) + + (guest_index / (SHADOW_L2_PAGETABLE_ENTRIES / 2))); + + // We store PAE L3 shadows in groups of 4, alternating shadows and + // pae_l3_bookkeeping structs. So the effective shadow index is + // the the group_id * 8 + the offset within the group. + // + guest_index %= (SHADOW_L2_PAGETABLE_ENTRIES / 2); + group_id = guest_index / 4; + return (group_id * 8) + (guest_index % 4); +#else + return guest_index; +#endif +} + +#endif // GUEST_PAGING_LEVELS >= 3 + +#if GUEST_PAGING_LEVELS >= 4 + +static inline u32 +shadow_l4_index(mfn_t *smfn, u32 guest_index) +{ + return guest_index; +} + +#endif // GUEST_PAGING_LEVELS >= 4 + + +/**************************************************************************/ +/* Functions which compute shadow entries from their corresponding guest + * entries. + * + * These are the "heart" of the shadow code. + * + * There are two sets of these: those that are called on demand faults (read + * faults and write faults), and those that are essentially called to + * "prefetch" (or propagate) entries from the guest into the shadow. The read + * fault and write fault are handled as two separate cases for L1 entries (due + * to the _PAGE_DIRTY bit handling), but for L[234], they are grouped together + * into the respective demand_fault functions. + */ + +#define CHECK(_cond) \ +do { \ + if (unlikely(!(_cond))) \ + { \ + printk("%s %s %d ASSERTION (%s) FAILED\n", \ + __func__, __FILE__, __LINE__, #_cond); \ + return -1; \ + } \ +} while (0); + +// The function below tries to capture all of the flag manipulation for the +// demand and propagate functions into one place. +// +static always_inline u32 +sh_propagate_flags(struct vcpu *v, mfn_t target_mfn, + u32 gflags, guest_l1e_t *guest_entry_ptr, mfn_t gmfn, + int mmio, int level, fetch_type_t ft) +{ + struct domain *d = v->domain; + u32 pass_thru_flags; + u32 sflags; + + // XXX -- might want to think about PAT support for HVM guests... + +#ifndef NDEBUG + // MMIO can only occur from L1e's + // + if ( mmio ) + CHECK(level == 1); + + // We should always have a pointer to the guest entry if it's a non-PSE + // non-MMIO demand access. + if ( ft & FETCH_TYPE_DEMAND ) + CHECK(guest_entry_ptr || level == 1); +#endif + + // A not-present guest entry has a special signature in the shadow table, + // so that we do not have to consult the guest tables multiple times... + // + if ( unlikely(!(gflags & _PAGE_PRESENT)) ) + return _PAGE_SHADOW_GUEST_NOT_PRESENT; + + // Must have a valid target_mfn, unless this is mmio, or unless this is a + // prefetch. In the case of a prefetch, an invalid mfn means that we can + // not usefully shadow anything, and so we return early. + // + if ( !valid_mfn(target_mfn) ) + { + CHECK((ft == ft_prefetch) || mmio); + if ( !mmio ) + return 0; + } + + // PAE does not allow NX, RW, USER, ACCESSED, or DIRTY bits in its L3e's... + // + if ( (SHADOW_PAGING_LEVELS == 3) && (level == 3) ) + pass_thru_flags = _PAGE_PRESENT; + else + { + pass_thru_flags = (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_USER | + _PAGE_RW | _PAGE_PRESENT); + if ( guest_supports_nx(v) ) + pass_thru_flags |= _PAGE_NX_BIT; + } + + // PAE guests can not put NX, RW, USER, ACCESSED, or DIRTY bits into their + // L3e's; they are all implied. So we emulate them here. + // + if ( (GUEST_PAGING_LEVELS == 3) && (level == 3) ) + gflags = pass_thru_flags; + + // Propagate bits from the guest to the shadow. + // Some of these may be overwritten, below. + // Since we know the guest's PRESENT bit is set, we also set the shadow's + // SHADOW_PRESENT bit. + // + sflags = (gflags & pass_thru_flags) | _PAGE_SHADOW_PRESENT; + + // Copy the guest's RW bit into the SHADOW_RW bit. + // + if ( gflags & _PAGE_RW ) + sflags |= _PAGE_SHADOW_RW; + + // Set the A&D bits for higher level shadows. + // Higher level entries do not, strictly speaking, have dirty bits, but + // since we use shadow linear tables, each of these entries may, at some + // point in time, also serve as a shadow L1 entry. + // By setting both the A&D bits in each of these, we eliminate the burden + // on the hardware to update these bits on initial accesses. + // + if ( (level > 1) && !((SHADOW_PAGING_LEVELS == 3) && (level == 3)) ) + sflags |= _PAGE_ACCESSED | _PAGE_DIRTY; + + + // Set the A and D bits in the guest entry, if we need to. + if ( guest_entry_ptr && (ft & FETCH_TYPE_DEMAND) ) + gflags = guest_set_ad_bits(v, gmfn, guest_entry_ptr, level, ft); + + // If the A or D bit has not yet been set in the guest, then we must + // prevent the corresponding kind of access. + // + if ( unlikely(!((GUEST_PAGING_LEVELS == 3) && (level == 3)) && + !(gflags & _PAGE_ACCESSED)) ) + sflags &= ~_PAGE_PRESENT; + + /* D bits exist in l1es, and 32bit/PAE PSE l2es, but not 64bit PSE l2es */ + if ( unlikely( ((level == 1) + || ((level == 2) && (GUEST_PAGING_LEVELS < 4) + && guest_supports_superpages(v) && + (gflags & _PAGE_PSE))) + && !(gflags & _PAGE_DIRTY)) ) + sflags &= ~_PAGE_RW; + + // MMIO caching + // + // MMIO mappings are marked as not present, but we set the SHADOW_MMIO bit + // to cache the fact that this entry is in MMIO space. + // + if ( (level == 1) && mmio ) + { + sflags &= ~(_PAGE_PRESENT); + sflags |= _PAGE_SHADOW_MMIO; + } + else + { + // shadow_mode_log_dirty support + // + // Only allow the guest write access to a page a) on a demand fault, + // or b) if the page is already marked as dirty. + // + if ( unlikely((level == 1) && + !(ft & FETCH_TYPE_WRITE) && + shadow_mode_log_dirty(d) && + !sh_mfn_is_dirty(d, target_mfn)) ) + { + sflags &= ~_PAGE_RW; + } + + // protect guest page tables + // + if ( unlikely((level == 1) && + sh_mfn_is_a_page_table(target_mfn)) ) + { + if ( shadow_mode_trap_reads(d) ) + { + // if we are trapping both reads & writes, then mark this page + // as not present... + // + sflags &= ~_PAGE_PRESENT; + } + else + { + // otherwise, just prevent any writes... + // + sflags &= ~_PAGE_RW; + } + } + } + + return sflags; +} + +#undef CHECK + +#if GUEST_PAGING_LEVELS >= 4 +static void +l4e_propagate_from_guest(struct vcpu *v, + guest_l4e_t *gl4e, + mfn_t gl4mfn, + mfn_t sl3mfn, + shadow_l4e_t *sl4p, + fetch_type_t ft) +{ + u32 gflags = guest_l4e_get_flags(*gl4e); + u32 sflags = sh_propagate_flags(v, sl3mfn, gflags, (guest_l1e_t *) gl4e, + gl4mfn, 0, 4, ft); + + *sl4p = shadow_l4e_from_mfn(sl3mfn, sflags); + + SHADOW_DEBUG(PROPAGATE, + "%s gl4e=%" SH_PRI_gpte " sl4e=%" SH_PRI_pte "\n", + fetch_type_names[ft], gl4e->l4, sl4p->l4); + ASSERT(sflags != -1); +} +#endif // GUEST_PAGING_LEVELS >= 4 + +#if GUEST_PAGING_LEVELS >= 3 +static void +l3e_propagate_from_guest(struct vcpu *v, + guest_l3e_t *gl3e, + mfn_t gl3mfn, + mfn_t sl2mfn, + shadow_l3e_t *sl3p, + fetch_type_t ft) +{ + u32 gflags = guest_l3e_get_flags(*gl3e); + u32 sflags = sh_propagate_flags(v, sl2mfn, gflags, (guest_l1e_t *) gl3e, + gl3mfn, 0, 3, ft); + + *sl3p = shadow_l3e_from_mfn(sl2mfn, sflags); + + SHADOW_DEBUG(PROPAGATE, + "%s gl3e=%" SH_PRI_gpte " sl3e=%" SH_PRI_pte "\n", + fetch_type_names[ft], gl3e->l3, sl3p->l3); + ASSERT(sflags != -1); +} +#endif // GUEST_PAGING_LEVELS >= 3 + +static void +l2e_propagate_from_guest(struct vcpu *v, + guest_l2e_t *gl2e, + mfn_t gl2mfn, + mfn_t sl1mfn, + shadow_l2e_t *sl2p, + fetch_type_t ft) +{ + u32 gflags = guest_l2e_get_flags(*gl2e); + u32 sflags = sh_propagate_flags(v, sl1mfn, gflags, (guest_l1e_t *) gl2e, + gl2mfn, 0, 2, ft); + + *sl2p = shadow_l2e_from_mfn(sl1mfn, sflags); + + SHADOW_DEBUG(PROPAGATE, + "%s gl2e=%" SH_PRI_gpte " sl2e=%" SH_PRI_pte "\n", + fetch_type_names[ft], gl2e->l2, sl2p->l2); + ASSERT(sflags != -1); +} + +static inline int +l1e_read_fault(struct vcpu *v, walk_t *gw, mfn_t gmfn, shadow_l1e_t *sl1p, + int mmio) +/* returns 1 if emulation is required, and 0 otherwise */ +{ + struct domain *d = v->domain; + u32 gflags = guest_l1e_get_flags(gw->eff_l1e); + u32 sflags = sh_propagate_flags(v, gmfn, gflags, gw->l1e, gw->l1mfn, + mmio, 1, ft_demand_read); + + if ( shadow_mode_trap_reads(d) && !mmio && sh_mfn_is_a_page_table(gmfn) ) + { + // emulation required! + *sl1p = shadow_l1e_empty(); + return 1; + } + + *sl1p = shadow_l1e_from_mfn(gmfn, sflags); + + SHADOW_DEBUG(PROPAGATE, + "va=%p eff_gl1e=%" SH_PRI_gpte " sl1e=%" SH_PRI_pte "\n", + (void *)gw->va, gw->eff_l1e.l1, sl1p->l1); + + ASSERT(sflags != -1); + return 0; +} + +static inline int +l1e_write_fault(struct vcpu *v, walk_t *gw, mfn_t gmfn, shadow_l1e_t *sl1p, + int mmio) +/* returns 1 if emulation is required, and 0 otherwise */ +{ + struct domain *d = v->domain; + u32 gflags = guest_l1e_get_flags(gw->eff_l1e); + u32 sflags = sh_propagate_flags(v, gmfn, gflags, gw->l1e, gw->l1mfn, + mmio, 1, ft_demand_write); + + sh_mark_dirty(d, gmfn); + + if ( !mmio && sh_mfn_is_a_page_table(gmfn) ) + { + // emulation required! + *sl1p = shadow_l1e_empty(); + return 1; + } + + *sl1p = shadow_l1e_from_mfn(gmfn, sflags); + + SHADOW_DEBUG(PROPAGATE, + "va=%p eff_gl1e=%" SH_PRI_gpte " sl1e=%" SH_PRI_pte "\n", + (void *)gw->va, gw->eff_l1e.l1, sl1p->l1); + + ASSERT(sflags != -1); + return 0; +} + +static inline void +l1e_propagate_from_guest(struct vcpu *v, guest_l1e_t gl1e, shadow_l1e_t *sl1p, + int mmio) +{ + gfn_t gfn = guest_l1e_get_gfn(gl1e); + mfn_t gmfn = (mmio) ? _mfn(gfn_x(gfn)) : vcpu_gfn_to_mfn(v, gfn); + u32 gflags = guest_l1e_get_flags(gl1e); + u32 sflags = sh_propagate_flags(v, gmfn, gflags, 0, _mfn(INVALID_MFN), + mmio, 1, ft_prefetch); + + *sl1p = shadow_l1e_from_mfn(gmfn, sflags); + + SHADOW_DEBUG(PROPAGATE, + "gl1e=%" SH_PRI_gpte " sl1e=%" SH_PRI_pte "\n", + gl1e.l1, sl1p->l1); + + ASSERT(sflags != -1); +} + + +/**************************************************************************/ +/* These functions update shadow entries (and do bookkeeping on the shadow + * tables they are in). It is intended that they are the only + * functions which ever write (non-zero) data onto a shadow page. + * + * They return a set of flags: + * SHADOW_SET_CHANGED -- we actually wrote a new value to the shadow. + * SHADOW_SET_FLUSH -- the caller must cause a TLB flush. + * SHADOW_SET_ERROR -- the input is not a valid entry (for example, if + * shadow_get_page_from_l1e() fails). + * SHADOW_SET_L3PAE_RECOPY -- one or more vcpu's need to have their local + * copies of their PAE L3 entries re-copied. + */ + +static inline void safe_write_entry(void *dst, void *src) +/* Copy one PTE safely when processors might be running on the + * destination pagetable. This does *not* give safety against + * concurrent writes (that's what the shadow lock is for), just + * stops the hardware picking up partially written entries. */ +{ + volatile unsigned long *d = dst; + unsigned long *s = src; + ASSERT(!((unsigned long) d & (sizeof (shadow_l1e_t) - 1))); +#if CONFIG_PAGING_LEVELS == 3 + /* In PAE mode, pagetable entries are larger + * than machine words, so won't get written atomically. We need to make + * sure any other cpu running on these shadows doesn't see a + * half-written entry. Do this by marking the entry not-present first, + * then writing the high word before the low word. */ + BUILD_BUG_ON(sizeof (shadow_l1e_t) != 2 * sizeof (unsigned long)); + d[0] = 0; + d[1] = s[1]; + d[0] = s[0]; +#else + /* In 32-bit and 64-bit, sizeof(pte) == sizeof(ulong) == 1 word, + * which will be an atomic write, since the entry is aligned. */ + BUILD_BUG_ON(sizeof (shadow_l1e_t) != sizeof (unsigned long)); + *d = *s; +#endif +} + + +static inline void +shadow_write_entries(void *d, void *s, int entries, mfn_t mfn) +/* This function does the actual writes to shadow pages. + * It must not be called directly, since it doesn't do the bookkeeping + * that shadow_set_l*e() functions do. */ +{ + shadow_l1e_t *dst = d; + shadow_l1e_t *src = s; + void *map = NULL; + int i; + + /* Because we mirror access rights at all levels in the shadow, an + * l2 (or higher) entry with the RW bit cleared will leave us with + * no write access through the linear map. + * We detect that by writing to the shadow with copy_to_user() and + * using map_domain_page() to get a writeable mapping if we need to. */ + if ( __copy_to_user(d, d, sizeof (unsigned long)) != 0 ) + { + perfc_incrc(shadow_linear_map_failed); + map = sh_map_domain_page(mfn); + ASSERT(map != NULL); + dst = map + ((unsigned long)dst & (PAGE_SIZE - 1)); + } + + + for ( i = 0; i < entries; i++ ) + safe_write_entry(dst++, src++); + + if ( map != NULL ) sh_unmap_domain_page(map); + + /* XXX TODO: + * Update min/max field in page_info struct of this mfn */ +} + +static inline int +perms_strictly_increased(u32 old_flags, u32 new_flags) +/* Given the flags of two entries, are the new flags a strict + * increase in rights over the old ones? */ +{ + u32 of = old_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX); + u32 nf = new_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX); + /* Flip the NX bit, since it's the only one that decreases rights; + * we calculate as if it were an "X" bit. */ + of ^= _PAGE_NX_BIT; + nf ^= _PAGE_NX_BIT; + /* If the changed bits are all set in the new flags, then rights strictly + * increased between old and new. */ + return ((of | (of ^ nf)) == nf); +} + +static int inline +shadow_get_page_from_l1e(shadow_l1e_t sl1e, struct domain *d) +{ + int res; + mfn_t mfn; + struct domain *owner; + shadow_l1e_t sanitized_sl1e = + shadow_l1e_remove_flags(sl1e, _PAGE_SHADOW_RW | _PAGE_SHADOW_PRESENT); + + //ASSERT(shadow_l1e_get_flags(sl1e) & _PAGE_PRESENT); + //ASSERT((shadow_l1e_get_flags(sl1e) & L1_DISALLOW_MASK) == 0); + + if ( !shadow_mode_refcounts(d) ) + return 1; + + res = get_page_from_l1e(sanitized_sl1e, d); + + // If a privileged domain is attempting to install a map of a page it does + // not own, we let it succeed anyway. + // + if ( unlikely(!res) && + IS_PRIV(d) && + !shadow_mode_translate(d) && + valid_mfn(mfn = shadow_l1e_get_mfn(sl1e)) && + (owner = page_get_owner(mfn_to_page(mfn))) && + (d != owner) ) + { + res = get_page_from_l1e(sanitized_sl1e, owner); + SHADOW_PRINTK("privileged domain %d installs map of mfn %05lx " + "which is owned by domain %d: %s\n", + d->domain_id, mfn_x(mfn), owner->domain_id, + res ? "success" : "failed"); + } + + if ( unlikely(!res) ) + { + perfc_incrc(shadow_get_page_fail); + SHADOW_PRINTK("failed: l1e=" SH_PRI_pte "\n"); + } + + return res; +} + +static void inline +shadow_put_page_from_l1e(shadow_l1e_t sl1e, struct domain *d) +{ + if ( !shadow_mode_refcounts(d) ) + return; + + put_page_from_l1e(sl1e, d); +} + +#if GUEST_PAGING_LEVELS >= 4 +static int shadow_set_l4e(struct vcpu *v, + shadow_l4e_t *sl4e, + shadow_l4e_t new_sl4e, + mfn_t sl4mfn) +{ + int flags = 0; + shadow_l4e_t old_sl4e; + paddr_t paddr; + ASSERT(sl4e != NULL); + old_sl4e = *sl4e; + + if ( old_sl4e.l4 == new_sl4e.l4 ) return 0; /* Nothing to do */ + + paddr = ((((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT) + | (((unsigned long)sl4e) & ~PAGE_MASK)); + + if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT ) + { + /* About to install a new reference */ + sh_get_ref(shadow_l4e_get_mfn(new_sl4e), paddr); + } + + /* Write the new entry */ + shadow_write_entries(sl4e, &new_sl4e, 1, sl4mfn); + flags |= SHADOW_SET_CHANGED; + + if ( shadow_l4e_get_flags(old_sl4e) & _PAGE_PRESENT ) + { + /* We lost a reference to an old mfn. */ + mfn_t osl3mfn = shadow_l4e_get_mfn(old_sl4e); + if ( (mfn_x(osl3mfn) != mfn_x(shadow_l4e_get_mfn(new_sl4e))) + || !perms_strictly_increased(shadow_l4e_get_flags(old_sl4e), + shadow_l4e_get_flags(new_sl4e)) ) + { + flags |= SHADOW_SET_FLUSH; + } + sh_put_ref(v, osl3mfn, paddr); + } + return flags; +} +#endif /* GUEST_PAGING_LEVELS >= 4 */ + +#if GUEST_PAGING_LEVELS >= 3 +static int shadow_set_l3e(struct vcpu *v, + shadow_l3e_t *sl3e, + shadow_l3e_t new_sl3e, + mfn_t sl3mfn) +{ + int flags = 0; + shadow_l3e_t old_sl3e; + paddr_t paddr; + ASSERT(sl3e != NULL); + old_sl3e = *sl3e; + + if ( old_sl3e.l3 == new_sl3e.l3 ) return 0; /* Nothing to do */ + + paddr = ((((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT) + | (((unsigned long)sl3e) & ~PAGE_MASK)); + + if ( shadow_l3e_get_flags(new_sl3e) & _PAGE_PRESENT ) + { + /* About to install a new reference */ + sh_get_ref(shadow_l3e_get_mfn(new_sl3e), paddr); + } + + /* Write the new entry */ + shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn); + flags |= SHADOW_SET_CHANGED; + +#if GUEST_PAGING_LEVELS == 3 + /* We wrote a guest l3e in a PAE pagetable. This table is copied in + * the linear pagetable entries of its l2s, and may also be copied + * to a low memory location to make it fit in CR3. Report that we + * need to resync those copies (we can't wait for the guest to flush + * the TLB because it might be an increase in rights). */ + { + struct vcpu *vcpu; + + struct pae_l3_bookkeeping *info = sl3p_to_info(sl3e); + for_each_vcpu(v->domain, vcpu) + { + if (info->vcpus & (1 << vcpu->vcpu_id)) + { + // Remember that this flip/update needs to occur. + vcpu->arch.shadow.pae_flip_pending = 1; + flags |= SHADOW_SET_L3PAE_RECOPY; + } + } + } +#endif + + if ( shadow_l3e_get_flags(old_sl3e) & _PAGE_PRESENT ) + { + /* We lost a reference to an old mfn. */ + mfn_t osl2mfn = shadow_l3e_get_mfn(old_sl3e); + if ( (mfn_x(osl2mfn) != mfn_x(shadow_l3e_get_mfn(new_sl3e))) || + !perms_strictly_increased(shadow_l3e_get_flags(old_sl3e), + shadow_l3e_get_flags(new_sl3e)) ) + { + flags |= SHADOW_SET_FLUSH; + } + sh_put_ref(v, osl2mfn, paddr); + } + return flags; +} +#endif /* GUEST_PAGING_LEVELS >= 3 */ + +static int shadow_set_l2e(struct vcpu *v, + shadow_l2e_t *sl2e, + shadow_l2e_t new_sl2e, + mfn_t sl2mfn) +{ + int flags = 0; + shadow_l2e_t old_sl2e; + paddr_t paddr; + +#if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2 + /* In 2-on-3 we work with pairs of l2es pointing at two-page + * shadows. Reference counting and up-pointers track from the first + * page of the shadow to the first l2e, so make sure that we're + * working with those: + * Align the pointer down so it's pointing at the first of the pair */ + sl2e = (shadow_l2e_t *)((unsigned long)sl2e & ~(sizeof(shadow_l2e_t))); + /* Align the mfn of the shadow entry too */ + new_sl2e.l2 &= ~(1<<PAGE_SHIFT); +#endif + + ASSERT(sl2e != NULL); + old_sl2e = *sl2e; + + if ( old_sl2e.l2 == new_sl2e.l2 ) return 0; /* Nothing to do */ + + paddr = ((((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT) + | (((unsigned long)sl2e) & ~PAGE_MASK)); + + if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT ) + { + /* About to install a new reference */ + sh_get_ref(shadow_l2e_get_mfn(new_sl2e), paddr); + } + + /* Write the new entry */ +#if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2 + { + shadow_l2e_t pair[2] = { new_sl2e, new_sl2e }; + /* The l1 shadow is two pages long and need to be pointed to by + * two adjacent l1es. The pair have the same flags, but point + * at odd and even MFNs */ + ASSERT(!(pair[0].l2 & (1<<PAGE_SHIFT))); + pair[1].l2 |= (1<<PAGE_SHIFT); + shadow_write_entries(sl2e, &pair, 2, sl2mfn); + } +#else /* normal case */ + shadow_write_entries(sl2e, &new_sl2e, 1, sl2mfn); +#endif + flags |= SHADOW_SET_CHANGED; + + if ( shadow_l2e_get_flags(old_sl2e) & _PAGE_PRESENT ) + { + /* We lost a reference to an old mfn. */ + mfn_t osl1mfn = shadow_l2e_get_mfn(old_sl2e); + if ( (mfn_x(osl1mfn) != mfn_x(shadow_l2e_get_mfn(new_sl2e))) || + !perms_strictly_increased(shadow_l2e_get_flags(old_sl2e), + shadow_l2e_get_flags(new_sl2e)) ) + { + flags |= SHADOW_SET_FLUSH; + } + sh_put_ref(v, osl1mfn, paddr); + } + return flags; +} + +static int shadow_set_l1e(struct vcpu *v, + shadow_l1e_t *sl1e, + shadow_l1e_t new_sl1e, + mfn_t sl1mfn) +{ + int flags = 0; + struct domain *d = v->domain; + shadow_l1e_t old_sl1e; + ASSERT(sl1e != NULL); + + old_sl1e = *sl1e; + + if ( old_sl1e.l1 == new_sl1e.l1 ) return 0; /* Nothing to do */ + + if ( shadow_l1e_get_flags(new_sl1e) & _PAGE_PRESENT ) + { + /* About to install a new reference */ + if ( shadow_mode_refcounts(d) ) { + if ( shadow_get_page_from_l1e(new_sl1e, d) == 0 ) + { + /* Doesn't look like a pagetable. */ + flags |= SHADOW_SET_ERROR; + new_sl1e = shadow_l1e_empty(); + } + } + } + + /* Write the new entry */ + shadow_write_entries(sl1e, &new_sl1e, 1, sl1mfn); + flags |= SHADOW_SET_CHANGED; + + if ( shadow_l1e_get_flags(old_sl1e) & _PAGE_PRESENT ) + { + /* We lost a reference to an old mfn. */ + /* N.B. Unlike higher-level sets, never need an extra flush + * when writing an l1e. Because it points to the same guest frame + * as the guest l1e did, it's the guest's responsibility to + * trigger a flush later. */ + if ( shadow_mode_refcounts(d) ) + { + shadow_put_page_from_l1e(old_sl1e, d); + } + } + return flags; +} + + +/**************************************************************************/ +/* These functions take a vcpu and a virtual address, and return a pointer + * to the appropriate level N entry from the shadow tables. + * If the necessary tables are not present in the shadow, they return NULL. */ + +/* N.B. The use of GUEST_PAGING_LEVELS here is correct. If the shadow has + * more levels than the guest, the upper levels are always fixed and do not + * reflect any information from the guest, so we do not use these functions + * to access them. */ + +#if GUEST_PAGING_LEVELS >= 4 +static shadow_l4e_t * +shadow_get_l4e(struct vcpu *v, unsigned long va) +{ + /* Reading the top level table is always valid. */ + return sh_linear_l4_table(v) + shadow_l4_linear_offset(va); +} +#endif /* GUEST_PAGING_LEVELS >= 4 */ + + +#if GUEST_PAGING_LEVELS >= 3 +static shadow_l3e_t * +shadow_get_l3e(struct vcpu *v, unsigned long va) +{ +#if GUEST_PAGING_LEVELS >= 4 /* 64bit... */ + /* Get the l4 */ + shadow_l4e_t *sl4e = shadow_get_l4e(v, va); + ASSERT(sl4e != NULL); + if ( !(shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT) ) + return NULL; + ASSERT(valid_mfn(shadow_l4e_get_mfn(*sl4e))); + /* l4 was present; OK to get the l3 */ + return sh_linear_l3_table(v) + shadow_l3_linear_offset(va); +#else /* PAE... */ + /* Top level is always mapped */ + ASSERT(v->arch.shadow_vtable); + return ((shadow_l3e_t *)v->arch.shadow_vtable) + shadow_l3_linear_offset(va); +#endif +} +#endif /* GUEST_PAGING_LEVELS >= 3 */ + + +static shadow_l2e_t * +shadow_get_l2e(struct vcpu *v, unsigned long va) +{ +#if GUEST_PAGING_LEVELS >= 3 /* 64bit/PAE... */ + /* Get the l3 */ + shadow_l3e_t *sl3e = shadow_get_l3e(v, va); + if ( sl3e == NULL || !(shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) ) + return NULL; + ASSERT(valid_mfn(shadow_l3e_get_mfn(*sl3e))); + /* l3 was present; OK to get the l2 */ +#endif + return sh_linear_l2_table(v) + shadow_l2_linear_offset(va); +} + + +#if 0 // avoid the compiler warning for now... + +static shadow_l1e_t * +shadow_get_l1e(struct vcpu *v, unsigned long va) +{ + /* Get the l2 */ + shadow_l2e_t *sl2e = shadow_get_l2e(v, va); + if ( sl2e == NULL || !(shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT) ) + return NULL; + ASSERT(valid_mfn(shadow_l2e_get_mfn(*sl2e))); + /* l2 was present; OK to get the l1 */ + return sh_linear_l1_table(v) + shadow_l1_linear_offset(va); +} + +#endif + + +/**************************************************************************/ +/* Macros to walk pagetables. These take the shadow of a pagetable and + * walk every "interesting" entry. That is, they don't touch Xen mappings, + * and for 32-bit l2s shadowed onto PAE or 64-bit, they only touch every + * second entry (since pairs of entries are managed together). For multi-page + * shadows they walk all pages. + * + * Arguments are an MFN, the variable to point to each entry, a variable + * to indicate that we are done (we will shortcut to the end of the scan + * when _done != 0), a variable to indicate that we should avoid Xen mappings, + * and the code. + * + * WARNING: These macros have side-effects. They change the values of both + * the pointer and the MFN. */ + +static inline void increment_ptr_to_guest_entry(void *ptr) +{ + if ( ptr ) + { + guest_l1e_t **entry = ptr; + (*entry)++; + } +} + +/* All kinds of l1: touch all entries */ +#define _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \ +do { \ + int _i; \ + shadow_l1e_t *_sp = map_shadow_page((_sl1mfn)); \ + ASSERT((mfn_to_page(_sl1mfn)->count_info & PGC_SH_type_mask) \ + == PGC_SH_l1_shadow \ + || (mfn_to_page(_sl1mfn)->count_info & PGC_SH_type_mask) \ + == PGC_SH_fl1_shadow); \ + for ( _i = 0; _i < SHADOW_L1_PAGETABLE_ENTRIES; _i++ ) \ + { \ + (_sl1e) = _sp + _i; \ + if ( shadow_l1e_get_flags(*(_sl1e)) & _PAGE_PRESENT ) \ + {_code} \ + if ( _done ) break; \ + increment_ptr_to_guest_entry(_gl1p); \ + } \ + unmap_shadow_page(_sp); \ +} while (0) + +/* 32-bit l1, on PAE or 64-bit shadows: need to walk both pages of shadow */ +#if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2 +#define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \ +do { \ + int __done = 0; \ + _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \ + ({ (__done = _done); }), _code); \ + _sl1mfn = _mfn(mfn_x(_sl1mfn) + 1); \ + if ( !__done ) \ + _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, \ + ({ (__done = _done); }), _code); \ +} while (0) +#else /* Everything else; l1 shadows are only one page */ +#define SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) \ + _SHADOW_FOREACH_L1E(_sl1mfn, _sl1e, _gl1p, _done, _code) +#endif + + +#if GUEST_PAGING_LEVELS == 2 && SHADOW_PAGING_LEVELS > 2 + +/* 32-bit l2 on PAE/64: four pages, touch every second entry, and avoid Xen */ +#define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \ +do { \ + int _i, _j, __done = 0; \ + ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH_type_mask) \ + == PGC_SH_l2_32_shadow); \ + for ( _j = 0; _j < 4 && !__done; _j++ ) \ + { \ + shadow_l2e_t *_sp = map_shadow_page(_sl2mfn); \ + for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i += 2 ) \ + if ( (!(_xen)) \ + || ((_j * SHADOW_L2_PAGETABLE_ENTRIES) + _i) \ + < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT) ) \ + { \ + (_sl2e) = _sp + _i; \ + if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \ + {_code} \ + if ( (__done = (_done)) ) break; \ + increment_ptr_to_guest_entry(_gl2p); \ + } \ + unmap_shadow_page(_sp); \ + _sl2mfn = _mfn(mfn_x(_sl2mfn) + 1); \ + } \ +} while (0) + +#elif GUEST_PAGING_LEVELS == 2 + +/* 32-bit on 32-bit: avoid Xen entries */ +#define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \ +do { \ + int _i; \ + shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \ + ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH_type_mask) \ + == PGC_SH_l2_32_shadow); \ + for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \ + if ( (!(_xen)) \ + || \ + (_i < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \ + { \ + (_sl2e) = _sp + _i; \ + if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \ + {_code} \ + if ( _done ) break; \ + increment_ptr_to_guest_entry(_gl2p); \ + } \ + unmap_shadow_page(_sp); \ +} while (0) + +#elif GUEST_PAGING_LEVELS == 3 + +/* PAE: if it's an l2h, don't touch Xen mappings */ +#define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \ +do { \ + int _i; \ + shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \ + ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH_type_mask) \ + == PGC_SH_l2_pae_shadow \ + || (mfn_to_page(_sl2mfn)->count_info & PGC_SH_type_mask) \ + == PGC_SH_l2h_pae_shadow); \ + for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \ + if ( (!(_xen)) \ + || ((mfn_to_page(_sl2mfn)->count_info & PGC_SH_type_mask) \ + != PGC_SH_l2h_pae_shadow) \ + || ((_i + (3 * SHADOW_L2_PAGETABLE_ENTRIES)) \ + < (HYPERVISOR_VIRT_START >> SHADOW_L2_PAGETABLE_SHIFT)) ) \ + { \ + (_sl2e) = _sp + _i; \ + if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \ + {_code} \ + if ( _done ) break; \ + increment_ptr_to_guest_entry(_gl2p); \ + } \ + unmap_shadow_page(_sp); \ +} while (0) + +#else + +/* 64-bit l2: touch all entries */ +#define SHADOW_FOREACH_L2E(_sl2mfn, _sl2e, _gl2p, _done, _xen, _code) \ +do { \ + int _i; \ + shadow_l2e_t *_sp = map_shadow_page((_sl2mfn)); \ + ASSERT((mfn_to_page(_sl2mfn)->count_info & PGC_SH_type_mask) \ + == PGC_SH_l2_64_shadow); \ + for ( _i = 0; _i < SHADOW_L2_PAGETABLE_ENTRIES; _i++ ) \ + { \ + (_sl2e) = _sp + _i; \ + if ( shadow_l2e_get_flags(*(_sl2e)) & _PAGE_PRESENT ) \ + {_code} \ + if ( _done ) break; \ + increment_ptr_to_guest_entry(_gl2p); \ + } \ + unmap_shadow_page(_sp); \ +} while (0) + +#endif /* different kinds of l2 */ + +#if GUEST_PAGING_LEVELS == 3 + +/* PAE l3 subshadow: touch all entries (FOREACH_L2E will find Xen l2es). */ +#define SHADOW_FOREACH_L3E_SUB(_sl3e, _gl3p, _done, _code) \ +do { \ + int _i; \ + for ( _i = 0; _i < 4; _i++ ) \ + { \ + if ( shadow_l3e_get_flags(*(_sl3e)) & _PAGE_PRESENT ) \ + {_code} \ + if ( _done ) break; \ + _sl3e++; \ + increment_ptr_to_guest_entry(_gl3p); \ + } \ +} while (0) + +/* PAE l3 full shadow: call subshadow walk on all valid l3 subshadows */ +#define SHADOW_FOREACH_L3E(_sl3mfn, _sl3e, _gl3p, _done, _code) \ +do { \ + int _i, _j, _k, __done = 0; \ + ASSERT((mfn_to_page(_sl3mfn)->count_info & PGC_SH_type_mask) \ + == PGC_SH_l3_pae_shadow); \ + /* The subshadows are split, 64 on each page of the shadow */ \ + for ( _j = 0; _j < 2 && !__done; _j++ ) \ + { \ + void *_sp = sh_map_domain_page(_sl3mfn); \ + for ( _i = 0; _i < 64; _i++ ) \ + { \ + /* Every second 32-byte region is a bookkeeping entry */ \ + _sl3e = (shadow_l3e_t *)(_sp + (64 * _i)); \ + if ( (sl3p_to_info(_sl3e))->refcount > 0 ) \ + SHADOW_FOREACH_L3E_SUB(_sl3e, _gl3p, \ + ({ __done = (_done); __done; }), \ + _code); \ + else \ + for ( _k = 0 ; _k < 4 ; _k++ ) \ + increment_ptr_to_guest_entry(_gl3p); \ + if ( __done ) break; \ + } \ + sh_unmap_domain_page(_sp); \ + _sl3mfn = _mfn(mfn_x(_sl3mfn) + 1); \ + } \ +} while (0) + +#elif GUEST_PAGING_LEVELS == 4 + +/* 64-bit l3: touch all entries */ +#define SHADOW_FOREACH_L3E(_sl3mfn, _sl3e, _gl3p, _done, _code) \ +do { \ + int _i; \ + shadow_l3e_t *_sp = map_shadow_page((_sl3mfn)); \ + ASSERT((mfn_to_page(_sl3mfn)->count_info & PGC_SH_type_mask) \ + == PGC_SH_l3_64_shadow); \ + for ( _i = 0; _i < SHADOW_L3_PAGETABLE_ENTRIES; _i++ ) \ + { \ + (_sl3e) = _sp + _i; \ + if ( shadow_l3e_get_flags(*(_sl3e)) & _PAGE_PRESENT ) \ + {_code} \ + if ( _done ) break; \ + increment_ptr_to_guest_entry(_gl3p); \ + } \ + unmap_shadow_page(_sp); \ +} while (0) + +/* 64-bit l4: avoid Xen mappings */ +#define SHADOW_FOREACH_L4E(_sl4mfn, _sl4e, _gl4p, _done, _xen, _code) \ +do { \ + int _i; \ + shadow_l4e_t *_sp = map_shadow_page((_sl4mfn)); \ + ASSERT((mfn_to_page(_sl4mfn)->count_info & PGC_SH_type_mask) \ + == PGC_SH_l4_64_shadow); \ + for ( _i = 0; _i < SHADOW_L4_PAGETABLE_ENTRIES; _i++ ) \ + { \ + if ( (!(_xen)) || is_guest_l4_slot(_i) ) \ + { \ + (_sl4e) = _sp + _i; \ + if ( shadow_l4e_get_flags(*(_sl4e)) & _PAGE_PRESENT ) \ + {_code} \ + if ( _done ) break; \ + } \ + increment_ptr_to_guest_entry(_gl4p); \ + } \ + unmap_shadow_page(_sp); \ +} while (0) + +#endif + + + +/**************************************************************************/ +/* Functions to install Xen mappings and linear mappings in shadow pages */ + +static mfn_t sh_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type); + +// XXX -- this function should probably be moved to shadow-common.c, but that +// probably wants to wait until the shadow types have been moved from +// shadow-types.h to shadow-private.h +// +#if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4 +void sh_install_xen_entries_in_l4(struct vcpu *v, mfn_t gl4mfn, mfn_t sl4mfn) +{ + struct domain *d = v->domain; + shadow_l4e_t *sl4e; + + sl4e = sh_map_domain_page(sl4mfn); + ASSERT(sl4e != NULL); + ASSERT(sizeof (l4_pgentry_t) == sizeof (shadow_l4e_t)); + + /* Copy the common Xen mappings from the idle domain */ + memcpy(&sl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT], + &idle_pg_table[ROOT_PAGETABLE_FIRST_XEN_SLOT], + ROOT_PAGETABLE_XEN_SLOTS * sizeof(l4_pgentry_t)); + + /* Install the per-domain mappings for this domain */ + sl4e[shadow_l4_table_offset(PERDOMAIN_VIRT_START)] = + shadow_l4e_from_mfn(page_to_mfn(virt_to_page(d->arch.mm_perdomain_l3)), + __PAGE_HYPERVISOR); + + /* Linear mapping */ + sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] = + shadow_l4e_from_mfn(gl4mfn, __PAGE_HYPERVISOR); + sl4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)] = + shadow_l4e_from_mfn(sl4mfn, __PAGE_HYPERVISOR); + + if ( shadow_mode_translate(v->domain) ) + { + /* install domain-specific P2M table */ + sl4e[shadow_l4_table_offset(RO_MPT_VIRT_START)] = + shadow_l4e_from_mfn(pagetable_get_mfn(d->arch.phys_table), + __PAGE_HYPERVISOR); + } + + sh_unmap_domain_page(sl4e); +} +#endif + +#if CONFIG_PAGING_LEVELS == 3 && GUEST_PAGING_LEVELS == 3 +// For 3-on-3 PV guests, we need to make sure the xen mappings are in +// place, which means that we need to populate the l2h entry in the l3 +// table. + +void sh_install_xen_entries_in_l2h(struct vcpu *v, + mfn_t sl2hmfn) +{ + struct domain *d = v->domain; + shadow_l2e_t *sl2e; + int i; + + sl2e = sh_map_domain_page(sl2hmfn); + ASSERT(sl2e != NULL); + ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t)); + + /* Copy the common Xen mappings from the idle domain */ + memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)], + &idle_pg_table_l2[L2_PAGETABLE_FIRST_XEN_SLOT], + L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t)); + + /* Install the per-domain mappings for this domain */ + for ( i = 0; i < PDPT_L2_ENTRIES; i++ ) + sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] = + shadow_l2e_from_mfn( + page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i), + __PAGE_HYPERVISOR); + + /* We don't set up a linear mapping here because we can't until this + * l2h is installed in an l3e. sh_update_linear_entries() handles + * the linear mappings when the l3 is loaded. */ + + if ( shadow_mode_translate(d) ) + { + /* Install the domain-specific p2m table */ + l3_pgentry_t *p2m; + ASSERT(pagetable_get_pfn(d->arch.phys_table) != 0); + p2m = sh_map_domain_page(pagetable_get_mfn(d->arch.phys_table)); + for ( i = 0; i < MACHPHYS_MBYTES>>1; i++ ) + { + sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START) + i] = + shadow_l2e_from_mfn(_mfn(l3e_get_pfn(p2m[i])), + __PAGE_HYPERVISOR); + } + sh_unmap_domain_page(p2m); + } + + sh_unmap_domain_page(sl2e); +} + +void sh_install_xen_entries_in_l3(struct vcpu *v, mfn_t gl3mfn, mfn_t sl3mfn) +{ + shadow_l3e_t *sl3e; + guest_l3e_t *gl3e = v->arch.guest_vtable; + shadow_l3e_t new_sl3e; + gfn_t l2gfn; + mfn_t l2gmfn, l2smfn; + int r; + + ASSERT(!shadow_mode_external(v->domain)); + ASSERT(guest_l3e_get_flags(gl3e[3]) & _PAGE_PRESENT); + l2gfn = guest_l3e_get_gfn(gl3e[3]); + l2gmfn = sh_gfn_to_mfn(v->domain, gfn_x(l2gfn)); + l2smfn = get_shadow_status(v, l2gmfn, PGC_SH_l2h_shadow); + if ( !valid_mfn(l2smfn) ) + { + l2smfn = sh_make_shadow(v, l2gmfn, PGC_SH_l2h_shadow); + } + l3e_propagate_from_guest(v, &gl3e[3], gl3mfn, l2smfn, &new_sl3e, + ft_prefetch); + sl3e = sh_map_domain_page(sl3mfn); + r = shadow_set_l3e(v, &sl3e[3], new_sl3e, sl3mfn); + sh_unmap_domain_page(sl3e); +} +#endif + + +#if CONFIG_PAGING_LEVELS == 2 && GUEST_PAGING_LEVELS == 2 +void sh_install_xen_entries_in_l2(struct vcpu *v, mfn_t gl2mfn, mfn_t sl2mfn) +{ + struct domain *d = v->domain; + shadow_l2e_t *sl2e; + int i; + + sl2e = sh_map_domain_page(sl2mfn); + ASSERT(sl2e != NULL); + ASSERT(sizeof (l2_pgentry_t) == sizeof (shadow_l2e_t)); + + /* Copy the common Xen mappings from the idle domain */ + memcpy(&sl2e[L2_PAGETABLE_FIRST_XEN_SLOT], + &idle_pg_table[L2_PAGETABLE_FIRST_XEN_SLOT], + L2_PAGETABLE_XEN_SLOTS * sizeof(l2_pgentry_t)); + + /* Install the per-domain mappings for this domain */ + for ( i = 0; i < PDPT_L2_ENTRIES; i++ ) + sl2e[shadow_l2_table_offset(PERDOMAIN_VIRT_START) + i] = + shadow_l2e_from_mfn( + page_to_mfn(virt_to_page(d->arch.mm_perdomain_pt) + i), + __PAGE_HYPERVISOR); + + /* Linear mapping */ + sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START)] = + shadow_l2e_from_mfn(gl2mfn, __PAGE_HYPERVISOR); + sl2e[shadow_l2_table_offset(SH_LINEAR_PT_VIRT_START)] = + shadow_l2e_from_mfn(sl2mfn, __PAGE_HYPERVISOR); + + if ( shadow_mode_translate(d) ) + { + /* install domain-specific P2M table */ + sl2e[shadow_l2_table_offset(RO_MPT_VIRT_START)] = + shadow_l2e_from_mfn(pagetable_get_mfn(d->arch.phys_table), + __PAGE_HYPERVISOR); + } + + sh_unmap_domain_page(sl2e); +} +#endif + + + + + +/**************************************************************************/ +/* Create a shadow of a given guest page. + */ +static mfn_t +sh_make_shadow(struct vcpu *v, mfn_t gmfn, u32 shadow_type) +{ + mfn_t smfn = shadow_alloc(v->domain, shadow_type, mfn_x(gmfn)); + SHADOW_DEBUG(MAKE_SHADOW, "(%05lx, %u)=>%05lx\n", + mfn_x(gmfn), shadow_type, mfn_x(smfn)); + + if ( shadow_type != PGC_SH_guest_root_type ) + /* Lower-level shadow, not yet linked form a higher level */ + mfn_to_page(smfn)->up = 0; + + // Create the Xen mappings... + if ( !shadow_mode_external(v->domain) ) + { + switch (shadow_type) + { +#if CONFIG_PAGING_LEVELS == 4 && GUEST_PAGING_LEVELS == 4 + case PGC_SH_l4_shadow: + sh_install_xen_entries_in_l4(v, gmfn, smfn); break; +#endif +#if CONFIG_PAGING_LEVELS == 3 && GUEST_PAGING_LEVELS == 3 + case PGC_SH_l3_shadow: + sh_install_xen_entries_in_l3(v, gmfn, smfn); break; + case PGC_SH_l2h_shadow: + sh_install_xen_entries_in_l2h(v, smfn); break; +#endif +#if CONFIG_PAGING_LEVELS == 2 && GUEST_PAGING_LEVELS == 2 + case PGC_SH_l2_shadow: + sh_install_xen_entries_in_l2(v, gmfn, smfn); break; +#endif + default: /* Do nothing */ break; + } + } + + shadow_promote(v, gmfn, shadow_type); + set_shadow_status(v, gmfn, shadow_type, smfn); + + return smfn; +} + +/* Make a splintered superpage shadow */ +static mfn_t +make_fl1_shadow(struct vcpu *v, gfn_t gfn) +{ + mfn_t smfn = shadow_alloc(v->domain, PGC_SH_fl1_shadow, + (unsigned long) gfn_x(gfn)); + + SHADOW_DEBUG(MAKE_SHADOW, "(%" SH_PRI_gfn ")=>%" SH_PRI_mfn "\n", + gfn_x(gfn), mfn_x(smfn)); + + set_fl1_shadow_status(v, gfn, smfn); + return smfn; +} + + +#if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS +mfn_t +sh_make_monitor_table(struct vcpu *v) +{ + + ASSERT(pagetable_get_pfn(v->arch.monitor_table) == 0); + +#if CONFIG_PAGING_LEVELS == 4 + { + struct domain *d = v->domain; + mfn_t m4mfn; + m4mfn = shadow_alloc(d, PGC_SH_monitor_table, 0); + sh_install_xen_entries_in_l4(v, m4mfn, m4mfn); + /* Remember the level of this table */ + mfn_to_page(m4mfn)->shadow_flags = 4; +#if SHADOW_PAGING_LEVELS < 4 + // Install a monitor l3 table in slot 0 of the l4 table. + // This is used for shadow linear maps. + { + mfn_t m3mfn; + l4_pgentry_t *l4e; + m3mfn = shadow_alloc(d, PGC_SH_monitor_table, 0); + mfn_to_page(m3mfn)->shadow_flags = 3; + l4e = sh_map_domain_page(m4mfn); + l4e[0] = l4e_from_pfn(mfn_x(m3mfn), __PAGE_HYPERVISOR); + sh_unmap_domain_page(l4e); + } +#endif /* SHADOW_PAGING_LEVELS < 4 */ + return m4mfn; + } + +#elif CONFIG_PAGING_LEVELS == 3 + + { + struct domain *d = v->domain; + mfn_t m3mfn, m2mfn; + l3_pgentry_t *l3e; + l2_pgentry_t *l2e; + int i; + + m3mfn = shadow_alloc(d, PGC_SH_monitor_table, 0); + /* Remember the level of this table */ + mfn_to_page(m3mfn)->shadow_flags = 3; + + // Install a monitor l2 table in slot 3 of the l3 table. + // This is used for all Xen entries, including linear maps + m2mfn = shadow_alloc(d, PGC_SH_monitor_table, 0); + mfn_to_page(m2mfn)->shadow_flags = 2; + l3e = sh_map_domain_page(m3mfn); + l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT); + sh_install_xen_entries_in_l2h(v, m2mfn); + /* Install the monitor's own linear map */ + l2e = sh_map_domain_page(m2mfn); + for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ ) + l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] = + (l3e_get_flags(l3e[i]) & _PAGE_PRESENT) + ? l2e_from_pfn(l3e_get_pfn(l3e[i]), __PAGE_HYPERVISOR) + : l2e_empty(); + sh_unmap_domain_page(l2e); + sh_unmap_domain_page(l3e); + + SHADOW_PRINTK("new monitor table: %#lx\n", mfn_x(m3mfn)); + return m3mfn; + } + +#elif CONFIG_PAGING_LEVELS == 2 + + { + struct domain *d = v->domain; + mfn_t m2mfn; + m2mfn = shadow_alloc(d, PGC_SH_monitor_table, 0); + sh_install_xen_entries_in_l2(v, m2mfn, m2mfn); + /* Remember the level of this table */ + mfn_to_page(m2mfn)->shadow_flags = 2; + return m2mfn; + } + +#else +#error this should not happen +#endif /* CONFIG_PAGING_LEVELS */ +} +#endif /* SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS */ + +/**************************************************************************/ +/* These functions also take a virtual address and return the level-N + * shadow table mfn and entry, but they create the shadow pagetables if + * they are needed. The "demand" argument is non-zero when handling + * a demand fault (so we know what to do about accessed bits &c). + * If the necessary tables are not present in the guest, they return NULL. */ +#if GUEST_PAGING_LEVELS >= 4 +static shadow_l4e_t * shadow_get_and_create_l4e(struct vcpu *v, + walk_t *gw, + mfn_t *sl4mfn) +{ + /* There is always a shadow of the top level table. Get it. */ + *sl4mfn = pagetable_get_mfn(v->arch.shadow_table); + /* Reading the top level table is always valid. */ + return sh_linear_l4_table(v) + shadow_l4_linear_offset(gw->va); +} +#endif /* GUEST_PAGING_LEVELS >= 4 */ + + +#if GUEST_PAGING_LEVELS >= 3 +static shadow_l3e_t * shadow_get_and_create_l3e(struct vcpu *v, + walk_t *gw, + mfn_t *sl3mfn, + fetch_type_t ft) +{ +#if GUEST_PAGING_LEVELS >= 4 /* 64bit... */ + mfn_t sl4mfn; + shadow_l4e_t *sl4e; + if ( !valid_mfn(gw->l3mfn) ) return NULL; /* No guest page. */ + /* Get the l4e */ + sl4e = shadow_get_and_create_l4e(v, gw, &sl4mfn); + ASSERT(sl4e != NULL); + if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT ) + { + *sl3mfn = shadow_l4e_get_mfn(*sl4e); + ASSERT(valid_mfn(*sl3mfn)); + } + else + { + int r; + shadow_l4e_t new_sl4e; + /* No l3 shadow installed: find and install it. */ + *sl3mfn = get_shadow_status(v, gw->l3mfn, PGC_SH_l3_shadow); + if ( !valid_mfn(*sl3mfn) ) + { + /* No l3 shadow of this page exists at all: make one. */ + *sl3mfn = sh_make_shadow(v, gw->l3mfn, PGC_SH_l3_shadow); + } + /* Install the new sl3 table in the sl4e */ + l4e_propagate_from_guest(v, gw->l4e, gw->l4mfn, + *sl3mfn, &new_sl4e, ft); + r = shadow_set_l4e(v, sl4e, new_sl4e, sl4mfn); + ASSERT((r & SHADOW_SET_FLUSH) == 0); + } + /* Now follow it down a level. Guaranteed to succeed. */ + return sh_linear_l3_table(v) + shadow_l3_linear_offset(gw->va); +#else /* PAE... */ + /* There is always a shadow of the top level table. Get it. */ + *sl3mfn = pagetable_get_mfn(v->arch.shadow_table); + /* This next line is important: the shadow l3 table is in an 8k + * shadow and we need to return the right mfn of the pair. This call + * will set it for us as a side-effect. */ + (void) shadow_l3_index(sl3mfn, guest_index(gw->l3e)); + ASSERT(v->arch.shadow_vtable); + return ((shadow_l3e_t *)v->arch.shadow_vtable) + + shadow_l3_table_offset(gw->va); +#endif /* GUEST_PAGING_LEVELS >= 4 */ +} +#endif /* GUEST_PAGING_LEVELS >= 3 */ + + +static shadow_l2e_t * shadow_get_and_create_l2e(struct vcpu *v, + walk_t *gw, + mfn_t *sl2mfn, + fetch_type_t ft) +{ +#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64bit... */ + mfn_t sl3mfn = _mfn(INVALID_MFN); + shadow_l3e_t *sl3e; + if ( !valid_mfn(gw->l2mfn) ) return NULL; /* No guest page. */ + /* Get the l3e */ + sl3e = shadow_get_and_create_l3e(v, gw, &sl3mfn, ft); + ASSERT(sl3e != NULL); /* Since we know guest PT is valid this far */ + if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT ) + { + *sl2mfn = shadow_l3e_get_mfn(*sl3e); + ASSERT(valid_mfn(*sl2mfn)); + } + else + { + int r; + shadow_l3e_t new_sl3e; + /* No l2 shadow installed: find and install it. */ + *sl2mfn = get_shadow_status(v, gw->l2mfn, PGC_SH_l2_shadow); + if ( !valid_mfn(*sl2mfn) ) + { + /* No l2 shadow of this page exists at all: make one. */ + *sl2mfn = sh_make_shadow(v, gw->l2mfn, PGC_SH_l2_shadow); + } + /* Install the new sl2 table in the sl3e */ + l3e_propagate_from_guest(v, gw->l3e, gw->l3mfn, + *sl2mfn, &new_sl3e, ft); + r = shadow_set_l3e(v, sl3e, new_sl3e, sl3mfn); + ASSERT((r & SHADOW_SET_FLUSH) == 0); +#if GUEST_PAGING_LEVELS == 3 + /* Need to sync up the linear maps, as we are about to use them */ + ASSERT( r & SHADOW_SET_L3PAE_RECOPY ); + sh_pae_recopy(v->domain); +#endif + } + /* Now follow it down a level. Guaranteed to succeed. */ + return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va); +#else /* 32bit... */ + /* There is always a shadow of the top level table. Get it. */ + *sl2mfn = pagetable_get_mfn(v->arch.shadow_table); + /* This next line is important: the guest l2 has a 16k + * shadow, we need to return the right mfn of the four. This + * call will set it for us as a side-effect. */ + (void) shadow_l2_index(sl2mfn, guest_index(gw->l2e)); + /* Reading the top level table is always valid. */ + return sh_linear_l2_table(v) + shadow_l2_linear_offset(gw->va); +#endif +} + + +static shadow_l1e_t * shadow_get_and_create_l1e(struct vcpu *v, + walk_t *gw, + mfn_t *sl1mfn, + fetch_type_t ft) +{ + mfn_t sl2mfn; + shadow_l2e_t *sl2e; + + /* Get the l2e */ + sl2e = shadow_get_and_create_l2e(v, gw, &sl2mfn, ft); + if ( sl2e == NULL ) return NULL; + if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT ) + { + *sl1mfn = shadow_l2e_get_mfn(*sl2e); + ASSERT(valid_mfn(*sl1mfn)); + } + else + { + shadow_l2e_t new_sl2e; + int r, flags = guest_l2e_get_flags(*gw->l2e); + /* No l1 shadow installed: find and install it. */ + if ( !(flags & _PAGE_PRESENT) ) + return NULL; /* No guest page. */ + if ( guest_supports_superpages(v) && (flags & _PAGE_PSE) ) + { + /* Splintering a superpage */ + gfn_t l2gfn = guest_l2e_get_gfn(*gw->l2e); + *sl1mfn = get_fl1_shadow_status(v, l2gfn); + if ( !valid_mfn(*sl1mfn) ) + { + /* No fl1 shadow of this superpage exists at all: make one. */ + *sl1mfn = make_fl1_shadow(v, l2gfn); + } + } + else + { + /* Shadowing an actual guest l1 table */ + if ( !valid_mfn(gw->l2mfn) ) return NULL; /* No guest page. */ + *sl1mfn = get_shadow_status(v, gw->l1mfn, PGC_SH_l1_shadow); + if ( !valid_mfn(*sl1mfn) ) + { + /* No l1 shadow of this page exists at all: make one. */ + *sl1mfn = sh_make_shadow(v, gw->l1mfn, PGC_SH_l1_shadow); + } + } + /* Install the new sl1 table in the sl2e */ + l2e_propagate_from_guest(v, gw->l2e, gw->l2mfn, + *sl1mfn, &new_sl2e, ft); + r = shadow_set_l2e(v, sl2e, new_sl2e, sl2mfn); + ASSERT((r & SHADOW_SET_FLUSH) == 0); + /* This next line is important: in 32-on-PAE and 32-on-64 modes, + * the guest l1 table has an 8k shadow, and we need to return + * the right mfn of the pair. This call will set it for us as a + * side-effect. (In all other cases, it's a no-op and will be + * compiled out.) */ + (void) shadow_l1_index(sl1mfn, guest_l1_table_offset(gw->va)); + } + /* Now follow it down a level. Guaranteed to succeed. */ + return sh_linear_l1_table(v) + shadow_l1_linear_offset(gw->va); +} + + + +/**************************************************************************/ +/* Destructors for shadow tables: + * Unregister the shadow, decrement refcounts of any entries present in it, + * and release the memory. + * + * N.B. These destructors do not clear the contents of the shadows. + * This allows us to delay TLB shootdowns until the page is being reused. + * See shadow_alloc() and shadow_free() for how this is handled. + */ + +#if GUEST_PAGING_LEVELS >= 4 +void sh_destroy_l4_shadow(struct vcpu *v, mfn_t smfn) +{ + shadow_l4e_t *sl4e; + u32 t = mfn_to_page(smfn)->count_info & PGC_SH_type_mask; + mfn_t gmfn, sl4mfn; + int xen_mappings; + + SHADOW_DEBUG(DESTROY_SHADOW, + "%s(%05lx)\n", __func__, mfn_x(smfn)); + ASSERT(t == PGC_SH_l4_shadow); + + /* Record that the guest page isn't shadowed any more (in this type) */ + gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info); + delete_shadow_status(v, gmfn, t, smfn); + shadow_demote(v, gmfn, t); + /* Take this shadow off the list of root shadows */ + list_del_init(&mfn_to_page(smfn)->list); + + /* Decrement refcounts of all the old entries */ + xen_mappings = (!shadow_mode_external(v->domain)); + sl4mfn = smfn; + SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, xen_mappings, { + if ( shadow_l4e_get_flags(*sl4e) & _PAGE_PRESENT ) + { + sh_put_ref(v, shadow_l4e_get_mfn(*sl4e), + (((paddr_t)mfn_x(sl4mfn)) << PAGE_SHIFT) + | ((unsigned long)sl4e & ~PAGE_MASK)); + } + }); + + /* Put the memory back in the pool */ + shadow_free(v->domain, smfn); +} +#endif + +#if GUEST_PAGING_LEVELS >= 3 +void sh_destroy_l3_shadow(struct vcpu *v, mfn_t smfn) +{ + shadow_l3e_t *sl3e; + u32 t = mfn_to_page(smfn)->count_info & PGC_SH_type_mask; + mfn_t gmfn, sl3mfn; + + SHADOW_DEBUG(DESTROY_SHADOW, + "%s(%05lx)\n", __func__, mfn_x(smfn)); + ASSERT(t == PGC_SH_l3_shadow); + + /* Record that the guest page isn't shadowed any more (in this type) */ + gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info); + delete_shadow_status(v, gmfn, t, smfn); + shadow_demote(v, gmfn, t); +#if GUEST_PAGING_LEVELS == 3 + /* Take this shadow off the list of root shadows */ + list_del_init(&mfn_to_page(smfn)->list); +#endif + + /* Decrement refcounts of all the old entries */ + sl3mfn = smfn; + SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, 0, { + if ( shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT ) + sh_put_ref(v, shadow_l3e_get_mfn(*sl3e), + (((paddr_t)mfn_x(sl3mfn)) << PAGE_SHIFT) + | ((unsigned long)sl3e & ~PAGE_MASK)); + }); + + /* Put the memory back in the pool */ + shadow_free(v->domain, smfn); +} +#endif + + +#if GUEST_PAGING_LEVELS == 3 +static void sh_destroy_l3_subshadow(struct vcpu *v, + shadow_l3e_t *sl3e) +/* Tear down just a single 4-entry l3 on a 2-page l3 shadow. */ +{ + int i; + ASSERT((unsigned long)sl3e % (4 * sizeof (shadow_l3e_t)) == 0); + for ( i = 0; i < GUEST_L3_PAGETABLE_ENTRIES; i++ ) + if ( shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT ) + sh_put_ref(v, shadow_l3e_get_mfn(sl3e[i]), + maddr_from_mapped_domain_page(sl3e)); +} +#endif + +#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3) +void sh_unpin_all_l3_subshadows(struct vcpu *v, mfn_t smfn) +/* Walk a full PAE l3 shadow, unpinning all of the subshadows on it */ +{ + int i, j; + struct pae_l3_bookkeeping *bk; + + ASSERT((mfn_to_page(smfn)->count_info & PGC_SH_type_mask) + == PGC_SH_l3_pae_shadow); + /* The subshadows are split, 64 on each page of the shadow */ + for ( i = 0; i < 2; i++ ) + { + void *p = sh_map_domain_page(_mfn(mfn_x(smfn) + i)); + for ( j = 0; j < 64; j++ ) + { + /* Every second 32-byte region is a bookkeeping entry */ + bk = (struct pae_l3_bookkeeping *)(p + (64 * j) + 32); + if ( bk->pinned ) + sh_unpin_l3_subshadow(v, (shadow_l3e_t *)(p + (64*j)), smfn); + /* Check whether we've just freed the whole shadow */ + if ( (mfn_to_page(smfn)->count_info & PGC_SH_count_mask) == 0 ) + { + sh_unmap_domain_page(p); + return; + } + } + sh_unmap_domain_page(p); + } +} +#endif + +void sh_destroy_l2_shadow(struct vcpu *v, mfn_t smfn) +{ + shadow_l2e_t *sl2e; + u32 t = mfn_to_page(smfn)->count_info & PGC_SH_type_mask; + mfn_t gmfn, sl2mfn; + int xen_mappings; + + SHADOW_DEBUG(DESTROY_SHADOW, + "%s(%05lx)\n", __func__, mfn_x(smfn)); + ASSERT(t == PGC_SH_l2_shadow + || t == PGC_SH_l2h_pae_shadow); + + /* Record that the guest page isn't shadowed any more (in this type) */ + gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info); + delete_shadow_status(v, gmfn, t, smfn); + shadow_demote(v, gmfn, t); +#if GUEST_PAGING_LEVELS == 2 + /* Take this shadow off the list of root shadows */ + list_del_init(&mfn_to_page(smfn)->list); +#endif + + /* Decrement refcounts of all the old entries */ + sl2mfn = smfn; + xen_mappings = (!shadow_mode_external(v->domain) && + ((GUEST_PAGING_LEVELS == 2) || + ((GUEST_PAGING_LEVELS == 3) && + (t == PGC_SH_l2h_pae_shadow)))); + SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, xen_mappings, { + if ( shadow_l2e_get_flags(*sl2e) & _PAGE_PRESENT ) + sh_put_ref(v, shadow_l2e_get_mfn(*sl2e), + (((paddr_t)mfn_x(sl2mfn)) << PAGE_SHIFT) + | ((unsigned long)sl2e & ~PAGE_MASK)); + }); + + /* Put the memory back in the pool */ + shadow_free(v->domain, smfn); +} + +void sh_destroy_l1_shadow(struct vcpu *v, mfn_t smfn) +{ + struct domain *d = v->domain; + shadow_l1e_t *sl1e; + u32 t = mfn_to_page(smfn)->count_info & PGC_SH_type_mask; + + SHADOW_DEBUG(DESTROY_SHADOW, + "%s(%05lx)\n", __func__, mfn_x(smfn)); + ASSERT(t == PGC_SH_l1_shadow || t == PGC_SH_fl1_shadow); + + /* Record that the guest page isn't shadowed any more (in this type) */ + if ( t == PGC_SH_fl1_shadow ) + { + gfn_t gfn = _gfn(mfn_to_page(smfn)->u.inuse.type_info); + delete_fl1_shadow_status(v, gfn, smfn); + } + else + { + mfn_t gmfn = _mfn(mfn_to_page(smfn)->u.inuse.type_info); + delete_shadow_status(v, gmfn, t, smfn); + shadow_demote(v, gmfn, t); + } + + if ( shadow_mode_refcounts(d) ) + { + /* Decrement refcounts of all the old entries */ + mfn_t sl1mfn = smfn; + SHADOW_FOREACH_L1E(sl1mfn, sl1e, 0, 0, { + if ( shadow_l1e_get_flags(*sl1e) & _PAGE_PRESENT ) + shadow_put_page_from_l1e(*sl1e, d); + }); + } + + /* Put the memory back in the pool */ + shadow_free(v->domain, smfn); +} + +#if SHADOW_PAGING_LEVELS == GUEST_PAGING_LEVELS +void sh_destroy_monitor_table(struct vcpu *v, mfn_t mmfn) +{ + struct domain *d = v->domain; + ASSERT((mfn_to_page(mmfn)->count_info & PGC_SH_type_mask) + == PGC_SH_monitor_table); + +#if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS != 4) + /* Need to destroy the l3 monitor page in slot 0 too */ + { + l4_pgentry_t *l4e = sh_map_domain_page(mmfn); + ASSERT(l4e_get_flags(l4e[0]) & _PAGE_PRESENT); + shadow_free(d, _mfn(l4e_get_pfn(l4e[0]))); + sh_unmap_domain_page(l4e); + } +#elif CONFIG_PAGING_LEVELS == 3 + /* Need to destroy the l2 monitor page in slot 4 too */ + { + l3_pgentry_t *l3e = sh_map_domain_page(mmfn); + ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT); + shadow_free(d, _mfn(l3e_get_pfn(l3e[3]))); + sh_unmap_domain_page(l3e); + } +#endif + + /* Put the memory back in the pool */ + shadow_free(d, mmfn); +} +#endif + +/**************************************************************************/ +/* Functions to destroy non-Xen mappings in a pagetable hierarchy. + * These are called from common code when we are running out of shadow + * memory, and unpinning all the top-level shadows hasn't worked. + * + * This implementation is pretty crude and slow, but we hope that it won't + * be called very often. */ + +#if GUEST_PAGING_LEVELS == 2 + +void sh_unhook_32b_mappings(struct vcpu *v, mfn_t sl2mfn) +{ + shadow_l2e_t *sl2e; + int xen_mappings = !shadow_mode_external(v->domain); + SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, xen_mappings, { + (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn); + }); +} + +#elif GUEST_PAGING_LEVELS == 3 + +void sh_unhook_pae_mappings(struct vcpu *v, mfn_t sl3mfn) +/* Walk a full PAE l3 shadow, unhooking entries from all the subshadows */ +{ + shadow_l3e_t *sl3e; + SHADOW_FOREACH_L3E(sl3mfn, sl3e, 0, 0, { + if ( (shadow_l3e_get_flags(*sl3e) & _PAGE_PRESENT) ) { + mfn_t sl2mfn = shadow_l3e_get_mfn(*sl3e); + if ( (mfn_to_page(sl2mfn)->count_info & PGC_SH_type_mask) + == PGC_SH_l2h_pae_shadow ) + { + /* High l2: need to pick particular l2es to unhook */ + shadow_l2e_t *sl2e; + SHADOW_FOREACH_L2E(sl2mfn, sl2e, 0, 0, 1, { + (void) shadow_set_l2e(v, sl2e, shadow_l2e_empty(), sl2mfn); + }); + } + else + { + /* Normal l2: can safely unhook the whole l3e */ + (void) shadow_set_l3e(v, sl3e, shadow_l3e_empty(), sl3mfn); + } + } + }); + /* We've changed PAE L3 entries: must sync up various copies of them */ + sh_pae_recopy(v->domain); +} + +#elif GUEST_PAGING_LEVELS == 4 + +void sh_unhook_64b_mappings(struct vcpu *v, mfn_t sl4mfn) +{ + shadow_l4e_t *sl4e; + int xen_mappings = !shadow_mode_external(v->domain); + SHADOW_FOREACH_L4E(sl4mfn, sl4e, 0, 0, xen_mappings, { + (void) shadow_set_l4e(v, sl4e, shadow_l4e_empty(), sl4mfn); + }); +} + +#endif + +/**************************************************************************/ +/* Internal translation functions. + * These functions require a pointer to the shadow entry that will be updated. + */ + +/* These functions take a new guest entry, translate it to shadow and write + * the shadow entry. + * + * They return the same bitmaps as the shadow_set_lXe() functions. + */ + +#if GUEST_PAGING_LEVELS >= 4 +static int validate_gl4e(struct vcpu *v, void *new_ge, mfn_t sl4mfn, void *se) +{ + shadow_l4e_t new_sl4e; + guest_l4e_t *new_gl4e = new_ge; + shadow_l4e_t *sl4p = se; + mfn_t sl3mfn = _mfn(INVALID_MFN); + int result = 0; + + perfc_incrc(shadow_validate_gl4e_calls); + + if ( guest_l4e_get_flags(*new_gl4e) & _PAGE_PRESENT ) + { + gfn_t gl3gfn = guest_l4e_get_gfn(*new_gl4e); + mfn_t gl3mfn = vcpu_gfn_to_mfn(v, gl3gfn); + if ( valid_mfn(gl3mfn) ) + sl3mfn = get_shadow_status(v, gl3mfn, PGC_SH_l3_shadow); + else + result |= SHADOW_SET_ERROR; + } + l4e_propagate_from_guest(v, new_gl4e, _mfn(INVALID_MFN), + sl3mfn, &new_sl4e, ft_prefetch); + result |= shadow_set_l4e(v, sl4p, new_sl4e, sl4mfn); + return result; +} +#endif // GUEST_PAGING_LEVELS >= 4 + +#if GUEST_PAGING_LEVELS >= 3 +static int validate_gl3e(struct vcpu *v, void *new_ge, mfn_t sl3mfn, void *se) +{ + shadow_l3e_t new_sl3e; + guest_l3e_t *new_gl3e = new_ge; + shadow_l3e_t *sl3p = se; + mfn_t sl2mfn = _mfn(INVALID_MFN); + int result = 0; + + perfc_incrc(shadow_validate_gl3e_calls); + + if ( guest_l3e_get_flags(*new_gl3e) & _PAGE_PRESENT ) + { + gfn_t gl2gfn = guest_l3e_get_gfn(*new_gl3e); + mfn_t gl2mfn = vcpu_gfn_to_mfn(v, gl2gfn); + if ( valid_mfn(gl2mfn) ) + sl2mfn = get_shadow_status(v, gl2mfn, PGC_SH_l2_shadow); + else + result |= SHADOW_SET_ERROR; + } + l3e_propagate_from_guest(v, new_gl3e, _mfn(INVALID_MFN), + sl2mfn, &new_sl3e, ft_prefetch); + result |= shadow_set_l3e(v, sl3p, new_sl3e, sl3mfn); + +#if GUEST_PAGING_LEVELS == 3 + /* We have changed a PAE l3 entry: need to sync up the possible copies + * of it */ + if ( result & SHADOW_SET_L3PAE_RECOPY ) + sh_pae_recopy(v->domain); +#endif + + return result; +} +#endif // GUEST_PAGING_LEVELS >= 3 + +static int validate_gl2e(struct vcpu *v, void *new_ge, mfn_t sl2mfn, void *se) +{ + shadow_l2e_t new_sl2e; + guest_l2e_t *new_gl2e = new_ge; + shadow_l2e_t *sl2p = se; + mfn_t sl1mfn = _mfn(INVALID_MFN); + int result = 0; + + perfc_incrc(shadow_validate_gl2e_calls); + + if ( guest_l2e_get_flags(*new_gl2e) & _PAGE_PRESENT ) + { + gfn_t gl1gfn = guest_l2e_get_gfn(*new_gl2e); + if ( guest_supports_superpages(v) && + (guest_l2e_get_flags(*new_gl2e) & _PAGE_PSE) ) + { + // superpage -- need to look up the shadow L1 which holds the + // splitters... + sl1mfn = get_fl1_shadow_status(v, gl1gfn); +#if 0 + // XXX - it's possible that we want to do some kind of prefetch + // for superpage fl1's here, but this is *not* on the demand path, + // so we'll hold off trying that for now... + // + if ( !valid_mfn(sl1mfn) ) + sl1mfn = make_fl1_shadow(v, gl1gfn); +#endif + } + else + { + mfn_t gl1mfn = vcpu_gfn_to_mfn(v, gl1gfn); + if ( valid_mfn(gl1mfn) ) + sl1mfn = get_shadow_status(v, gl1mfn, PGC_SH_l1_shadow); + else + result |= SHADOW_SET_ERROR; + } + } + l2e_propagate_from_guest(v, new_gl2e, _mfn(INVALID_MFN), + sl1mfn, &new_sl2e, ft_prefetch); + result |= shadow_set_l2e(v, sl2p, new_sl2e, sl2mfn); + + return result; +} + +static int validate_gl1e(struct vcpu *v, void *new_ge, mfn_t sl1mfn, void *se) +{ + shadow_l1e_t new_sl1e; + guest_l1e_t *new_gl1e = new_ge; + shadow_l1e_t *sl1p = se; + gfn_t gfn; + mfn_t mfn; + int result = 0; + + perfc_incrc(shadow_validate_gl1e_calls); + + gfn = guest_l1e_get_gfn(*new_gl1e); + mfn = vcpu_gfn_to_mfn(v, gfn); + + l1e_propagate_from_guest(v, *new_gl1e, &new_sl1e, + /* mmio? */ !valid_mfn(mfn)); + + result |= shadow_set_l1e(v, sl1p, new_sl1e, sl1mfn); + return result; +} + + +/**************************************************************************/ +/* Functions which translate and install a the shadows of arbitrary guest + * entries that we have just seen the guest write. */ + + +static inline int +sh_map_and_validate(struct vcpu *v, mfn_t gmfn, + void *new_gp, u32 size, u32 sh_type, + u32 (*shadow_index)(mfn_t *smfn, u32 idx), + int (*validate_ge)(struct vcpu *v, void *ge, + mfn_t smfn, void *se)) +/* Generic function for mapping and validating. */ +{ + mfn_t smfn, smfn2, map_mfn; + shadow_l1e_t *sl1p; + u32 shadow_idx, guest_idx; + int result = 0; + + /* Align address and size to guest entry boundaries */ + size += (unsigned long)new_gp & (sizeof (guest_l1e_t) - 1); + new_gp = (void *)((unsigned long)new_gp & ~(sizeof (guest_l1e_t) - 1)); + size = (size + sizeof (guest_l1e_t) - 1) & ~(sizeof (guest_l1e_t) - 1); + ASSERT(size + (((unsigned long)new_gp) & ~PAGE_MASK) <= PAGE_SIZE); + + /* Map the shadow page */ + smfn = get_shadow_status(v, gmfn, sh_type); + ASSERT(valid_mfn(smfn)); /* Otherwise we would not have been called */ + guest_idx = guest_index(new_gp); + map_mfn = smfn; + shadow_idx = shadow_index(&map_mfn, guest_idx); + sl1p = map_shadow_page(map_mfn); + + /* Validate one entry at a time */ + while ( size ) + { + smfn2 = smfn; + guest_idx = guest_index(new_gp); + shadow_idx = shadow_index(&smfn2, guest_idx); + if ( mfn_x(smfn2) != mfn_x(map_mfn) ) + { + /* We have moved to another page of the shadow */ + map_mfn = smfn2; + unmap_shadow_page(sl1p); + sl1p = map_shadow_page(map_mfn); + } + result |= validate_ge(v, + new_gp, + map_mfn, + &sl1p[shadow_idx]); + size -= sizeof(guest_l1e_t); + new_gp += sizeof(guest_l1e_t); + } + unmap_shadow_page(sl1p); + return result; +} + + +int +sh_map_and_validate_gl4e(struct vcpu *v, mfn_t gl4mfn, + void *new_gl4p, u32 size) +{ +#if GUEST_PAGING_LEVELS >= 4 + return sh_map_and_validate(v, gl4mfn, new_gl4p, size, + PGC_SH_l4_shadow, + shadow_l4_index, + validate_gl4e); +#else // ! GUEST_PAGING_LEVELS >= 4 + SHADOW_PRINTK("called in wrong paging mode!\n"); + BUG(); + return 0; +#endif +} + +int +sh_map_and_validate_gl3e(struct vcpu *v, mfn_t gl3mfn, + void *new_gl3p, u32 size) +{ +#if GUEST_PAGING_LEVELS >= 3 + return sh_map_and_validate(v, gl3mfn, new_gl3p, size, + PGC_SH_l3_shadow, + shadow_l3_index, + validate_gl3e); +#else // ! GUEST_PAGING_LEVELS >= 3 + SHADOW_PRINTK("called in wrong paging mode!\n"); + BUG(); + return 0; +#endif +} + +int +sh_map_and_validate_gl2e(struct vcpu *v, mfn_t gl2mfn, + void *new_gl2p, u32 size) +{ + return sh_map_and_validate(v, gl2mfn, new_gl2p, size, + PGC_SH_l2_shadow, + shadow_l2_index, + validate_gl2e); +} + +int +sh_map_and_validate_gl2he(struct vcpu *v, mfn_t gl2mfn, + void *new_gl2p, u32 size) +{ +#if GUEST_PAGING_LEVELS == 3 + return sh_map_and_validate(v, gl2mfn, new_gl2p, size, + PGC_SH_l2h_shadow, + shadow_l2_index, + validate_gl2e); +#else /* Non-PAE guests don't have different kinds of l2 table */ + SHADOW_PRINTK("called in wrong paging mode!\n"); + BUG(); + return 0; +#endif +} + +int +sh_map_and_validate_gl1e(struct vcpu *v, mfn_t gl1mfn, + void *new_gl1p, u32 size) +{ + return sh_map_and_validate(v, gl1mfn, new_gl1p, size, + PGC_SH_l1_shadow, + shadow_l1_index, + validate_gl1e); +} + + +/**************************************************************************/ +/* Optimization: If we see two emulated writes of zeros to the same + * page-table without another kind of page fault in between, we guess + * that this is a batch of changes (for process destruction) and + * unshadow the page so we don't take a pagefault on every entry. This + * should also make finding writeable mappings of pagetables much + * easier. */ + +/* Look to see if this is the second emulated write in a row to this + * page, and unshadow/unhook if it is */ +static inline void check_for_early_unshadow(struct vcpu *v, mfn_t gmfn) +{ +#if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW + if ( v->arch.shadow.last_emulated_mfn == mfn_x(gmfn) && + sh_mfn_is_a_page_table(gmfn) ) + { + u32 flags = mfn_to_page(gmfn)->shadow_flags; + mfn_t smfn; + if ( !(flags & (SHF_L2_32|SHF_L3_PAE|SHF_L4_64)) ) + { + perfc_incrc(shadow_early_unshadow); + sh_remove_shadows(v, gmfn, 0 /* Can fail to unshadow */ ); + return; + } + /* SHF_unhooked_mappings is set to make sure we only unhook + * once in a single batch of updates. It is reset when this + * top-level page is loaded into CR3 again */ + if ( !(flags & SHF_unhooked_mappings) ) + { + perfc_incrc(shadow_early_unshadow_top); + mfn_to_page(gmfn)->shadow_flags |= SHF_unhooked_mappings; + if ( flags & SHF_L2_32 ) + { + smfn = get_shadow_status(v, gmfn, PGC_SH_l2_32_shadow); + shadow_unhook_mappings(v, smfn); + } + if ( flags & SHF_L3_PAE ) + { + smfn = get_shadow_status(v, gmfn, PGC_SH_l3_pae_shadow); + shadow_unhook_mappings(v, smfn); + } + if ( flags & SHF_L4_64 ) + { + smfn = get_shadow_status(v, gmfn, PGC_SH_l4_64_shadow); + shadow_unhook_mappings(v, smfn); + } + } + } + v->arch.shadow.last_emulated_mfn = mfn_x(gmfn); +#endif +} + +/* Stop counting towards early unshadows, as we've seen a real page fault */ +static inline void reset_early_unshadow(struct vcpu *v) +{ +#if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW + v->arch.shadow.last_emulated_mfn = INVALID_MFN; +#endif +} + + + +/**************************************************************************/ +/* Entry points into the shadow code */ + +/* Called from pagefault handler in Xen, and from the HVM trap handlers + * for pagefaults. Returns 1 if this fault was an artefact of the + * shadow code (and the guest should retry) or 0 if it is not (and the + * fault should be handled elsewhere or passed to the guest). */ + +static int sh_page_fault(struct vcpu *v, + unsigned long va, + struct cpu_user_regs *regs) +{ + struct domain *d = v->domain; + walk_t gw; + u32 accumulated_gflags; + gfn_t gfn; + mfn_t gmfn, sl1mfn=_mfn(0); + shadow_l1e_t sl1e, *ptr_sl1e; + paddr_t gpa; + struct cpu_user_regs emul_regs; + struct x86_emulate_ctxt emul_ctxt; + int r, mmio; + fetch_type_t ft = 0; + + // + // XXX: Need to think about eventually mapping superpages directly in the + // shadow (when possible), as opposed to splintering them into a + // bunch of 4K maps. + // + + SHADOW_PRINTK("d:v=%u:%u va=%#lx err=%u\n", + v->domain->domain_id, v->vcpu_id, va, regs->error_code); + + shadow_lock(d); + + shadow_audit_tables(v); + + if ( guest_walk_tables(v, va, &gw, 1) != 0 ) + { + SHADOW_PRINTK("malformed guest pagetable!"); + print_gw(&gw); + } + + sh_audit_gw(v, &gw); + + // We do not look at the gw->l1e, as that will not exist for superpages. + // Instead, we use the gw->eff_l1e... + // + // We need not check all the levels of the guest page table entries for + // present vs not-present, as the eff_l1e will always be not present if + // one of the higher level entries is not present. + // + if ( unlikely(!(guest_l1e_get_flags(gw.eff_l1e) & _PAGE_PRESENT)) ) + { + if ( hvm_guest(v) && !shadow_vcpu_mode_translate(v) ) + { + /* Not present in p2m map, means this is mmio */ + gpa = va; + goto mmio; + } + + perfc_incrc(shadow_fault_bail_not_present); + goto not_a_shadow_fault; + } + + // All levels of the guest page table are now known to be present. + accumulated_gflags = accumulate_guest_flags(&gw); + + // Check for attempts to access supervisor-only pages from user mode, + // i.e. ring 3. Such errors are not caused or dealt with by the shadow + // code. + // + if ( (regs->error_code & PFEC_user_mode) && + !(accumulated_gflags & _PAGE_USER) ) + { + /* illegal user-mode access to supervisor-only page */ + perfc_incrc(shadow_fault_bail_user_supervisor); + goto not_a_shadow_fault; + } + + // Was it a write fault? + // + if ( regs->error_code & PFEC_write_access ) + { + if ( unlikely(!(accumulated_gflags & _PAGE_RW)) ) + { + perfc_incrc(shadow_fault_bail_ro_mapping); + goto not_a_shadow_fault; + } + } + else // must have been either an insn fetch or read fault + { + // Check for NX bit violations: attempts to execute code that is + // marked "do not execute". Such errors are not caused or dealt with + // by the shadow code. + // + if ( regs->error_code & PFEC_insn_fetch ) + { + if ( accumulated_gflags & _PAGE_NX_BIT ) + { + /* NX prevented this code fetch */ + perfc_incrc(shadow_fault_bail_nx); + goto not_a_shadow_fault; + } + } + } + + /* Is this an MMIO access? */ + gfn = guest_l1e_get_gfn(gw.eff_l1e); + mmio = ( hvm_guest(v) + && shadow_vcpu_mode_translate(v) + && mmio_space(gfn_to_paddr(gfn)) ); + + /* For MMIO, the shadow holds the *gfn*; for normal accesses, if holds + * the equivalent mfn. */ + if ( mmio ) + gmfn = _mfn(gfn_x(gfn)); + else + { + gmfn = vcpu_gfn_to_mfn(v, gfn); + if ( !valid_mfn(gmfn) ) + { + perfc_incrc(shadow_fault_bail_bad_gfn); + SHADOW_PRINTK("BAD gfn=%"SH_PRI_gfn" gmfn=%"SH_PRI_mfn"\n", + gfn_x(gfn), mfn_x(gmfn)); + goto not_a_shadow_fault; + } + } + + /* Make sure there is enough free shadow memory to build a chain of + * shadow tables: one SHADOW_MAX_ORDER chunk will always be enough + * to allocate all we need. (We never allocate a top-level shadow + * on this path, only a 32b l1, pae l2+1 or 64b l3+2+1) */ + shadow_prealloc(d, SHADOW_MAX_ORDER); + + /* Acquire the shadow. This must happen before we figure out the rights + * for the shadow entry, since we might promote a page here. */ + // XXX -- this code will need to change somewhat if/when the shadow code + // can directly map superpages... + ft = ((regs->error_code & PFEC_write_access) ? + ft_demand_write : ft_demand_read); + ptr_sl1e = shadow_get_and_create_l1e(v, &gw, &sl1mfn, ft); + ASSERT(ptr_sl1e); + + /* Calculate the shadow entry */ + if ( ft == ft_demand_write ) + { + if ( l1e_write_fault(v, &gw, gmfn, &sl1e, mmio) ) + { + perfc_incrc(shadow_fault_emulate_write); + goto emulate; + } + } + else if ( l1e_read_fault(v, &gw, gmfn, &sl1e, mmio) ) + { + perfc_incrc(shadow_fault_emulate_read); + goto emulate; + } + + /* Quick sanity check: we never make an MMIO entry that's got the + * _PAGE_PRESENT flag set in it. */ + ASSERT(!mmio || !(shadow_l1e_get_flags(sl1e) & _PAGE_PRESENT)); + + r = shadow_set_l1e(v, ptr_sl1e, sl1e, sl1mfn); + + if ( mmio ) + { + gpa = guest_walk_to_gpa(&gw); + goto mmio; + } + +#if 0 + if ( !(r & SHADOW_SET_CHANGED) ) + debugtrace_printk("%s: shadow_set_l1e(va=%p, sl1e=%" SH_PRI_pte + ") did not change anything\n", + __func__, gw.va, l1e_get_intpte(sl1e)); +#endif + + perfc_incrc(shadow_fault_fixed); + d->arch.shadow.fault_count++; + reset_early_unshadow(v); + + done: + sh_audit_gw(v, &gw); + unmap_walk(v, &gw); + SHADOW_PRINTK("fixed\n"); + shadow_audit_tables(v); + shadow_unlock(d); + return EXCRET_fault_fixed; + + emulate: + + /* Take the register set we were called with */ + emul_regs = *regs; + if ( hvm_guest(v) ) + { + /* Add the guest's segment selectors, rip, rsp. rflags */ + hvm_store_cpu_guest_regs(v, &emul_regs, NULL); + } + emul_ctxt.regs = &emul_regs; + emul_ctxt.cr2 = va; + emul_ctxt.mode = hvm_guest(v) ? hvm_guest_x86_mode(v) : X86EMUL_MODE_HOST; + + SHADOW_PRINTK("emulate: eip=%#lx\n", emul_regs.eip); + + v->arch.shadow.propagate_fault = 0; + if ( x86_emulate_memop(&emul_ctxt, &shadow_emulator_ops) ) + { + SHADOW_PRINTK("emulator failure, unshadowing mfn %#lx\n", + mfn_x(gmfn)); + perfc_incrc(shadow_fault_emulate_failed); + /* If this is actually a page table, then we have a bug, and need + * to support more operations in the emulator. More likely, + * though, this is a hint that this page should not be shadowed. */ + shadow_remove_all_shadows(v, gmfn); + /* This means that actual missing operations will cause the + * guest to loop on the same page fault. */ + goto done; + } + if ( v->arch.shadow.propagate_fault ) + { + /* Emulation triggered another page fault */ + goto not_a_shadow_fault; + } + + /* Emulator has changed the user registers: write back */ + if ( hvm_guest(v) ) + { + /* Write back the guest's segment selectors, rip, rsp. rflags */ + hvm_load_cpu_guest_regs(v, &emul_regs); + /* And don't overwrite those in the caller's regs. */ + emul_regs.eip = regs->eip; + emul_regs.cs = regs->cs; + emul_regs.eflags = regs->eflags; + emul_regs.esp = regs->esp; + emul_regs.ss = regs->ss; + emul_regs.es = regs->es; + emul_regs.ds = regs->ds; + emul_regs.fs = regs->fs; + emul_regs.gs = regs->gs; + } + *regs = emul_regs; + + goto done; + + mmio: + perfc_incrc(shadow_fault_mmio); + if ( !hvm_apic_support(d) && (gpa >= 0xFEC00000) ) + { + /* Need to deal with these disabled-APIC accesses, as + * handle_mmio() apparently does not currently do that. */ + /* TJD: What about it, then? For now, I'm turning this BUG() + * into a domain_crash() since we don't want to kill Xen. */ + SHADOW_ERROR("disabled-APIC access: not supported\n."); + domain_crash(d); + } + sh_audit_gw(v, &gw); + unmap_walk(v, &gw); + SHADOW_PRINTK("mmio\n"); + shadow_audit_tables(v); + reset_early_unshadow(v); + shadow_unlock(d); + sh_log_mmio(v, gpa); + handle_mmio(va, gpa); + return EXCRET_fault_fixed; + + not_a_shadow_fault: + sh_audit_gw(v, &gw); + unmap_walk(v, &gw); + SHADOW_PRINTK("not a shadow fault\n"); + shadow_audit_tables(v); + reset_early_unshadow(v); + shadow_unlock(d); + return 0; +} + + +static int +sh_invlpg(struct vcpu *v, unsigned long va) +/* Called when the guest requests an invlpg. Returns 1 if the invlpg + * instruction should be issued on the hardware, or 0 if it's safe not + * to do so. */ +{ + shadow_l2e_t *ptr_sl2e = shadow_get_l2e(v, va); + + // XXX -- might be a good thing to prefetch the va into the shadow + + // no need to flush anything if there's no SL2... + // + if ( !ptr_sl2e ) + return 0; + + // If there's nothing shadowed for this particular sl2e, then + // there is no need to do an invlpg, either... + // + if ( !(shadow_l2e_get_flags(*ptr_sl2e) & _PAGE_PRESENT) ) + return 0; + + // Check to see if the SL2 is a splintered superpage... + // If so, then we'll need to flush the entire TLB (because that's + // easier than invalidating all of the individual 4K pages). + // + if ( (mfn_to_page(shadow_l2e_get_mfn(*ptr_sl2e))->count_info & + PGC_SH_type_mask) == PGC_SH_fl1_shadow ) + { + local_flush_tlb(); + return 0; + } + + return 1; +} + +static unsigned long +sh_gva_to_gfn(struct vcpu *v, unsigned long va) +/* Called to translate a guest virtual address to what the *guest* + * pagetables would map it to. */ +{ + walk_t gw; + gfn_t gfn; + + guest_walk_tables(v, va, &gw, 0); + gfn = guest_walk_to_gfn(&gw); + unmap_walk(v, &gw); + + return gfn_x(gfn); +} + + +static unsigned long +sh_gva_to_gpa(struct vcpu *v, unsigned long va) +/* Called to translate a guest virtual address to what the *guest* + * pagetables would map it to. */ +{ + unsigned long gfn = sh_gva_to_gfn(v, va); + if ( gfn == INVALID_GFN ) + return 0; + else + return (gfn << PAGE_SHIFT) | (va & ~PAGE_MASK); +} + + +// XXX -- should this be in this file? +// Or should it be moved to shadow-common.c? +// +/* returns a lowmem machine address of the copied HVM L3 root table + * If clear_res != 0, then clear the PAE-l3 reserved bits in the copy, + * otherwise blank out any entries with reserved bits in them. */ +#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3) +static unsigned long +hvm_pae_copy_root(struct vcpu *v, l3_pgentry_t *l3tab, int clear_res) +{ + int i, f; + int res = (_PAGE_RW|_PAGE_NX_BIT|_PAGE_USER|_PAGE_ACCESSED|_PAGE_DIRTY); + l3_pgentry_t new_l3e, *copy = v->arch.hvm_vcpu.hvm_lowmem_l3tab; + memcpy(copy, l3tab, 4 * sizeof(l3_pgentry_t)); + for ( i = 0; i < 4; i++ ) + { + f = l3e_get_flags(l3tab[i]); + if ( (f & _PAGE_PRESENT) && (!(f & res) || clear_res) ) + new_l3e = l3e_from_pfn(l3e_get_pfn(l3tab[i]), f & ~res); + else + new_l3e = l3e_empty(); + safe_write_entry(&copy[i], &new_l3e); + } + return __pa(copy); +} +#endif + + +static inline void +sh_update_linear_entries(struct vcpu *v) +/* Sync up all the linear mappings for this vcpu's pagetables */ +{ + struct domain *d = v->domain; + + /* Linear pagetables in PV guests + * ------------------------------ + * + * Guest linear pagetables, which map the guest pages, are at + * LINEAR_PT_VIRT_START. Shadow linear pagetables, which map the + * shadows, are at SH_LINEAR_PT_VIRT_START. Most of the time these + * are set up at shadow creation time, but (of course!) the PAE case + * is subtler. Normal linear mappings are made by having an entry + * in the top-level table that points to itself (shadow linear) or + * to the guest top-level table (guest linear). For PAE, to set up + * a linear map requires us to copy the four top-level entries into + * level-2 entries. That means that every time we change a PAE l3e, + * we need to reflect the change into the copy. + * + * Linear pagetables in HVM guests + * ------------------------------- + * + * For HVM guests, the linear pagetables are installed in the monitor + * tables (since we can't put them in the shadow). Shadow linear + * pagetables, which map the shadows, are at SH_LINEAR_PT_VIRT_START, + * and we use the linear pagetable slot at LINEAR_PT_VIRT_START for + * a linear pagetable of the monitor tables themselves. We have + * the same issue of having to re-copy PAE l3 entries whevever we use + * PAE shadows. + * + * Because HVM guests run on the same monitor tables regardless of the + * shadow tables in use, the linear mapping of the shadow tables has to + * be updated every time v->arch.shadow_table changes. + */ + + /* Don't try to update the monitor table if it doesn't exist */ + if ( shadow_mode_external(d) + && pagetable_get_pfn(v->arch.monitor_table) == 0 ) + return; + +#if (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 4) + + /* For PV, one l4e points at the guest l4, one points at the shadow + * l4. No maintenance required. + * For HVM, just need to update the l4e that points to the shadow l4. */ + + if ( shadow_mode_external(d) ) + { + /* Use the linear map if we can; otherwise make a new mapping */ + if ( v == current ) + { + __linear_l4_table[l4_linear_offset(SH_LINEAR_PT_VIRT_START)] = + l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table), + __PAGE_HYPERVISOR); + } + else + { + l4_pgentry_t *ml4e; + ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table)); + ml4e[l4_table_offset(SH_LINEAR_PT_VIRT_START)] = + l4e_from_pfn(pagetable_get_pfn(v->arch.shadow_table), + __PAGE_HYPERVISOR); + sh_unmap_domain_page(ml4e); + } + } + +#elif (CONFIG_PAGING_LEVELS == 4) && (SHADOW_PAGING_LEVELS == 3) + + /* This case only exists in HVM. To give ourselves a linear map of the + * shadows, we need to extend a PAE shadow to 4 levels. We do this by + * having a monitor l3 in slot 0 of the monitor l4 table, and + * copying the PAE l3 entries into it. Then, by having the monitor l4e + * for shadow pagetables also point to the monitor l4, we can use it + * to access the shadows. */ + + if ( shadow_mode_external(d) ) + { + /* Install copies of the shadow l3es into the monitor l3 table. + * The monitor l3 table is hooked into slot 0 of the monitor + * l4 table, so we use l3 linear indices 0 to 3 */ + shadow_l3e_t *sl3e; + l3_pgentry_t *ml3e; + mfn_t l3mfn; + int i; + + /* Use linear mappings if we can; otherwise make new mappings */ + if ( v == current ) + { + ml3e = __linear_l3_table; + l3mfn = _mfn(l4e_get_pfn(__linear_l4_table[0])); +#if GUEST_PAGING_LEVELS == 2 + /* Shadow l3 tables are made up by update_cr3 */ + sl3e = v->arch.hvm_vcpu.hvm_lowmem_l3tab; +#else + sl3e = v->arch.shadow_vtable; +#endif + } + else + { + l4_pgentry_t *ml4e; + ml4e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table)); + ASSERT(l4e_get_flags(ml4e[0]) & _PAGE_PRESENT); + l3mfn = _mfn(l4e_get_pfn(ml4e[0])); + ml3e = sh_map_domain_page(l3mfn); + sh_unmap_domain_page(ml4e); +#if GUEST_PAGING_LEVELS == 2 + /* Shadow l3 tables are made up by update_cr3 */ + sl3e = v->arch.hvm_vcpu.hvm_lowmem_l3tab; +#else + sl3e = sh_map_domain_page(pagetable_get_mfn(v->arch.shadow_table)); +#endif + } + + for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ ) + { + ml3e[i] = + (shadow_l3e_get_flags(sl3e[i]) & _PAGE_PRESENT) + ? l3e_from_pfn(mfn_x(shadow_l3e_get_mfn(sl3e[i])), + __PAGE_HYPERVISOR) + : l3e_empty(); + } + + if ( v != current ) + { + sh_unmap_domain_page(ml3e); +#if GUEST_PAGING_LEVELS != 2 + sh_unmap_domain_page(sl3e); +#endif + } + } + +#elif CONFIG_PAGING_LEVELS == 3 + + /* PV: need to copy the guest's l3 entries into the guest-linear-map l2 + * entries in the shadow, and the shadow's l3 entries into the + * shadow-linear-map l2 entries in the shadow. This is safe to do + * because Xen does not let guests share high-slot l2 tables between l3s, + * so we know we're not treading on anyone's toes. + * + * HVM: need to copy the shadow's l3 entries into the + * shadow-linear-map l2 entries in the monitor table. This is safe + * because we have one monitor table for each vcpu. The monitor's + * own l3es don't need to be copied because they never change. + * XXX That might change if we start stuffing things into the rest + * of the monitor's virtual address space. + */ + { + l2_pgentry_t *l2e, new_l2e; + shadow_l3e_t *guest_l3e = NULL, *shadow_l3e; + int i; + +#if GUEST_PAGING_LEVELS == 2 + /* Shadow l3 tables were built by update_cr3 */ + if ( shadow_mode_external(d) ) + shadow_l3e = v->arch.hvm_vcpu.hvm_lowmem_l3tab; + else + BUG(); /* PV 2-on-3 is not supported yet */ + +#else /* GUEST_PAGING_LEVELS == 3 */ + + /* Use local vcpu's mappings if we can; otherwise make new mappings */ + if ( v == current ) + { + shadow_l3e = v->arch.shadow_vtable; + if ( !shadow_mode_external(d) ) + guest_l3e = v->arch.guest_vtable; + } + else + { + mfn_t smfn; + int idx; + + /* Map the shadow l3 */ + smfn = pagetable_get_mfn(v->arch.shadow_table); + idx = shadow_l3_index(&smfn, guest_index(v->arch.shadow_vtable)); + shadow_l3e = sh_map_domain_page(smfn); + shadow_l3e += idx; + if ( !shadow_mode_external(d) ) + { + /* Also the guest l3 */ + mfn_t gmfn = pagetable_get_mfn(v->arch.guest_table); + guest_l3e = sh_map_domain_page(gmfn); + guest_l3e += guest_index(v->arch.guest_vtable); + } + } +#endif /* GUEST_PAGING_LEVELS */ + + /* Choose where to write the entries, using linear maps if possible */ + if ( v == current && shadow_mode_external(d) ) + { + /* From the monitor tables, it's safe to use linear maps to update + * monitor l2s */ + l2e = __linear_l2_table + (3 * L2_PAGETABLE_ENTRIES); + } + else if ( shadow_mode_external(d) ) + { + /* Map the monitor table's high l2 */ + l3_pgentry_t *l3e; + l3e = sh_map_domain_page( + pagetable_get_mfn(v->arch.monitor_table)); + ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT); + l2e = sh_map_domain_page(_mfn(l3e_get_pfn(l3e[3]))); + sh_unmap_domain_page(l3e); + } + else + { + /* Map the shadow table's high l2 */ + ASSERT(shadow_l3e_get_flags(shadow_l3e[3]) & _PAGE_PRESENT); + l2e = sh_map_domain_page(shadow_l3e_get_mfn(shadow_l3e[3])); + } + + + if ( !shadow_mode_external(d) ) + { + /* Write linear mapping of guest. */ + for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ ) + { + new_l2e = (shadow_l3e_get_flags(guest_l3e[i]) & _PAGE_PRESENT) + ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(guest_l3e[i])), + __PAGE_HYPERVISOR) + : l2e_empty(); + safe_write_entry( + &l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i], + &new_l2e); + } + } + + /* Write linear mapping of shadow. */ + for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ ) + { + new_l2e = (shadow_l3e_get_flags(shadow_l3e[i]) & _PAGE_PRESENT) + ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(shadow_l3e[i])), + __PAGE_HYPERVISOR) + : l2e_empty(); + safe_write_entry( + &l2e[l2_table_offset(SH_LINEAR_PT_VIRT_START) + i], + &new_l2e); + } + + if ( v != current || !shadow_mode_external(d) ) + sh_unmap_domain_page(l2e); + +#if GUEST_PAGING_LEVELS == 3 + if ( v != current) + { + sh_unmap_domain_page(shadow_l3e); + if ( !shadow_mode_external(d) ) + sh_unmap_domain_page(guest_l3e); + } +#endif + } + +#elif CONFIG_PAGING_LEVELS == 2 + + /* For PV, one l2e points at the guest l2, one points at the shadow + * l2. No maintenance required. + * For HVM, just need to update the l2e that points to the shadow l2. */ + + if ( shadow_mode_external(d) ) + { + /* Use the linear map if we can; otherwise make a new mapping */ + if ( v == current ) + { + __linear_l2_table[l2_linear_offset(SH_LINEAR_PT_VIRT_START)] = + l2e_from_pfn(pagetable_get_pfn(v->arch.shadow_table), + __PAGE_HYPERVISOR); + } + else + { + l2_pgentry_t *ml2e; + ml2e = sh_map_domain_page(pagetable_get_mfn(v->arch.monitor_table)); + ml2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = + l2e_from_pfn(pagetable_get_pfn(v->arch.shadow_table), + __PAGE_HYPERVISOR); + sh_unmap_domain_page(ml2e); + } + } + +#else +#error this should not happen +#endif +} + + +// XXX -- should this be in this file? +// Or should it be moved to shadow-common.c? +// +#if (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3) +void sh_pae_recopy(struct domain *d) +/* Called whenever we write to the l3 entries of a PAE pagetable which + * is currently in use. Each vcpu that is using the table needs to + * resync its copies of the l3s in linear maps and any low-memory + * copies it might have made for fitting into 32bit CR3. + * Since linear maps are also resynced when we change CR3, we don't + * need to worry about changes to PAE l3es that are not currently in use.*/ +{ + struct vcpu *v; + cpumask_t flush_mask = CPU_MASK_NONE; + ASSERT(shadow_lock_is_acquired(d)); + + for_each_vcpu(d, v) + { + if ( !v->arch.shadow.pae_flip_pending ) + continue; + + cpu_set(v->processor, flush_mask); + + SHADOW_PRINTK("d=%u v=%u\n", v->domain->domain_id, v->vcpu_id); + + /* This vcpu has a copy in its linear maps */ + sh_update_linear_entries(v); + if ( hvm_guest(v) ) + { + /* This vcpu has a copy in its HVM PAE l3 */ + v->arch.hvm_vcpu.hw_cr3 = + hvm_pae_copy_root(v, v->arch.shadow_vtable, + !shadow_vcpu_mode_translate(v)); + } +#if CONFIG_PAGING_LEVELS == 3 + else + { + /* This vcpu might have copied the l3 to below 4GB */ + if ( v->arch.cr3 >> PAGE_SHIFT + != pagetable_get_pfn(v->arch.shadow_table) ) + { + /* Recopy to where that copy is. */ + int i; + l3_pgentry_t *dst, *src; + dst = __va(v->arch.cr3 & ~0x1f); /* Mask cache control bits */ + src = v->arch.shadow_vtable; + for ( i = 0 ; i < 4 ; i++ ) + safe_write_entry(dst + i, src + i); + } + } +#endif + v->arch.shadow.pae_flip_pending = 0; + } + + flush_tlb_mask(flush_mask); +} +#endif /* (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3) */ + + +/* removes: + * vcpu->arch.guest_vtable + * vcpu->arch.shadow_table + * vcpu->arch.shadow_vtable + * Does all appropriate management/bookkeeping/refcounting/etc... + */ +static void +sh_detach_old_tables(struct vcpu *v) +{ + mfn_t smfn; + + //// + //// vcpu->arch.guest_vtable + //// + if ( (shadow_mode_external(v->domain) || (GUEST_PAGING_LEVELS == 3)) && + v->arch.guest_vtable ) + { + // Q: why does this need to use (un)map_domain_page_*global* ? + sh_unmap_domain_page_global(v->arch.guest_vtable); + v->arch.guest_vtable = NULL; + } + + //// + //// vcpu->arch.shadow_table + //// + smfn = pagetable_get_mfn(v->arch.shadow_table); + if ( mfn_x(smfn) ) + { + ASSERT(v->arch.shadow_vtable); + +#if GUEST_PAGING_LEVELS == 3 + // PAE guests do not (necessarily) use an entire page for their + // 4-entry L3s, so we have to deal with them specially. + // + sh_put_ref_l3_subshadow(v, v->arch.shadow_vtable, smfn); +#else + sh_put_ref(v, smfn, 0); +#endif + +#if (SHADOW_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3) + { + struct pae_l3_bookkeeping *info = + sl3p_to_info(v->arch.shadow_vtable); + ASSERT(test_bit(v->vcpu_id, &info->vcpus)); + clear_bit(v->vcpu_id, &info->vcpus); + } +#endif + v->arch.shadow_table = pagetable_null(); + } + + //// + //// vcpu->arch.shadow_vtable + //// + if ( (shadow_mode_external(v->domain) || (GUEST_PAGING_LEVELS == 3)) && + v->arch.shadow_vtable ) + { + // Q: why does this need to use (un)map_domain_page_*global* ? + // + sh_unmap_domain_page_global(v->arch.shadow_vtable); + v->arch.shadow_vtable = NULL; + } +} + +static void +sh_update_cr3(struct vcpu *v) +/* Updates vcpu->arch.shadow_table after the guest has changed CR3. + * Paravirtual guests should set v->arch.guest_table (and guest_table_user, + * if appropriate). + * HVM guests should also set hvm_get_guest_cntl_reg(v, 3)... + */ +{ + struct domain *d = v->domain; + mfn_t gmfn, smfn; +#if GUEST_PAGING_LEVELS == 3 + u32 guest_idx=0; +#endif + + ASSERT(shadow_lock_is_acquired(v->domain)); + ASSERT(v->arch.shadow.mode); + + //// + //// vcpu->arch.guest_table is already set + //// + +#ifndef NDEBUG + /* Double-check that the HVM code has sent us a sane guest_table */ + if ( hvm_guest(v) ) + { + gfn_t gfn; + + ASSERT(shadow_mode_external(d)); + + // Is paging enabled on this vcpu? + if ( shadow_vcpu_mode_translate(v) ) + { + gfn = _gfn(paddr_to_pfn(hvm_get_guest_ctrl_reg(v, 3))); + gmfn = vcpu_gfn_to_mfn(v, gfn); + ASSERT(valid_mfn(gmfn)); + ASSERT(pagetable_get_pfn(v->arch.guest_table) == mfn_x(gmfn)); + } + else + { + /* Paging disabled: guest_table points at (part of) p2m */ +#if SHADOW_PAGING_LEVELS != 3 /* in 3-on-4, guest-table is in slot 0 of p2m */ + /* For everything else, they sould be the same */ + ASSERT(v->arch.guest_table.pfn == d->arch.phys_table.pfn); +#endif + } + } +#endif + + SHADOW_PRINTK("d=%u v=%u guest_table=%05lx\n", + d->domain_id, v->vcpu_id, + (unsigned long)pagetable_get_pfn(v->arch.guest_table)); + +#if GUEST_PAGING_LEVELS == 4 + if ( !(v->arch.flags & TF_kernel_mode) ) + gmfn = pagetable_get_mfn(v->arch.guest_table_user); + else +#endif + gmfn = pagetable_get_mfn(v->arch.guest_table); + + sh_detach_old_tables(v); + + if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) ) + { + ASSERT(v->arch.cr3 == 0); + return; + } + + //// + //// vcpu->arch.guest_vtable + //// + if ( shadow_mode_external(d) ) + { +#if GUEST_PAGING_LEVELS == 3 + if ( shadow_vcpu_mode_translate(v) ) + /* Paging enabled: find where in the page the l3 table is */ + guest_idx = guest_index((void *)hvm_get_guest_ctrl_reg(v, 3)); + else + /* Paging disabled: l3 is at the start of a page (in the p2m) */ + guest_idx = 0; + + // Ignore the low 2 bits of guest_idx -- they are really just + // cache control. + guest_idx &= ~3; + // XXX - why does this need a global map? + v->arch.guest_vtable = + (guest_l3e_t *)sh_map_domain_page_global(gmfn) + guest_idx; +#else + // XXX - why does this need a global map? + v->arch.guest_vtable = sh_map_domain_page_global(gmfn); +#endif + } + else + { +#ifdef __x86_64__ + v->arch.guest_vtable = __linear_l4_table; +#elif GUEST_PAGING_LEVELS == 3 + // XXX - why does this need a global map? + v->arch.guest_vtable = sh_map_domain_page_global(gmfn); +#else + v->arch.guest_vtable = __linear_l2_table; +#endif + } + +#if 0 + printk("%s %s %d gmfn=%05lx guest_vtable=%p\n", + __func__, __FILE__, __LINE__, gmfn, v->arch.guest_vtable); +#endif + + //// + //// vcpu->arch.shadow_table + //// + smfn = get_shadow_status(v, gmfn, PGC_SH_guest_root_type); + if ( valid_mfn(smfn) ) + { + /* Pull this root shadow to the front of the list of roots. */ + list_del(&mfn_to_page(smfn)->list); + list_add(&mfn_to_page(smfn)->list, &d->arch.shadow.toplevel_shadows); + } + else + { + /* This guest MFN is a pagetable. Must revoke write access. */ + if ( shadow_remove_write_access(v, gmfn, GUEST_PAGING_LEVELS, 0) + != 0 ) + flush_tlb_mask(d->domain_dirty_cpumask); + /* Make sure there's enough free shadow memory. */ + shadow_prealloc(d, SHADOW_MAX_ORDER); + /* Shadow the page. */ + smfn = sh_make_shadow(v, gmfn, PGC_SH_guest_root_type); + list_add(&mfn_to_page(smfn)->list, &d->arch.shadow.toplevel_shadows); + } + ASSERT(valid_mfn(smfn)); + v->arch.shadow_table = pagetable_from_mfn(smfn); + +#if SHADOW_OPTIMIZATIONS & SHOPT_EARLY_UNSHADOW + /* Once again OK to unhook entries from this table if we see fork/exit */ + ASSERT(sh_mfn_is_a_page_table(gmfn)); + mfn_to_page(gmfn)->shadow_flags &= ~SHF_unhooked_mappings; +#endif + + + //// + //// vcpu->arch.shadow_vtable + //// + if ( shadow_mode_external(d) ) + { +#if (SHADOW_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3) + mfn_t adjusted_smfn = smfn; + u32 shadow_idx = shadow_l3_index(&adjusted_smfn, guest_idx); + // Q: why does this need to use (un)map_domain_page_*global* ? + v->arch.shadow_vtable = + (shadow_l3e_t *)sh_map_domain_page_global(adjusted_smfn) + + shadow_idx; +#else + // Q: why does this need to use (un)map_domain_page_*global* ? + v->arch.shadow_vtable = sh_map_domain_page_global(smfn); +#endif + } + else + { +#if SHADOW_PAGING_LEVELS == 4 + v->arch.shadow_vtable = __sh_linear_l4_table; +#elif GUEST_PAGING_LEVELS == 3 + // XXX - why does this need a global map? + v->arch.shadow_vtable = sh_map_domain_page_global(smfn); +#else + v->arch.shadow_vtable = __sh_linear_l2_table; +#endif + } + + //// + //// Take a ref to the new shadow table, and pin it. + //// + // + // This ref is logically "held" by v->arch.shadow_table entry itself. + // Release the old ref. + // +#if GUEST_PAGING_LEVELS == 3 + // PAE guests do not (necessarily) use an entire page for their + // 4-entry L3s, so we have to deal with them specially. + // + // XXX - might want to revisit this if/when we do multiple compilation for + // HVM-vs-PV guests, as PAE PV guests could get away without doing + // subshadows. + // + sh_get_ref_l3_subshadow(v->arch.shadow_vtable, smfn); + sh_pin_l3_subshadow(v->arch.shadow_vtable, smfn); +#else + sh_get_ref(smfn, 0); + sh_pin(smfn); +#endif + +#if (SHADOW_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3) + // PAE 3-on-3 shadows have to keep track of which vcpu's are using + // which l3 subshadow, in order handle the SHADOW_SET_L3PAE_RECOPY + // case from validate_gl3e(). Search for SHADOW_SET_L3PAE_RECOPY + // in the code for more info. + // + { + struct pae_l3_bookkeeping *info = + sl3p_to_info(v->arch.shadow_vtable); + ASSERT(!test_bit(v->vcpu_id, &info->vcpus)); + set_bit(v->vcpu_id, &info->vcpus); + } +#endif + + debugtrace_printk("%s cr3 gmfn=%05lx smfn=%05lx\n", + __func__, gmfn, smfn); + + /// + /// v->arch.cr3 and, if appropriate, v->arch.hvm_vcpu.hw_cr3 + /// + if ( shadow_mode_external(d) ) + { + ASSERT(hvm_guest(v)); + make_cr3(v, pagetable_get_pfn(v->arch.monitor_table)); + +#if (GUEST_PAGING_LEVELS == 2) && (SHADOW_PAGING_LEVELS != 2) +#if SHADOW_PAGING_LEVELS != 3 +#error unexpected combination of GUEST and SHADOW paging levels +#endif + /* 2-on-3: make a PAE l3 table that points at the four-page l2 */ + { + mfn_t smfn = pagetable_get_mfn(v->arch.shadow_table); + int i; + + ASSERT(v->arch.hvm_vcpu.hw_cr3 == + virt_to_maddr(v->arch.hvm_vcpu.hvm_lowmem_l3tab)); + for (i = 0; i < 4; i++) + { + v->arch.hvm_vcpu.hvm_lowmem_l3tab[i] = + shadow_l3e_from_mfn(_mfn(mfn_x(smfn)+i), _PAGE_PRESENT); + } + } +#elif (GUEST_PAGING_LEVELS == 3) && (SHADOW_PAGING_LEVELS == 3) + /* 3-on-3: copy the shadow l3 to slots that are below 4GB. + * If paging is disabled, clear l3e reserved bits; otherwise + * remove entries that have reserved bits set. */ + v->arch.hvm_vcpu.hw_cr3 = + hvm_pae_copy_root(v, v->arch.shadow_vtable, + !shadow_vcpu_mode_translate(v)); +#else + /* 2-on-2 or 4-on-4: just put the shadow top-level into cr3 */ + v->arch.hvm_vcpu.hw_cr3 = + pagetable_get_paddr(v->arch.shadow_table); +#endif + } + else // not shadow_mode_external... + { + /* We don't support PV except guest == shadow == config levels */ + BUG_ON(GUEST_PAGING_LEVELS != SHADOW_PAGING_LEVELS); + make_cr3(v, pagetable_get_pfn(v->arch.shadow_table)); + } + + /* Fix up the linear pagetable mappings */ + sh_update_linear_entries(v); +} + + +/**************************************************************************/ +/* Functions to revoke guest rights */ + +#if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC +static int sh_guess_wrmap(struct vcpu *v, unsigned long vaddr, mfn_t gmfn) +/* Look up this vaddr in the current shadow and see if it's a writeable + * mapping of this gmfn. If so, remove it. Returns 1 if it worked. */ +{ + shadow_l1e_t sl1e, *sl1p; + shadow_l2e_t *sl2p; +#if GUEST_PAGING_LEVELS >= 3 + shadow_l3e_t *sl3p; +#if GUEST_PAGING_LEVELS >= 4 + shadow_l4e_t *sl4p; +#endif +#endif + mfn_t sl1mfn; + + + /* Carefully look in the shadow linear map for the l1e we expect */ + if ( v->arch.shadow_vtable == NULL ) return 0; +#if GUEST_PAGING_LEVELS >= 4 + sl4p = sh_linear_l4_table(v) + shadow_l4_linear_offset(vaddr); + if ( !(shadow_l4e_get_flags(*sl4p) & _PAGE_PRESENT) ) + return 0; + sl3p = sh_linear_l3_table(v) + shadow_l3_linear_offset(vaddr); + if ( !(shadow_l3e_get_flags(*sl3p) & _PAGE_PRESENT) ) + return 0; +#elif GUEST_PAGING_LEVELS == 3 + sl3p = ((shadow_l3e_t *) v->arch.shadow_vtable) _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-changelog

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.