[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-changelog] [xen-unstable] merge with xen-unstable.hg (staging)
# HG changeset patch # User Alex Williamson <alex.williamson@xxxxxx> # Date 1190917336 21600 # Node ID ee498c9af856e8fc9d37d156a2123e1a26d83444 # Parent eae7b887e5acb4e087f4787be581985e21d0d40d # Parent 8817a53c030f9c2c5f39fafad72aa9502342e7b3 merge with xen-unstable.hg (staging) --- buildconfigs/mk.linux-2.6-xen | 9 config/NetBSD.mk | 3 docs/src/interface.tex | 16 docs/src/user.tex | 8 extras/mini-os/arch/x86/minios-x86_64.lds | 9 tools/check/check_crypto_lib | 8 tools/check/check_zlib_lib | 8 tools/console/client/main.c | 6 tools/console/daemon/utils.c | 1 tools/examples/blktap | 2 tools/examples/network-bridge | 1 tools/firmware/hvmloader/mkhex | 2 tools/firmware/rombios/32bit/Makefile | 2 tools/firmware/rombios/32bit/mkhex | 2 tools/firmware/rombios/32bit/tcgbios/tcgbios.c | 43 tools/firmware/vmxassist/vm86.c | 5 tools/include/NetBSD/evtchn.h | 89 tools/include/NetBSD/privcmd.h | 105 + tools/ioemu/Makefile.target | 10 tools/ioemu/audio/audio.c | 4 tools/ioemu/audio/mixeng.c | 4 tools/ioemu/audio/ossaudio.c | 7 tools/ioemu/block-raw.c | 35 tools/ioemu/block-vvfat.c | 2 tools/ioemu/bswap.h | 7 tools/ioemu/cutils.c | 6 tools/ioemu/hw/fdc.c | 9 tools/ioemu/hw/ne2000.c | 8 tools/ioemu/hw/pass-through.c | 454 ++++ tools/ioemu/hw/pass-through.h | 89 tools/ioemu/hw/pc.c | 27 tools/ioemu/hw/sb16.c | 6 tools/ioemu/monitor.c | 38 tools/ioemu/osdep.h | 4 tools/ioemu/target-i386-dm/exec-dm.c | 4 tools/ioemu/usb-linux.c | 2 tools/ioemu/vl.c | 96 - tools/ioemu/vl.h | 6 tools/ioemu/vnc.c | 3 tools/libaio/src/compat-0_1.c | 2 tools/libfsimage/ext2fs/fsys_ext2fs.c | 6 tools/libfsimage/fat/fsys_fat.c | 12 tools/libfsimage/iso9660/fsys_iso9660.c | 2 tools/libfsimage/reiserfs/fsys_reiserfs.c | 6 tools/libfsimage/ufs/fsys_ufs.c | 4 tools/libxc/Makefile | 1 tools/libxc/xc_core.c | 3 tools/libxc/xc_core_x86.c | 3 tools/libxc/xc_dom_boot.c | 9 tools/libxc/xc_dom_core.c | 4 tools/libxc/xc_domain.c | 108 + tools/libxc/xc_domain_restore.c | 244 +- tools/libxc/xc_domain_save.c | 252 +- tools/libxc/xc_netbsd.c | 271 ++ tools/libxc/xc_private.c | 97 - tools/libxc/xc_resume.c | 13 tools/libxc/xenctrl.h | 39 tools/libxc/xg_private.h | 27 tools/libxc/xg_save_restore.h | 83 tools/libxen/include/xen/api/xen_all.h | 1 tools/pygrub/src/GrubConf.py | 28 tools/pygrub/src/LiloConf.py | 18 tools/python/xen/lowlevel/xc/xc.c | 1 tools/python/xen/lowlevel/xs/xs.c | 1 tools/python/xen/util/bootloader.py | 156 + tools/python/xen/util/xsm/acm/acm.py | 28 tools/python/xen/util/xsm/dummy/dummy.py | 4 tools/python/xen/util/xsm/flask/flask.py | 4 tools/python/xen/xend/XendConfig.py | 23 tools/python/xen/xend/XendDomain.py | 15 tools/python/xen/xend/XendDomainInfo.py | 58 tools/python/xen/xend/balloon.py | 16 tools/python/xen/xend/image.py | 2 tools/python/xen/xend/server/netif.py | 8 tools/python/xen/xm/create.py | 2 tools/python/xen/xm/main.py | 8 tools/python/xen/xm/xenapi_create.py | 9 tools/vtpm/tpm_emulator.patch | 26 tools/vtpm_manager/tcs/tcs.c | 2 tools/xenmon/xenbaked.c | 703 +++---- tools/xenstat/libxenstat/Makefile | 1 tools/xenstat/libxenstat/src/xenstat_netbsd.c | 97 + tools/xenstat/xentop/xentop.c | 9 tools/xenstore/Makefile | 1 tools/xenstore/xenstored_netbsd.c | 73 tools/xentrace/xentrace.c | 144 + tools/xm-test/lib/XmTestLib/acm.py | 2 xen/arch/x86/acpi/boot.c | 6 xen/arch/x86/dmi_scan.c | 31 xen/arch/x86/domain.c | 36 xen/arch/x86/domctl.c | 172 + xen/arch/x86/e820.c | 78 xen/arch/x86/hvm/hvm.c | 412 ++++ xen/arch/x86/hvm/i8254.c | 8 xen/arch/x86/hvm/intercept.c | 4 xen/arch/x86/hvm/io.c | 123 + xen/arch/x86/hvm/irq.c | 11 xen/arch/x86/hvm/platform.c | 1 xen/arch/x86/hvm/svm/Makefile | 2 xen/arch/x86/hvm/svm/amd_iommu/Makefile | 4 xen/arch/x86/hvm/svm/amd_iommu/amd-iommu-detect.c | 211 ++ xen/arch/x86/hvm/svm/amd_iommu/amd-iommu-init.c | 145 + xen/arch/x86/hvm/svm/amd_iommu/amd-iommu-map.c | 419 ++++ xen/arch/x86/hvm/svm/amd_iommu/pci-amd-iommu.c | 389 ++++ xen/arch/x86/hvm/svm/amd_iommu/pci-direct.h | 48 xen/arch/x86/hvm/svm/amd_iommu/pci_regs.h | 513 +++++ xen/arch/x86/hvm/svm/emulate.c | 19 xen/arch/x86/hvm/svm/svm.c | 251 +- xen/arch/x86/hvm/svm/vmcb.c | 3 xen/arch/x86/hvm/svm/x86_32/exits.S | 12 xen/arch/x86/hvm/svm/x86_64/exits.S | 12 xen/arch/x86/hvm/vioapic.c | 4 xen/arch/x86/hvm/vmx/Makefile | 2 xen/arch/x86/hvm/vmx/intr.c | 21 xen/arch/x86/hvm/vmx/vmcs.c | 168 + xen/arch/x86/hvm/vmx/vmx.c | 268 +- xen/arch/x86/hvm/vmx/vtd/Makefile | 4 xen/arch/x86/hvm/vmx/vtd/dmar.c | 512 +++++ xen/arch/x86/hvm/vmx/vtd/dmar.h | 92 + xen/arch/x86/hvm/vmx/vtd/intel-iommu.c | 2002 ++++++++++++++++++++++ xen/arch/x86/hvm/vmx/vtd/io.c | 150 + xen/arch/x86/hvm/vmx/vtd/msi.h | 127 + xen/arch/x86/hvm/vmx/vtd/pci-direct.h | 48 xen/arch/x86/hvm/vmx/vtd/pci_regs.h | 449 ++++ xen/arch/x86/hvm/vmx/vtd/utils.c | 333 +++ xen/arch/x86/hvm/vmx/x86_32/exits.S | 31 xen/arch/x86/hvm/vmx/x86_64/exits.S | 35 xen/arch/x86/hvm/vpic.c | 8 xen/arch/x86/hvm/vpt.c | 19 xen/arch/x86/io_apic.c | 76 xen/arch/x86/irq.c | 38 xen/arch/x86/mm/p2m.c | 81 xen/arch/x86/mm/shadow/common.c | 147 - xen/arch/x86/mm/shadow/multi.c | 10 xen/arch/x86/numa.c | 18 xen/arch/x86/oprofile/op_model_athlon.c | 1 xen/arch/x86/platform_hypercall.c | 14 xen/arch/x86/setup.c | 68 xen/arch/x86/time.c | 70 xen/arch/x86/traps.c | 15 xen/arch/x86/x86_32/asm-offsets.c | 3 xen/arch/x86/x86_32/traps.c | 27 xen/arch/x86/x86_64/asm-offsets.c | 3 xen/arch/x86/x86_64/traps.c | 27 xen/common/kernel.c | 5 xen/common/libelf/libelf-private.h | 14 xen/common/schedule.c | 11 xen/common/trace.c | 298 ++- xen/drivers/acpi/tables.c | 1 xen/drivers/char/console.c | 15 xen/drivers/char/ns16550.c | 18 xen/include/asm-x86/acpi.h | 2 xen/include/asm-x86/amd-iommu.h | 70 xen/include/asm-x86/config.h | 2 xen/include/asm-x86/e820.h | 1 xen/include/asm-x86/fixmap.h | 6 xen/include/asm-x86/hvm/domain.h | 5 xen/include/asm-x86/hvm/hvm.h | 52 xen/include/asm-x86/hvm/io.h | 1 xen/include/asm-x86/hvm/iommu.h | 52 xen/include/asm-x86/hvm/irq.h | 20 xen/include/asm-x86/hvm/svm/amd-iommu-defs.h | 419 ++++ xen/include/asm-x86/hvm/svm/amd-iommu-proto.h | 88 xen/include/asm-x86/hvm/svm/emulate.h | 10 xen/include/asm-x86/hvm/svm/svm.h | 16 xen/include/asm-x86/hvm/trace.h | 158 + xen/include/asm-x86/hvm/vcpu.h | 2 xen/include/asm-x86/hvm/vmx/intel-iommu.h | 401 ++++ xen/include/asm-x86/iommu.h | 84 xen/include/asm-x86/msr.h | 3 xen/include/asm-x86/p2m.h | 17 xen/include/asm-x86/system.h | 3 xen/include/asm-x86/time.h | 15 xen/include/public/arch-x86/xen-x86_32.h | 3 xen/include/public/domctl.h | 68 xen/include/public/platform.h | 11 xen/include/public/trace.h | 48 xen/include/xen/acpi.h | 70 xen/include/xen/console.h | 1 xen/include/xen/dmi.h | 2 xen/include/xen/irq.h | 7 xen/include/xen/sched.h | 4 xen/include/xen/time.h | 26 xen/include/xen/trace.h | 77 184 files changed, 11972 insertions(+), 1681 deletions(-) diff -r eae7b887e5ac -r ee498c9af856 buildconfigs/mk.linux-2.6-xen --- a/buildconfigs/mk.linux-2.6-xen Thu Sep 27 09:16:23 2007 -0600 +++ b/buildconfigs/mk.linux-2.6-xen Thu Sep 27 12:22:16 2007 -0600 @@ -29,6 +29,11 @@ LINUX_DIR = build-linux-$(LINUX_VER) LINUX_DIR = build-linux-$(LINUX_VER)$(EXTRAVERSION)_$(XEN_TARGET_ARCH) IMAGE_TARGET ?= vmlinuz +ifneq ($(XEN_TARGET_ARCH),ia64) +IMAGE_PATH ?= arch/$(LINUX_ARCH)/boot/$(firstword $(IMAGE_TARGET)) +else +IMAGE_PATH ?= arch/ia64/hp/sim/boot/vmlinux.gz +endif INSTALL_BOOT_PATH := $(DESTDIR)/boot LINUX_VER3 := $(LINUX_SERIES).$(word 3, $(subst ., ,$(LINUX_VER))) @@ -62,7 +67,9 @@ endif fi $(MAKE) -C $(LINUX_DIR) ARCH=$(LINUX_ARCH) INSTALL_PATH=$(DESTDIR) $(IMAGE_TARGET) mkdir -p $(INSTALL_BOOT_PATH) - $(MAKE) -C $(LINUX_DIR) ARCH=$(LINUX_ARCH) INSTALL_PATH=$(INSTALL_BOOT_PATH) install + cp $(LINUX_DIR)/$(IMAGE_PATH) $(INSTALL_BOOT_PATH)/vmlinuz-$(LINUX_VER)$(EXTRAVERSION) + cp $(LINUX_DIR)/.config $(INSTALL_BOOT_PATH)/config-$(LINUX_VER)$(EXTRAVERSION) + cp $(LINUX_DIR)/System.map $(INSTALL_BOOT_PATH)/System.map-$(LINUX_VER)$(EXTRAVERSION) $(LINUX_DIR)/include/linux/autoconf.h: CONFIG_FILE=$(CURDIR)/$(LINUX_DIR)/.config $(LINUX_DIR)/include/linux/autoconf.h: $(LINUX_SRCDIR)/.valid-src diff -r eae7b887e5ac -r ee498c9af856 config/NetBSD.mk --- a/config/NetBSD.mk Thu Sep 27 09:16:23 2007 -0600 +++ b/config/NetBSD.mk Thu Sep 27 12:22:16 2007 -0600 @@ -1,1 +1,4 @@ include $(XEN_ROOT)/config/StdGNU.mk include $(XEN_ROOT)/config/StdGNU.mk + +# Override settings for this OS +CURSES_LIBS = -lcurses diff -r eae7b887e5ac -r ee498c9af856 docs/src/interface.tex --- a/docs/src/interface.tex Thu Sep 27 09:16:23 2007 -0600 +++ b/docs/src/interface.tex Thu Sep 27 12:22:16 2007 -0600 @@ -1052,7 +1052,7 @@ This path contains: \end{description} \end{description} - \item[vtpm/] a directory containin vtpm backends + \item[vtpm/] a directory containing vtpm backends \begin{description} \item[$<$domid$>$/] a directory containing vtpm's for domid \begin{description} @@ -1287,7 +1287,7 @@ ring. \subsection{Network ring interface} The network device uses two shared memory rings for communication: one -for transmit, one for receieve. +for transmit, one for receive. Transmit requests are described by the following structure: @@ -1466,7 +1466,7 @@ The fields are as follows: interface \item[id] this value is echoed in the response message for this IO; the guest may use it to identify the original request -\item[sector\_number] start sector on the virtal device for this +\item[sector\_number] start sector on the virtual device for this request \item[frame\_and\_sects] This array contains structures encoding scatter-gather IO to be performed: @@ -1483,7 +1483,7 @@ The fields are as follows: Virtual TPM (VTPM) support provides TPM functionality to each virtual machine that requests this functionality in its configuration file. -The interface enables domains to access therr own private TPM like it +The interface enables domains to access their own private TPM like it was a hardware TPM built into the machine. The virtual TPM interface is implemented as a split driver, @@ -1504,11 +1504,11 @@ table mechanism. table mechanism. The backend driver has been implemented to only accept well-formed -TPM requests. To meet this requirement, the length inidicator in the +TPM requests. To meet this requirement, the length indicator in the TPM request must correctly indicate the length of the request. Otherwise an error message is automatically sent back by the device driver. -The virtual TPM implementation listenes for TPM request on /dev/vtpm. Since +The virtual TPM implementation listens for TPM request on /dev/vtpm. Since it must be able to apply the TPM request packet to the virtual TPM instance associated with the virtual machine, a 4-byte virtual TPM instance identifier is prepended to each packet by the backend driver (in network @@ -1536,7 +1536,7 @@ The fields are as follows: The fields are as follows: \begin{description} -\item[addr] The machine address of the page asscoiated with the TPM +\item[addr] The machine address of the page associated with the TPM request/response; a request/response may span multiple pages \item[ref] The grant table reference associated with the address. @@ -1982,7 +1982,7 @@ the value of {\bf op}). The available o stored. \item[XENMEM\_current\_reservation] Returns current memory reservation of the specified domain. -\item[XENMEM\_maximum\_reservation] Returns maximum memory resrevation +\item[XENMEM\_maximum\_reservation] Returns maximum memory reservation of the specified domain. \end{description} diff -r eae7b887e5ac -r ee498c9af856 docs/src/user.tex --- a/docs/src/user.tex Thu Sep 27 09:16:23 2007 -0600 +++ b/docs/src/user.tex Thu Sep 27 12:22:16 2007 -0600 @@ -1683,7 +1683,7 @@ to: section remains to detail a configuration that was used by older Xen versions.}} -Raw image file-backed VBDs amy also be attached to VMs using the +Raw image file-backed VBDs may also be attached to VMs using the Linux loopback driver. The only required change to the raw file instructions above are to specify the configuration entry as: \begin{quote} @@ -1694,7 +1694,7 @@ instructions above are to specify the co I/O-intensive domains.} This approach is known to experience substantial slowdowns under heavy I/O workloads, due to the I/O handling by the loopback block device used to support file-backed VBDs -in dom0. Loopbach support remains for old Xen installations, and users +in dom0. Loopback support remains for old Xen installations, and users are strongly encouraged to use the blktap-based file support (using ``{\tt{tap:aio}}'' as described above). @@ -4203,7 +4203,7 @@ on the vnet UDP port: # tcpdump udp port 1798 \end{verbatim} -If multicast is not being forwaded between machines you can configure +If multicast is not being forwarded between machines you can configure multicast forwarding using vn. Suppose we have machines hostA on 10.10.0.100 and hostB on 10.11.0.100 and that multicast is not forwarded between them. We use vn to configure each machine to forward to the other: @@ -4256,7 +4256,7 @@ as it will forward multicasts received f VMMs} to provide the illusion of contiguous physical memory, in Xen this is used during {\bf live migration}. -\item[Virtual Block Device] Persistant storage available to a virtual +\item[Virtual Block Device] Persistent storage available to a virtual machine, providing the abstraction of an actual block storage device. {\bf VBD}s may be actual block devices, filesystem images, or remote/network storage. diff -r eae7b887e5ac -r ee498c9af856 extras/mini-os/arch/x86/minios-x86_64.lds --- a/extras/mini-os/arch/x86/minios-x86_64.lds Thu Sep 27 09:16:23 2007 -0600 +++ b/extras/mini-os/arch/x86/minios-x86_64.lds Thu Sep 27 12:22:16 2007 -0600 @@ -21,15 +21,6 @@ SECTIONS _edata = .; /* End of data section */ - . = ALIGN(8192); /* init_task */ - .data.init_task : { *(.data.init_task) } - - . = ALIGN(4096); - .data.page_aligned : { *(.data.idt) } - - . = ALIGN(32); - .data.cacheline_aligned : { *(.data.cacheline_aligned) } - __bss_start = .; /* BSS */ .bss : { *(.bss) diff -r eae7b887e5ac -r ee498c9af856 tools/check/check_crypto_lib --- a/tools/check/check_crypto_lib Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/check/check_crypto_lib Thu Sep 27 12:22:16 2007 -0600 @@ -2,6 +2,14 @@ # CHECK-BUILD CHECK-INSTALL RC=0 + +case $(uname -s) in +FreeBSD|NetBSD|OpenBSD) + exit 0 + ;; +*) + ;; +esac PATH=/sbin:$PATH set -e diff -r eae7b887e5ac -r ee498c9af856 tools/check/check_zlib_lib --- a/tools/check/check_zlib_lib Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/check/check_zlib_lib Thu Sep 27 12:22:16 2007 -0600 @@ -2,6 +2,14 @@ # CHECK-BUILD CHECK-INSTALL RC=0 + +case $(uname -s) in +FreeBSD|NetBSD|OpenBSD) + exit 0 + ;; +*) + ;; +esac PATH=/sbin:$PATH diff -r eae7b887e5ac -r ee498c9af856 tools/console/client/main.c --- a/tools/console/client/main.c Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/console/client/main.c Thu Sep 27 12:22:16 2007 -0600 @@ -34,7 +34,7 @@ #include <sys/select.h> #include <err.h> #include <errno.h> -#include <pty.h> +#include <string.h> #include "xs.h" @@ -113,9 +113,8 @@ static void init_term(int fd, struct ter { struct termios new_term; - if (tcgetattr(fd, old) == -1) { + if (tcgetattr(fd, old) == -1) return; - } new_term = *old; cfmakeraw(&new_term); @@ -289,6 +288,7 @@ int main(int argc, char **argv) err(errno, "Could not read tty from store"); } + init_term(spty, &attr); init_term(STDIN_FILENO, &attr); console_loop(spty, xs, path); restore_term(STDIN_FILENO, &attr); diff -r eae7b887e5ac -r ee498c9af856 tools/console/daemon/utils.c --- a/tools/console/daemon/utils.c Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/console/daemon/utils.c Thu Sep 27 12:22:16 2007 -0600 @@ -32,6 +32,7 @@ #include <sys/socket.h> #include <sys/un.h> #include <string.h> +#include <signal.h> #include "xenctrl.h" #include "utils.h" diff -r eae7b887e5ac -r ee498c9af856 tools/examples/blktap --- a/tools/examples/blktap Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/examples/blktap Thu Sep 27 12:22:16 2007 -0600 @@ -73,7 +73,7 @@ if [ -L "$p" ]; then if [ -L "$p" ]; then file=$(readlink -f "$p") || ebusy "$p link does not exist." else - [ -f "$p" ] || { ebusy "$p file does not exist." } + [ -f "$p" ] || { ebusy "$p file does not exist."; } file="$p" fi diff -r eae7b887e5ac -r ee498c9af856 tools/examples/network-bridge --- a/tools/examples/network-bridge Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/examples/network-bridge Thu Sep 27 12:22:16 2007 -0600 @@ -58,6 +58,7 @@ evalVariables "$@" netdev=${netdev:-$(ip route list | awk '/^default / { print $NF }' | sed 's/.* dev //')} +netdev=${netdev:-eth0} bridge=${bridge:-${netdev}} antispoof=${antispoof:-no} diff -r eae7b887e5ac -r ee498c9af856 tools/firmware/hvmloader/mkhex --- a/tools/firmware/hvmloader/mkhex Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/firmware/hvmloader/mkhex Thu Sep 27 12:22:16 2007 -0600 @@ -21,6 +21,6 @@ # echo "unsigned $1[] = {" -od -v -t x $2 | sed 's/^[0-9]* /0x/' | sed 's/ /, 0x/g' | sed 's/$/,/' +od -v -t x $2 | sed 's/^[0-9]* */0x/' | sed 's/ */, 0x/g' | sed 's/$/,/' | sed 's/0x,//' | sed 's/^[0-9]*,//' echo "};" diff -r eae7b887e5ac -r ee498c9af856 tools/firmware/rombios/32bit/Makefile --- a/tools/firmware/rombios/32bit/Makefile Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/firmware/rombios/32bit/Makefile Thu Sep 27 12:22:16 2007 -0600 @@ -39,4 +39,4 @@ clean: nm -u 32bitbios_all.o; \ exit 11; \ } || : - bash mkhex highbios_array 32bitbios_all.o > $@ + sh mkhex highbios_array 32bitbios_all.o > $@ diff -r eae7b887e5ac -r ee498c9af856 tools/firmware/rombios/32bit/mkhex --- a/tools/firmware/rombios/32bit/mkhex Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/firmware/rombios/32bit/mkhex Thu Sep 27 12:22:16 2007 -0600 @@ -21,6 +21,6 @@ # echo "unsigned $1[] = {" -od -v -t x $2 | sed 's/^[0-9]* /0x/' | sed 's/ /, 0x/g' | sed 's/$/,/' +od -v -t x $2 | sed 's/^[0-9]* */0x/' | sed 's/ */, 0x/g' | sed 's/$/,/' | sed 's/0x,//' | sed 's/^[0-9]*,//' echo "};" diff -r eae7b887e5ac -r ee498c9af856 tools/firmware/rombios/32bit/tcgbios/tcgbios.c --- a/tools/firmware/rombios/32bit/tcgbios/tcgbios.c Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/firmware/rombios/32bit/tcgbios/tcgbios.c Thu Sep 27 12:22:16 2007 -0600 @@ -260,31 +260,44 @@ uint8_t acpi_validate_entry(struct acpi_ } +/* + * Search for the RSDP ACPI table in the memory starting at addr and + * ending at addr + len - 1. + */ +static struct acpi_20_rsdp *find_rsdp(const void *start, unsigned int len) +{ + char *rsdp = (char *)start; + char *end = rsdp + len; + /* scan memory in steps of 16 bytes */ + while (rsdp < end) { + /* check for expected string */ + if (!strncmp( rsdp, "RSD PTR ", 8)) + return (struct acpi_20_rsdp *)rsdp; + rsdp += 0x10; + } + return 0; +} + void tcpa_acpi_init(void) { struct acpi_20_rsdt *rsdt; + struct acpi_20_tcpa *tcpa = (void *)0; + struct acpi_20_rsdp *rsdp; uint32_t length; - struct acpi_20_tcpa *tcpa = (void *)0; - uint16_t found = 0; - uint16_t rsdp_off; uint16_t off; - struct acpi_20_rsdp *rsdp = (void *)0; + int found = 0; + uint16_t ebda_seg; if (MA_IsTPMPresent() == 0) { return; } - /* scan memory in steps of 16 bytes in the ACPI_SEGMENT segment */ - found = 0; - for (rsdp_off = 0; rsdp_off < 0xfff0; rsdp_off += 0x10) { - char *_rsdp = (char *)(ACPI_SEGMENT << 4); - /* check for expected string */ - if (!strncmp( &_rsdp[rsdp_off], "RSD PTR ", 8)) { - found = 1; - rsdp = (struct acpi_20_rsdp *)&_rsdp[rsdp_off]; - break; - } - } + /* RSDP in EBDA? */ + ebda_seg = *(uint16_t *)ADDR_FROM_SEG_OFF(0x40, 0xe); + rsdp = find_rsdp((void *)(ebda_seg << 16), 1024); + + if (!rsdp) + rsdp = find_rsdp((void *)(ACPI_SEGMENT << 4), 0x20000); if (rsdp) { uint32_t ctr = 0; diff -r eae7b887e5ac -r ee498c9af856 tools/firmware/vmxassist/vm86.c --- a/tools/firmware/vmxassist/vm86.c Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/firmware/vmxassist/vm86.c Thu Sep 27 12:22:16 2007 -0600 @@ -1622,6 +1622,11 @@ opcode(struct regs *regs) TRACE((regs, regs->eip - eip, "lock")); continue; + case 0xF4: /* hlt */ + TRACE((regs, regs->eip - eip, "hlt")); + /* Do something power-saving here! */ + return OPC_EMULATED; + case 0xF6: /* addr32 testb $imm, r/m8 */ if (!(prefix & ADDR32)) goto invalid; diff -r eae7b887e5ac -r ee498c9af856 tools/include/NetBSD/evtchn.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/include/NetBSD/evtchn.h Thu Sep 27 12:22:16 2007 -0600 @@ -0,0 +1,89 @@ +/* $NetBSD: evtchn.h,v 1.1.1.1 2007/06/14 19:39:45 bouyer Exp $ */ +/****************************************************************************** + * evtchn.h + * + * Interface to /dev/xen/evtchn. + * + * Copyright (c) 2003-2005, K A Fraser + * + * This file may be distributed separately from the Linux kernel, or + * incorporated into other software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __NetBSD_EVTCHN_H__ +#define __NetBSD_EVTCHN_H__ + +/* + * Bind a fresh port to VIRQ @virq. + * Return allocated port. + */ +#define IOCTL_EVTCHN_BIND_VIRQ \ + _IOWR('E', 4, struct ioctl_evtchn_bind_virq) +struct ioctl_evtchn_bind_virq { + unsigned int virq; + unsigned int port; +}; + +/* + * Bind a fresh port to remote <@remote_domain, @remote_port>. + * Return allocated port. + */ +#define IOCTL_EVTCHN_BIND_INTERDOMAIN \ + _IOWR('E', 5, struct ioctl_evtchn_bind_interdomain) +struct ioctl_evtchn_bind_interdomain { + unsigned int remote_domain, remote_port; + unsigned int port; +}; + +/* + * Allocate a fresh port for binding to @remote_domain. + * Return allocated port. + */ +#define IOCTL_EVTCHN_BIND_UNBOUND_PORT \ + _IOWR('E', 6, struct ioctl_evtchn_bind_unbound_port) +struct ioctl_evtchn_bind_unbound_port { + unsigned int remote_domain; + unsigned int port; +}; + +/* + * Unbind previously allocated @port. + */ +#define IOCTL_EVTCHN_UNBIND \ + _IOW('E', 7, struct ioctl_evtchn_unbind) +struct ioctl_evtchn_unbind { + unsigned int port; +}; + +/* + * Send event to previously allocated @port. + */ +#define IOCTL_EVTCHN_NOTIFY \ + _IOW('E', 8, struct ioctl_evtchn_notify) +struct ioctl_evtchn_notify { + unsigned int port; +}; + +/* Clear and reinitialise the event buffer. Clear error condition. */ +#define IOCTL_EVTCHN_RESET \ + _IO('E', 9) + +#endif /* __NetBSD_EVTCHN_H__ */ diff -r eae7b887e5ac -r ee498c9af856 tools/include/NetBSD/privcmd.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/include/NetBSD/privcmd.h Thu Sep 27 12:22:16 2007 -0600 @@ -0,0 +1,105 @@ +/* NetBSD: xenio.h,v 1.3 2005/05/24 12:07:12 yamt Exp $ */ + +/****************************************************************************** + * privcmd.h + * + * Copyright (c) 2003-2004, K A Fraser + * + * This file may be distributed separately from the Linux kernel, or + * incorporated into other software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __NetBSD_PRIVCMD_H__ +#define __NetBSD_PRIVCMD_H__ + +/* Interface to /proc/xen/privcmd */ + +typedef struct privcmd_hypercall +{ + unsigned long op; + unsigned long arg[5]; +} privcmd_hypercall_t; + +typedef struct privcmd_mmap_entry { + unsigned long va; + unsigned long mfn; + unsigned long npages; +} privcmd_mmap_entry_t; + +typedef struct privcmd_mmap { + int num; + domid_t dom; /* target domain */ + privcmd_mmap_entry_t *entry; +} privcmd_mmap_t; + +typedef struct privcmd_mmapbatch { + int num; /* number of pages to populate */ + domid_t dom; /* target domain */ + unsigned long addr; /* virtual address */ + unsigned long *arr; /* array of mfns - top nibble set on err */ +} privcmd_mmapbatch_t; + +typedef struct privcmd_blkmsg +{ + unsigned long op; + void *buf; + int buf_size; +} privcmd_blkmsg_t; + +/* + * @cmd: IOCTL_PRIVCMD_HYPERCALL + * @arg: &privcmd_hypercall_t + * Return: Value returned from execution of the specified hypercall. + */ +#define IOCTL_PRIVCMD_HYPERCALL \ + _IOWR('P', 0, privcmd_hypercall_t) + +#if defined(_KERNEL) +/* compat */ +#define IOCTL_PRIVCMD_INITDOMAIN_EVTCHN_OLD \ + _IO('P', 1) +#endif /* defined(_KERNEL) */ + +#define IOCTL_PRIVCMD_MMAP \ + _IOW('P', 2, privcmd_mmap_t) +#define IOCTL_PRIVCMD_MMAPBATCH \ + _IOW('P', 3, privcmd_mmapbatch_t) +#define IOCTL_PRIVCMD_GET_MACH2PHYS_START_MFN \ + _IOR('P', 4, unsigned long) + +/* + * @cmd: IOCTL_PRIVCMD_INITDOMAIN_EVTCHN + * @arg: n/a + * Return: Port associated with domain-controller end of control event channel + * for the initial domain. + */ +#define IOCTL_PRIVCMD_INITDOMAIN_EVTCHN \ + _IOR('P', 5, int) + +/* Interface to /dev/xenevt */ +/* EVTCHN_RESET: Clear and reinit the event buffer. Clear error condition. */ +#define EVTCHN_RESET _IO('E', 1) +/* EVTCHN_BIND: Bind to the specified event-channel port. */ +#define EVTCHN_BIND _IOW('E', 2, unsigned long) +/* EVTCHN_UNBIND: Unbind from the specified event-channel port. */ +#define EVTCHN_UNBIND _IOW('E', 3, unsigned long) + +#endif /* __NetBSD_PRIVCMD_H__ */ diff -r eae7b887e5ac -r ee498c9af856 tools/ioemu/Makefile.target --- a/tools/ioemu/Makefile.target Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/ioemu/Makefile.target Thu Sep 27 12:22:16 2007 -0600 @@ -348,6 +348,16 @@ VL_OBJS+=tap-win32.o VL_OBJS+=tap-win32.o endif +ifeq (,$(wildcard /usr/include/pci)) +$(warning *** pciutils-devl package not found - missing /usr/include/pci) +$(warning *** PCI passthrough capability has been disabled) +else +LIBS+=-lpci +VL_OBJS+= pass-through.o +CFLAGS += -DCONFIG_PASSTHROUGH +$(info *** PCI passthrough capability has been enabled ***) +endif + SOUND_HW = sb16.o es1370.o AUDIODRV = audio.o noaudio.o wavaudio.o ifdef CONFIG_SDL diff -r eae7b887e5ac -r ee498c9af856 tools/ioemu/audio/audio.c --- a/tools/ioemu/audio/audio.c Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/ioemu/audio/audio.c Thu Sep 27 12:22:16 2007 -0600 @@ -207,7 +207,7 @@ static char *audio_alloc_prefix (const c strcat (r, s); for (i = 0; i < len; ++i) { - u[i] = toupper (u[i]); + u[i] = toupper ((uint8_t)u[i]); } } return r; @@ -446,7 +446,7 @@ static void audio_process_options (const /* copy while upper-casing, including trailing zero */ for (i = 0; i <= preflen; ++i) { - optname[i + sizeof (qemu_prefix) - 1] = toupper (prefix[i]); + optname[i + sizeof (qemu_prefix) - 1] = toupper ((uint8_t)prefix[i]); } strcat (optname, "_"); strcat (optname, opt->name); diff -r eae7b887e5ac -r ee498c9af856 tools/ioemu/audio/mixeng.c --- a/tools/ioemu/audio/mixeng.c Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/ioemu/audio/mixeng.c Thu Sep 27 12:22:16 2007 -0600 @@ -102,6 +102,7 @@ #undef SHIFT t_sample *mixeng_conv[2][2][2][2] = { +#ifndef _BSD { { { @@ -146,9 +147,11 @@ t_sample *mixeng_conv[2][2][2][2] = { } } } +#endif /* !_BSD */ }; f_sample *mixeng_clip[2][2][2][2] = { +#ifndef _BSD { { { @@ -193,6 +196,7 @@ f_sample *mixeng_clip[2][2][2][2] = { } } } +#endif /* !_BSD */ }; /* diff -r eae7b887e5ac -r ee498c9af856 tools/ioemu/audio/ossaudio.c --- a/tools/ioemu/audio/ossaudio.c Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/ioemu/audio/ossaudio.c Thu Sep 27 12:22:16 2007 -0600 @@ -21,10 +21,15 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ +#include <stdlib.h> #include <sys/mman.h> #include <sys/types.h> #include <sys/ioctl.h> +#if defined(__OpenBSD__) +#include <soundcard.h> +#else #include <sys/soundcard.h> +#endif #include "vl.h" #define AUDIO_CAP "oss" @@ -231,7 +236,7 @@ static int oss_open (int in, struct oss_ goto err; } - if (ioctl (fd, SNDCTL_DSP_NONBLOCK)) { + if (ioctl (fd, SNDCTL_DSP_NONBLOCK, NULL)) { oss_logerr2 (errno, typ, "Failed to set non-blocking mode\n"); goto err; } diff -r eae7b887e5ac -r ee498c9af856 tools/ioemu/block-raw.c --- a/tools/ioemu/block-raw.c Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/ioemu/block-raw.c Thu Sep 27 12:22:16 2007 -0600 @@ -53,8 +53,13 @@ #include <linux/cdrom.h> #include <linux/fd.h> #endif -#ifdef __FreeBSD__ +#if defined(__FreeBSD__) #include <sys/disk.h> +#endif +#if defined(__OpenBSD__) +#include <sys/ioctl.h> +#include <sys/disklabel.h> +#include <sys/dkio.h> #endif //#define DEBUG_FLOPPY @@ -150,7 +155,7 @@ static int raw_pread(BlockDriverState *b if (lseek(s->fd, offset, SEEK_SET) == (off_t)-1) { ++(s->lseek_err_cnt); if(s->lseek_err_cnt <= 10) { - DEBUG_BLOCK_PRINT("raw_pread(%d:%s, %ld, %p, %d) [%ld] lseek failed : %d = %s\n", + DEBUG_BLOCK_PRINT("raw_pread(%d:%s, %" PRId64 ", %p, %d) [%" PRId64 "] lseek failed : %d = %s\n", s->fd, bs->filename, offset, @@ -166,7 +171,7 @@ static int raw_pread(BlockDriverState *b if (ret == count) goto label__raw_read__success; - DEBUG_BLOCK_PRINT("raw_read(%d:%s, %ld, %p, %d) [%ld] read failed %d : %d = %s\n", + DEBUG_BLOCK_PRINT("raw_read(%d:%s, %" PRId64 ", %p, %d) [%" PRId64 "] read failed %d : %d = %s\n", s->fd, bs->filename, offset, @@ -185,7 +190,7 @@ static int raw_pread(BlockDriverState *b if (ret == count) goto label__raw_read__success; - DEBUG_BLOCK_PRINT("raw_read(%d:%s, %ld, %p, %d) [%ld] retry read failed %d : %d = %s\n", + DEBUG_BLOCK_PRINT("raw_read(%d:%s, %" PRId64 ", %p, %d) [%" PRId64 "] retry read failed %d : %d = %s\n", s->fd, bs->filename, offset, @@ -215,7 +220,7 @@ static int raw_pwrite(BlockDriverState * if (lseek(s->fd, offset, SEEK_SET) == (off_t)-1) { ++(s->lseek_err_cnt); if(s->lseek_err_cnt) { - DEBUG_BLOCK_PRINT("raw_write(%d:%s, %ld, %p, %d) [%ld] lseek failed : %d = %s\n", + DEBUG_BLOCK_PRINT("raw_write(%d:%s, %" PRId64 ", %p, %d) [%" PRId64 "] lseek failed : %d = %s\n", s->fd, bs->filename, offset, @@ -231,7 +236,7 @@ static int raw_pwrite(BlockDriverState * if (ret == count) goto label__raw_write__success; - DEBUG_BLOCK_PRINT("raw_write(%d:%s, %ld, %p, %d) [%ld] write failed %d : %d = %s\n", + DEBUG_BLOCK_PRINT("raw_write(%d:%s, %" PRId64 ", %p, %d) [%" PRId64 "] write failed %d : %d = %s\n", s->fd, bs->filename, offset, @@ -496,6 +501,23 @@ static int raw_truncate(BlockDriverState return 0; } +#ifdef __OpenBSD__ +static int64_t raw_getlength(BlockDriverState *bs) +{ + int fd = ((BDRVRawState*)bs->opaque)->fd; + struct stat st; + if(fstat(fd, &st)) + return -1; + if(S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)){ + struct disklabel dl; + if(ioctl(fd, DIOCGDINFO, &dl)) + return -1; + return (uint64_t)dl.d_secsize * + dl.d_partitions[DISKPART(st.st_rdev)].p_size; + }else + return st.st_size; +} +#else /* !__OpenBSD__ */ static int64_t raw_getlength(BlockDriverState *bs) { BDRVRawState *s = bs->opaque; @@ -542,6 +564,7 @@ static int64_t raw_getlength(BlockDrive } return size; } +#endif static int raw_create(const char *filename, int64_t total_size, const char *backing_file, int flags) diff -r eae7b887e5ac -r ee498c9af856 tools/ioemu/block-vvfat.c --- a/tools/ioemu/block-vvfat.c Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/ioemu/block-vvfat.c Thu Sep 27 12:22:16 2007 -0600 @@ -1017,7 +1017,7 @@ DLOG(if (stderr == NULL) { i = strrchr(dirname, ':') - dirname; assert(i >= 3); - if (dirname[i-2] == ':' && isalpha(dirname[i-1])) + if (dirname[i-2] == ':' && isalpha((uint8_t)dirname[i-1])) /* workaround for DOS drive names */ dirname += i-1; else diff -r eae7b887e5ac -r ee498c9af856 tools/ioemu/bswap.h --- a/tools/ioemu/bswap.h Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/ioemu/bswap.h Thu Sep 27 12:22:16 2007 -0600 @@ -4,6 +4,11 @@ #include "config-host.h" #include <inttypes.h> + +#ifdef _BSD +#include <sys/endian.h> +#include <sys/types.h> +#else #ifdef HAVE_BYTESWAP_H #include <byteswap.h> @@ -73,6 +78,8 @@ static inline void bswap64s(uint64_t *s) *s = bswap64(*s); } +#endif /* _BSD */ + #if defined(WORDS_BIGENDIAN) #define be_bswap(v, size) (v) #define le_bswap(v, size) bswap ## size(v) diff -r eae7b887e5ac -r ee498c9af856 tools/ioemu/cutils.c --- a/tools/ioemu/cutils.c Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/ioemu/cutils.c Thu Sep 27 12:22:16 2007 -0600 @@ -23,7 +23,7 @@ */ #include "vl.h" -void pstrcpy(char *buf, int buf_size, const char *str) +void pstrcpy(char *buf, size_t buf_size, const char *str) { int c; char *q = buf; @@ -41,7 +41,7 @@ void pstrcpy(char *buf, int buf_size, co } /* strcat and truncate. */ -char *pstrcat(char *buf, int buf_size, const char *s) +char *pstrcat(char *buf, size_t buf_size, const char *s) { int len; len = strlen(buf); @@ -72,7 +72,7 @@ int stristart(const char *str, const cha p = str; q = val; while (*q != '\0') { - if (toupper(*p) != toupper(*q)) + if (toupper((uint8_t)*p) != toupper((uint8_t)*q)) return 0; p++; q++; diff -r eae7b887e5ac -r ee498c9af856 tools/ioemu/hw/fdc.c --- a/tools/ioemu/hw/fdc.c Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/ioemu/hw/fdc.c Thu Sep 27 12:22:16 2007 -0600 @@ -1100,8 +1100,13 @@ static uint32_t fdctrl_read_data (fdctrl len = fdctrl->data_len - fdctrl->data_pos; if (len > FD_SECTOR_LEN) len = FD_SECTOR_LEN; - bdrv_read(cur_drv->bs, fd_sector(cur_drv), - fdctrl->fifo, len); + if (cur_drv->bs) { + bdrv_read(cur_drv->bs, fd_sector(cur_drv), + fdctrl->fifo, len); + } else { + FLOPPY_ERROR("can't read data from drive\n"); + return 0; + } } } retval = fdctrl->fifo[pos]; diff -r eae7b887e5ac -r ee498c9af856 tools/ioemu/hw/ne2000.c --- a/tools/ioemu/hw/ne2000.c Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/ioemu/hw/ne2000.c Thu Sep 27 12:22:16 2007 -0600 @@ -252,7 +252,7 @@ static void ne2000_receive(void *opaque, { NE2000State *s = opaque; uint8_t *p; - int total_len, next, avail, len, index, mcast_idx; + unsigned int total_len, next, avail, len, index, mcast_idx; uint8_t buf1[60]; static const uint8_t broadcast_macaddr[6] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; @@ -327,7 +327,11 @@ static void ne2000_receive(void *opaque, /* write packet data */ while (size > 0) { - avail = s->stop - index; + /* taviso: this can wrap, so check its okay. */ + if (index <= s->stop) + avail = s->stop - index; + else + avail = 0; len = size; if (len > avail) len = avail; diff -r eae7b887e5ac -r ee498c9af856 tools/ioemu/hw/pass-through.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/ioemu/hw/pass-through.c Thu Sep 27 12:22:16 2007 -0600 @@ -0,0 +1,454 @@ +/* + * Copyright (c) 2007, Neocleus Corporation. + * Copyright (c) 2007, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Alex Novik <alex@xxxxxxxxxxxx> + * Allen Kay <allen.m.kay@xxxxxxxxx> + * Guy Zana <guy@xxxxxxxxxxxx> + * + * This file implements direct PCI assignment to a HVM guest + * + */ +#include "vl.h" +#include "pass-through.h" +#include "pci/header.h" +#include "pci/pci.h" + +extern FILE *logfile; +char *token; + +int pci_devs(const char *direct_pci) +{ + int count = 0; + const char *c; + + /* skip first "[" character */ + c = direct_pci + 1; + while ((c = strchr(c, '[')) != NULL) { + c++; + count++; + } + return (count); +} + +int next_token(char *direct_pci) +{ + if (token == NULL) + token = strtok(direct_pci, ","); + else + token = strtok(NULL, ","); + token = strchr(token, 'x'); + token = token + 1; + return ((int) strtol(token, NULL, 16)); +} + +void next_bdf(char *direct_pci, int *seg, + int *bus, int *dev, int *func) +{ + *seg = next_token(direct_pci); + *bus = next_token(direct_pci); + *dev = next_token(direct_pci); + *func = next_token(direct_pci); +} + +uint8_t find_cap_offset(struct pci_dev *pci_dev, uint8_t cap) +{ + int id; + int max_cap = 48; + int pos = PCI_CAPABILITY_LIST; + int status; + + status = pci_read_byte(pci_dev, PCI_STATUS); + if ( (status & PCI_STATUS_CAP_LIST) == 0 ) + return 0; + + while ( max_cap-- ) + { + pos = pci_read_byte(pci_dev, pos); + if ( pos < 0x40 ) + break; + + pos &= ~3; + id = pci_read_byte(pci_dev, pos + PCI_CAP_LIST_ID); + + if ( id == 0xff ) + break; + if ( id == cap ) + return pos; + + pos += PCI_CAP_LIST_NEXT; + } + return 0; +} + +void pdev_flr(struct pci_dev *pci_dev) +{ + int pos; + int dev_cap; + int dev_status; + + pos = find_cap_offset(pci_dev, PCI_CAP_ID_EXP); + if ( pos ) + { + dev_cap = pci_read_long(pci_dev, pos + PCI_EXP_DEVCAP); + if ( dev_cap & PCI_EXP_DEVCAP_FLR ) + { + pci_write_word(pci_dev, pos + PCI_EXP_DEVCTL, PCI_EXP_DEVCTL_FLR); + do { + dev_status = pci_read_long(pci_dev, pos + PCI_EXP_DEVSTA); + } while (dev_status & PCI_EXP_DEVSTA_TRPND); + } + } +} + +/* Being called each time a mmio region has been updated */ +void pt_iomem_map(PCIDevice *d, int i, uint32_t e_phys, uint32_t e_size, + int type) +{ + struct pt_dev *assigned_device = (struct pt_dev *)d; + uint32_t old_ebase = assigned_device->bases[i].e_physbase; + int first_map = ( assigned_device->bases[i].e_size == 0 ); + int ret = 0; + + assigned_device->bases[i].e_physbase = e_phys; + assigned_device->bases[i].e_size= e_size; + + PT_LOG("e_phys=%08x maddr=%08x type=%d len=%08x index=%d\n", + e_phys, assigned_device->bases[i].access.maddr, type, e_size, i); + + if ( e_size == 0 ) + return; + + if ( !first_map ) + { + /* Remove old mapping */ + ret = xc_domain_memory_mapping(xc_handle, domid, old_ebase >> 12, + assigned_device->bases[i].access.maddr >> 12, + (e_size+0xFFF) >> 12, + DPCI_REMOVE_MAPPING); + if ( ret != 0 ) + { + PT_LOG("Error: remove old mapping failed!\n"); + return; + } + } + + /* Create new mapping */ + ret = xc_domain_memory_mapping(xc_handle, domid, + assigned_device->bases[i].e_physbase >> 12, + assigned_device->bases[i].access.maddr >> 12, + (e_size+0xFFF) >> 12, + DPCI_ADD_MAPPING); + if ( ret != 0 ) + PT_LOG("Error: create new mapping failed!\n"); + +} + +/* Being called each time a pio region has been updated */ +void pt_ioport_map(PCIDevice *d, int i, + uint32_t e_phys, uint32_t e_size, int type) +{ + struct pt_dev *assigned_device = (struct pt_dev *)d; + uint32_t old_ebase = assigned_device->bases[i].e_physbase; + int first_map = ( assigned_device->bases[i].e_size == 0 ); + int ret = 0; + + assigned_device->bases[i].e_physbase = e_phys; + assigned_device->bases[i].e_size= e_size; + + PT_LOG("e_phys=%04x pio_base=%04x len=%04x index=%d\n", + (uint16_t)e_phys, (uint16_t)assigned_device->bases[i].access.pio_base, + (uint16_t)e_size, i); + + if ( e_size == 0 ) + return; + + if ( !first_map ) + { + /* Remove old mapping */ + ret = xc_domain_ioport_mapping(xc_handle, domid, old_ebase, + assigned_device->bases[i].access.pio_base, e_size, + DPCI_REMOVE_MAPPING); + if ( ret != 0 ) + { + PT_LOG("Error: remove old mapping failed!\n"); + return; + } + } + + /* Create new mapping */ + ret = xc_domain_ioport_mapping(xc_handle, domid, e_phys, + assigned_device->bases[i].access.pio_base, e_size, + DPCI_ADD_MAPPING); + if ( ret != 0 ) + PT_LOG("Error: create new mapping failed!\n"); + +} + +static void pt_pci_write_config(PCIDevice *d, uint32_t address, uint32_t val, + int len) +{ + struct pt_dev *assigned_device = (struct pt_dev *)d; + struct pci_dev *pci_dev = assigned_device->pci_dev; + +#ifdef PT_DEBUG_PCI_CONFIG_ACCESS + PT_LOG("(%x.%x): address=%04x val=0x%08x len=%d\n", + (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, len); +#endif + + /* Pre-write hooking */ + switch ( address ) { + case 0x0C ... 0x3F: + pci_default_write_config(d, address, val, len); + return; + } + + /* PCI config pass-through */ + if (address == 0x4) { + switch (len){ + case 1: + pci_write_byte(pci_dev, address, val); + break; + case 2: + pci_write_word(pci_dev, address, val); + break; + case 4: + pci_write_long(pci_dev, address, val); + break; + } + } + + if (address == 0x4) { + /* Post-write hooking */ + pci_default_write_config(d, address, val, len); + } +} + +static uint32_t pt_pci_read_config(PCIDevice *d, uint32_t address, int len) +{ + struct pt_dev *assigned_device = (struct pt_dev *)d; + struct pci_dev *pci_dev = assigned_device->pci_dev; + uint32_t val = 0xFF; + + /* Pre-hooking */ + switch ( address ) { + case 0x0C ... 0x3F: + val = pci_default_read_config(d, address, len); + goto exit; + } + + switch ( len ) { + case 1: + val = pci_read_byte(pci_dev, address); + break; + case 2: + val = pci_read_word(pci_dev, address); + break; + case 4: + val = pci_read_long(pci_dev, address); + break; + } + +exit: + +#ifdef PT_DEBUG_PCI_CONFIG_ACCESS + PT_LOG("(%x.%x): address=%04x val=0x%08x len=%d\n", + (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, len); +#endif + + return val; +} + +static int pt_register_regions(struct pt_dev *assigned_device) +{ + int i = 0; + uint32_t bar_data = 0; + struct pci_dev *pci_dev = assigned_device->pci_dev; + PCIDevice *d = &assigned_device->dev; + + /* Register PIO/MMIO BARs */ + for ( i=0; i < PCI_BAR_ENTRIES; i++ ) + { + if ( pci_dev->base_addr[i] ) + { + assigned_device->bases[i].e_physbase = pci_dev->base_addr[i]; + assigned_device->bases[i].access.u = pci_dev->base_addr[i]; + + /* Register current region */ + bar_data = *((uint32_t*)(d->config + PCI_BASE_ADDRESS_0) + i); + if ( bar_data & PCI_ADDRESS_SPACE_IO ) + pci_register_io_region((PCIDevice *)assigned_device, i, + (uint32_t)pci_dev->size[i], PCI_ADDRESS_SPACE_IO, + pt_ioport_map); + else if ( bar_data & PCI_ADDRESS_SPACE_MEM_PREFETCH ) + pci_register_io_region((PCIDevice *)assigned_device, i, + (uint32_t)pci_dev->size[i], PCI_ADDRESS_SPACE_MEM_PREFETCH, + pt_iomem_map); + else + pci_register_io_region((PCIDevice *)assigned_device, i, + (uint32_t)pci_dev->size[i], PCI_ADDRESS_SPACE_MEM, + pt_iomem_map); + + PT_LOG("IO region registered (size=0x%08x base_addr=0x%08x)\n", + (uint32_t)(pci_dev->size[i]), + (uint32_t)(pci_dev->base_addr[i])); + } + } + + /* Register expansion ROM address */ + if ( pci_dev->rom_base_addr && pci_dev->rom_size ) + { + assigned_device->bases[PCI_ROM_SLOT].e_physbase = + pci_dev->rom_base_addr; + assigned_device->bases[PCI_ROM_SLOT].access.maddr = + pci_dev->rom_base_addr; + pci_register_io_region((PCIDevice *)assigned_device, PCI_ROM_SLOT, + pci_dev->rom_size, PCI_ADDRESS_SPACE_MEM_PREFETCH, + pt_iomem_map); + + PT_LOG("Expansion ROM registered (size=0x%08x base_addr=0x%08x)\n", + (uint32_t)(pci_dev->rom_size), (uint32_t)(pci_dev->rom_base_addr)); + } + + return 0; +} + +struct pt_dev * register_real_device(PCIBus *e_bus, + const char *e_dev_name, int e_devfn, uint8_t r_bus, uint8_t r_dev, + uint8_t r_func, uint32_t machine_irq, struct pci_access *pci_access) +{ + int rc, i; + struct pt_dev *assigned_device = NULL; + struct pci_dev *pci_dev; + struct pci_config_cf8 machine_bdf; + uint8_t e_device, e_intx; + + PT_LOG("Assigning real physical device %02x:%02x.%x ...\n", + r_bus, r_dev, r_func); + + /* Find real device structure */ + for (pci_dev = pci_access->devices; pci_dev != NULL; + pci_dev = pci_dev->next) + { + if ((r_bus == pci_dev->bus) && (r_dev == pci_dev->dev) + && (r_func == pci_dev->func)) + break; + } + if ( pci_dev == NULL ) + { + PT_LOG("Error: couldn't locate device in libpci structures\n"); + return NULL; + } + + /* Register device */ + assigned_device = (struct pt_dev *) pci_register_device(e_bus, e_dev_name, + sizeof(struct pt_dev), e_devfn, + pt_pci_read_config, pt_pci_write_config); + if ( assigned_device == NULL ) + { + PT_LOG("Error: couldn't register real device\n"); + return NULL; + } + + assigned_device->pci_dev = pci_dev; + + /* Issue PCIe FLR */ + pdev_flr(pci_dev); + + /* Tell XEN vmm to change iommu settings */ + machine_bdf.reg = 0; + machine_bdf.bus = r_bus; + machine_bdf.dev = r_dev; + machine_bdf.func = r_func; + rc = xc_assign_device(xc_handle, domid, machine_bdf.value); + if ( rc < 0 ) + PT_LOG("Error: xc_domain_assign_device error %d\n", rc); + + /* Initialize virtualized PCI configuration (Extended 256 Bytes) */ + for ( i = 0; i < PCI_CONFIG_SIZE; i++ ) + assigned_device->dev.config[i] = pci_read_byte(pci_dev, i); + + /* Handle real device's MMIO/PIO BARs */ + pt_register_regions(assigned_device); + + /* Bind interrupt */ + e_device = (assigned_device->dev.devfn >> 3) & 0x1f; + e_intx = assigned_device->dev.config[0x3d]-1; + + if ( PT_MACHINE_IRQ_AUTO == machine_irq ) + machine_irq = pci_dev->irq; + + /* bind machine_irq to device */ + if ( 0 != machine_irq ) + { + rc = xc_domain_bind_pt_pci_irq(xc_handle, domid, machine_irq, 0, + e_device, e_intx); + if ( rc < 0 ) + { + /* TBD: unregister device in case of an error */ + PT_LOG("Error: Binding of interrupt failed! rc=%d\n", rc); + } + } + else { + /* Disable PCI intx assertion (turn on bit10 of devctl) */ + assigned_device->dev.config[0x05] |= 0x04; + pci_write_word(pci_dev, 0x04, + *(uint16_t *)(&assigned_device->dev.config[0x04])); + } + + PT_LOG("Real physical device %02x:%02x.%x registered successfuly!\n", + r_bus, r_dev, r_func); + + return assigned_device; +} + +int pt_init(PCIBus *e_bus, char *direct_pci) +{ + int i; + int seg, b, d, f; + struct pt_dev *pt_dev; + struct pci_access *pci_access; + int dev_count = pci_devs(direct_pci); + + /* Initialize libpci */ + pci_access = pci_alloc(); + if ( pci_access == NULL ) + { + PT_LOG("pci_access is NULL\n"); + return -1; + } + pci_init(pci_access); + pci_scan_bus(pci_access); + + /* Assign given devices to guest */ + for ( i = 0; i < dev_count; i++ ) + { + /* Get next device bdf (bus, device, function) */ + next_bdf(direct_pci, &seg, &b, &d, &f); + + /* Register real device with the emulated bus */ + pt_dev = register_real_device(e_bus, "DIRECT PCI", PT_VIRT_DEVFN_AUTO, + b, d, f, PT_MACHINE_IRQ_AUTO, pci_access); + if ( pt_dev == NULL ) + { + PT_LOG("Error: Registration failed (%02x:%02x.%x)\n", b, d, f); + return -1; + } + } + + /* Success */ + return 0; +} diff -r eae7b887e5ac -r ee498c9af856 tools/ioemu/hw/pass-through.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/ioemu/hw/pass-through.h Thu Sep 27 12:22:16 2007 -0600 @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2007, Neocleus Corporation. + * Copyright (c) 2007, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + */ +#ifndef __PASSTHROUGH_H__ +#define __PASSTHROUGH_H__ + +#include "vl.h" +#include "pci/header.h" +#include "pci/pci.h" + +/* Log acesss */ +#define PT_LOGGING_ENABLED + +#ifdef PT_LOGGING_ENABLED +#define PT_LOG(_f, _a...) fprintf(logfile, "%s: " _f, __func__, ##_a) +#else +#define PT_LOG(_f, _a...) +#endif + +/* Some compilation flags */ +// #define PT_DEBUG_PCI_CONFIG_ACCESS + +#define PT_MACHINE_IRQ_AUTO (0xFFFFFFFF) +#define PT_VIRT_DEVFN_AUTO (-1) + +/* Misc PCI constants that should be moved to a separate library :) */ +#define PCI_CONFIG_SIZE (256) +#define PCI_EXP_DEVCAP_FLR (1 << 28) +#define PCI_EXP_DEVCTL_FLR (0x1b) +#define PCI_BAR_ENTRIES (6) + +struct pt_region { + /* Virtual phys base & size */ + uint32_t e_physbase; + uint32_t e_size; + /* Index of region in qemu */ + uint32_t memory_index; + /* Translation of the emulated address */ + union { + uint32_t maddr; + uint32_t pio_base; + uint32_t u; + } access; +}; + +/* + This structure holds the context of the mapping functions + and data that is relevant for qemu device management. +*/ +struct pt_dev { + PCIDevice dev; + struct pci_dev *pci_dev; /* libpci struct */ + struct pt_region bases[PCI_NUM_REGIONS]; /* Access regions */ +}; + +/* Used for formatting PCI BDF into cf8 format */ +struct pci_config_cf8 { + union { + unsigned int value; + struct { + unsigned int reserved1:2; + unsigned int reg:6; + unsigned int func:3; + unsigned int dev:5; + unsigned int bus:8; + unsigned int reserved2:7; + unsigned int enable:1; + }; + }; +}; + +int pt_init(PCIBus * e_bus, char * direct_pci); + +#endif /* __PASSTHROUGH_H__ */ + diff -r eae7b887e5ac -r ee498c9af856 tools/ioemu/hw/pc.c --- a/tools/ioemu/hw/pc.c Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/ioemu/hw/pc.c Thu Sep 27 12:22:16 2007 -0600 @@ -465,7 +465,7 @@ static void pc_init1(uint64_t ram_size, DisplayState *ds, const char **fd_filename, int snapshot, const char *kernel_filename, const char *kernel_cmdline, const char *initrd_filename, - int pci_enabled) + int pci_enabled, const char *direct_pci) { #ifndef NOBIOS char buf[1024]; @@ -480,6 +480,7 @@ static void pc_init1(uint64_t ram_size, int piix3_devfn = -1; CPUState *env; NICInfo *nd; + int rc; linux_boot = (kernel_filename != NULL); @@ -665,6 +666,19 @@ static void pc_init1(uint64_t ram_size, } } +#ifdef CONFIG_PASSTHROUGH + /* Pass-through Initialization */ + if ( pci_enabled && direct_pci ) + { + rc = pt_init(pci_bus, direct_pci); + if ( rc < 0 ) + { + fprintf(logfile, "Error: Initialization failed for pass-through devices\n"); + exit(1); + } + } +#endif + rtc_state = rtc_init(0x70, 8); register_ioport_read(0x92, 1, 1, ioport92_read, NULL); @@ -801,12 +815,14 @@ static void pc_init_pci(uint64_t ram_siz int snapshot, const char *kernel_filename, const char *kernel_cmdline, - const char *initrd_filename) + const char *initrd_filename, + const char *direct_pci) { pc_init1(ram_size, vga_ram_size, boot_device, ds, fd_filename, snapshot, kernel_filename, kernel_cmdline, - initrd_filename, 1); + initrd_filename, 1, + direct_pci); } static void pc_init_isa(uint64_t ram_size, int vga_ram_size, char *boot_device, @@ -814,12 +830,13 @@ static void pc_init_isa(uint64_t ram_siz int snapshot, const char *kernel_filename, const char *kernel_cmdline, - const char *initrd_filename) + const char *initrd_filename, + const char *unused) { pc_init1(ram_size, vga_ram_size, boot_device, ds, fd_filename, snapshot, kernel_filename, kernel_cmdline, - initrd_filename, 0); + initrd_filename, 0, NULL); } QEMUMachine pc_machine = { diff -r eae7b887e5ac -r ee498c9af856 tools/ioemu/hw/sb16.c --- a/tools/ioemu/hw/sb16.c Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/ioemu/hw/sb16.c Thu Sep 27 12:22:16 2007 -0600 @@ -1235,8 +1235,10 @@ static int SB_read_DMA (void *opaque, in s->block_size); #endif - while (s->left_till_irq <= 0) { - s->left_till_irq = s->block_size + s->left_till_irq; + if (s->block_size) { + while (s->left_till_irq <= 0) { + s->left_till_irq = s->block_size + s->left_till_irq; + } } return dma_pos; diff -r eae7b887e5ac -r ee498c9af856 tools/ioemu/monitor.c --- a/tools/ioemu/monitor.c Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/ioemu/monitor.c Thu Sep 27 12:22:16 2007 -0600 @@ -1698,7 +1698,7 @@ static void next(void) { if (pch != '\0') { pch++; - while (isspace(*pch)) + while (isspace((uint8_t)*pch)) pch++; } } @@ -1756,7 +1756,7 @@ static target_long expr_unary(void) *q++ = *pch; pch++; } - while (isspace(*pch)) + while (isspace((uint8_t)*pch)) pch++; *q = 0; ret = get_monitor_def(&n, buf); @@ -1780,7 +1780,7 @@ static target_long expr_unary(void) expr_error("invalid char in expression"); } pch = p; - while (isspace(*pch)) + while (isspace((uint8_t)*pch)) pch++; break; } @@ -1874,7 +1874,7 @@ static int get_expr(target_long *pval, c *pp = pch; return -1; } - while (isspace(*pch)) + while (isspace((uint8_t)*pch)) pch++; *pval = expr_sum(); *pp = pch; @@ -1890,7 +1890,7 @@ static int get_str(char *buf, int buf_si q = buf; p = *pp; - while (isspace(*p)) + while (isspace((uint8_t)*p)) p++; if (*p == '\0') { fail: @@ -1935,7 +1935,7 @@ static int get_str(char *buf, int buf_si } p++; } else { - while (*p != '\0' && !isspace(*p)) { + while (*p != '\0' && !isspace((uint8_t)*p)) { if ((q - buf) < buf_size - 1) { *q++ = *p; } @@ -1975,12 +1975,12 @@ static void monitor_handle_command(const /* extract the command name */ p = cmdline; q = cmdname; - while (isspace(*p)) + while (isspace((uint8_t)*p)) p++; if (*p == '\0') return; pstart = p; - while (*p != '\0' && *p != '/' && !isspace(*p)) + while (*p != '\0' && *p != '/' && !isspace((uint8_t)*p)) p++; len = p - pstart; if (len > sizeof(cmdname) - 1) @@ -2016,7 +2016,7 @@ static void monitor_handle_command(const int ret; char *str; - while (isspace(*p)) + while (isspace((uint8_t)*p)) p++; if (*typestr == '?') { typestr++; @@ -2058,15 +2058,15 @@ static void monitor_handle_command(const { int count, format, size; - while (isspace(*p)) + while (isspace((uint8_t)*p)) p++; if (*p == '/') { /* format found */ p++; count = 1; - if (isdigit(*p)) { + if (isdigit((uint8_t)*p)) { count = 0; - while (isdigit(*p)) { + while (isdigit((uint8_t)*p)) { count = count * 10 + (*p - '0'); p++; } @@ -2105,7 +2105,7 @@ static void monitor_handle_command(const } } next: - if (*p != '\0' && !isspace(*p)) { + if (*p != '\0' && !isspace((uint8_t)*p)) { term_printf("invalid char in format: '%c'\n", *p); goto fail; } @@ -2138,7 +2138,7 @@ static void monitor_handle_command(const case 'l': { target_long val; - while (isspace(*p)) + while (isspace((uint8_t)*p)) p++; if (*typestr == '?' || *typestr == '.') { if (*typestr == '?') { @@ -2149,7 +2149,7 @@ static void monitor_handle_command(const } else { if (*p == '.') { p++; - while (isspace(*p)) + while (isspace((uint8_t)*p)) p++; has_arg = 1; } else { @@ -2195,7 +2195,7 @@ static void monitor_handle_command(const c = *typestr++; if (c == '\0') goto bad_type; - while (isspace(*p)) + while (isspace((uint8_t)*p)) p++; has_option = 0; if (*p == '-') { @@ -2225,7 +2225,7 @@ static void monitor_handle_command(const } } /* check that all arguments were parsed */ - while (isspace(*p)) + while (isspace((uint8_t)*p)) p++; if (*p != '\0') { term_printf("%s: extraneous characters at the end of line\n", @@ -2364,7 +2364,7 @@ static void parse_cmdline(const char *cm p = cmdline; nb_args = 0; for(;;) { - while (isspace(*p)) + while (isspace((uint8_t)*p)) p++; if (*p == '\0') break; @@ -2398,7 +2398,7 @@ void readline_find_completion(const char /* if the line ends with a space, it means we want to complete the next arg */ len = strlen(cmdline); - if (len > 0 && isspace(cmdline[len - 1])) { + if (len > 0 && isspace((uint8_t)cmdline[len - 1])) { if (nb_args >= MAX_ARGS) return; args[nb_args++] = qemu_strdup(""); diff -r eae7b887e5ac -r ee498c9af856 tools/ioemu/osdep.h --- a/tools/ioemu/osdep.h Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/ioemu/osdep.h Thu Sep 27 12:22:16 2007 -0600 @@ -2,6 +2,10 @@ #define QEMU_OSDEP_H #include <stdarg.h> +#ifdef __OpenBSD__ +#include <sys/types.h> +#include <sys/signal.h> +#endif #define qemu_printf printf diff -r eae7b887e5ac -r ee498c9af856 tools/ioemu/target-i386-dm/exec-dm.c --- a/tools/ioemu/target-i386-dm/exec-dm.c Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/ioemu/target-i386-dm/exec-dm.c Thu Sep 27 12:22:16 2007 -0600 @@ -168,8 +168,8 @@ void cpu_set_log_filename(const char *fi #else setvbuf(logfile, NULL, _IOLBF, 0); #endif - stdout = logfile; - stderr = logfile; + dup2(fileno(logfile), 1); + dup2(fileno(logfile), 2); } /* mask must never be zero, except for A20 change call */ diff -r eae7b887e5ac -r ee498c9af856 tools/ioemu/usb-linux.c --- a/tools/ioemu/usb-linux.c Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/ioemu/usb-linux.c Thu Sep 27 12:22:16 2007 -0600 @@ -268,7 +268,7 @@ static int get_tag_value(char *buf, int if (!p) return -1; p += strlen(tag); - while (isspace(*p)) + while (isspace((uint8_t)*p)) p++; q = buf; while (*p != '\0' && !strchr(stopchars, *p)) { diff -r eae7b887e5ac -r ee498c9af856 tools/ioemu/vl.c --- a/tools/ioemu/vl.c Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/ioemu/vl.c Thu Sep 27 12:22:16 2007 -0600 @@ -24,6 +24,7 @@ #include "vl.h" #include <unistd.h> +#include <stdlib.h> #include <fcntl.h> #include <signal.h> #include <time.h> @@ -38,22 +39,29 @@ #include <sys/poll.h> #include <sys/mman.h> #include <sys/ioctl.h> +#include <sys/resource.h> #include <sys/socket.h> #include <netinet/in.h> +#include <net/if.h> +#if defined(__NetBSD__) +#include <net/if_tap.h> +#endif +#if defined(__linux__) || defined(__Linux__) +#include <linux/if_tun.h> +#endif #include <arpa/inet.h> #include <dirent.h> #include <netdb.h> #ifdef _BSD #include <sys/stat.h> -#ifndef __APPLE__ +#ifndef _BSD #include <libutil.h> +#else +#include <util.h> #endif #else #ifndef __sun__ -#include <linux/if.h> -#include <linux/if_tun.h> #include <pty.h> -#include <malloc.h> #include <linux/rtc.h> #include <linux/ppdev.h> #endif @@ -65,7 +73,6 @@ #endif #ifdef _WIN32 -#include <malloc.h> #include <sys/timeb.h> #include <windows.h> #define getopt_long_only getopt_long @@ -91,7 +98,11 @@ #include <xen/hvm/params.h> #define DEFAULT_NETWORK_SCRIPT "/etc/xen/qemu-ifup" +#ifdef _BSD +#define DEFAULT_BRIDGE "bridge0" +#else #define DEFAULT_BRIDGE "xenbr0" +#endif #ifdef __sun__ #define SMBD_COMMAND "/usr/sfw/sbin/smbd" #else @@ -1794,7 +1805,7 @@ static int store_dev_info(char *devName, return 0; } -#if defined(__linux__) +#if defined(__linux__) || defined(__NetBSD__) || defined(__OpenBSD__) static CharDriverState *qemu_chr_open_pty(void) { struct termios tty; @@ -1949,6 +1960,7 @@ static CharDriverState *qemu_chr_open_tt return chr; } +#if defined(__linux__) static int pp_ioctl(CharDriverState *chr, int cmd, void *arg) { int fd = (int)chr->opaque; @@ -2013,13 +2025,14 @@ static CharDriverState *qemu_chr_open_pp return chr; } +#endif /* __linux__ */ #else static CharDriverState *qemu_chr_open_pty(void) { return NULL; } -#endif +#endif /* __linux__ || __NetBSD__ || __OpenBSD__ */ #endif /* !defined(_WIN32) */ @@ -2958,7 +2971,7 @@ static int parse_macaddr(uint8_t *macadd return 0; } -static int get_str_sep(char *buf, int buf_size, const char **pp, int sep) +static int get_str_sep(char *buf, size_t buf_size, const char **pp, int sep) { const char *p, *p1; int len; @@ -3031,7 +3044,7 @@ int parse_host_port(struct sockaddr_in * if (buf[0] == '\0') { saddr->sin_addr.s_addr = 0; } else { - if (isdigit(buf[0])) { + if (isdigit((uint8_t)buf[0])) { if (!inet_aton(buf, &saddr->sin_addr)) return -1; } else { @@ -3373,18 +3386,30 @@ static int tap_open(char *ifname, int if static int tap_open(char *ifname, int ifname_size) { int fd; +#ifndef TAPGIFNAME char *dev; struct stat s; +#endif + struct ifreq ifr; fd = open("/dev/tap", O_RDWR); if (fd < 0) { - fprintf(stderr, "warning: could not open /dev/tap: no virtual network emulation\n"); + fprintf(stderr, "warning: could not open /dev/tap: no virtual network emulation %s\n", strerror(errno)); return -1; } +#ifdef TAPGIFNAME + if (ioctl (fd, TAPGIFNAME, (void*)&ifr) < 0) { + fprintf(stderr, "warning: could not open get tap name: %s\n", + strerror(errno)); + return -1; + } + pstrcpy(ifname, ifname_size, ifr.ifr_name); +#else fstat(fd, &s); dev = devname(s.st_rdev, S_IFCHR); pstrcpy(ifname, ifname_size, dev); +#endif fcntl(fd, F_SETFL, O_NONBLOCK); return fd; @@ -3434,6 +3459,8 @@ static int net_tap_init(VLANState *vlan, char *args[4]; char **parg; char ifname[128]; + + memset(ifname, 0, sizeof(ifname)); if (ifname1 != NULL) pstrcpy(ifname, sizeof(ifname), ifname1); @@ -3611,7 +3638,7 @@ static int net_socket_mcast_create(struc val = 1; ret=setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, - (const char *)&val, sizeof(val)); + (const char *)&val, sizeof(char)); if (ret < 0) { perror("setsockopt(SOL_SOCKET, SO_REUSEADDR)"); goto fail; @@ -3893,7 +3920,7 @@ static int net_socket_mcast_init(VLANSta } -static int get_param_value(char *buf, int buf_size, +static int get_param_value(char *buf, size_t buf_size, const char *tag, const char *str) { const char *p; @@ -4019,6 +4046,10 @@ static int net_client_init(const char *s char setup_script[1024]; char bridge[16]; int fd; + + memset(ifname, 0, sizeof(ifname)); + memset(setup_script, 0, sizeof(setup_script)); + if (get_param_value(buf, sizeof(buf), "fd", p) > 0) { fd = strtol(buf, NULL, 0); ret = -1; @@ -6513,6 +6544,7 @@ enum { QEMU_OPTION_acpi, QEMU_OPTION_vncviewer, QEMU_OPTION_vncunused, + QEMU_OPTION_pci, }; typedef struct QEMUOption { @@ -6610,6 +6642,7 @@ const QEMUOption qemu_options[] = { { "d", HAS_ARG, QEMU_OPTION_d }, { "vcpus", 1, QEMU_OPTION_vcpus }, { "acpi", 0, QEMU_OPTION_acpi }, + { "pci", HAS_ARG, QEMU_OPTION_pci}, { NULL }, }; @@ -6912,7 +6945,6 @@ static int qemu_map_cache_init(void) nr_buckets = (((MAX_MCACHE_SIZE >> PAGE_SHIFT) + (1UL << (MCACHE_BUCKET_SHIFT - PAGE_SHIFT)) - 1) >> (MCACHE_BUCKET_SHIFT - PAGE_SHIFT)); - fprintf(logfile, "qemu_map_cache_init nr_buckets = %lx\n", nr_buckets); /* * Use mmap() directly: lets us allocate a big hash table with no up-front @@ -6921,8 +6953,9 @@ static int qemu_map_cache_init(void) */ size = nr_buckets * sizeof(struct map_cache); size = (size + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1); + fprintf(logfile, "qemu_map_cache_init nr_buckets = %lx size %lu\n", nr_buckets, size); mapcache_entry = mmap(NULL, size, PROT_READ|PROT_WRITE, - MAP_SHARED|MAP_ANONYMOUS, 0, 0); + MAP_SHARED|MAP_ANON, -1, 0); if (mapcache_entry == MAP_FAILED) { errno = ENOMEM; return -1; @@ -7059,15 +7092,40 @@ int main(int argc, char **argv) unsigned long ioreq_pfn; extern void *shared_page; extern void *buffered_io_page; + struct rlimit rl; #ifdef __ia64__ unsigned long nr_pages; xen_pfn_t *page_array; extern void *buffered_pio_page; #endif sigset_t set; - char qemu_dm_logfilename[128]; - + const char *direct_pci = NULL; + + /* Maximise rlimits. Needed where default constraints are tight (*BSD). */ + if (getrlimit(RLIMIT_STACK, &rl) != 0) { + perror("getrlimit(RLIMIT_STACK)"); + exit(1); + } + rl.rlim_cur = rl.rlim_max; + if (setrlimit(RLIMIT_STACK, &rl) != 0) + perror("setrlimit(RLIMIT_STACK)"); + if (getrlimit(RLIMIT_DATA, &rl) != 0) { + perror("getrlimit(RLIMIT_DATA)"); + exit(1); + } + rl.rlim_cur = rl.rlim_max; + if (setrlimit(RLIMIT_DATA, &rl) != 0) + perror("setrlimit(RLIMIT_DATA)"); + rl.rlim_cur = RLIM_INFINITY; + rl.rlim_max = RLIM_INFINITY; + if (setrlimit(RLIMIT_RSS, &rl) != 0) + perror("setrlimit(RLIMIT_RSS)"); + rl.rlim_cur = RLIM_INFINITY; + rl.rlim_max = RLIM_INFINITY; + if (setrlimit(RLIMIT_MEMLOCK, &rl) != 0) + perror("setrlimit(RLIMIT_MEMLOCK)"); + /* Ensure that SIGUSR2 is blocked by default when a new thread is created, then only the threads that use the signal unblock it -- this fixes a race condition in Qcow support where the AIO signal is misdelivered. */ @@ -7560,6 +7618,9 @@ int main(int argc, char **argv) case QEMU_OPTION_vncunused: vncunused++; break; + case QEMU_OPTION_pci: + direct_pci = optarg; + break; } } } @@ -7926,7 +7987,8 @@ int main(int argc, char **argv) machine->init(ram_size, vga_ram_size, boot_device, ds, fd_filename, snapshot, - kernel_filename, kernel_cmdline, initrd_filename); + kernel_filename, kernel_cmdline, initrd_filename, + direct_pci); free(boot_device); /* init USB devices */ diff -r eae7b887e5ac -r ee498c9af856 tools/ioemu/vl.h --- a/tools/ioemu/vl.h Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/ioemu/vl.h Thu Sep 27 12:22:16 2007 -0600 @@ -103,8 +103,8 @@ static inline char *realpath(const char #endif /* cutils.c */ -void pstrcpy(char *buf, int buf_size, const char *str); -char *pstrcat(char *buf, int buf_size, const char *s); +void pstrcpy(char *buf, size_t buf_size, const char *str); +char *pstrcat(char *buf, size_t buf_size, const char *s); int strstart(const char *str, const char *val, const char **ptr); int stristart(const char *str, const char *val, const char **ptr); @@ -717,7 +717,7 @@ typedef void QEMUMachineInitFunc(uint64_ char *boot_device, DisplayState *ds, const char **fd_filename, int snapshot, const char *kernel_filename, const char *kernel_cmdline, - const char *initrd_filename); + const char *initrd_filename, const char *direct_pci); typedef struct QEMUMachine { const char *name; diff -r eae7b887e5ac -r ee498c9af856 tools/ioemu/vnc.c --- a/tools/ioemu/vnc.c Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/ioemu/vnc.c Thu Sep 27 12:22:16 2007 -0600 @@ -24,6 +24,9 @@ * THE SOFTWARE. */ +#include <sys/stat.h> +#include <sys/socket.h> +#include <netinet/in.h> #include "vl.h" #include "qemu_socket.h" #include <assert.h> diff -r eae7b887e5ac -r ee498c9af856 tools/libaio/src/compat-0_1.c --- a/tools/libaio/src/compat-0_1.c Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/libaio/src/compat-0_1.c Thu Sep 27 12:22:16 2007 -0600 @@ -19,7 +19,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <stdlib.h> -#include <asm/errno.h> +#include <sys/time.h> #include "libaio.h" #include "vsys_def.h" diff -r eae7b887e5ac -r ee498c9af856 tools/libfsimage/ext2fs/fsys_ext2fs.c --- a/tools/libfsimage/ext2fs/fsys_ext2fs.c Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/libfsimage/ext2fs/fsys_ext2fs.c Thu Sep 27 12:22:16 2007 -0600 @@ -594,7 +594,7 @@ ext2fs_dir (fsi_file_t *ffi, char *dirna /* Find out how long our remaining name is. */ len = 0; - while (dirname[len] && !isspace (dirname[len])) + while (dirname[len] && !isspace ((uint8_t)dirname[len])) len++; /* Get the symlink size. */ @@ -651,7 +651,7 @@ ext2fs_dir (fsi_file_t *ffi, char *dirna } /* if end of filename, INODE points to the file's inode */ - if (!*dirname || isspace (*dirname)) + if (!*dirname || isspace ((uint8_t)*dirname)) { if (!S_ISREG (INODE->i_mode)) { @@ -678,7 +678,7 @@ ext2fs_dir (fsi_file_t *ffi, char *dirna } /* skip to next slash or end of filename (space) */ - for (rest = dirname; (ch = *rest) && !isspace (ch) && ch != '/'; + for (rest = dirname; (ch = *rest) && !isspace ((uint8_t)ch) && ch != '/'; rest++); /* look through this directory and find the next filename component */ diff -r eae7b887e5ac -r ee498c9af856 tools/libfsimage/fat/fsys_fat.c --- a/tools/libfsimage/fat/fsys_fat.c Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/libfsimage/fat/fsys_fat.c Thu Sep 27 12:22:16 2007 -0600 @@ -301,7 +301,7 @@ fat_dir (fsi_file_t *ffi, char *dirname) /* if we have a real file (and we're not just printing possibilities), then this is where we want to exit */ - if (!*dirname || isspace (*dirname)) + if (!*dirname || isspace ((uint8_t)*dirname)) { if (attrib & FAT_ATTRIB_DIR) { @@ -325,7 +325,7 @@ fat_dir (fsi_file_t *ffi, char *dirname) /* Directories don't have a file size */ filemax = INT_MAX; - for (rest = dirname; (ch = *rest) && !isspace (ch) && ch != '/'; rest++); + for (rest = dirname; (ch = *rest) && !isspace ((uint8_t)ch) && ch != '/'; rest++); *rest = 0; @@ -426,13 +426,13 @@ fat_dir (fsi_file_t *ffi, char *dirname) { int i, j, c; - for (i = 0; i < 8 && (c = filename[i] = tolower (dir_buf[i])) - && !isspace (c); i++); + for (i = 0; i < 8 && (c = filename[i] = tolower ((uint8_t)dir_buf[i])) + && !isspace ((uint8_t)c); i++); filename[i++] = '.'; - for (j = 0; j < 3 && (c = filename[i + j] = tolower (dir_buf[8 + j])) - && !isspace (c); j++); + for (j = 0; j < 3 && (c = filename[i + j] = tolower ((uint8_t)dir_buf[8 + j])) + && !isspace ((uint8_t)c); j++); if (j == 0) i--; diff -r eae7b887e5ac -r ee498c9af856 tools/libfsimage/iso9660/fsys_iso9660.c --- a/tools/libfsimage/iso9660/fsys_iso9660.c Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/libfsimage/iso9660/fsys_iso9660.c Thu Sep 27 12:22:16 2007 -0600 @@ -164,7 +164,7 @@ iso9660_dir (fsi_file_t *ffi, char *dirn /* pathlen = strcspn(dirname, "/\n\t "); */ for (pathlen = 0 ; dirname[pathlen] - && !isspace(dirname[pathlen]) && dirname[pathlen] != '/' ; + && !isspace((uint8_t)dirname[pathlen]) && dirname[pathlen] != '/' ; pathlen++) ; diff -r eae7b887e5ac -r ee498c9af856 tools/libfsimage/reiserfs/fsys_reiserfs.c --- a/tools/libfsimage/reiserfs/fsys_reiserfs.c Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/libfsimage/reiserfs/fsys_reiserfs.c Thu Sep 27 12:22:16 2007 -0600 @@ -1029,7 +1029,7 @@ reiserfs_dir (fsi_file_t *ffi, char *dir /* Find out how long our remaining name is. */ len = 0; - while (dirname[len] && !isspace (dirname[len])) + while (dirname[len] && !isspace ((uint8_t)dirname[len])) len++; if (filemax + len > sizeof (linkbuf) - 1) @@ -1078,7 +1078,7 @@ reiserfs_dir (fsi_file_t *ffi, char *dir /* if we have a real file (and we're not just printing possibilities), then this is where we want to exit */ - if (! *dirname || isspace (*dirname)) + if (! *dirname || isspace ((uint8_t)*dirname)) { if (! S_ISREG (mode)) { @@ -1109,7 +1109,7 @@ reiserfs_dir (fsi_file_t *ffi, char *dir errnum = ERR_BAD_FILETYPE; return 0; } - for (rest = dirname; (ch = *rest) && ! isspace (ch) && ch != '/'; rest++); + for (rest = dirname; (ch = *rest) && ! isspace ((uint8_t)ch) && ch != '/'; rest++); *rest = 0; # ifndef STAGE1_5 diff -r eae7b887e5ac -r ee498c9af856 tools/libfsimage/ufs/fsys_ufs.c --- a/tools/libfsimage/ufs/fsys_ufs.c Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/libfsimage/ufs/fsys_ufs.c Thu Sep 27 12:22:16 2007 -0600 @@ -72,13 +72,13 @@ ufs_dir(fsi_file_t *ffi, char *dirname) while (*dirname == '/') dirname++; - while (inode && *dirname && !isspace(*dirname)) { + while (inode && *dirname && !isspace((uint8_t)*dirname)) { if (!openi(ffi, inode)) return 0; /* parse for next path component */ fname = dirname; - while (*dirname && !isspace(*dirname) && *dirname != '/') + while (*dirname && !isspace((uint8_t)*dirname) && *dirname != '/') dirname++; ch = *dirname; *dirname = 0; /* ensure null termination */ diff -r eae7b887e5ac -r ee498c9af856 tools/libxc/Makefile --- a/tools/libxc/Makefile Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/libxc/Makefile Thu Sep 27 12:22:16 2007 -0600 @@ -23,6 +23,7 @@ CTRL_SRCS-$(CONFIG_Linux) += xc_linux.c CTRL_SRCS-$(CONFIG_Linux) += xc_linux.c CTRL_SRCS-$(CONFIG_SunOS) += xc_solaris.c CTRL_SRCS-$(CONFIG_X86_Linux) += xc_ptrace.c xc_ptrace_core.c +CTRL_SRCS-$(CONFIG_NetBSD) += xc_netbsd.c GUEST_SRCS-y := GUEST_SRCS-y += xg_private.c diff -r eae7b887e5ac -r ee498c9af856 tools/libxc/xc_core.c --- a/tools/libxc/xc_core.c Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/libxc/xc_core.c Thu Sep 27 12:22:16 2007 -0600 @@ -57,6 +57,9 @@ /* number of pages to write at a time */ #define DUMP_INCREMENT (4 * 1024) + +/* Don't yet support cross-address-size core dump */ +#define guest_width (sizeof (unsigned long)) /* string table */ struct xc_core_strtab { diff -r eae7b887e5ac -r ee498c9af856 tools/libxc/xc_core_x86.c --- a/tools/libxc/xc_core_x86.c Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/libxc/xc_core_x86.c Thu Sep 27 12:22:16 2007 -0600 @@ -20,6 +20,9 @@ #include "xg_private.h" #include "xc_core.h" + +/* Don't yet support cross-address-size core dump */ +#define guest_width (sizeof (unsigned long)) static int nr_gpfns(int xc_handle, domid_t domid) { diff -r eae7b887e5ac -r ee498c9af856 tools/libxc/xc_dom_boot.c --- a/tools/libxc/xc_dom_boot.c Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/libxc/xc_dom_boot.c Thu Sep 27 12:22:16 2007 -0600 @@ -152,6 +152,7 @@ void *xc_dom_boot_domU_map(struct xc_dom privcmd_mmap_entry_t *entries; void *ptr; int i, rc; + int err; entries = xc_dom_malloc(dom, count * sizeof(privcmd_mmap_entry_t)); if ( entries == NULL ) @@ -166,9 +167,11 @@ void *xc_dom_boot_domU_map(struct xc_dom MAP_SHARED, dom->guest_xc, 0); if ( ptr == MAP_FAILED ) { - xc_dom_panic(XC_INTERNAL_ERROR, - "%s: failed to mmap domU pages 0x%" PRIpfn "+0x%" PRIpfn - " [mmap]\n", __FUNCTION__, pfn, count); + err = errno; + xc_dom_panic(XC_INTERNAL_ERROR, + "%s: failed to mmap domU pages 0x%" PRIpfn "+0x%" PRIpfn + " [mmap, errno=%i (%s)]\n", __FUNCTION__, pfn, count, + err, strerror(err)); return NULL; } diff -r eae7b887e5ac -r ee498c9af856 tools/libxc/xc_dom_core.c --- a/tools/libxc/xc_dom_core.c Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/libxc/xc_dom_core.c Thu Sep 27 12:22:16 2007 -0600 @@ -122,7 +122,7 @@ void *xc_dom_malloc_page_aligned(struct memset(block, 0, sizeof(*block)); block->mmap_len = size; block->mmap_ptr = mmap(NULL, block->mmap_len, - PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, + PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); if ( block->mmap_ptr == MAP_FAILED ) { @@ -354,7 +354,7 @@ void *xc_dom_pfn_to_ptr(struct xc_dom_im { mode = "anonymous memory"; phys->ptr = mmap(NULL, phys->count << page_shift, - PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, + PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); if ( phys->ptr == MAP_FAILED ) { diff -r eae7b887e5ac -r ee498c9af856 tools/libxc/xc_domain.c --- a/tools/libxc/xc_domain.c Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/libxc/xc_domain.c Thu Sep 27 12:22:16 2007 -0600 @@ -734,6 +734,114 @@ int xc_domain_setdebugging(int xc_handle return do_domctl(xc_handle, &domctl); } +int xc_assign_device( + int xc_handle, + uint32_t domid, + uint32_t machine_bdf) +{ + DECLARE_DOMCTL; + + domctl.cmd = XEN_DOMCTL_assign_device; + domctl.domain = domid; + domctl.u.assign_device.machine_bdf = machine_bdf; + + return do_domctl(xc_handle, &domctl); +} + +/* Pass-through: binds machine irq to guests irq */ +int xc_domain_bind_pt_irq( + int xc_handle, + uint32_t domid, + uint8_t machine_irq, + uint8_t irq_type, + uint8_t bus, + uint8_t device, + uint8_t intx, + uint8_t isa_irq) +{ + int rc; + xen_domctl_bind_pt_irq_t * bind; + DECLARE_DOMCTL; + + domctl.cmd = XEN_DOMCTL_bind_pt_irq; + domctl.domain = (domid_t)domid; + + bind = &(domctl.u.bind_pt_irq); + bind->hvm_domid = domid; + bind->irq_type = irq_type; + bind->machine_irq = machine_irq; + bind->u.pci.bus = bus; + bind->u.pci.device = device; + bind->u.pci.intx = intx; + bind->u.isa.isa_irq = isa_irq; + + rc = do_domctl(xc_handle, &domctl); + return rc; +} + +int xc_domain_bind_pt_pci_irq( + int xc_handle, + uint32_t domid, + uint8_t machine_irq, + uint8_t bus, + uint8_t device, + uint8_t intx) +{ + + return (xc_domain_bind_pt_irq(xc_handle, domid, machine_irq, + PT_IRQ_TYPE_PCI, bus, device, intx, 0)); +} + +int xc_domain_bind_pt_isa_irq( + int xc_handle, + uint32_t domid, + uint8_t machine_irq) +{ + + return (xc_domain_bind_pt_irq(xc_handle, domid, machine_irq, + PT_IRQ_TYPE_ISA, 0, 0, 0, machine_irq)); +} + +int xc_domain_memory_mapping( + int xc_handle, + uint32_t domid, + unsigned long first_gfn, + unsigned long first_mfn, + unsigned long nr_mfns, + uint32_t add_mapping) +{ + DECLARE_DOMCTL; + + domctl.cmd = XEN_DOMCTL_memory_mapping; + domctl.domain = domid; + domctl.u.memory_mapping.first_gfn = first_gfn; + domctl.u.memory_mapping.first_mfn = first_mfn; + domctl.u.memory_mapping.nr_mfns = nr_mfns; + domctl.u.memory_mapping.add_mapping = add_mapping; + + return do_domctl(xc_handle, &domctl); +} + +int xc_domain_ioport_mapping( + int xc_handle, + uint32_t domid, + uint32_t first_gport, + uint32_t first_mport, + uint32_t nr_ports, + uint32_t add_mapping) +{ + DECLARE_DOMCTL; + + domctl.cmd = XEN_DOMCTL_ioport_mapping; + domctl.domain = domid; + domctl.u.ioport_mapping.first_gport = first_gport; + domctl.u.ioport_mapping.first_mport = first_mport; + domctl.u.ioport_mapping.nr_ports = nr_ports; + domctl.u.ioport_mapping.add_mapping = add_mapping; + + return do_domctl(xc_handle, &domctl); +} + /* * Local variables: * mode: C diff -r eae7b887e5ac -r ee498c9af856 tools/libxc/xc_domain_restore.c --- a/tools/libxc/xc_domain_restore.c Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/libxc/xc_domain_restore.c Thu Sep 27 12:22:16 2007 -0600 @@ -56,6 +56,10 @@ static xen_pfn_t *p2m = NULL; /* A table of P2M mappings in the current region */ static xen_pfn_t *p2m_batch = NULL; +/* Address size of the guest, in bytes */ +unsigned int guest_width; + + static ssize_t read_exact(int fd, void *buf, size_t count) { @@ -168,22 +172,17 @@ static xen_pfn_t *load_p2m_frame_list(in static xen_pfn_t *load_p2m_frame_list(int io_fd, int *pae_extended_cr3) { xen_pfn_t *p2m_frame_list; - vcpu_guest_context_t ctxt; - - if ( (p2m_frame_list = malloc(P2M_FL_SIZE)) == NULL ) - { - ERROR("Couldn't allocate p2m_frame_list array"); - return NULL; - } - + vcpu_guest_context_either_t ctxt; + xen_pfn_t p2m_fl_zero; + /* Read first entry of P2M list, or extended-info signature (~0UL). */ - if ( !read_exact(io_fd, p2m_frame_list, sizeof(long)) ) + if ( !read_exact(io_fd, &p2m_fl_zero, sizeof(long)) ) { ERROR("read extended-info signature failed"); return NULL; } - if ( p2m_frame_list[0] == ~0UL ) + if ( p2m_fl_zero == ~0UL ) { uint32_t tot_bytes; @@ -211,25 +210,42 @@ static xen_pfn_t *load_p2m_frame_list(in /* VCPU context structure? */ if ( !strncmp(chunk_sig, "vcpu", 4) ) { - if ( !read_exact(io_fd, &ctxt, sizeof(ctxt)) ) + /* Pick a guest word-size and PT depth from the ctxt size */ + if ( chunk_bytes == sizeof (ctxt.x32) ) + { + guest_width = 4; + if ( pt_levels > 2 ) + pt_levels = 3; + } + else if ( chunk_bytes == sizeof (ctxt.x64) ) + { + guest_width = 8; + pt_levels = 4; + } + else + { + ERROR("bad extended-info context size %d", chunk_bytes); + return NULL; + } + + if ( !read_exact(io_fd, &ctxt, chunk_bytes) ) { ERROR("read extended-info vcpu context failed"); return NULL; } - tot_bytes -= sizeof(struct vcpu_guest_context); - chunk_bytes -= sizeof(struct vcpu_guest_context); - - if ( ctxt.vm_assist & (1UL << VMASST_TYPE_pae_extended_cr3) ) + tot_bytes -= chunk_bytes; + chunk_bytes = 0; + + if ( GET_FIELD(&ctxt, vm_assist) + & (1UL << VMASST_TYPE_pae_extended_cr3) ) *pae_extended_cr3 = 1; } /* Any remaining bytes of this chunk: read and discard. */ while ( chunk_bytes ) { - unsigned long sz = chunk_bytes; - if ( sz > P2M_FL_SIZE ) - sz = P2M_FL_SIZE; - if ( !read_exact(io_fd, p2m_frame_list, sz) ) + unsigned long sz = MIN(chunk_bytes, sizeof(xen_pfn_t)); + if ( !read_exact(io_fd, &p2m_fl_zero, sz) ) { ERROR("read-and-discard extended-info chunk bytes failed"); return NULL; @@ -240,15 +256,25 @@ static xen_pfn_t *load_p2m_frame_list(in } /* Now read the real first entry of P2M list. */ - if ( !read_exact(io_fd, p2m_frame_list, sizeof(long)) ) + if ( !read_exact(io_fd, &p2m_fl_zero, sizeof(xen_pfn_t)) ) { ERROR("read first entry of p2m_frame_list failed"); return NULL; } } - /* First entry is already read into the p2m array. */ - if ( !read_exact(io_fd, &p2m_frame_list[1], P2M_FL_SIZE - sizeof(long)) ) + /* Now that we know the guest's word-size, can safely allocate + * the p2m frame list */ + if ( (p2m_frame_list = malloc(P2M_FL_SIZE)) == NULL ) + { + ERROR("Couldn't allocate p2m_frame_list array"); + return NULL; + } + + /* First entry has already been read. */ + p2m_frame_list[0] = p2m_fl_zero; + if ( !read_exact(io_fd, &p2m_frame_list[1], + (P2M_FL_ENTRIES - 1) * sizeof(xen_pfn_t)) ) { ERROR("read p2m_frame_list failed"); return NULL; @@ -272,11 +298,11 @@ int xc_domain_restore(int xc_handle, int /* The new domain's shared-info frame number. */ unsigned long shared_info_frame; unsigned char shared_info_page[PAGE_SIZE]; /* saved contents from file */ - shared_info_t *old_shared_info = (shared_info_t *)shared_info_page; - shared_info_t *new_shared_info; + shared_info_either_t *old_shared_info = (shared_info_either_t *)shared_info_page; + shared_info_either_t *new_shared_info; /* A copy of the CPU context of the guest. */ - vcpu_guest_context_t ctxt; + vcpu_guest_context_either_t ctxt; /* A table containing the type of each PFN (/not/ MFN!). */ unsigned long *pfn_type = NULL; @@ -291,7 +317,7 @@ int xc_domain_restore(int xc_handle, int xen_pfn_t *p2m_frame_list = NULL; /* A temporary mapping of the guest's start_info page. */ - start_info_t *start_info; + start_info_either_t *start_info; /* Our mapping of the current region (batch) */ char *region_base; @@ -324,16 +350,38 @@ int xc_domain_restore(int xc_handle, int } DPRINTF("xc_domain_restore start: p2m_size = %lx\n", p2m_size); - if ( !hvm ) - { - /* - * XXX For now, 32bit dom0's can only save/restore 32bit domUs - * on 64bit hypervisors. - */ + if ( !get_platform_info(xc_handle, dom, + &max_mfn, &hvirt_start, &pt_levels, &guest_width) ) + { + ERROR("Unable to get platform info."); + return 1; + } + + /* The *current* word size of the guest isn't very interesting; for now + * assume the guest will be the same as we are. We'll fix that later + * if we discover otherwise. */ + guest_width = sizeof(unsigned long); + pt_levels = (guest_width == 8) ? 4 : (pt_levels == 2) ? 2 : 3; + + if ( lock_pages(&ctxt, sizeof(ctxt)) ) + { + /* needed for build domctl, but might as well do early */ + ERROR("Unable to lock ctxt"); + return 1; + } + + if ( !hvm ) + { + /* Load the p2m frame list, plus potential extended info chunk */ + p2m_frame_list = load_p2m_frame_list(io_fd, &pae_extended_cr3); + if ( !p2m_frame_list ) + goto out; + + /* Now that we know the word size, tell Xen about it */ memset(&domctl, 0, sizeof(domctl)); domctl.domain = dom; domctl.cmd = XEN_DOMCTL_set_address_size; - domctl.u.address_size.size = sizeof(unsigned long) * 8; + domctl.u.address_size.size = guest_width * 8; rc = do_domctl(xc_handle, &domctl); if ( rc != 0 ) { @@ -343,30 +391,8 @@ int xc_domain_restore(int xc_handle, int rc = 1; } - if ( !get_platform_info(xc_handle, dom, - &max_mfn, &hvirt_start, &pt_levels) ) - { - ERROR("Unable to get platform info."); - return 1; - } - - if ( lock_pages(&ctxt, sizeof(ctxt)) ) - { - /* needed for build domctl, but might as well do early */ - ERROR("Unable to lock ctxt"); - return 1; - } - - /* Load the p2m frame list, plus potential extended info chunk */ - if ( !hvm ) - { - p2m_frame_list = load_p2m_frame_list(io_fd, &pae_extended_cr3); - if ( !p2m_frame_list ) - goto out; - } - /* We want zeroed memory so use calloc rather than malloc. */ - p2m = calloc(p2m_size, sizeof(xen_pfn_t)); + p2m = calloc(p2m_size, MAX(guest_width, sizeof (xen_pfn_t))); pfn_type = calloc(p2m_size, sizeof(unsigned long)); region_mfn = calloc(MAX_BATCH_SIZE, sizeof(xen_pfn_t)); p2m_batch = calloc(MAX_BATCH_SIZE, sizeof(xen_pfn_t)); @@ -963,14 +989,16 @@ int xc_domain_restore(int xc_handle, int if ( !(vcpumap & (1ULL << i)) ) continue; - if ( !read_exact(io_fd, &ctxt, sizeof(ctxt)) ) + if ( !read_exact(io_fd, &ctxt, ((guest_width == 8) + ? sizeof(ctxt.x64) + : sizeof(ctxt.x32))) ) { ERROR("Error when reading ctxt %d", i); goto out; } if ( !new_ctxt_format ) - ctxt.flags |= VGCF_online; + SET_FIELD(&ctxt, flags, GET_FIELD(&ctxt, flags) | VGCF_online); if ( i == 0 ) { @@ -978,48 +1006,49 @@ int xc_domain_restore(int xc_handle, int * Uncanonicalise the suspend-record frame number and poke * resume record. */ - pfn = ctxt.user_regs.edx; + pfn = GET_FIELD(&ctxt, user_regs.edx); if ( (pfn >= p2m_size) || (pfn_type[pfn] != XEN_DOMCTL_PFINFO_NOTAB) ) { ERROR("Suspend record frame number is bad"); goto out; } - ctxt.user_regs.edx = mfn = p2m[pfn]; + mfn = p2m[pfn]; + SET_FIELD(&ctxt, user_regs.edx, mfn); start_info = xc_map_foreign_range( xc_handle, dom, PAGE_SIZE, PROT_READ | PROT_WRITE, mfn); - start_info->nr_pages = p2m_size; - start_info->shared_info = shared_info_frame << PAGE_SHIFT; - start_info->flags = 0; - *store_mfn = start_info->store_mfn = p2m[start_info->store_mfn]; - start_info->store_evtchn = store_evtchn; - start_info->console.domU.mfn = p2m[start_info->console.domU.mfn]; - start_info->console.domU.evtchn = console_evtchn; - *console_mfn = start_info->console.domU.mfn; + SET_FIELD(start_info, nr_pages, p2m_size); + SET_FIELD(start_info, shared_info, shared_info_frame<<PAGE_SHIFT); + SET_FIELD(start_info, flags, 0); + *store_mfn = p2m[GET_FIELD(start_info, store_mfn)]; + SET_FIELD(start_info, store_mfn, *store_mfn); + SET_FIELD(start_info, store_evtchn, store_evtchn); + *console_mfn = p2m[GET_FIELD(start_info, console.domU.mfn)]; + SET_FIELD(start_info, console.domU.mfn, *console_mfn); + SET_FIELD(start_info, console.domU.evtchn, console_evtchn); munmap(start_info, PAGE_SIZE); } - /* Uncanonicalise each GDT frame number. */ - if ( ctxt.gdt_ents > 8192 ) + if ( GET_FIELD(&ctxt, gdt_ents) > 8192 ) { ERROR("GDT entry count out of range"); goto out; } - for ( j = 0; (512*j) < ctxt.gdt_ents; j++ ) - { - pfn = ctxt.gdt_frames[j]; + for ( j = 0; (512*j) < GET_FIELD(&ctxt, gdt_ents); j++ ) + { + pfn = GET_FIELD(&ctxt, gdt_frames[j]); if ( (pfn >= p2m_size) || (pfn_type[pfn] != XEN_DOMCTL_PFINFO_NOTAB) ) { - ERROR("GDT frame number is bad"); + ERROR("GDT frame number %i (0x%lx) is bad", + j, (unsigned long)pfn); goto out; } - ctxt.gdt_frames[j] = p2m[pfn]; - } - + SET_FIELD(&ctxt, gdt_frames[j], p2m[pfn]); + } /* Uncanonicalise the page table base pointer. */ - pfn = xen_cr3_to_pfn(ctxt.ctrlreg[3]); + pfn = xen_cr3_to_pfn(GET_FIELD(&ctxt, ctrlreg[3])); if ( pfn >= p2m_size ) { @@ -1036,21 +1065,18 @@ int xc_domain_restore(int xc_handle, int (unsigned long)pt_levels<<XEN_DOMCTL_PFINFO_LTAB_SHIFT); goto out; } - - ctxt.ctrlreg[3] = xen_pfn_to_cr3(p2m[pfn]); + SET_FIELD(&ctxt, ctrlreg[3], xen_pfn_to_cr3(p2m[pfn])); /* Guest pagetable (x86/64) stored in otherwise-unused CR1. */ - if ( (pt_levels == 4) && ctxt.ctrlreg[1] ) - { - pfn = xen_cr3_to_pfn(ctxt.ctrlreg[1]); - + if ( (pt_levels == 4) && (ctxt.x64.ctrlreg[1] & 1) ) + { + pfn = xen_cr3_to_pfn(ctxt.x64.ctrlreg[1] & ~1); if ( pfn >= p2m_size ) { - ERROR("User PT base is bad: pfn=%lu p2m_size=%lu type=%08lx", - pfn, p2m_size, pfn_type[pfn]); + ERROR("User PT base is bad: pfn=%lu p2m_size=%lu", + pfn, p2m_size); goto out; } - if ( (pfn_type[pfn] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) != ((unsigned long)pt_levels<<XEN_DOMCTL_PFINFO_LTAB_SHIFT) ) { @@ -1059,14 +1085,12 @@ int xc_domain_restore(int xc_handle, int (unsigned long)pt_levels<<XEN_DOMCTL_PFINFO_LTAB_SHIFT); goto out; } - - ctxt.ctrlreg[1] = xen_pfn_to_cr3(p2m[pfn]); - } - + ctxt.x64.ctrlreg[1] = xen_pfn_to_cr3(p2m[pfn]); + } domctl.cmd = XEN_DOMCTL_setvcpucontext; domctl.domain = (domid_t)dom; domctl.u.vcpucontext.vcpu = i; - set_xen_guest_handle(domctl.u.vcpucontext.ctxt, &ctxt); + set_xen_guest_handle(domctl.u.vcpucontext.ctxt, &ctxt.c); rc = xc_domctl(xc_handle, &domctl); if ( rc != 0 ) { @@ -1087,22 +1111,16 @@ int xc_domain_restore(int xc_handle, int xc_handle, dom, PAGE_SIZE, PROT_WRITE, shared_info_frame); /* restore saved vcpu_info and arch specific info */ - memcpy(&new_shared_info->vcpu_info, - &old_shared_info->vcpu_info, - sizeof(new_shared_info->vcpu_info)); - memcpy(&new_shared_info->arch, - &old_shared_info->arch, - sizeof(new_shared_info->arch)); + MEMCPY_FIELD(new_shared_info, old_shared_info, vcpu_info); + MEMCPY_FIELD(new_shared_info, old_shared_info, arch); /* clear any pending events and the selector */ - memset(&(new_shared_info->evtchn_pending[0]), 0, - sizeof (new_shared_info->evtchn_pending)); + MEMSET_ARRAY_FIELD(new_shared_info, evtchn_pending, 0); for ( i = 0; i < MAX_VIRT_CPUS; i++ ) - new_shared_info->vcpu_info[i].evtchn_pending_sel = 0; + SET_FIELD(new_shared_info, vcpu_info[i].evtchn_pending_sel, 0); /* mask event channels */ - memset(&(new_shared_info->evtchn_mask[0]), 0xff, - sizeof (new_shared_info->evtchn_mask)); + MEMSET_ARRAY_FIELD(new_shared_info, evtchn_mask, 0xff); /* leave wallclock time. set by hypervisor */ munmap(new_shared_info, PAGE_SIZE); @@ -1113,10 +1131,9 @@ int xc_domain_restore(int xc_handle, int pfn = p2m_frame_list[i]; if ( (pfn >= p2m_size) || (pfn_type[pfn] != XEN_DOMCTL_PFINFO_NOTAB) ) { - ERROR("PFN-to-MFN frame number is bad"); - goto out; - } - + ERROR("PFN-to-MFN frame number %i (%#lx) is bad", i, pfn); + goto out; + } p2m_frame_list[i] = p2m[pfn]; } @@ -1128,8 +1145,17 @@ int xc_domain_restore(int xc_handle, int goto out; } - memcpy(live_p2m, p2m, ROUNDUP(p2m_size * sizeof(xen_pfn_t), PAGE_SHIFT)); - munmap(live_p2m, ROUNDUP(p2m_size * sizeof(xen_pfn_t), PAGE_SHIFT)); + /* If the domain we're restoring has a different word size to ours, + * we need to repack the p2m appropriately */ + if ( guest_width > sizeof (xen_pfn_t) ) + for ( i = p2m_size - 1; i >= 0; i-- ) + ((uint64_t *)p2m)[i] = p2m[i]; + else if ( guest_width > sizeof (xen_pfn_t) ) + for ( i = 0; i < p2m_size; i++ ) + ((uint32_t *)p2m)[i] = p2m[i]; + + memcpy(live_p2m, p2m, ROUNDUP(p2m_size * guest_width, PAGE_SHIFT)); + munmap(live_p2m, ROUNDUP(p2m_size * guest_width, PAGE_SHIFT)); DPRINTF("Domain ready to be built.\n"); rc = 0; diff -r eae7b887e5ac -r ee498c9af856 tools/libxc/xc_domain_save.c --- a/tools/libxc/xc_domain_save.c Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/libxc/xc_domain_save.c Thu Sep 27 12:22:16 2007 -0600 @@ -54,9 +54,17 @@ static xen_pfn_t *live_m2p = NULL; static xen_pfn_t *live_m2p = NULL; static unsigned long m2p_mfn0; +/* Address size of the guest */ +unsigned int guest_width; + /* grep fodder: machine_to_phys */ -#define mfn_to_pfn(_mfn) live_m2p[(_mfn)] +#define mfn_to_pfn(_mfn) (live_m2p[(_mfn)]) + +#define pfn_to_mfn(_pfn) \ + ((xen_pfn_t) ((guest_width==8) \ + ? (((uint64_t *)live_p2m)[(_pfn)]) \ + : (((uint32_t *)live_p2m)[(_pfn)]))) /* * Returns TRUE if the given machine frame number has a unique mapping @@ -65,19 +73,7 @@ static unsigned long m2p_mfn0; #define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn) \ (((_mfn) < (max_mfn)) && \ ((mfn_to_pfn(_mfn) < (p2m_size)) && \ - (live_p2m[mfn_to_pfn(_mfn)] == (_mfn)))) - -/* Returns TRUE if MFN is successfully converted to a PFN. */ -#define translate_mfn_to_pfn(_pmfn) \ -({ \ - unsigned long mfn = *(_pmfn); \ - int _res = 1; \ - if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) ) \ - _res = 0; \ - else \ - *(_pmfn) = mfn_to_pfn(mfn); \ - _res; \ -}) + (pfn_to_mfn(mfn_to_pfn(_mfn)) == (_mfn)))) /* ** During (live) save/migrate, we maintain a number of bitmaps to track @@ -451,22 +447,25 @@ static int suspend_and_state(int (*suspe ** it to update the MFN to a reasonable value. */ static void *map_frame_list_list(int xc_handle, uint32_t dom, - shared_info_t *shinfo) + shared_info_either_t *shinfo) { int count = 100; void *p; - - while ( count-- && (shinfo->arch.pfn_to_mfn_frame_list_list == 0) ) + uint64_t fll = GET_FIELD(shinfo, arch.pfn_to_mfn_frame_list_list); + + while ( count-- && (fll == 0) ) + { usleep(10000); - - if ( shinfo->arch.pfn_to_mfn_frame_list_list == 0 ) + fll = GET_FIELD(shinfo, arch.pfn_to_mfn_frame_list_list); + } + + if ( fll == 0 ) { ERROR("Timed out waiting for frame list updated."); return NULL; } - p = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, PROT_READ, - shinfo->arch.pfn_to_mfn_frame_list_list); + p = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, PROT_READ, fll); if ( p == NULL ) ERROR("Couldn't map p2m_frame_list_list (errno %d)", errno); @@ -659,15 +658,16 @@ static xen_pfn_t *map_and_save_p2m_table int io_fd, uint32_t dom, unsigned long p2m_size, - shared_info_t *live_shinfo) -{ - vcpu_guest_context_t ctxt; + shared_info_either_t *live_shinfo) +{ + vcpu_guest_context_either_t ctxt; /* Double and single indirect references to the live P2M table */ - xen_pfn_t *live_p2m_frame_list_list = NULL; - xen_pfn_t *live_p2m_frame_list = NULL; - - /* A copy of the pfn-to-mfn table frame list. */ + void *live_p2m_frame_list_list = NULL; + void *live_p2m_frame_list = NULL; + + /* Copies of the above. */ + xen_pfn_t *p2m_frame_list_list = NULL; xen_pfn_t *p2m_frame_list = NULL; /* The mapping of the live p2m table itself */ @@ -680,15 +680,50 @@ static xen_pfn_t *map_and_save_p2m_table if ( !live_p2m_frame_list_list ) goto out; + /* Get a local copy of the live_P2M_frame_list_list */ + if ( !(p2m_frame_list_list = malloc(PAGE_SIZE)) ) + { + ERROR("Couldn't allocate p2m_frame_list_list array"); + goto out; + } + memcpy(p2m_frame_list_list, live_p2m_frame_list_list, PAGE_SIZE); + + /* Canonicalize guest's unsigned long vs ours */ + if ( guest_width > sizeof(unsigned long) ) + for ( i = 0; i < PAGE_SIZE/sizeof(unsigned long); i++ ) + if ( i < PAGE_SIZE/guest_width ) + p2m_frame_list_list[i] = ((uint64_t *)p2m_frame_list_list)[i]; + else + p2m_frame_list_list[i] = 0; + else if ( guest_width < sizeof(unsigned long) ) + for ( i = PAGE_SIZE/sizeof(unsigned long) - 1; i >= 0; i++ ) + p2m_frame_list_list[i] = ((uint32_t *)p2m_frame_list_list)[i]; + live_p2m_frame_list = xc_map_foreign_batch(xc_handle, dom, PROT_READ, - live_p2m_frame_list_list, + p2m_frame_list_list, P2M_FLL_ENTRIES); if ( !live_p2m_frame_list ) { ERROR("Couldn't map p2m_frame_list"); goto out; } + + /* Get a local copy of the live_P2M_frame_list */ + if ( !(p2m_frame_list = malloc(P2M_FL_SIZE)) ) + { + ERROR("Couldn't allocate p2m_frame_list array"); + goto out; + } + memcpy(p2m_frame_list, live_p2m_frame_list, P2M_FL_SIZE); + + /* Canonicalize guest's unsigned long vs ours */ + if ( guest_width > sizeof(unsigned long) ) + for ( i = 0; i < P2M_FL_ENTRIES; i++ ) + p2m_frame_list[i] = ((uint64_t *)p2m_frame_list)[i]; + else if ( guest_width < sizeof(unsigned long) ) + for ( i = P2M_FL_ENTRIES - 1; i >= 0; i++ ) + p2m_frame_list[i] = ((uint32_t *)p2m_frame_list)[i]; /* Map all the frames of the pfn->mfn table. For migrate to succeed, @@ -697,7 +732,7 @@ static xen_pfn_t *map_and_save_p2m_table from a safety POV anyhow. */ p2m = xc_map_foreign_batch(xc_handle, dom, PROT_READ, - live_p2m_frame_list, + p2m_frame_list, P2M_FL_ENTRIES); if ( !p2m ) { @@ -706,27 +741,30 @@ static xen_pfn_t *map_and_save_p2m_table } live_p2m = p2m; /* So that translation macros will work */ - /* Get a local copy of the live_P2M_frame_list */ - if ( !(p2m_frame_list = malloc(P2M_FL_SIZE)) ) - { - ERROR("Couldn't allocate p2m_frame_list array"); - goto out; - } - memcpy(p2m_frame_list, live_p2m_frame_list, P2M_FL_SIZE); - /* Canonicalise the pfn-to-mfn table frame-number list. */ - for ( i = 0; i < p2m_size; i += fpp ) - { - if ( !translate_mfn_to_pfn(&p2m_frame_list[i/fpp]) ) + for ( i = 0; i < p2m_size; i += FPP ) + { + if ( !MFN_IS_IN_PSEUDOPHYS_MAP(p2m_frame_list[i/FPP]) ) { ERROR("Frame# in pfn-to-mfn frame list is not in pseudophys"); - ERROR("entry %d: p2m_frame_list[%ld] is 0x%"PRIx64, i, i/fpp, - (uint64_t)p2m_frame_list[i/fpp]); - goto out; - } - } - - if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt) ) + ERROR("entry %d: p2m_frame_list[%ld] is 0x%"PRIx64", max 0x%lx", + i, i/FPP, (uint64_t)p2m_frame_list[i/FPP], max_mfn); + if ( p2m_frame_list[i/FPP] < max_mfn ) + { + ERROR("m2p[0x%"PRIx64"] = 0x%"PRIx64, + (uint64_t)p2m_frame_list[i/FPP], + (uint64_t)live_m2p[p2m_frame_list[i/FPP]]); + ERROR("p2m[0x%"PRIx64"] = 0x%"PRIx64, + (uint64_t)live_m2p[p2m_frame_list[i/FPP]], + (uint64_t)p2m[live_m2p[p2m_frame_list[i/FPP]]]); + + } + goto out; + } + p2m_frame_list[i/FPP] = mfn_to_pfn(p2m_frame_list[i/FPP]); + } + + if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt.c) ) { ERROR("Could not get vcpu context"); goto out; @@ -737,25 +775,26 @@ static xen_pfn_t *map_and_save_p2m_table * a PAE guest understands extended CR3 (PDPTs above 4GB). Turns off * slow paths in the restore code. */ - if ( (pt_levels == 3) && - (ctxt.vm_assist & (1UL << VMASST_TYPE_pae_extended_cr3)) ) { unsigned long signature = ~0UL; - uint32_t tot_sz = sizeof(struct vcpu_guest_context) + 8; - uint32_t chunk_sz = sizeof(struct vcpu_guest_context); + uint32_t chunk_sz = ((guest_width==8) + ? sizeof(ctxt.x64) + : sizeof(ctxt.x32)); + uint32_t tot_sz = chunk_sz + 8; char chunk_sig[] = "vcpu"; if ( !write_exact(io_fd, &signature, sizeof(signature)) || !write_exact(io_fd, &tot_sz, sizeof(tot_sz)) || !write_exact(io_fd, &chunk_sig, 4) || !write_exact(io_fd, &chunk_sz, sizeof(chunk_sz)) || - !write_exact(io_fd, &ctxt, sizeof(ctxt)) ) + !write_exact(io_fd, &ctxt, chunk_sz) ) { ERROR("write: extended info"); goto out; } } - if ( !write_exact(io_fd, p2m_frame_list, P2M_FL_SIZE) ) + if ( !write_exact(io_fd, p2m_frame_list, + P2M_FL_ENTRIES * sizeof(xen_pfn_t)) ) { ERROR("write: p2m_frame_list"); goto out; @@ -774,6 +813,9 @@ static xen_pfn_t *map_and_save_p2m_table if ( live_p2m_frame_list ) munmap(live_p2m_frame_list, P2M_FLL_ENTRIES * PAGE_SIZE); + if ( p2m_frame_list_list ) + free(p2m_frame_list_list); + if ( p2m_frame_list ) free(p2m_frame_list); @@ -789,7 +831,7 @@ int xc_domain_save(int xc_handle, int io { xc_dominfo_t info; - int rc = 1, i, j, last_iter, iter = 0; + int rc = 1, frc, i, j, last_iter, iter = 0; int live = (flags & XCFLAGS_LIVE); int debug = (flags & XCFLAGS_DEBUG); int race = 0, sent_last_iter, skip_this_iter; @@ -798,7 +840,7 @@ int xc_domain_save(int xc_handle, int io unsigned long shared_info_frame; /* A copy of the CPU context of the guest. */ - vcpu_guest_context_t ctxt; + vcpu_guest_context_either_t ctxt; /* A table containing the type of each PFN (/not/ MFN!). */ unsigned long *pfn_type = NULL; @@ -808,7 +850,7 @@ int xc_domain_save(int xc_handle, int io char page[PAGE_SIZE]; /* Live mapping of shared info structure */ - shared_info_t *live_shinfo = NULL; + shared_info_either_t *live_shinfo = NULL; /* base of the region in which domain memory is mapped */ unsigned char *region_base = NULL; @@ -836,6 +878,8 @@ int xc_domain_save(int xc_handle, int io /* HVM: magic frames for ioreqs and xenstore comms. */ uint64_t magic_pfns[3]; /* ioreq_pfn, bufioreq_pfn, store_pfn */ + unsigned long mfn; + /* If no explicit control parameters given, use defaults */ max_iters = max_iters ? : DEF_MAX_ITERS; max_factor = max_factor ? : DEF_MAX_FACTOR; @@ -843,7 +887,7 @@ int xc_domain_save(int xc_handle, int io initialize_mbit_rate(); if ( !get_platform_info(xc_handle, dom, - &max_mfn, &hvirt_start, &pt_levels) ) + &max_mfn, &hvirt_start, &pt_levels, &guest_width) ) { ERROR("Unable to get platform info."); return 1; @@ -882,13 +926,18 @@ int xc_domain_save(int xc_handle, int io { /* log-dirty already enabled? There's no test op, so attempt to disable then reenable it */ - if ( !(xc_shadow_control(xc_handle, dom, XEN_DOMCTL_SHADOW_OP_OFF, - NULL, 0, NULL, 0, NULL) >= 0 && - xc_shadow_control(xc_handle, dom, - XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY, - NULL, 0, NULL, 0, NULL) >= 0) ) - { - ERROR("Couldn't enable shadow mode"); + frc = xc_shadow_control(xc_handle, dom, XEN_DOMCTL_SHADOW_OP_OFF, + NULL, 0, NULL, 0, NULL); + if ( frc >= 0 ) + { + frc = xc_shadow_control(xc_handle, dom, + XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY, + NULL, 0, NULL, 0, NULL); + } + + if ( frc < 0 ) + { + ERROR("Couldn't enable shadow mode (rc %d) (errno %d)", frc, errno ); goto out; } } @@ -1001,7 +1050,6 @@ int xc_domain_save(int xc_handle, int io if ( !hvm ) { int err = 0; - unsigned long mfn; /* Map the P2M table, and write the list of P2M frames */ live_p2m = map_and_save_p2m_table(xc_handle, io_fd, dom, @@ -1018,7 +1066,7 @@ int xc_domain_save(int xc_handle, int io for ( i = 0; i < p2m_size; i++ ) { - mfn = live_p2m[i]; + mfn = pfn_to_mfn(i); if( (mfn != INVALID_P2M_ENTRY) && (mfn_to_pfn(mfn) != i) ) { DPRINTF("i=0x%x mfn=%lx live_m2p=%lx\n", i, @@ -1078,11 +1126,16 @@ int xc_domain_save(int xc_handle, int io int n = permute(N, p2m_size, order_nr); if ( debug ) - DPRINTF("%d pfn= %08lx mfn= %08lx %d [mfn]= %08lx\n", - iter, (unsigned long)n, hvm ? 0 : live_p2m[n], - test_bit(n, to_send), - hvm ? 0 : mfn_to_pfn(live_p2m[n]&0xFFFFF)); - + { + DPRINTF("%d pfn= %08lx mfn= %08lx %d", + iter, (unsigned long)n, + hvm ? 0 : pfn_to_mfn(n), + test_bit(n, to_send)); + if ( !hvm && is_mapped(pfn_to_mfn(n)) ) + DPRINTF(" [mfn]= %08lx", + mfn_to_pfn(pfn_to_mfn(n)&0xFFFFF)); + DPRINTF("\n"); + } if ( !last_iter && test_bit(n, to_send) && test_bit(n, to_skip) ) @@ -1113,7 +1166,7 @@ int xc_domain_save(int xc_handle, int io if ( hvm ) pfn_type[batch] = n; else - pfn_type[batch] = live_p2m[n]; + pfn_type[batch] = pfn_to_mfn(n); if ( !is_mapped(pfn_type[batch]) ) { @@ -1446,7 +1499,7 @@ int xc_domain_save(int xc_handle, int io for ( i = 0, j = 0; i < p2m_size; i++ ) { - if ( !is_mapped(live_p2m[i]) ) + if ( !is_mapped(pfn_to_mfn(i)) ) j++; } @@ -1458,7 +1511,7 @@ int xc_domain_save(int xc_handle, int io for ( i = 0, j = 0; i < p2m_size; ) { - if ( !is_mapped(live_p2m[i]) ) + if ( !is_mapped(pfn_to_mfn(i)) ) pfntab[j++] = i; i++; @@ -1475,63 +1528,75 @@ int xc_domain_save(int xc_handle, int io } } - if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt) ) + if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt.c) ) { ERROR("Could not get vcpu context"); goto out; } /* Canonicalise the suspend-record frame number. */ - if ( !translate_mfn_to_pfn(&ctxt.user_regs.edx) ) + mfn = GET_FIELD(&ctxt, user_regs.edx); + if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) ) { ERROR("Suspend record is not in range of pseudophys map"); goto out; } + SET_FIELD(&ctxt, user_regs.edx, mfn_to_pfn(mfn)); for ( i = 0; i <= info.max_vcpu_id; i++ ) { if ( !(vcpumap & (1ULL << i)) ) continue; - if ( (i != 0) && xc_vcpu_getcontext(xc_handle, dom, i, &ctxt) ) + if ( (i != 0) && xc_vcpu_getcontext(xc_handle, dom, i, &ctxt.c) ) { ERROR("No context for VCPU%d", i); goto out; } /* Canonicalise each GDT frame number. */ - for ( j = 0; (512*j) < ctxt.gdt_ents; j++ ) - { - if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[j]) ) + for ( j = 0; (512*j) < GET_FIELD(&ctxt, gdt_ents); j++ ) + { + mfn = GET_FIELD(&ctxt, gdt_frames[j]); + if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) ) { ERROR("GDT frame is not in range of pseudophys map"); goto out; } + SET_FIELD(&ctxt, gdt_frames[j], mfn_to_pfn(mfn)); } /* Canonicalise the page table base pointer. */ - if ( !MFN_IS_IN_PSEUDOPHYS_MAP(xen_cr3_to_pfn(ctxt.ctrlreg[3])) ) + if ( !MFN_IS_IN_PSEUDOPHYS_MAP(xen_cr3_to_pfn( + GET_FIELD(&ctxt, ctrlreg[3]))) ) { ERROR("PT base is not in range of pseudophys map"); goto out; } - ctxt.ctrlreg[3] = - xen_pfn_to_cr3(mfn_to_pfn(xen_cr3_to_pfn(ctxt.ctrlreg[3]))); + SET_FIELD(&ctxt, ctrlreg[3], + xen_pfn_to_cr3( + mfn_to_pfn( + xen_cr3_to_pfn( + GET_FIELD(&ctxt, ctrlreg[3]))))); /* Guest pagetable (x86/64) stored in otherwise-unused CR1. */ - if ( (pt_levels == 4) && ctxt.ctrlreg[1] ) - { - if ( !MFN_IS_IN_PSEUDOPHYS_MAP(xen_cr3_to_pfn(ctxt.ctrlreg[1])) ) + if ( (pt_levels == 4) && ctxt.x64.ctrlreg[1] ) + { + if ( !MFN_IS_IN_PSEUDOPHYS_MAP( + xen_cr3_to_pfn(ctxt.x64.ctrlreg[1])) ) { ERROR("PT base is not in range of pseudophys map"); goto out; } /* Least-significant bit means 'valid PFN'. */ - ctxt.ctrlreg[1] = 1 | - xen_pfn_to_cr3(mfn_to_pfn(xen_cr3_to_pfn(ctxt.ctrlreg[1]))); - } - - if ( !write_exact(io_fd, &ctxt, sizeof(ctxt)) ) + ctxt.x64.ctrlreg[1] = 1 | + xen_pfn_to_cr3( + mfn_to_pfn(xen_cr3_to_pfn(ctxt.x64.ctrlreg[1]))); + } + + if ( !write_exact(io_fd, &ctxt, ((guest_width==8) + ? sizeof(ctxt.x64) + : sizeof(ctxt.x32))) ) { ERROR("Error when writing to state file (1) (errno %d)", errno); goto out; @@ -1542,7 +1607,8 @@ int xc_domain_save(int xc_handle, int io * Reset the MFN to be a known-invalid value. See map_frame_list_list(). */ memcpy(page, live_shinfo, PAGE_SIZE); - ((shared_info_t *)page)->arch.pfn_to_mfn_frame_list_list = 0; + SET_FIELD(((shared_info_either_t *)page), + arch.pfn_to_mfn_frame_list_list, 0); if ( !write_exact(io_fd, page, PAGE_SIZE) ) { ERROR("Error when writing to state file (1) (errno %d)", errno); diff -r eae7b887e5ac -r ee498c9af856 tools/libxc/xc_netbsd.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/libxc/xc_netbsd.c Thu Sep 27 12:22:16 2007 -0600 @@ -0,0 +1,271 @@ +/****************************************************************************** + * + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + */ + +#include "xc_private.h" + +#include <xen/memory.h> +#include <xen/sys/evtchn.h> +#include <unistd.h> +#include <fcntl.h> + +int xc_interface_open(void) +{ + int flags, saved_errno; + int fd = open("/kern/xen/privcmd", O_RDWR); + + if ( fd == -1 ) + { + PERROR("Could not obtain handle on privileged command interface"); + return -1; + } + + /* Although we return the file handle as the 'xc handle' the API + does not specify / guarentee that this integer is in fact + a file handle. Thus we must take responsiblity to ensure + it doesn't propagate (ie leak) outside the process */ + if ( (flags = fcntl(fd, F_GETFD)) < 0 ) + { + PERROR("Could not get file handle flags"); + goto error; + } + flags |= FD_CLOEXEC; + if ( fcntl(fd, F_SETFD, flags) < 0 ) + { + PERROR("Could not set file handle flags"); + goto error; + } + + return fd; + + error: + saved_errno = errno; + close(fd); + errno = saved_errno; + return -1; +} + +int xc_interface_close(int xc_handle) +{ + return close(xc_handle); +} + +void *xc_map_foreign_batch(int xc_handle, uint32_t dom, int prot, + xen_pfn_t *arr, int num) +{ + privcmd_mmapbatch_t ioctlx; + void *addr; + addr = mmap(NULL, num*PAGE_SIZE, prot, MAP_ANON | MAP_SHARED, -1, 0); + if ( addr == MAP_FAILED ) + return NULL; + + ioctlx.num=num; + ioctlx.dom=dom; + ioctlx.addr=(unsigned long)addr; + ioctlx.arr=arr; + if ( ioctl(xc_handle, IOCTL_PRIVCMD_MMAPBATCH, &ioctlx) < 0 ) + { + int saved_errno = errno; + perror("XXXXXXXX"); + (void)munmap(addr, num*PAGE_SIZE); + errno = saved_errno; + return NULL; + } + return addr; + +} + +void *xc_map_foreign_range(int xc_handle, uint32_t dom, + int size, int prot, + unsigned long mfn) +{ + privcmd_mmap_t ioctlx; + privcmd_mmap_entry_t entry; + void *addr; + addr = mmap(NULL, size, prot, MAP_ANON | MAP_SHARED, -1, 0); + if ( addr == MAP_FAILED ) + return NULL; + + ioctlx.num=1; + ioctlx.dom=dom; + ioctlx.entry=&entry; + entry.va=(unsigned long) addr; + entry.mfn=mfn; + entry.npages=(size+PAGE_SIZE-1)>>PAGE_SHIFT; + if ( ioctl(xc_handle, IOCTL_PRIVCMD_MMAP, &ioctlx) < 0 ) + { + int saved_errno = errno; + (void)munmap(addr, size); + errno = saved_errno; + return NULL; + } + return addr; +} + +int xc_map_foreign_ranges(int xc_handle, uint32_t dom, + privcmd_mmap_entry_t *entries, int nr) +{ + privcmd_mmap_t ioctlx; + int err; + + ioctlx.num = nr; + ioctlx.dom = dom; + ioctlx.entry = entries; + + err = ioctl(xc_handle, IOCTL_PRIVCMD_MMAP, &ioctlx); + if (err == 0) + return 0; + else + return -errno; +} + +static int do_privcmd(int xc_handle, unsigned int cmd, unsigned long data) +{ + int err = ioctl(xc_handle, cmd, data); + if (err == 0) + return 0; + else + return -errno; +} + +int do_xen_hypercall(int xc_handle, privcmd_hypercall_t *hypercall) +{ + return do_privcmd(xc_handle, + IOCTL_PRIVCMD_HYPERCALL, + (unsigned long)hypercall); +} + +#define EVTCHN_DEV_NAME "/dev/xenevt" + +int xc_evtchn_open(void) +{ + return open(EVTCHN_DEV_NAME, O_NONBLOCK|O_RDWR); +} + +int xc_evtchn_close(int xce_handle) +{ + return close(xce_handle); +} + +int xc_evtchn_fd(int xce_handle) +{ + return xce_handle; +} + +int xc_evtchn_notify(int xce_handle, evtchn_port_t port) +{ + struct ioctl_evtchn_notify notify; + + notify.port = port; + + return ioctl(xce_handle, IOCTL_EVTCHN_NOTIFY, ¬ify); +} + +evtchn_port_or_error_t +xc_evtchn_bind_interdomain(int xce_handle, int domid, + evtchn_port_t remote_port) +{ + struct ioctl_evtchn_bind_interdomain bind; + int ret; + + bind.remote_domain = domid; + bind.remote_port = remote_port; + + ret = ioctl(xce_handle, IOCTL_EVTCHN_BIND_INTERDOMAIN, &bind); + if (ret == 0) + return bind.port; + else + return -1; +} + +int xc_evtchn_unbind(int xce_handle, evtchn_port_t port) +{ + struct ioctl_evtchn_unbind unbind; + + unbind.port = port; + + return ioctl(xce_handle, IOCTL_EVTCHN_UNBIND, &unbind); +} + +evtchn_port_or_error_t +xc_evtchn_bind_virq(int xce_handle, unsigned int virq) +{ + struct ioctl_evtchn_bind_virq bind; + int err; + + bind.virq = virq; + + err = ioctl(xce_handle, IOCTL_EVTCHN_BIND_VIRQ, &bind); + if (err) + return -1; + else + return bind.port; +} + +static int dorw(int fd, char *data, size_t size, int do_write) +{ + size_t offset = 0; + ssize_t len; + + while ( offset < size ) + { + if (do_write) + len = write(fd, data + offset, size - offset); + else + len = read(fd, data + offset, size - offset); + + if ( len == -1 ) + { + if ( errno == EINTR ) + continue; + return -1; + } + + offset += len; + } + + return 0; +} + +evtchn_port_or_error_t +xc_evtchn_pending(int xce_handle) +{ + evtchn_port_t port; + + if ( dorw(xce_handle, (char *)&port, sizeof(port), 0) == -1 ) + return -1; + + return port; +} + +int xc_evtchn_unmask(int xce_handle, evtchn_port_t port) +{ + return dorw(xce_handle, (char *)&port, sizeof(port), 1); +} + +/* Optionally flush file to disk and discard page cache */ +void discard_file_cache(int fd, int flush) +{ + + if ( flush && (fsync(fd) < 0) ) + { + /*PERROR("Failed to flush file: %s", strerror(errno));*/ + } +} + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff -r eae7b887e5ac -r ee498c9af856 tools/libxc/xc_private.c --- a/tools/libxc/xc_private.c Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/libxc/xc_private.c Thu Sep 27 12:22:16 2007 -0600 @@ -10,7 +10,12 @@ #include <stdarg.h> #include <pthread.h> -static __thread xc_error last_error = { XC_ERROR_NONE, ""}; +static pthread_key_t last_error_pkey; +static pthread_once_t last_error_pkey_once = PTHREAD_ONCE_INIT; + +static pthread_key_t errbuf_pkey; +static pthread_once_t errbuf_pkey_once = PTHREAD_ONCE_INIT; + #if DEBUG static xc_error_handler error_handler = xc_default_error_handler; #else @@ -23,15 +28,45 @@ void xc_default_error_handler(const xc_e fprintf(stderr, "ERROR %s: %s\n", desc, err->message); } +static void +_xc_clean_last_error(void *m) +{ + free(m); + pthread_setspecific(last_error_pkey, NULL); +} + +static void +_xc_init_last_error(void) +{ + pthread_key_create(&last_error_pkey, _xc_clean_last_error); +} + +static xc_error * +_xc_get_last_error(void) +{ + xc_error *last_error; + + pthread_once(&last_error_pkey_once, _xc_init_last_error); + + last_error = pthread_getspecific(last_error_pkey); + if (last_error == NULL) { + last_error = malloc(sizeof(xc_error)); + pthread_setspecific(last_error_pkey, last_error); + } + + return last_error; +} + const xc_error *xc_get_last_error(void) { - return &last_error; + return _xc_get_last_error(); } void xc_clear_last_error(void) { - last_error.code = XC_ERROR_NONE; - last_error.message[0] = '\0'; + xc_error *last_error = _xc_get_last_error(); + last_error->code = XC_ERROR_NONE; + last_error->message[0] = '\0'; } const char *xc_error_code_to_desc(int code) @@ -61,12 +96,12 @@ xc_error_handler xc_set_error_handler(xc return old; } - static void _xc_set_error(int code, const char *msg) { - last_error.code = code; - strncpy(last_error.message, msg, XC_MAX_ERROR_MSG_LEN - 1); - last_error.message[XC_MAX_ERROR_MSG_LEN-1] = '\0'; + xc_error *last_error = _xc_get_last_error(); + last_error->code = code; + strncpy(last_error->message, msg, XC_MAX_ERROR_MSG_LEN - 1); + last_error->message[XC_MAX_ERROR_MSG_LEN-1] = '\0'; } void xc_set_error(int code, const char *fmt, ...) @@ -84,23 +119,29 @@ void xc_set_error(int code, const char * errno = saved_errno; - if ( error_handler != NULL ) - error_handler(&last_error); + if ( error_handler != NULL ) { + xc_error *last_error = _xc_get_last_error(); + error_handler(last_error); + } } int lock_pages(void *addr, size_t len) { int e = 0; #ifndef __sun__ - e = mlock(addr, len); -#endif - return (e); + void *laddr = (void *)((unsigned long)addr & PAGE_MASK); + size_t llen = (len + PAGE_SIZE - 1) & PAGE_MASK; + e = mlock(laddr, llen); +#endif + return e; } void unlock_pages(void *addr, size_t len) { #ifndef __sun__ - safe_munlock(addr, len); + void *laddr = (void *)((unsigned long)addr & PAGE_MASK); + size_t llen = (len + PAGE_SIZE - 1) & PAGE_MASK; + safe_munlock(laddr, llen); #endif } @@ -466,11 +507,33 @@ unsigned long xc_make_page_below_4G( return new_mfn; } +static void +_xc_clean_errbuf(void * m) +{ + free(m); + pthread_setspecific(errbuf_pkey, NULL); +} + +static void +_xc_init_errbuf(void) +{ + pthread_key_create(&errbuf_pkey, _xc_clean_errbuf); +} + char *safe_strerror(int errcode) { - static __thread char errbuf[32]; +#define XS_BUFSIZE 32 + char *errbuf; static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; char *strerror_str; + + pthread_once(&errbuf_pkey_once, _xc_init_errbuf); + + errbuf = pthread_getspecific(errbuf_pkey); + if (errbuf == NULL) { + errbuf = malloc(XS_BUFSIZE); + pthread_setspecific(errbuf_pkey, errbuf); + } /* * Thread-unsafe strerror() is protected by a local mutex. We copy @@ -478,8 +541,8 @@ char *safe_strerror(int errcode) */ pthread_mutex_lock(&mutex); strerror_str = strerror(errcode); - strncpy(errbuf, strerror_str, sizeof(errbuf)); - errbuf[sizeof(errbuf)-1] = '\0'; + strncpy(errbuf, strerror_str, XS_BUFSIZE); + errbuf[XS_BUFSIZE-1] = '\0'; pthread_mutex_unlock(&mutex); return errbuf; diff -r eae7b887e5ac -r ee498c9af856 tools/libxc/xc_resume.c --- a/tools/libxc/xc_resume.c Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/libxc/xc_resume.c Thu Sep 27 12:22:16 2007 -0600 @@ -8,13 +8,8 @@ #include <xen/foreign/x86_64.h> #include <xen/hvm/params.h> -/* Need to provide the right flavour of vcpu context for Xen */ -typedef union -{ - vcpu_guest_context_x86_64_t c64; - vcpu_guest_context_x86_32_t c32; - vcpu_guest_context_t c; -} vcpu_guest_context_either_t; +/* Don't yet support cross-address-size uncooperative resume */ +#define guest_width (sizeof (unsigned long)) static int modify_returncode(int xc_handle, uint32_t domid) { @@ -50,9 +45,9 @@ static int modify_returncode(int xc_hand if ( !info.hvm ) ctxt.c.user_regs.eax = 1; else if ( strstr(caps, "x86_64") ) - ctxt.c64.user_regs.eax = 1; + ctxt.x64.user_regs.eax = 1; else - ctxt.c32.user_regs.eax = 1; + ctxt.x32.user_regs.eax = 1; if ( (rc = xc_vcpu_setcontext(xc_handle, domid, 0, &ctxt.c)) != 0 ) return rc; diff -r eae7b887e5ac -r ee498c9af856 tools/libxc/xenctrl.h --- a/tools/libxc/xenctrl.h Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/libxc/xenctrl.h Thu Sep 27 12:22:16 2007 -0600 @@ -897,4 +897,43 @@ int xc_ia64_save_to_nvram(int xc_handle, /* IA64 specific, nvram init */ int xc_ia64_nvram_init(int xc_handle, char *dom_name, uint32_t dom); +/* HVM guest pass-through */ +int xc_assign_device(int xc_handle, + uint32_t domid, + uint32_t machine_bdf); + +int xc_domain_memory_mapping(int xc_handle, + uint32_t domid, + unsigned long first_gfn, + unsigned long first_mfn, + unsigned long nr_mfns, + uint32_t add_mapping); + +int xc_domain_ioport_mapping(int xc_handle, + uint32_t domid, + uint32_t first_gport, + uint32_t first_mport, + uint32_t nr_ports, + uint32_t add_mapping); + +int xc_domain_bind_pt_irq(int xc_handle, + uint32_t domid, + uint8_t machine_irq, + uint8_t irq_type, + uint8_t bus, + uint8_t device, + uint8_t intx, + uint8_t isa_irq); + +int xc_domain_bind_pt_pci_irq(int xc_handle, + uint32_t domid, + uint8_t machine_irq, + uint8_t bus, + uint8_t device, + uint8_t intx); + +int xc_domain_bind_pt_isa_irq(int xc_handle, + uint32_t domid, + uint8_t machine_irq); + #endif /* XENCTRL_H */ diff -r eae7b887e5ac -r ee498c9af856 tools/libxc/xg_private.h --- a/tools/libxc/xg_private.h Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/libxc/xg_private.h Thu Sep 27 12:22:16 2007 -0600 @@ -15,7 +15,6 @@ #include "xenguest.h" #include "xc_private.h" -#include <xen/sys/privcmd.h> #include <xen/memory.h> #include <xen/elfnote.h> @@ -134,13 +133,6 @@ typedef l4_pgentry_64_t l4_pgentry_t; #define PAGE_SHIFT_X86 12 #define PAGE_SIZE_X86 (1UL << PAGE_SHIFT_X86) #define PAGE_MASK_X86 (~(PAGE_SIZE_X86-1)) -#if defined(__i386__) -#define MADDR_BITS_X86 44 -#elif defined(__x86_64__) -#define MADDR_BITS_X86 52 -#endif -#define MFN_MASK_X86 ((1ULL << (MADDR_BITS_X86 - PAGE_SHIFT_X86)) - 1) -#define MADDR_MASK_X86 (MFN_MASK_X86 << PAGE_SHIFT_X86) #define PAGE_SHIFT_IA64 14 #define PAGE_SIZE_IA64 (1UL << PAGE_SHIFT_IA64) @@ -148,19 +140,28 @@ typedef l4_pgentry_64_t l4_pgentry_t; #define ROUNDUP(_x,_w) (((unsigned long)(_x)+(1UL<<(_w))-1) & ~((1UL<<(_w))-1)) + +/* XXX SMH: following skanky macros rely on variable p2m_size being set */ +/* XXX TJD: also, "guest_width" should be the guest's sizeof(unsigned long) */ + /* Number of xen_pfn_t in a page */ -#define fpp (PAGE_SIZE/sizeof(xen_pfn_t)) -/* XXX SMH: following 3 skanky macros rely on variable p2m_size being set */ +#define FPP (PAGE_SIZE/(guest_width)) /* Number of entries in the pfn_to_mfn_frame_list_list */ -#define P2M_FLL_ENTRIES (((p2m_size)+(fpp*fpp)-1)/(fpp*fpp)) +#define P2M_FLL_ENTRIES (((p2m_size)+(FPP*FPP)-1)/(FPP*FPP)) /* Number of entries in the pfn_to_mfn_frame_list */ -#define P2M_FL_ENTRIES (((p2m_size)+fpp-1)/fpp) +#define P2M_FL_ENTRIES (((p2m_size)+FPP-1)/FPP) /* Size in bytes of the pfn_to_mfn_frame_list */ -#define P2M_FL_SIZE ((P2M_FL_ENTRIES)*sizeof(unsigned long)) +#define P2M_FL_SIZE ((P2M_FL_ENTRIES)*(guest_width)) + +/* Masks for PTE<->PFN conversions */ +#define MADDR_BITS_X86 ((guest_width == 8) ? 52 : 44) +#define MFN_MASK_X86 ((1ULL << (MADDR_BITS_X86 - PAGE_SHIFT_X86)) - 1) +#define MADDR_MASK_X86 (MFN_MASK_X86 << PAGE_SHIFT_X86) + #define PAEKERN_no 0 #define PAEKERN_yes 1 diff -r eae7b887e5ac -r ee498c9af856 tools/libxc/xg_save_restore.h --- a/tools/libxc/xg_save_restore.h Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/libxc/xg_save_restore.h Thu Sep 27 12:22:16 2007 -0600 @@ -5,6 +5,9 @@ */ #include "xc_private.h" + +#include <xen/foreign/x86_32.h> +#include <xen/foreign/x86_64.h> /* ** We process save/restore/migrate in batches of pages; the below @@ -32,15 +35,19 @@ ** be a property of the domain, but for the moment we just read it ** from the hypervisor. ** +** - The width of a guest word (unsigned long), in bytes. +** ** Returns 1 on success, 0 on failure. */ static inline int get_platform_info(int xc_handle, uint32_t dom, /* OUT */ unsigned long *max_mfn, /* OUT */ unsigned long *hvirt_start, - /* OUT */ unsigned int *pt_levels) + /* OUT */ unsigned int *pt_levels, + /* OUT */ unsigned int *guest_width) { xen_capabilities_info_t xen_caps = ""; xen_platform_parameters_t xen_params; + DECLARE_DOMCTL; if (xc_version(xc_handle, XENVER_platform_parameters, &xen_params) != 0) return 0; @@ -52,17 +59,18 @@ static inline int get_platform_info(int *hvirt_start = xen_params.virt_start; - /* - * XXX For now, 32bit dom0's can only save/restore 32bit domUs - * on 64bit hypervisors, so no need to check which type of domain - * we're dealing with. - */ + memset(&domctl, 0, sizeof(domctl)); + domctl.domain = dom; + domctl.cmd = XEN_DOMCTL_get_address_size; + + if ( do_domctl(xc_handle, &domctl) != 0 ) + return 0; + + *guest_width = domctl.u.address_size.size / 8; + if (strstr(xen_caps, "xen-3.0-x86_64")) -#if defined(__i386__) - *pt_levels = 3; -#else - *pt_levels = 4; -#endif + /* Depends on whether it's a compat 32-on-64 guest */ + *pt_levels = ( (*guest_width == 8) ? 4 : 3 ); else if (strstr(xen_caps, "xen-3.0-x86_32p")) *pt_levels = 3; else if (strstr(xen_caps, "xen-3.0-x86_32")) @@ -95,3 +103,56 @@ static inline int get_platform_info(int /* Returns TRUE if the PFN is currently mapped */ #define is_mapped(pfn_type) (!((pfn_type) & 0x80000000UL)) + + +/* 32-on-64 support: saving 32bit guests from 64bit tools and vice versa */ +typedef union +{ + vcpu_guest_context_x86_64_t x64; + vcpu_guest_context_x86_32_t x32; + vcpu_guest_context_t c; +} vcpu_guest_context_either_t; + +typedef union +{ + shared_info_x86_64_t x64; + shared_info_x86_32_t x32; + shared_info_t s; +} shared_info_either_t; + +typedef union +{ + start_info_x86_64_t x64; + start_info_x86_32_t x32; + start_info_t s; +} start_info_either_t; + +#define GET_FIELD(_p, _f) ((guest_width==8) ? ((_p)->x64._f) : ((_p)->x32._f)) + +#define SET_FIELD(_p, _f, _v) do { \ + if (guest_width == 8) \ + (_p)->x64._f = (_v); \ + else \ + (_p)->x32._f = (_v); \ +} while (0) + +#define MEMCPY_FIELD(_d, _s, _f) do { \ + if (guest_width == 8) \ + memcpy(&(_d)->x64._f, &(_s)->x64._f,sizeof((_d)->x64._f)); \ + else \ + memcpy(&(_d)->x32._f, &(_s)->x32._f,sizeof((_d)->x32._f)); \ +} while (0) + +#define MEMSET_ARRAY_FIELD(_p, _f, _v) do { \ + if (guest_width == 8) \ + memset(&(_p)->x64._f[0], (_v), sizeof((_p)->x64._f)); \ + else \ + memset(&(_p)->x32._f[0], (_v), sizeof((_p)->x32._f)); \ +} while (0) + +#ifndef MAX +#define MAX(_a, _b) ((_a) >= (_b) ? (_a) : (_b)) +#endif +#ifndef MIN +#define MIN(_a, _b) ((_a) <= (_b) ? (_a) : (_b)) +#endif diff -r eae7b887e5ac -r ee498c9af856 tools/libxen/include/xen/api/xen_all.h --- a/tools/libxen/include/xen/api/xen_all.h Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/libxen/include/xen/api/xen_all.h Thu Sep 27 12:22:16 2007 -0600 @@ -36,4 +36,5 @@ #include <xen/api/xen_vm_metrics.h> #include <xen/api/xen_vm_power_state.h> #include <xen/api/xen_vtpm.h> +#include <xen/api/xen_xspolicy.h> #endif diff -r eae7b887e5ac -r ee498c9af856 tools/pygrub/src/GrubConf.py --- a/tools/pygrub/src/GrubConf.py Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/pygrub/src/GrubConf.py Thu Sep 27 12:22:16 2007 -0600 @@ -101,7 +101,7 @@ class GrubImage(object): if self.commands.has_key(com): if self.commands[com] is not None: - exec("%s = r\"%s\"" %(self.commands[com], arg.strip())) + setattr(self, self.commands[com], arg.strip()) else: logging.info("Ignored image directive %s" %(com,)) else: @@ -142,11 +142,11 @@ class GrubImage(object): initrd = property(get_initrd, set_initrd) # set up command handlers - commands = { "title": "self.title", - "root": "self.root", - "rootnoverify": "self.root", - "kernel": "self.kernel", - "initrd": "self.initrd", + commands = { "title": "title", + "root": "root", + "rootnoverify": "root", + "kernel": "kernel", + "initrd": "initrd", "chainloader": None, "module": None} @@ -195,7 +195,7 @@ class GrubConfigFile(object): (com, arg) = grub_exact_split(l, 2) if self.commands.has_key(com): if self.commands[com] is not None: - exec("%s = r\"%s\"" %(self.commands[com], arg.strip())) + setattr(self, self.commands[com], arg.strip()) else: logging.info("Ignored directive %s" %(com,)) else: @@ -208,7 +208,7 @@ class GrubConfigFile(object): (com, arg) = grub_exact_split(line, 2) if self.commands.has_key(com): if self.commands[com] is not None: - exec("%s = r\"%s\"" %(self.commands[com], arg.strip())) + setattr(self, self.commands[com], arg.strip()) else: logging.info("Ignored directive %s" %(com,)) else: @@ -236,12 +236,12 @@ class GrubConfigFile(object): splash = property(get_splash, set_splash) # set up command handlers - commands = { "default": "self.default", - "timeout": "self.timeout", - "fallback": "self.fallback", - "hiddenmenu": "self.hiddenmenu", - "splashimage": "self.splash", - "password": "self.password" } + commands = { "default": "default", + "timeout": "timeout", + "fallback": "fallback", + "hiddenmenu": "hiddenmenu", + "splashimage": "splash", + "password": "password" } for c in ("bootp", "color", "device", "dhcp", "hide", "ifconfig", "pager", "partnew", "parttype", "rarp", "serial", "setkey", "terminal", "terminfo", "tftpserver", "unhide"): diff -r eae7b887e5ac -r ee498c9af856 tools/pygrub/src/LiloConf.py --- a/tools/pygrub/src/LiloConf.py Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/pygrub/src/LiloConf.py Thu Sep 27 12:22:16 2007 -0600 @@ -31,7 +31,7 @@ class LiloImage(object): if self.commands.has_key(com): if self.commands[com] is not None: - exec("%s = r\'%s\'" %(self.commands[com], re.sub('^"(.+)"$', r"\1", arg.strip()))) + setattr(self, self.commands[com], re.sub('^"(.+)"$', r"\1", arg.strip())) else: logging.info("Ignored image directive %s" %(com,)) else: @@ -74,13 +74,13 @@ class LiloImage(object): readonly = property(get_readonly, set_readonly) # set up command handlers - commands = { "label": "self.title", - "root": "self.root", - "rootnoverify": "self.root", - "image": "self.kernel", - "initrd": "self.initrd", - "append": "self.args", - "read-only": "self.readonly", + commands = { "label": "title", + "root": "root", + "rootnoverify": "root", + "image": "kernel", + "initrd": "initrd", + "append": "args", + "read-only": "readonly", "chainloader": None, "module": None} @@ -129,7 +129,7 @@ class LiloConfigFile(object): (com, arg) = GrubConf.grub_exact_split(l, 2) if self.commands.has_key(com): if self.commands[com] is not None: - exec("%s = r\"%s\"" %(self.commands[com], arg.strip())) + setattr(self, self.commands[com], arg.strip()) else: logging.info("Ignored directive %s" %(com,)) else: diff -r eae7b887e5ac -r ee498c9af856 tools/python/xen/lowlevel/xc/xc.c --- a/tools/python/xen/lowlevel/xc/xc.c Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/python/xen/lowlevel/xc/xc.c Thu Sep 27 12:22:16 2007 -0600 @@ -346,6 +346,7 @@ static PyObject *pyxc_domain_getinfo(XcO Py_DECREF(list); if ( pyhandle != NULL ) { Py_DECREF(pyhandle); } if ( info_dict != NULL ) { Py_DECREF(info_dict); } + free(info); return NULL; } for ( j = 0; j < sizeof(xen_domain_handle_t); j++ ) diff -r eae7b887e5ac -r ee498c9af856 tools/python/xen/lowlevel/xs/xs.c --- a/tools/python/xen/lowlevel/xs/xs.c Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/python/xen/lowlevel/xs/xs.c Thu Sep 27 12:22:16 2007 -0600 @@ -365,6 +365,7 @@ static PyObject *xspy_set_permissions(Xs goto exit; } + free(xsperms); Py_INCREF(Py_None); return Py_None; diff -r eae7b887e5ac -r ee498c9af856 tools/python/xen/util/bootloader.py --- a/tools/python/xen/util/bootloader.py Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/python/xen/util/bootloader.py Thu Sep 27 12:22:16 2007 -0600 @@ -21,7 +21,10 @@ import tempfile import tempfile import shutil import threading + from xen.xend.XendLogging import log +from xen.util import mkdir +import xen.util.xsm.xsm as security __bootloader = None @@ -70,8 +73,9 @@ def set_boot_policy(title_idx, filename) def loads_default_policy(filename): """ Determine whether the given policy is loaded by the default boot title """ - polfile = get_default_policy() - if polfile != None: + policy = get_default_policy() + if policy: + polfile = policy + ".bin" if polfile == filename or \ "/"+polfile == filename: return True @@ -220,28 +224,6 @@ class Grub(Bootloader): return boot_file - def __get_titles(self): - """ Get the names of all boot titles in the grub config file - @rtype: list - @return: list of names of available boot titles - """ - titles = [] - try: - boot_file = self.__get_bootfile() - except: - return [] - try: - self.__bootfile_lock.acquire() - grub_fd = open(boot_file) - for line in grub_fd: - if self.title_re.match(line): - line = line.rstrip().lstrip() - titles.append(line.lstrip('title').lstrip()) - finally: - self.__bootfile_lock.release() - return titles - - def get_default_title(self): """ Get the index (starting with 0) of the default boot title This number is read from the grub configuration file. @@ -261,8 +243,8 @@ class Grub(Bootloader): for line in grub_fd: line = line.rstrip() if def_re.match(line): - line = line.rstrip() - line = line.lstrip("default=") + #remove 'default=' + line = line.lstrip()[8:] default = int(line) break finally: @@ -295,11 +277,13 @@ class Grub(Bootloader): if self.policy_re.match(line): start = line.find("module") pol = line[start+6:] - pol = pol.lstrip().rstrip() + pol = pol.strip() if pol[0] == '/': pol = pol[1:] if pol[0:5] == "boot/": pol = pol[5:] + if pol.endswith(".bin"): + pol = pol[:-4] policies[idx] = pol finally: self.__bootfile_lock.release() @@ -399,7 +383,7 @@ class Grub(Bootloader): if self.policy_re.match(line): start = line.find("module") pol = line[start+6:len(line)] - pol = pol.lstrip().rstrip() + pol = pol.strip() if pol in namelist: omit_line = True found = True @@ -499,7 +483,7 @@ class Grub(Bootloader): within_title = 0 ctr = ctr + 1 if within_title and self.kernel_re.match(line): - line = line.rstrip().lstrip() + line = line.strip() items = line.split(" ") i = 0 while i < len(items): @@ -513,9 +497,123 @@ class Grub(Bootloader): self.__bootfile_lock.release() return None # Not found +class LatePolicyLoader(Bootloader): + """ A fake bootloader file that holds the policy to load automatically + once xend has started up and the Domain-0 label to set. """ + def __init__(self): + self.__bootfile_lock = threading.RLock() + self.PATH = security.security_dir_prefix + self.FILENAME = self.PATH + "/xen_boot_policy" + self.DEFAULT_TITLE = "ANY" + self.POLICY_ATTR = "POLICY" + Bootloader.__init__(self) + + def probe(self): + _dir=os.path.dirname(self.FILENAME) + mkdir.parents(_dir, stat.S_IRWXU) + return True + + def get_default_title(self): + return self.DEFAULT_TITLE + + def get_boot_policies(self): + policies = {} + try: + self.__bootfile_lock.acquire() + + res = self.__loadcontent() + + pol = res.get( self.POLICY_ATTR ) + if pol: + policies.update({ self.DEFAULT_TITLE : pol }) + + finally: + self.__bootfile_lock.release() + + return policies + + def add_boot_policy(self, index, binpolname): + try: + self.__bootfile_lock.acquire() + + res = self.__loadcontent() + if binpolname.endswith(".bin"): + binpolname = binpolname[0:-4] + res[ self.POLICY_ATTR ] = binpolname + self.__writecontent(res) + finally: + self.__bootfile_lock.release() + + return True + + def rm_policy_from_boottitle(self, index, unamelist): + try: + self.__bootfile_lock.acquire() + + res = self.__loadcontent() + if self.POLICY_ATTR in res: + del(res[self.POLICY_ATTR]) + self.__writecontent(res) + finally: + self.__bootfile_lock.release() + + return True + + def set_kernel_attval(self, index, att, val): + try: + self.__bootfile_lock.acquire() + + res = self.__loadcontent() + res[att] = val + self.__writecontent(res) + finally: + self.__bootfile_lock.release() + + return True + + def get_kernel_val(self, index, att): + try: + self.__bootfile_lock.acquire() + + res = self.__loadcontent() + return res.get(att) + finally: + self.__bootfile_lock.release() + + def __loadcontent(self): + res={} + try: + file = open(self.FILENAME) + for line in file: + tmp = line.split("=",1) + if len(tmp) == 2: + res[tmp[0]] = tmp[1].strip() + file.close() + except: + pass + + return res + + def __writecontent(self, items): + rc = True + try: + file = open(self.FILENAME,"w") + if file: + for key, value in items.items(): + file.write("%s=%s\n" % (str(key),str(value))) + file.close() + except: + rc = False + + return rc + __bootloader = Bootloader() grub = Grub() if grub.probe() == True: __bootloader = grub +else: + late = LatePolicyLoader() + if late.probe() == True: + __bootloader = late diff -r eae7b887e5ac -r ee498c9af856 tools/python/xen/util/xsm/acm/acm.py --- a/tools/python/xen/util/xsm/acm/acm.py Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/python/xen/util/xsm/acm/acm.py Thu Sep 27 12:22:16 2007 -0600 @@ -33,7 +33,8 @@ from xen.xend.XendConstants import * from xen.xend.XendConstants import * #global directories and tools for security management -policy_dir_prefix = "/etc/xen/acm-security/policies" +security_dir_prefix = "/etc/xen/acm-security" +policy_dir_prefix = security_dir_prefix + "/policies" res_label_filename = policy_dir_prefix + "/resource_labels" boot_filename = "/boot/grub/menu.lst" altboot_filename = "/boot/grub/grub.conf" @@ -1308,12 +1309,33 @@ def parse_security_label(security_label) return security_label def set_security_label(policy, label): - policytype = xsconstants.ACM_POLICY_ID if label != "" and policy != "": - return "%s:%s:%s" % (policytype, policy, label) + return "%s:%s:%s" % (xsconstants.ACM_POLICY_ID, policy, label) else: return "" def ssidref2security_label(ssidref): from xen.xend.XendXSPolicyAdmin import XSPolicyAdminInstance return XSPolicyAdminInstance().ssidref_to_vmlabel(ssidref) + +def get_security_label(self, xspol=None): + """ + Get the security label of a domain + @param xspol The policy to use when converting the ssid into + a label; only to be passed during the updating + of the policy + """ + domid = self.getDomid() + + if not xspol: + from xen.xend.XendXSPolicyAdmin import XSPolicyAdminInstance + xspol = XSPolicyAdminInstance().get_loaded_policy() + + if domid == 0: + if xspol: + label = xspol.policy_get_domain_label_formatted(domid) + else: + label = "" + else: + label = self.info.get('security_label', '') + return label diff -r eae7b887e5ac -r ee498c9af856 tools/python/xen/util/xsm/dummy/dummy.py --- a/tools/python/xen/util/xsm/dummy/dummy.py Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/python/xen/util/xsm/dummy/dummy.py Thu Sep 27 12:22:16 2007 -0600 @@ -6,6 +6,7 @@ class XSMError(Exception): def __str__(self): return repr(self.value) +security_dir_prefix = ""; policy_dir_prefix = ""; active_policy = ""; NULL_SSIDREF = 0; @@ -51,3 +52,6 @@ def ssidref2security_label(ssidref): def has_authorization(ssidref): return True + +def get_security_label(self, xspol=None): + return "" diff -r eae7b887e5ac -r ee498c9af856 tools/python/xen/util/xsm/flask/flask.py --- a/tools/python/xen/util/xsm/flask/flask.py Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/python/xen/util/xsm/flask/flask.py Thu Sep 27 12:22:16 2007 -0600 @@ -35,3 +35,7 @@ def set_security_label(policy, label): def ssidref2security_label(ssidref): return ssidref2label(ssidref) + +def get_security_label(self, xspol=None): + label = self.info.get('security_label', '') + return label diff -r eae7b887e5ac -r ee498c9af856 tools/python/xen/xend/XendConfig.py --- a/tools/python/xen/xend/XendConfig.py Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/python/xen/xend/XendConfig.py Thu Sep 27 12:22:16 2007 -0600 @@ -127,7 +127,7 @@ XENAPI_PLATFORM_CFG = [ 'acpi', 'apic', 'nographic', 'pae', 'rtc_timeoffset', 'serial', 'sdl', 'soundhw','stdvga', 'usb', 'usbdevice', 'vnc', 'vncconsole', 'vncdisplay', 'vnclisten', - 'vncpasswd', 'vncunused', 'xauthority'] + 'vncpasswd', 'vncunused', 'xauthority', 'pci'] # Xen API console 'other_config' keys. XENAPI_CONSOLE_OTHER_CFG = ['vncunused', 'vncdisplay', 'vnclisten', @@ -168,6 +168,7 @@ XENAPI_CFG_TYPES = { 'tools_version': dict, 'other_config': dict, 'security_label': str, + 'pci': str, } # List of legacy configuration keys that have no equivalent in the @@ -177,8 +178,6 @@ LEGACY_UNSUPPORTED_BY_XENAPI_CFG = [ # roundtripped (dynamic, unmodified) 'shadow_memory', 'vcpu_avail', - 'cpu_weight', - 'cpu_cap', 'features', # read/write 'on_xend_start', @@ -202,8 +201,6 @@ LEGACY_CFG_TYPES = { 'shadow_memory': int, 'maxmem': int, 'start_time': float, - 'cpu_cap': int, - 'cpu_weight': int, 'cpu_time': float, 'features': str, 'localtime': int, @@ -329,8 +326,6 @@ class XendConfig(dict): 'on_xend_start': 'ignore', 'on_xend_stop': 'ignore', 'cpus': [], - 'cpu_weight': 256, - 'cpu_cap': 0, 'VCPUs_max': 1, 'VCPUs_live': 1, 'VCPUs_at_startup': 1, @@ -495,6 +490,14 @@ class XendConfig(dict): if sxp.child_value(sxp_cfg, "maxmem") != None: cfg["maxmem"] = int(sxp.child_value(sxp_cfg, "maxmem")) + # Convert scheduling parameters to vcpus_params + if 'vcpus_params' not in cfg: + cfg['vcpus_params'] = {} + cfg["vcpus_params"]["weight"] = \ + int(sxp.child_value(sxp_cfg, "cpu_weight", 256)) + cfg["vcpus_params"]["cap"] = \ + int(sxp.child_value(sxp_cfg, "cpu_cap", 0)) + # Only extract options we know about. extract_keys = LEGACY_UNSUPPORTED_BY_XENAPI_CFG extract_keys += XENAPI_CFG_TO_LEGACY_CFG.values() @@ -811,8 +814,6 @@ class XendConfig(dict): _set_cfg_if_exists('on_xend_stop') _set_cfg_if_exists('on_xend_start') _set_cfg_if_exists('vcpu_avail') - _set_cfg_if_exists('cpu_weight') - _set_cfg_if_exists('cpu_cap') # Parse and store runtime configuration _set_cfg_if_exists('start_time') @@ -864,6 +865,10 @@ class XendConfig(dict): self[key] = type_conv(val) else: self[key] = val + + self['vcpus_params']['weight'] = \ + int(self['vcpus_params'].get('weight', 256)) + self['vcpus_params']['cap'] = int(self['vcpus_params'].get('cap', 0)) def to_sxp(self, domain = None, ignore_devices = False, ignore = [], legacy_only = True): diff -r eae7b887e5ac -r ee498c9af856 tools/python/xen/xend/XendDomain.py --- a/tools/python/xen/xend/XendDomain.py Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/python/xen/xend/XendDomain.py Thu Sep 27 12:22:16 2007 -0600 @@ -1176,12 +1176,16 @@ class XendDomain: log.exception("domain_unpause") raise XendError(str(ex)) - def domain_pause(self, domid): + def domain_pause(self, domid, state=False): """Pause domain execution. @param domid: Domain ID or Name @type domid: int or string. - @rtype: None + @keyword state: If True, will return the domain state before pause + @type state: bool + @rtype: int if state is True + @return: Domain state (DOM_STATE_*) + @rtype: None if state is False @raise XendError: Failed to pause @raise XendInvalidDomain: Domain is not valid """ @@ -1191,13 +1195,16 @@ class XendDomain: raise XendInvalidDomain(str(domid)) if dominfo.getDomid() == DOM0_ID: raise XendError("Cannot pause privileged domain %s" % domid) - if dominfo._stateGet() not in (DOM_STATE_RUNNING, DOM_STATE_PAUSED): + ds = dominfo._stateGet() + if ds not in (DOM_STATE_RUNNING, DOM_STATE_PAUSED): raise VMBadState("Domain '%s' is not started" % domid, POWER_STATE_NAMES[DOM_STATE_RUNNING], - POWER_STATE_NAMES[dominfo._stateGet()]) + POWER_STATE_NAMES[ds]) log.info("Domain %s (%d) paused.", dominfo.getName(), int(dominfo.getDomid())) dominfo.pause() + if state: + return ds except XendInvalidDomain: log.exception("domain_pause") raise diff -r eae7b887e5ac -r ee498c9af856 tools/python/xen/xend/XendDomainInfo.py --- a/tools/python/xen/xend/XendDomainInfo.py Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/python/xen/xend/XendDomainInfo.py Thu Sep 27 12:22:16 2007 -0600 @@ -174,7 +174,8 @@ def recreate(info, priv): except XendError: pass # our best shot at 'goto' in python :) - vm = XendDomainInfo(xeninfo, domid, dompath, augment = True, priv = priv) + vm = XendDomainInfo(xeninfo, domid, dompath, augment = True, priv = priv, + vmpath = vmpath) if needs_reinitialising: vm._recreateDom() @@ -321,7 +322,7 @@ class XendDomainInfo: """ def __init__(self, info, domid = None, dompath = None, augment = False, - priv = False, resume = False): + priv = False, resume = False, vmpath = None): """Constructor for a domain @param info: parsed configuration @@ -348,7 +349,22 @@ class XendDomainInfo: #if not self._infoIsSet('uuid'): # self.info['uuid'] = uuid.toString(uuid.create()) - self.vmpath = XS_VMROOT + self.info['uuid'] + # Find a unique /vm/<uuid>/<integer> path if not specified. + # This avoids conflict between pre-/post-migrate domains when doing + # localhost relocation. + self.vmpath = vmpath + i = 0 + while self.vmpath == None: + self.vmpath = XS_VMROOT + self.info['uuid'] + if i != 0: + self.vmpath = self.vmpath + '-' + str(i) + try: + if self._readVm("uuid"): + self.vmpath = None + i = i + 1 + except: + pass + self.dompath = dompath self.image = None @@ -1101,16 +1117,16 @@ class XendDomainInfo: return str(self._resume) def getCap(self): - return self.info.get('cpu_cap', 0) + return self.info['vcpus_params']['cap'] def setCap(self, cpu_cap): - self.info['cpu_cap'] = cpu_cap + self.info['vcpus_params']['cap'] = cpu_cap def getWeight(self): - return self.info.get('cpu_weight', 256) + return self.info['vcpus_params']['weight'] def setWeight(self, cpu_weight): - self.info['cpu_weight'] = cpu_weight + self.info['vcpus_params']['weight'] = cpu_weight def setResume(self, state): self._resume = state @@ -1582,7 +1598,7 @@ class XendDomainInfo: def _initDomain(self): log.debug('XendDomainInfo.initDomain: %s %s', self.domid, - self.info['cpu_weight']) + self.info['vcpus_params']['weight']) self._configureBootloader() @@ -1592,7 +1608,8 @@ class XendDomainInfo: if self.info['platform'].get('localtime', 0): xc.domain_set_time_offset(self.domid) - xc.domain_setcpuweight(self.domid, self.info['cpu_weight']) + xc.domain_setcpuweight(self.domid, \ + self.info['vcpus_params']['weight']) # repin domain vcpus if a restricted cpus list is provided # this is done prior to memory allocation to aide in memory @@ -2167,7 +2184,7 @@ class XendDomainInfo: raise VmError('Invalid VM Name') dom = XendDomain.instance().domain_lookup_nr(name) - if dom and dom.domid != self.domid: + if dom and dom.info['uuid'] != self.info['uuid']: raise VmError("VM name '%s' already exists%s" % (name, dom.domid is not None and @@ -2275,25 +2292,8 @@ class XendDomainInfo: def get_security_label(self, xspol=None): - """ - Get the security label of a domain - @param xspol The policy to use when converting the ssid into - a label; only to be passed during the updating - of the policy - """ - domid = self.getDomid() - - if not xspol: - from xen.xend.XendXSPolicyAdmin import XSPolicyAdminInstance - xspol = XSPolicyAdminInstance().get_loaded_policy() - - if domid == 0: - if xspol: - label = xspol.policy_get_domain_label_formatted(domid) - else: - label = "" - else: - label = self.info.get('security_label', '') + import xen.util.xsm.xsm as security + label = security.get_security_label(self, xspol) return label def set_security_label(self, seclab, old_seclab, xspol=None, diff -r eae7b887e5ac -r ee498c9af856 tools/python/xen/xend/balloon.py --- a/tools/python/xen/xend/balloon.py Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/python/xen/xend/balloon.py Thu Sep 27 12:22:16 2007 -0600 @@ -100,12 +100,28 @@ def free(need_mem): try: dom0_min_mem = xoptions.get_dom0_min_mem() * 1024 + dom0_alloc = get_dom0_current_alloc() retries = 0 sleep_time = SLEEP_TIME_GROWTH + new_alloc = 0 last_new_alloc = None last_free = None rlimit = RETRY_LIMIT + + # If unreasonable memory size is required, we give up waiting + # for ballooning or scrubbing, as if had retried. + physinfo = xc.physinfo() + free_mem = physinfo['free_memory'] + scrub_mem = physinfo['scrub_memory'] + total_mem = physinfo['total_memory'] + if dom0_min_mem > 0: + max_free_mem = total_mem - dom0_min_mem + else: + max_free_mem = total_mem - dom0_alloc + if need_mem >= max_free_mem: + retries = rlimit + while retries < rlimit: physinfo = xc.physinfo() free_mem = physinfo['free_memory'] diff -r eae7b887e5ac -r ee498c9af856 tools/python/xen/xend/image.py --- a/tools/python/xen/xend/image.py Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/python/xen/xend/image.py Thu Sep 27 12:22:16 2007 -0600 @@ -309,7 +309,7 @@ class HVMImageHandler(ImageHandler): def parseDeviceModelArgs(self, vmConfig): dmargs = [ 'boot', 'fda', 'fdb', 'soundhw', 'localtime', 'serial', 'stdvga', 'isa', - 'acpi', 'usb', 'usbdevice', 'keymap' ] + 'acpi', 'usb', 'usbdevice', 'keymap', 'pci' ] ret = ['-vcpus', str(self.vm.getVCpuCount())] diff -r eae7b887e5ac -r ee498c9af856 tools/python/xen/xend/server/netif.py --- a/tools/python/xen/xend/server/netif.py Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/python/xen/xend/server/netif.py Thu Sep 27 12:22:16 2007 -0600 @@ -115,17 +115,15 @@ class NetifController(DevController): accel = config.get('accel') sec_lab = config.get('security_label') - if not typ: - typ = xoptions.netback_type - if not mac: raise VmError("MAC address not specified or generated.") devid = self.allocateDeviceID() back = { 'script' : script, - 'mac' : mac, - 'type' : typ } + 'mac' : mac } + if typ: + back['type'] = typ if ipaddr: back['ip'] = ipaddr if bridge: diff -r eae7b887e5ac -r ee498c9af856 tools/python/xen/xm/create.py --- a/tools/python/xen/xm/create.py Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/python/xen/xm/create.py Thu Sep 27 12:22:16 2007 -0600 @@ -721,7 +721,7 @@ def configure_hvm(config_image, vals): 'localtime', 'serial', 'stdvga', 'isa', 'nographic', 'soundhw', 'vnc', 'vncdisplay', 'vncunused', 'vncconsole', 'vnclisten', 'sdl', 'display', 'xauthority', 'rtc_timeoffset', 'monitor', - 'acpi', 'apic', 'usb', 'usbdevice', 'keymap' ] + 'acpi', 'apic', 'usb', 'usbdevice', 'keymap', 'pci' ] for a in args: if a in vals.__dict__ and vals.__dict__[a] is not None: config_image.append([a, vals.__dict__[a]]) diff -r eae7b887e5ac -r ee498c9af856 tools/python/xen/xm/main.py --- a/tools/python/xen/xm/main.py Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/python/xen/xm/main.py Thu Sep 27 12:22:16 2007 -0600 @@ -931,11 +931,11 @@ def xm_brief_list(doms): print format % d def xm_label_list(doms): - print '%-40s %3s %5s %5s %10s %9s %-10s' % \ + print '%-40s %5s %5s %5s %10s %9s %-10s' % \ ('Name', 'ID', 'Mem', 'VCPUs', 'State', 'Time(s)', 'Label') output = [] - format = '%(name)-40s %(domid)3s %(mem)5d %(vcpus)5d %(state)10s ' \ + format = '%(name)-40s %(domid)5s %(mem)5d %(vcpus)5d %(state)10s ' \ '%(cpu_time)8.1f %(seclabel)10s' import xen.util.xsm.xsm as security @@ -1287,13 +1287,13 @@ def xm_dump_core(args): filename = None if not live: - server.xend.domain.pause(dom) + ds = server.xend.domain.pause(dom, True) try: print "Dumping core of domain: %s ..." % str(dom) server.xend.domain.dump(dom, filename, live, crash) finally: - if not live: + if not live and ds == DOM_STATE_RUNNING: server.xend.domain.unpause(dom) if crash: diff -r eae7b887e5ac -r ee498c9af856 tools/python/xen/xm/xenapi_create.py --- a/tools/python/xen/xm/xenapi_create.py Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/python/xen/xm/xenapi_create.py Thu Sep 27 12:22:16 2007 -0600 @@ -26,6 +26,7 @@ from xen.xend.XendAPIConstants import XE XEN_API_ON_CRASH_BEHAVIOUR from xen.xm.opts import OptionError from xen.util import xsconstants +import xen.util.xsm.xsm as security import sys import os @@ -569,7 +570,7 @@ class sxp2xml: if sec_data: try : vm.attributes['security_label'] = \ - "%s:%s:%s" % (xsconstants.ACM_POLICY_ID, sec_data[0][1][1],sec_data[0][2][1]) + security.set_security_label(sec_data[0][1][1],sec_data[0][2][1]) except Exception, e: raise "Invalid security data format: %s" % str(sec_data) @@ -753,11 +754,7 @@ class sxp2xml: policy = get_child_by_name(vif_sxp, "policy") label = get_child_by_name(vif_sxp, "label") - if label and policy: - vif.attributes["security_label"] \ - = "%s:%s:%s" % (xsconstants.ACM_POLICY_ID, policy, label) - else: - vif.attributes["security_label"] = "" + vif.attributes["security_label"] = security.set_security_label(policy, label) if get_child_by_name(vif_sxp, "bridge") is not None: vif.attributes["network"] \ diff -r eae7b887e5ac -r ee498c9af856 tools/vtpm/tpm_emulator.patch --- a/tools/vtpm/tpm_emulator.patch Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/vtpm/tpm_emulator.patch Thu Sep 27 12:22:16 2007 -0600 @@ -547,10 +547,31 @@ diff -uprN orig/tpm_emulator-0.4/README Installation -------------------------------------------------------------------------- The compilation and installation process uses the build environment for +diff -uprN orig/tpm_emulator-0.4/tpm/tpm_cmd_handler.c tpm_emulator/tpm/tpm_cmd_handler.c +--- orig/tpm_emulator-0.4/tpm/tpm_cmd_handler.c 2006-06-23 19:37:07.000000000 +0900 ++++ tpm_emulator/tpm/tpm_cmd_handler.c 2007-09-12 20:23:00.000000000 +0900 +@@ -565,7 +565,7 @@ static TPM_RESULT execute_TPM_Seal(TPM_R + if (tpm_unmarshal_TPM_KEY_HANDLE(&ptr, &len, &keyHandle) + || tpm_unmarshal_TPM_ENCAUTH(&ptr, &len, &encAuth) + || tpm_unmarshal_UINT32(&ptr, &len, &pcrInfoSize) +- || tpm_unmarshal_TPM_PCR_INFO(&ptr, &len, &pcrInfo) ++ || (pcrInfoSize >0 && tpm_unmarshal_TPM_PCR_INFO(&ptr, &len, &pcrInfo)) + || tpm_unmarshal_UINT32(&ptr, &len, &inDataSize) + || tpm_unmarshal_BLOB(&ptr, &len, &inData, inDataSize) + || len != 0) return TPM_BAD_PARAMETER; +@@ -798,7 +798,7 @@ static TPM_RESULT execute_TPM_Sealx(TPM_ + if (tpm_unmarshal_TPM_KEY_HANDLE(&ptr, &len, &keyHandle) + || tpm_unmarshal_TPM_ENCAUTH(&ptr, &len, &encAuth) + || tpm_unmarshal_UINT32(&ptr, &len, &pcrInfoSize) +- || tpm_unmarshal_TPM_PCR_INFO(&ptr, &len, &pcrInfo) ++ || (pcrInfoSize > 0 && tpm_unmarshal_TPM_PCR_INFO(&ptr, &len, &pcrInfo)) + || tpm_unmarshal_UINT32(&ptr, &len, &inDataSize) + || tpm_unmarshal_BLOB(&ptr, &len, &inData, inDataSize) + || len != 0) return TPM_BAD_PARAMETER; diff -uprN orig/tpm_emulator-0.4/tpm/tpm_credentials.c tpm_emulator/tpm/tpm_credentials.c ---- orig/tpm_emulator-0.4/tpm/tpm_credentials.c 2006-06-23 03:37:07.000000000 -0700 -+++ tpm_emulator/tpm/tpm_credentials.c 2006-07-24 14:35:35.000000000 -0700 -@@ -47,16 +47,16 @@ int tpm_compute_pubkey_checksum(TPM_NONC +--- orig/tpm_emulator-0.4/tpm/tpm_credentials.c 2006-06-23 19:37:07.000000000 +0900 ++++ tpm_emulator/tpm/tpm_credentials.c 2007-09-12 20:23:30.000000000 +0900 +@@ -47,20 +47,20 @@ int tpm_compute_pubkey_checksum(TPM_NONC TPM_RESULT tpm_get_pubek(TPM_PUBKEY *pubEndorsementKey) { @@ -572,6 +593,11 @@ diff -uprN orig/tpm_emulator-0.4/tpm/tpm pubEndorsementKey->algorithmParms.algorithmID = TPM_ALG_RSA; pubEndorsementKey->algorithmParms.encScheme = TPM_ES_RSAESOAEP_SHA1_MGF1; pubEndorsementKey->algorithmParms.sigScheme = TPM_SS_NONE; +- pubEndorsementKey->algorithmParms.parms.rsa.keyLength = key_length; ++ pubEndorsementKey->algorithmParms.parms.rsa.keyLength = key_length << 3; + pubEndorsementKey->algorithmParms.parms.rsa.numPrimes = 2; + pubEndorsementKey->algorithmParms.parms.rsa.exponentSize = 0; + pubEndorsementKey->algorithmParms.parms.rsa.exponent = NULL; @@ -175,6 +175,7 @@ TPM_RESULT TPM_OwnerReadInternalPub(TPM_ { TPM_RESULT res; diff -r eae7b887e5ac -r ee498c9af856 tools/vtpm_manager/tcs/tcs.c --- a/tools/vtpm_manager/tcs/tcs.c Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/vtpm_manager/tcs/tcs.c Thu Sep 27 12:22:16 2007 -0600 @@ -775,7 +775,7 @@ TPM_RESULT TCSP_UnBind(TCS_CONTEXT_HANDL TDDL_UINT32 OutLength = TCPA_MAX_BUFFER_LENGTH; // check input params - if (inData == NULL || privAuth == NULL || outDataSize == NULL || *outData == NULL) + if (inData == NULL || privAuth == NULL || outDataSize == NULL || outData == NULL) return TPM_BAD_PARAMETER; // Convert Byte Input parameter in the input byte stream InBuf diff -r eae7b887e5ac -r ee498c9af856 tools/xenmon/xenbaked.c --- a/tools/xenmon/xenbaked.c Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/xenmon/xenbaked.c Thu Sep 27 12:22:16 2007 -0600 @@ -12,7 +12,8 @@ * Authors: Diwaker Gupta, diwaker.gupta@xxxxxx * Rob Gardner, rob.gardner@xxxxxx * Lucy Cherkasova, lucy.cherkasova.hp.com - * Much code based on xentrace, authored by Mark Williamson, mark.a.williamson@xxxxxxxxx + * Much code based on xentrace, authored by Mark Williamson, + * mark.a.williamson@xxxxxxxxx * Date: November, 2005 * * This program is free software; you can redistribute it and/or modify @@ -107,31 +108,31 @@ int NCPU = 0; void init_current(int ncpu) { - running = calloc(ncpu, sizeof(int)); - NCPU = ncpu; - printf("Initialized with %d %s\n", ncpu, (ncpu == 1) ? "cpu" : "cpu's"); + running = calloc(ncpu, sizeof(int)); + NCPU = ncpu; + printf("Initialized with %d %s\n", ncpu, (ncpu == 1) ? "cpu" : "cpu's"); } int is_current(int domain, int cpu) { - // int i; - - // for (i=0; i<NCPU; i++) + // int i; + + // for (i=0; i<NCPU; i++) if (running[cpu] == domain) - return 1; - return 0; + return 1; + return 0; } // return the domain that's currently running on the given cpu int current(int cpu) { - return running[cpu]; + return running[cpu]; } void set_current(int cpu, int domain) { - running[cpu] = domain; + running[cpu] = domain; } @@ -145,7 +146,7 @@ void dump_record(int cpu, struct t_rec * void dump_record(int cpu, struct t_rec *x) { printf("record: cpu=%x, tsc=%lx, event=%x, d1=%lx\n", - cpu, x->cycles, x->event, x->data[0]); + cpu, x->cycles, x->event, x->data[0]); } #endif @@ -198,15 +199,15 @@ void check_gotten_sum(void) int i; for (i=0; i<NCPU; i++) { - new_qos = cpu_qos_data[i]; - ns = billion; - sum = total_ns_gotten(&ns); - - printf("[cpu%d] ns_gotten over all domains = %lldns, over %lldns\n", - i, sum, ns); - percent = (double) sum; - percent = (100.0*percent) / (double)ns; - printf(" ==> ns_gotten = %7.3f%%\n", percent); + new_qos = cpu_qos_data[i]; + ns = billion; + sum = total_ns_gotten(&ns); + + printf("[cpu%d] ns_gotten over all domains = %lldns, over %lldns\n", + i, sum, ns); + percent = (double) sum; + percent = (100.0*percent) / (double)ns; + printf(" ==> ns_gotten = %7.3f%%\n", percent); } #endif } @@ -229,7 +230,7 @@ void dump_stats(void) } printf("processed %d total records in %d seconds (%ld per second)\n", - rec_count, (int)run_time, rec_count/run_time); + rec_count, (int)run_time, rec_count/run_time); printf("woke up %d times in %d seconds (%ld per second)\n", wakeups, (int) run_time, wakeups/run_time); @@ -261,56 +262,56 @@ int xce_handle = -1; /* Stolen from xenstore code */ int eventchn_init(void) { - int rc; - - // to revert to old way: - if (0) - return -1; - - xce_handle = xc_evtchn_open(); - - if (xce_handle < 0) - perror("Failed to open evtchn device"); - - if ((rc = xc_evtchn_bind_virq(xce_handle, VIRQ_TBUF)) == -1) - perror("Failed to bind to domain exception virq port"); - virq_port = rc; - - return xce_handle; + int rc; + + // to revert to old way: + if (0) + return -1; + + xce_handle = xc_evtchn_open(); + + if (xce_handle < 0) + perror("Failed to open evtchn device"); + + if ((rc = xc_evtchn_bind_virq(xce_handle, VIRQ_TBUF)) == -1) + perror("Failed to bind to domain exception virq port"); + virq_port = rc; + + return xce_handle; } void wait_for_event(void) { - int ret; - fd_set inset; - evtchn_port_t port; - struct timeval tv; - int evtchn_fd; - - if (xce_handle < 0) { - nanosleep(&opts.poll_sleep, NULL); - return; - } - - evtchn_fd = xc_evtchn_fd(xce_handle); - - FD_ZERO(&inset); - FD_SET(evtchn_fd, &inset); - tv.tv_sec = 1; - tv.tv_usec = 0; - // tv = millis_to_timespec(&opts.poll_sleep); - ret = select(evtchn_fd+1, &inset, NULL, NULL, &tv); - - if ( (ret == 1) && FD_ISSET(evtchn_fd, &inset)) { - if ((port = xc_evtchn_pending(xce_handle)) == -1) - perror("Failed to read from event fd"); - - // if (port == virq_port) - // printf("got the event I was looking for\r\n"); - - if (xc_evtchn_unmask(xce_handle, port) == -1) - perror("Failed to write to event fd"); - } + int ret; + fd_set inset; + evtchn_port_t port; + struct timeval tv; + int evtchn_fd; + + if (xce_handle < 0) { + nanosleep(&opts.poll_sleep, NULL); + return; + } + + evtchn_fd = xc_evtchn_fd(xce_handle); + + FD_ZERO(&inset); + FD_SET(evtchn_fd, &inset); + tv.tv_sec = 1; + tv.tv_usec = 0; + // tv = millis_to_timespec(&opts.poll_sleep); + ret = select(evtchn_fd+1, &inset, NULL, NULL, &tv); + + if ( (ret == 1) && FD_ISSET(evtchn_fd, &inset)) { + if ((port = xc_evtchn_pending(xce_handle)) == -1) + perror("Failed to read from event fd"); + + // if (port == virq_port) + // printf("got the event I was looking for\r\n"); + + if (xc_evtchn_unmask(xce_handle, port) == -1) + perror("Failed to write to event fd"); + } } static void get_tbufs(unsigned long *mfn, unsigned long *size) @@ -336,9 +337,9 @@ static void get_tbufs(unsigned long *mfn void disable_tracing(void) { - int xc_handle = xc_interface_open(); - xc_tbuf_disable(xc_handle); - xc_interface_close(xc_handle); + int xc_handle = xc_interface_open(); + xc_tbuf_disable(xc_handle); + xc_interface_close(xc_handle); } /** @@ -387,7 +388,7 @@ struct t_buf *map_tbufs(unsigned long tb * mapped region containing all trace buffers. */ struct t_buf **init_bufs_ptrs(void *bufs_mapped, unsigned int num, - unsigned long size) + unsigned long size) { int i; struct t_buf **user_ptrs; @@ -472,7 +473,7 @@ int monitor_tbufs(void) int monitor_tbufs(void) { int i; - extern void process_record(int, struct t_rec *); + extern int process_record(int, struct t_rec *); extern void alloc_qos_data(int ncpu); void *tbufs_mapped; /* pointer to where the tbufs are mapped */ @@ -483,7 +484,7 @@ int monitor_tbufs(void) unsigned int num; /* number of trace buffers / logical CPUS */ unsigned long size; /* size of a single trace buffer */ - int size_in_recs; + unsigned long data_size, rec_size; /* get number of logical CPUs (and therefore number of trace buffers) */ num = get_num_cpus(); @@ -496,34 +497,32 @@ int monitor_tbufs(void) /* setup access to trace buffers */ get_tbufs(&tbufs_mfn, &size); - // printf("from dom0op: %ld, t_buf: %d, t_rec: %d\n", - // size, sizeof(struct t_buf), sizeof(struct t_rec)); - tbufs_mapped = map_tbufs(tbufs_mfn, num, size); - size_in_recs = (size - sizeof(struct t_buf)) / sizeof(struct t_rec); - // fprintf(stderr, "size_in_recs = %d\n", size_in_recs); + data_size = size - sizeof(struct t_buf); /* build arrays of convenience ptrs */ meta = init_bufs_ptrs (tbufs_mapped, num, size); data = init_rec_ptrs(meta, num); - // Set up event channel for select() - if (eventchn_init() < 0) { - fprintf(stderr, "Failed to initialize event channel; Using POLL method\r\n"); - } + if ( eventchn_init() < 0 ) + fprintf(stderr, "Failed to initialize event channel; " + "Using POLL method\r\n"); /* now, scan buffers for events */ while ( !interrupted ) { - for ( i = 0; ( i < num ) && !interrupted; i++ ) + for ( i = 0; (i < num) && !interrupted; i++ ) + { while ( meta[i]->cons != meta[i]->prod ) { rmb(); /* read prod, then read item. */ - process_record(i, data[i] + meta[i]->cons % size_in_recs); + rec_size = process_record( + i, data[i] + meta[i]->cons % data_size); mb(); /* read item, then update cons. */ - meta[i]->cons++; + meta[i]->cons += rec_size; } + } wait_for_event(); wakeups++; @@ -550,44 +549,44 @@ error_t cmd_parser(int key, char *arg, s switch ( key ) { - case 't': /* set new records threshold for logging */ - { - char *inval; - setup->new_data_thresh = strtol(arg, &inval, 0); - if ( inval == arg ) - argp_usage(state); - } - break; - - case 's': /* set sleep time (given in milliseconds) */ - { - char *inval; - setup->poll_sleep = millis_to_timespec(strtol(arg, &inval, 0)); - if ( inval == arg ) - argp_usage(state); - } - break; - - case 'm': /* set ms_per_sample */ - { - char *inval; - setup->ms_per_sample = strtol(arg, &inval, 0); - if ( inval == arg ) - argp_usage(state); - } - break; - - case ARGP_KEY_ARG: - { - if ( state->arg_num == 0 ) - setup->outfile = arg; - else - argp_usage(state); - } - break; - - default: - return ARGP_ERR_UNKNOWN; + case 't': /* set new records threshold for logging */ + { + char *inval; + setup->new_data_thresh = strtol(arg, &inval, 0); + if ( inval == arg ) + argp_usage(state); + } + break; + + case 's': /* set sleep time (given in milliseconds) */ + { + char *inval; + setup->poll_sleep = millis_to_timespec(strtol(arg, &inval, 0)); + if ( inval == arg ) + argp_usage(state); + } + break; + + case 'm': /* set ms_per_sample */ + { + char *inval; + setup->ms_per_sample = strtol(arg, &inval, 0); + if ( inval == arg ) + argp_usage(state); + } + break; + + case ARGP_KEY_ARG: + { + if ( state->arg_num == 0 ) + setup->outfile = arg; + else + argp_usage(state); + } + break; + + default: + return ARGP_ERR_UNKNOWN; } return 0; @@ -614,27 +613,27 @@ void alloc_qos_data(int ncpu) for (n=0; n<ncpu; n++) { - for (i=0; i<sizeof(_new_qos_data); i=i+pgsize) - if ((write(qos_fd, dummy, pgsize)) != pgsize) { - PERROR(SHARED_MEM_FILE); - exit(2); - } - - new_qos = (_new_qos_data *) mmap(0, sizeof(_new_qos_data), PROT_READ|PROT_WRITE, - MAP_SHARED, qos_fd, off); - off += i; - if (new_qos == NULL) { - PERROR("mmap"); - exit(3); - } - // printf("new_qos = %p\n", new_qos); - memset(new_qos, 0, sizeof(_new_qos_data)); - new_qos->next_datapoint = 0; - advance_next_datapoint(0); - new_qos->structlen = i; - new_qos->ncpu = ncpu; - // printf("structlen = 0x%x\n", i); - cpu_qos_data[n] = new_qos; + for (i=0; i<sizeof(_new_qos_data); i=i+pgsize) + if ((write(qos_fd, dummy, pgsize)) != pgsize) { + PERROR(SHARED_MEM_FILE); + exit(2); + } + + new_qos = (_new_qos_data *) mmap(0, sizeof(_new_qos_data), PROT_READ|PROT_WRITE, + MAP_SHARED, qos_fd, off); + off += i; + if (new_qos == NULL) { + PERROR("mmap"); + exit(3); + } + // printf("new_qos = %p\n", new_qos); + memset(new_qos, 0, sizeof(_new_qos_data)); + new_qos->next_datapoint = 0; + advance_next_datapoint(0); + new_qos->structlen = i; + new_qos->ncpu = ncpu; + // printf("structlen = 0x%x\n", i); + cpu_qos_data[n] = new_qos; } free(dummy); new_qos = NULL; @@ -647,19 +646,19 @@ const struct argp_option cmd_opts[] = const struct argp_option cmd_opts[] = { { .name = "log-thresh", .key='t', .arg="l", - .doc = - "Set number, l, of new records required to trigger a write to output " - "(default " xstr(NEW_DATA_THRESH) ")." }, + .doc = + "Set number, l, of new records required to trigger a write to output " + "(default " xstr(NEW_DATA_THRESH) ")." }, { .name = "poll-sleep", .key='s', .arg="p", - .doc = - "Set sleep time, p, in milliseconds between polling the trace buffer " - "for new data (default " xstr(POLL_SLEEP_MILLIS) ")." }, + .doc = + "Set sleep time, p, in milliseconds between polling the trace buffer " + "for new data (default " xstr(POLL_SLEEP_MILLIS) ")." }, { .name = "ms_per_sample", .key='m', .arg="MS", - .doc = - "Specify the number of milliseconds per sample " - " (default " xstr(MS_PER_SAMPLE) ")." }, + .doc = + "Specify the number of milliseconds per sample " + " (default " xstr(MS_PER_SAMPLE) ")." }, {0} }; @@ -670,10 +669,10 @@ const struct argp parser_def = .parser = cmd_parser, // .args_doc = "[output file]", .doc = - "Tool to capture and partially process Xen trace buffer data" - "\v" - "This tool is used to capture trace buffer data from Xen. The data is " - "saved in a shared memory structure to be further processed by xenmon." + "Tool to capture and partially process Xen trace buffer data" + "\v" + "This tool is used to capture trace buffer data from Xen. The data is " + "saved in a shared memory structure to be further processed by xenmon." }; @@ -716,101 +715,101 @@ int main(int argc, char **argv) void qos_init_domain(int domid, int idx) { - int i; - - memset(&new_qos->domain_info[idx], 0, sizeof(_domain_info)); - new_qos->domain_info[idx].last_update_time = global_now; - // runnable_start_time[idx] = 0; - new_qos->domain_info[idx].runnable_start_time = 0; // invalidate - new_qos->domain_info[idx].in_use = 1; - new_qos->domain_info[idx].blocked_start_time = 0; - new_qos->domain_info[idx].id = domid; - if (domid == IDLE_DOMAIN_ID) - sprintf(new_qos->domain_info[idx].name, "Idle Task%d", global_cpu); - else - sprintf(new_qos->domain_info[idx].name, "Domain#%d", domid); - - for (i=0; i<NSAMPLES; i++) { - new_qos->qdata[i].ns_gotten[idx] = 0; - new_qos->qdata[i].ns_allocated[idx] = 0; - new_qos->qdata[i].ns_waiting[idx] = 0; - new_qos->qdata[i].ns_blocked[idx] = 0; - new_qos->qdata[i].switchin_count[idx] = 0; - new_qos->qdata[i].io_count[idx] = 0; - } + int i; + + memset(&new_qos->domain_info[idx], 0, sizeof(_domain_info)); + new_qos->domain_info[idx].last_update_time = global_now; + // runnable_start_time[idx] = 0; + new_qos->domain_info[idx].runnable_start_time = 0; // invalidate + new_qos->domain_info[idx].in_use = 1; + new_qos->domain_info[idx].blocked_start_time = 0; + new_qos->domain_info[idx].id = domid; + if (domid == IDLE_DOMAIN_ID) + sprintf(new_qos->domain_info[idx].name, "Idle Task%d", global_cpu); + else + sprintf(new_qos->domain_info[idx].name, "Domain#%d", domid); + + for (i=0; i<NSAMPLES; i++) { + new_qos->qdata[i].ns_gotten[idx] = 0; + new_qos->qdata[i].ns_allocated[idx] = 0; + new_qos->qdata[i].ns_waiting[idx] = 0; + new_qos->qdata[i].ns_blocked[idx] = 0; + new_qos->qdata[i].switchin_count[idx] = 0; + new_qos->qdata[i].io_count[idx] = 0; + } } void global_init_domain(int domid, int idx) { - int cpu; - _new_qos_data *saved_qos; - - saved_qos = new_qos; - - for (cpu=0; cpu<NCPU; cpu++) { - new_qos = cpu_qos_data[cpu]; - qos_init_domain(domid, idx); - } - new_qos = saved_qos; + int cpu; + _new_qos_data *saved_qos; + + saved_qos = new_qos; + + for (cpu=0; cpu<NCPU; cpu++) { + new_qos = cpu_qos_data[cpu]; + qos_init_domain(domid, idx); + } + new_qos = saved_qos; } // give index of this domain in the qos data array int indexof(int domid) { - int idx; - xc_dominfo_t dominfo[NDOMAINS]; - int xc_handle, ndomains; - extern void qos_kill_thread(int domid); - - if (domid < 0) { // shouldn't happen - printf("bad domain id: %d\r\n", domid); - return 0; - } - - for (idx=0; idx<NDOMAINS; idx++) - if ( (new_qos->domain_info[idx].id == domid) && new_qos->domain_info[idx].in_use) - return idx; - - // not found, make a new entry - for (idx=0; idx<NDOMAINS; idx++) - if (new_qos->domain_info[idx].in_use == 0) { - global_init_domain(domid, idx); - return idx; - } - - // call domaininfo hypercall to try and garbage collect unused entries - xc_handle = xc_interface_open(); - ndomains = xc_domain_getinfo(xc_handle, 0, NDOMAINS, dominfo); - xc_interface_close(xc_handle); - - // for each domain in our data, look for it in the system dominfo structure - // and purge the domain's data from our state if it does not exist in the - // dominfo structure - for (idx=0; idx<NDOMAINS; idx++) { - int domid = new_qos->domain_info[idx].id; - int jdx; - - for (jdx=0; jdx<ndomains; jdx++) { - if (dominfo[jdx].domid == domid) - break; - } - if (jdx == ndomains) // we didn't find domid in the dominfo struct - if (domid != IDLE_DOMAIN_ID) // exception for idle domain, which is not - // contained in dominfo - qos_kill_thread(domid); // purge our stale data - } - - // look again for a free slot - for (idx=0; idx<NDOMAINS; idx++) - if (new_qos->domain_info[idx].in_use == 0) { - global_init_domain(domid, idx); - return idx; - } - - // still no space found, so bail - fprintf(stderr, "out of space in domain table, increase NDOMAINS\r\n"); - exit(2); + int idx; + xc_dominfo_t dominfo[NDOMAINS]; + int xc_handle, ndomains; + extern void qos_kill_thread(int domid); + + if (domid < 0) { // shouldn't happen + printf("bad domain id: %d\r\n", domid); + return 0; + } + + for (idx=0; idx<NDOMAINS; idx++) + if ( (new_qos->domain_info[idx].id == domid) && new_qos->domain_info[idx].in_use) + return idx; + + // not found, make a new entry + for (idx=0; idx<NDOMAINS; idx++) + if (new_qos->domain_info[idx].in_use == 0) { + global_init_domain(domid, idx); + return idx; + } + + // call domaininfo hypercall to try and garbage collect unused entries + xc_handle = xc_interface_open(); + ndomains = xc_domain_getinfo(xc_handle, 0, NDOMAINS, dominfo); + xc_interface_close(xc_handle); + + // for each domain in our data, look for it in the system dominfo structure + // and purge the domain's data from our state if it does not exist in the + // dominfo structure + for (idx=0; idx<NDOMAINS; idx++) { + int domid = new_qos->domain_info[idx].id; + int jdx; + + for (jdx=0; jdx<ndomains; jdx++) { + if (dominfo[jdx].domid == domid) + break; + } + if (jdx == ndomains) // we didn't find domid in the dominfo struct + if (domid != IDLE_DOMAIN_ID) // exception for idle domain, which is not + // contained in dominfo + qos_kill_thread(domid); // purge our stale data + } + + // look again for a free slot + for (idx=0; idx<NDOMAINS; idx++) + if (new_qos->domain_info[idx].in_use == 0) { + global_init_domain(domid, idx); + return idx; + } + + // still no space found, so bail + fprintf(stderr, "out of space in domain table, increase NDOMAINS\r\n"); + exit(2); } int domain_runnable(int domid) @@ -879,25 +878,25 @@ void qos_update_thread(int cpu, int domi time_since_update = now - last_update_time; if (time_since_update < 0) { - // what happened here? either a timestamp wraparound, or more likely, - // a slight inconsistency among timestamps from various cpu's - if (-time_since_update < billion) { - // fairly small difference, let's just adjust 'now' to be a little - // beyond last_update_time - time_since_update = -time_since_update; - } - else if ( ((~0ULL - last_update_time) < billion) && (now < billion) ) { - // difference is huge, must be a wraparound - // last_update time should be "near" ~0ULL, - // and now should be "near" 0 - time_since_update = now + (~0ULL - last_update_time); - printf("time wraparound\n"); - } - else { - // none of the above, may be an out of order record - // no good solution, just ignore and update again later - return; - } + // what happened here? either a timestamp wraparound, or more likely, + // a slight inconsistency among timestamps from various cpu's + if (-time_since_update < billion) { + // fairly small difference, let's just adjust 'now' to be a little + // beyond last_update_time + time_since_update = -time_since_update; + } + else if ( ((~0ULL - last_update_time) < billion) && (now < billion) ) { + // difference is huge, must be a wraparound + // last_update time should be "near" ~0ULL, + // and now should be "near" 0 + time_since_update = now + (~0ULL - last_update_time); + printf("time wraparound\n"); + } + else { + // none of the above, may be an out of order record + // no good solution, just ignore and update again later + return; + } } new_qos->domain_info[id].last_update_time = now; @@ -985,7 +984,7 @@ void qos_switch_in(int cpu, int domid, u // count up page flips for dom0 execution if (domid == 0) - dom0_flips = 0; + dom0_flips = 0; } // called when the current thread is taken off the cpu @@ -1011,8 +1010,8 @@ void qos_switch_out(int cpu, int domid, #if 0 new_qos->qdata[n].ns_gotten[idx] += gotten; if (gotten > new_qos->qdata[n].ns_passed) - printf("inconsistency #257, diff = %lld\n", - gotten - new_qos->qdata[n].ns_passed ); + printf("inconsistency #257, diff = %lld\n", + gotten - new_qos->qdata[n].ns_passed ); #endif new_qos->domain_info[idx].ns_oncpu_since_boot += gotten; new_qos->domain_info[idx].runnable_start_time = now; @@ -1021,8 +1020,8 @@ void qos_switch_out(int cpu, int domid, // process dom0 page flips if (domid == 0) - if (dom0_flips == 0) - new_qos->qdata[n].flip_free_periods++; + if (dom0_flips == 0) + new_qos->qdata[n].flip_free_periods++; } // called when domain is put to sleep, may also be called @@ -1047,11 +1046,11 @@ void qos_state_sleeping(int cpu, int dom // domain died, presume it's dead on all cpu's, not just mostly dead void qos_kill_thread(int domid) { - int cpu; - - for (cpu=0; cpu<NCPU; cpu++) { - cpu_qos_data[cpu]->domain_info[indexof(domid)].in_use = 0; - } + int cpu; + + for (cpu=0; cpu<NCPU; cpu++) { + cpu_qos_data[cpu]->domain_info[indexof(domid)].in_use = 0; + } } @@ -1060,7 +1059,7 @@ void qos_kill_thread(int domid) // when thread is already runnable void qos_state_runnable(int cpu, int domid, uint64_t now) { - int idx; + int idx; qos_update_thread_stats(cpu, domid, now); @@ -1080,79 +1079,85 @@ void qos_state_runnable(int cpu, int dom void qos_count_packets(domid_t domid, uint64_t now) { - int i, idx = indexof(domid); - _new_qos_data *cpu_data; - - for (i=0; i<NCPU; i++) { - cpu_data = cpu_qos_data[i]; - if (cpu_data->domain_info[idx].in_use) { - cpu_data->qdata[cpu_data->next_datapoint].io_count[idx]++; - } - } - - new_qos->qdata[new_qos->next_datapoint].io_count[0]++; - dom0_flips++; -} - - -void process_record(int cpu, struct t_rec *r) -{ - uint64_t now; - - new_qos = cpu_qos_data[cpu]; - - rec_count++; - - now = ((double)r->cycles) / (opts.cpu_freq / 1000.0); - - global_now = now; - global_cpu = cpu; - - log_event(r->event); - - switch (r->event) { - - case TRC_SCHED_SWITCH_INFPREV: - // domain data[0] just switched out and received data[1] ns of cpu time - qos_switch_out(cpu, r->data[0], now, r->data[1]); - // printf("ns_gotten %ld\n", r->data[1]); - break; - - case TRC_SCHED_SWITCH_INFNEXT: - // domain data[0] just switched in and - // waited data[1] ns, and was allocated data[2] ns of cpu time - qos_switch_in(cpu, r->data[0], now, r->data[2], r->data[1]); - break; - - case TRC_SCHED_DOM_ADD: - (void) indexof(r->data[0]); - break; - - case TRC_SCHED_DOM_REM: - qos_kill_thread(r->data[0]); - break; - - case TRC_SCHED_SLEEP: - qos_state_sleeping(cpu, r->data[0], now); - break; - - case TRC_SCHED_WAKE: - qos_state_runnable(cpu, r->data[0], now); - break; - - case TRC_SCHED_BLOCK: - qos_state_sleeping(cpu, r->data[0], now); - break; - - case TRC_MEM_PAGE_GRANT_TRANSFER: - qos_count_packets(r->data[0], now); - break; - - default: - break; - } - new_qos = NULL; -} - - - + int i, idx = indexof(domid); + _new_qos_data *cpu_data; + + for (i=0; i<NCPU; i++) { + cpu_data = cpu_qos_data[i]; + if (cpu_data->domain_info[idx].in_use) { + cpu_data->qdata[cpu_data->next_datapoint].io_count[idx]++; + } + } + + new_qos->qdata[new_qos->next_datapoint].io_count[0]++; + dom0_flips++; +} + + +int process_record(int cpu, struct t_rec *r) +{ + uint64_t now = 0; + uint32_t *extra_u32 = r->u.nocycles.extra_u32; + + new_qos = cpu_qos_data[cpu]; + + rec_count++; + + if ( r->cycles_included ) + { + now = ((uint64_t)r->u.cycles.cycles_hi << 32) | r->u.cycles.cycles_lo; + now = ((double)now) / (opts.cpu_freq / 1000.0); + extra_u32 = r->u.cycles.extra_u32; + } + + global_now = now; + global_cpu = cpu; + + log_event(r->event); + + switch (r->event) { + + case TRC_SCHED_SWITCH_INFPREV: + // domain data[0] just switched out and received data[1] ns of cpu time + qos_switch_out(cpu, extra_u32[0], now, extra_u32[1]); + // printf("ns_gotten %ld\n", extra_u32[1]); + break; + + case TRC_SCHED_SWITCH_INFNEXT: + // domain data[0] just switched in and + // waited data[1] ns, and was allocated data[2] ns of cpu time + qos_switch_in(cpu, extra_u32[0], now, extra_u32[2], extra_u32[1]); + break; + + case TRC_SCHED_DOM_ADD: + (void) indexof(extra_u32[0]); + break; + + case TRC_SCHED_DOM_REM: + qos_kill_thread(extra_u32[0]); + break; + + case TRC_SCHED_SLEEP: + qos_state_sleeping(cpu, extra_u32[0], now); + break; + + case TRC_SCHED_WAKE: + qos_state_runnable(cpu, extra_u32[0], now); + break; + + case TRC_SCHED_BLOCK: + qos_state_sleeping(cpu, extra_u32[0], now); + break; + + case TRC_MEM_PAGE_GRANT_TRANSFER: + qos_count_packets(extra_u32[0], now); + break; + + default: + break; + } + + new_qos = NULL; + + return 4 + (r->cycles_included ? 8 : 0) + (r->extra_u32 * 4); +} diff -r eae7b887e5ac -r ee498c9af856 tools/xenstat/libxenstat/Makefile --- a/tools/xenstat/libxenstat/Makefile Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/xenstat/libxenstat/Makefile Thu Sep 27 12:22:16 2007 -0600 @@ -31,6 +31,7 @@ OBJECTS-y=src/xenstat.o OBJECTS-y=src/xenstat.o OBJECTS-$(CONFIG_Linux) += src/xenstat_linux.o OBJECTS-$(CONFIG_SunOS) += src/xenstat_solaris.o +OBJECTS-$(CONFIG_NetBSD) += src/xenstat_netbsd.o SONAME_FLAGS=-Wl,$(SONAME_LDFLAG) -Wl,libxenstat.so.$(MAJOR) WARN_FLAGS=-Wall -Werror diff -r eae7b887e5ac -r ee498c9af856 tools/xenstat/libxenstat/src/xenstat_netbsd.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/xenstat/libxenstat/src/xenstat_netbsd.c Thu Sep 27 12:22:16 2007 -0600 @@ -0,0 +1,97 @@ +/* libxenstat: statistics-collection library for Xen + * Copyright (C) International Business Machines Corp., 2005 + * Authors: Josh Triplett <josht@xxxxxxxxxx> + * Judy Fischbach <jfisch@xxxxxxxxxx> + * David Hendricks <dhendrix@xxxxxxxxxx> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <fcntl.h> +#include <dirent.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include "xenstat_priv.h" + +#define SYSFS_VBD_PATH "/sys/devices/xen-backend/" + +struct priv_data { + FILE *procnetdev; + DIR *sysfsvbd; +}; + +static struct priv_data * +get_priv_data(xenstat_handle *handle) +{ + if (handle->priv != NULL) + return handle->priv; + + handle->priv = malloc(sizeof(struct priv_data)); + if (handle->priv == NULL) + return (NULL); + + ((struct priv_data *)handle->priv)->procnetdev = NULL; + ((struct priv_data *)handle->priv)->sysfsvbd = NULL; + + return handle->priv; +} + +/* Expected format of /proc/net/dev */ +static const char PROCNETDEV_HEADER[] = + "Inter-| Receive |" + " Transmit\n" + " face |bytes packets errs drop fifo frame compressed multicast|" + "bytes packets errs drop fifo colls carrier compressed\n"; + +/* Collect information about networks */ +int xenstat_collect_networks(xenstat_node * node) +{ + /* XXX fixme: implement code to get stats from libkvm ! */ + return 1; +} + +/* Free network information in handle */ +void xenstat_uninit_networks(xenstat_handle * handle) +{ + struct priv_data *priv = get_priv_data(handle); + if (priv != NULL && priv->procnetdev != NULL) + fclose(priv->procnetdev); +} + +static int read_attributes_vbd(const char *vbd_directory, const char *what, char *ret, int cap) +{ + /* XXX implement */ + return 0; +} + +/* Collect information about VBDs */ +int xenstat_collect_vbds(xenstat_node * node) +{ + return 1; +} + +/* Free VBD information in handle */ +void xenstat_uninit_vbds(xenstat_handle * handle) +{ + struct priv_data *priv = get_priv_data(handle); + if (priv != NULL && priv->sysfsvbd != NULL) + closedir(priv->sysfsvbd); +} diff -r eae7b887e5ac -r ee498c9af856 tools/xenstat/xentop/xentop.c --- a/tools/xenstat/xentop/xentop.c Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/xenstat/xentop/xentop.c Thu Sep 27 12:22:16 2007 -0600 @@ -28,7 +28,9 @@ #include <sys/time.h> #include <time.h> #include <unistd.h> +#if defined(__linux__) #include <linux/kdev_t.h> +#endif #include <xenstat.h> @@ -938,11 +940,12 @@ void do_vbd(xenstat_domain *domain) vbd = xenstat_domain_vbd(domain,i); -#ifdef __sun__ +#if !defined(__linux__) details[0] = '\0'; #else - snprintf(details, 20, "[%2x:%2x] ", MAJOR(xenstat_vbd_dev(vbd)), - MINOR(xenstat_vbd_dev(vbd))); + snprintf(details, 20, "[%2x:%2x] ", + MAJOR(xenstat_vbd_dev(vbd)), + MINOR(xenstat_vbd_dev(vbd))); #endif print("VBD %s %4d %s OO: %8llu RD: %8llu WR: %8llu\n", diff -r eae7b887e5ac -r ee498c9af856 tools/xenstore/Makefile --- a/tools/xenstore/Makefile Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/xenstore/Makefile Thu Sep 27 12:22:16 2007 -0600 @@ -25,6 +25,7 @@ XENSTORED_OBJS = xenstored_core.o xensto XENSTORED_OBJS_$(CONFIG_Linux) = xenstored_linux.o XENSTORED_OBJS_$(CONFIG_SunOS) = xenstored_solaris.o +XENSTORED_OBJS_$(CONFIG_NetBSD) = xenstored_netbsd.o XENSTORED_OBJS += $(XENSTORED_OBJS_y) diff -r eae7b887e5ac -r ee498c9af856 tools/xenstore/xenstored_netbsd.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/xenstore/xenstored_netbsd.c Thu Sep 27 12:22:16 2007 -0600 @@ -0,0 +1,73 @@ +/****************************************************************************** + * + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * + * Copyright (C) 2005 Rusty Russell IBM Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + */ + +#include <fcntl.h> +#include <unistd.h> +#include <stdlib.h> +#include <sys/mman.h> + +#include "xenstored_core.h" + +#define XENSTORED_PROC_KVA "/dev/xsd_kva" +#define XENSTORED_PROC_PORT "/kern/xen/xsd_port" + +evtchn_port_t xenbus_evtchn(void) +{ + int fd; + int rc; + evtchn_port_t port; + char str[20]; + + fd = open(XENSTORED_PROC_PORT, O_RDONLY); + if (fd == -1) + return -1; + + rc = read(fd, str, sizeof(str)); + if (rc == -1) + { + int err = errno; + close(fd); + errno = err; + return -1; + } + + str[rc] = '\0'; + port = strtoul(str, NULL, 0); + + close(fd); + return port; +} + +void *xenbus_map(void) +{ + int fd; + void *addr; + + fd = open(XENSTORED_PROC_KVA, O_RDWR); + if (fd == -1) + return NULL; + + addr = mmap(NULL, getpagesize(), PROT_READ|PROT_WRITE, + MAP_SHARED, fd, 0); + + if (addr == MAP_FAILED) + addr = NULL; + + close(fd); + + return addr; +} + +void xenbus_notify_running(void) +{ +} diff -r eae7b887e5ac -r ee498c9af856 tools/xentrace/xentrace.c --- a/tools/xentrace/xentrace.c Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/xentrace/xentrace.c Thu Sep 27 12:22:16 2007 -0600 @@ -22,6 +22,7 @@ #include <signal.h> #include <inttypes.h> #include <string.h> +#include <assert.h> #include <xen/xen.h> #include <xen/trace.h> @@ -83,24 +84,62 @@ struct timespec millis_to_timespec(unsig } /** - * write_rec - output a trace record in binary format + * write_buffer - write a section of the trace buffer * @cpu - source buffer CPU ID - * @rec - trace record to output + * @start + * @size - size of write (may be less than total window size) + * @total_size - total size of the window (0 on 2nd write of wrapped windows) * @out - output stream * - * Outputs the trace record to a filestream, prepending the CPU ID of the - * source trace buffer. - */ -void write_rec(unsigned int cpu, struct t_rec *rec, FILE *out) + * Outputs the trace buffer to a filestream, prepending the CPU and size + * of the buffer write. + */ +void write_buffer(unsigned int cpu, unsigned char *start, int size, + int total_size, int outfd) { size_t written = 0; - written += fwrite(&cpu, sizeof(cpu), 1, out); - written += fwrite(rec, sizeof(*rec), 1, out); - if ( written != 2 ) - { - PERROR("Failed to write trace record"); - exit(EXIT_FAILURE); - } + + /* Write a CPU_BUF record on each buffer "window" written. Wrapped + * windows may involve two writes, so only write the record on the + * first write. */ + if ( total_size != 0 ) + { + struct { + uint32_t header; + struct { + unsigned cpu; + unsigned byte_count; + } extra; + } rec; + + rec.header = TRC_TRACE_CPU_CHANGE + | ((sizeof(rec.extra)/sizeof(uint32_t)) << TRACE_EXTRA_SHIFT); + rec.extra.cpu = cpu; + rec.extra.byte_count = total_size; + + written = write(outfd, &rec, sizeof(rec)); + + if ( written != sizeof(rec) ) + { + fprintf(stderr, "Cannot write cpu change (write returned %zd)\n", + written); + goto fail; + } + } + + written = write(outfd, start, size); + if ( written != size ) + { + fprintf(stderr, "Write failed! (size %d, returned %zd)\n", + size, written); + goto fail; + } + + return; + + fail: + PERROR("Failed to write trace data"); + exit(EXIT_FAILURE); } static void get_tbufs(unsigned long *mfn, unsigned long *size) @@ -233,12 +272,12 @@ struct t_buf **init_bufs_ptrs(void *bufs * mapped in user space. Note that the trace buffer metadata contains machine * pointers - the array returned allows more convenient access to them. */ -struct t_rec **init_rec_ptrs(struct t_buf **meta, unsigned int num) +unsigned char **init_rec_ptrs(struct t_buf **meta, unsigned int num) { int i; - struct t_rec **data; - - data = calloc(num, sizeof(struct t_rec *)); + unsigned char **data; + + data = calloc(num, sizeof(unsigned char *)); if ( data == NULL ) { PERROR("Failed to allocate memory for data pointers\n"); @@ -246,7 +285,7 @@ struct t_rec **init_rec_ptrs(struct t_bu } for ( i = 0; i < num; i++ ) - data[i] = (struct t_rec *)(meta[i] + 1); + data[i] = (unsigned char *)(meta[i] + 1); return data; } @@ -281,19 +320,19 @@ unsigned int get_num_cpus(void) * monitor_tbufs - monitor the contents of tbufs and output to a file * @logfile: the FILE * representing the file to log to */ -int monitor_tbufs(FILE *logfile) +int monitor_tbufs(int outfd) { int i; void *tbufs_mapped; /* pointer to where the tbufs are mapped */ struct t_buf **meta; /* pointers to the trace buffer metadata */ - struct t_rec **data; /* pointers to the trace buffer data areas + unsigned char **data; /* pointers to the trace buffer data areas * where they are mapped into user space. */ unsigned long tbufs_mfn; /* mfn of the tbufs */ unsigned int num; /* number of trace buffers / logical CPUS */ unsigned long size; /* size of a single trace buffer */ - int size_in_recs; + unsigned long data_size; /* get number of logical CPUs (and therefore number of trace buffers) */ num = get_num_cpus(); @@ -302,7 +341,7 @@ int monitor_tbufs(FILE *logfile) get_tbufs(&tbufs_mfn, &size); tbufs_mapped = map_tbufs(tbufs_mfn, num, size); - size_in_recs = (size - sizeof(struct t_buf)) / sizeof(struct t_rec); + data_size = size - sizeof(struct t_buf); /* build arrays of convenience ptrs */ meta = init_bufs_ptrs(tbufs_mapped, num, size); @@ -317,13 +356,48 @@ int monitor_tbufs(FILE *logfile) { for ( i = 0; (i < num) && !interrupted; i++ ) { - while ( meta[i]->cons != meta[i]->prod ) + unsigned long start_offset, end_offset, window_size, cons, prod; + + /* Read window information only once. */ + cons = meta[i]->cons; + prod = meta[i]->prod; + rmb(); /* read prod, then read item. */ + + if ( cons == prod ) + continue; + + assert(prod > cons); + + window_size = prod - cons; + start_offset = cons % data_size; + end_offset = prod % data_size; + + if ( end_offset > start_offset ) { - rmb(); /* read prod, then read item. */ - write_rec(i, data[i] + meta[i]->cons % size_in_recs, logfile); - mb(); /* read item, then update cons. */ - meta[i]->cons++; + /* If window does not wrap, write in one big chunk */ + write_buffer(i, data[i]+start_offset, + window_size, + window_size, + outfd); } + else + { + /* If wrapped, write in two chunks: + * - first, start to the end of the buffer + * - second, start of buffer to end of window + */ + write_buffer(i, data[i] + start_offset, + data_size - start_offset, + window_size, + outfd); + write_buffer(i, data[i], + end_offset, + 0, + outfd); + } + + mb(); /* read buffer, then update cons. */ + meta[i]->cons = meta[i]->prod; } nanosleep(&opts.poll_sleep, NULL); @@ -333,7 +407,7 @@ int monitor_tbufs(FILE *logfile) free(meta); free(data); /* don't need to munmap - cleanup is automatic */ - fclose(logfile); + close(outfd); return 0; } @@ -503,7 +577,6 @@ int main(int argc, char **argv) int main(int argc, char **argv) { int outfd = 1, ret; - FILE *logfile; struct sigaction act; opts.outfile = 0; @@ -537,8 +610,6 @@ int main(int argc, char **argv) exit(EXIT_FAILURE); } - logfile = fdopen(outfd, "w"); - /* ensure that if we get a signal, we'll do cleanup, then exit */ act.sa_handler = close_handler; act.sa_flags = 0; @@ -547,7 +618,16 @@ int main(int argc, char **argv) sigaction(SIGTERM, &act, NULL); sigaction(SIGINT, &act, NULL); - ret = monitor_tbufs(logfile); + ret = monitor_tbufs(outfd); return ret; } +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff -r eae7b887e5ac -r ee498c9af856 tools/xm-test/lib/XmTestLib/acm.py --- a/tools/xm-test/lib/XmTestLib/acm.py Thu Sep 27 09:16:23 2007 -0600 +++ b/tools/xm-test/lib/XmTestLib/acm.py Thu Sep 27 12:22:16 2007 -0600 @@ -49,7 +49,7 @@ def ACMLoadPolicy_XenAPI(policy='xm-test if polname != policy: # Try it, maybe it's not activated traceCommand("xm setpolicy %s %s" % - (xsconstants.XS_POLICY_ACM, policy)) + (xsconstants.ACM_POLICY_ID, policy)) polname = getSystemPolicyName() if polname != policy: FAIL("Need to have a system with no or policy '%s' active, " diff -r eae7b887e5ac -r ee498c9af856 xen/arch/x86/acpi/boot.c --- a/xen/arch/x86/acpi/boot.c Thu Sep 27 09:16:23 2007 -0600 +++ b/xen/arch/x86/acpi/boot.c Thu Sep 27 12:22:16 2007 -0600 @@ -1017,5 +1017,7 @@ int __init acpi_boot_init(void) acpi_table_parse(ACPI_HPET, acpi_parse_hpet); - return 0; -} + acpi_dmar_init(); + + return 0; +} diff -r eae7b887e5ac -r ee498c9af856 xen/arch/x86/dmi_scan.c --- a/xen/arch/x86/dmi_scan.c Thu Sep 27 09:16:23 2007 -0600 +++ b/xen/arch/x86/dmi_scan.c Thu Sep 27 12:22:16 2007 -0600 @@ -100,23 +100,32 @@ inline static int __init dmi_checksum(u8 return (sum==0); } -static int __init dmi_iterate(void (*decode)(struct dmi_header *)) +int __init dmi_get_table(u32 *base, u32 *len) { u8 buf[15]; char __iomem *p, *q; - /* - * no iounmap() for that ioremap(); it would be a no-op, but it's - * so early in setup that sucker gets confused into doing what - * it shouldn't if we actually call it. - */ - p = ioremap(0xF0000, 0x10000); - if (p == NULL) - return -1; + p = maddr_to_virt(0xF0000); for (q = p; q < p + 0x10000; q += 16) { memcpy_fromio(buf, q, 15); - if(memcmp(buf, "_DMI_", 5)==0 && dmi_checksum(buf)) - { + if (memcmp(buf, "_DMI_", 5)==0 && dmi_checksum(buf)) { + *base=buf[11]<<24|buf[10]<<16|buf[9]<<8|buf[8]; + *len=buf[7]<<8|buf[6]; + return 0; + } + } + return -1; +} + +static int __init dmi_iterate(void (*decode)(struct dmi_header *)) +{ + u8 buf[15]; + char __iomem *p, *q; + + p = maddr_to_virt(0xF0000); + for (q = p; q < p + 0x10000; q += 16) { + memcpy_fromio(buf, q, 15); + if (memcmp(buf, "_DMI_", 5)==0 && dmi_checksum(buf)) { u16 num=buf[13]<<8|buf[12]; u16 len=buf[7]<<8|buf[6]; u32 base=buf[11]<<24|buf[10]<<16|buf[9]<<8|buf[8]; diff -r eae7b887e5ac -r ee498c9af856 xen/arch/x86/domain.c --- a/xen/arch/x86/domain.c Thu Sep 27 09:16:23 2007 -0600 +++ b/xen/arch/x86/domain.c Thu Sep 27 12:22:16 2007 -0600 @@ -44,6 +44,7 @@ #include <asm/hvm/support.h> #include <asm/msr.h> #include <asm/nmi.h> +#include <asm/iommu.h> #ifdef CONFIG_COMPAT #include <compat/vcpu.h> #endif @@ -505,10 +506,16 @@ int arch_domain_create(struct domain *d) virt_to_page(d->shared_info), d, XENSHARE_writable); } + if ( (rc = iommu_domain_init(d)) != 0 ) + goto fail; + if ( is_hvm_domain(d) ) { if ( (rc = hvm_domain_initialise(d)) != 0 ) + { + iommu_domain_destroy(d); goto fail; + } } else { @@ -538,6 +545,8 @@ void arch_domain_destroy(struct domain * if ( is_hvm_domain(d) ) hvm_domain_destroy(d); + iommu_domain_destroy(d); + paging_final_teardown(d); free_xenheap_pages( @@ -631,10 +640,10 @@ int arch_set_info_guest( memcpy(&v->arch.guest_context, c.nat, sizeof(*c.nat)); #ifdef CONFIG_COMPAT else - { XLAT_vcpu_guest_context(&v->arch.guest_context, c.cmp); - } -#endif +#endif + + v->arch.guest_context.user_regs.eflags |= 2; /* Only CR0.TS is modifiable by guest or admin. */ v->arch.guest_context.ctrlreg[0] &= X86_CR0_TS; @@ -650,10 +659,6 @@ int arch_set_info_guest( /* Ensure real hardware interrupts are enabled. */ v->arch.guest_context.user_regs.eflags |= EF_IE; - } - else - { - hvm_load_cpu_guest_regs(v, &v->arch.guest_context.user_regs); } if ( v->is_initialised ) @@ -1382,10 +1387,9 @@ static void continue_hypercall_on_cpu_he regs->eax = info->func(info->data); v->arch.schedule_tail = info->saved_schedule_tail; - v->cpu_affinity = info->saved_affinity; + v->arch.continue_info = NULL; xfree(info); - v->arch.continue_info = NULL; vcpu_set_affinity(v, &v->cpu_affinity); schedule_tail(v); @@ -1396,6 +1400,7 @@ int continue_hypercall_on_cpu(int cpu, l struct vcpu *v = current; struct migrate_info *info; cpumask_t mask = cpumask_of_cpu(cpu); + int rc; if ( cpu == smp_processor_id() ) return func(data); @@ -1407,12 +1412,19 @@ int continue_hypercall_on_cpu(int cpu, l info->func = func; info->data = data; info->saved_schedule_tail = v->arch.schedule_tail; + info->saved_affinity = v->cpu_affinity; + v->arch.schedule_tail = continue_hypercall_on_cpu_helper; - - info->saved_affinity = v->cpu_affinity; v->arch.continue_info = info; - vcpu_set_affinity(v, &mask); + rc = vcpu_set_affinity(v, &mask); + if ( rc ) + { + v->arch.schedule_tail = info->saved_schedule_tail; + v->arch.continue_info = NULL; + xfree(info); + return rc; + } /* Dummy return value will be overwritten by new schedule_tail. */ BUG_ON(!test_bit(SCHEDULE_SOFTIRQ, &softirq_pending(smp_processor_id()))); diff -r eae7b887e5ac -r ee498c9af856 xen/arch/x86/domctl.c --- a/xen/arch/x86/domctl.c Thu Sep 27 09:16:23 2007 -0600 +++ b/xen/arch/x86/domctl.c Thu Sep 27 12:22:16 2007 -0600 @@ -25,6 +25,8 @@ #include <asm/hvm/support.h> #include <asm/processor.h> #include <xsm/xsm.h> +#include <xen/list.h> +#include <asm/iommu.h> long arch_do_domctl( struct xen_domctl *domctl, @@ -523,6 +525,155 @@ long arch_do_domctl( } break; + case XEN_DOMCTL_assign_device: + { + struct domain *d; + struct hvm_iommu *hd; + u8 bus, devfn; + + if (!vtd_enabled) + break; + + ret = -EINVAL; + if ( unlikely((d = get_domain_by_id(domctl->domain)) == NULL) ) { + gdprintk(XENLOG_ERR, + "XEN_DOMCTL_assign_device: get_domain_by_id() failed\n"); + break; + } + hd = domain_hvm_iommu(d); + bus = (domctl->u.assign_device.machine_bdf >> 16) & 0xff; + devfn = (domctl->u.assign_device.machine_bdf >> 8) & 0xff; + ret = assign_device(d, bus, devfn); + gdprintk(XENLOG_ERR, "XEN_DOMCTL_assign_device: bdf = %x:%x:%x\n", + bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); + put_domain(d); + } + break; + + case XEN_DOMCTL_bind_pt_irq: + { + struct domain * d; + xen_domctl_bind_pt_irq_t * bind; + + ret = -ESRCH; + if ( (d = rcu_lock_domain_by_id(domctl->domain)) == NULL ) + break; + bind = &(domctl->u.bind_pt_irq); + if (vtd_enabled) + ret = pt_irq_create_bind_vtd(d, bind); + if (ret < 0) + gdprintk(XENLOG_ERR, "pt_irq_create_bind failed!\n"); + rcu_unlock_domain(d); + } + break; + + case XEN_DOMCTL_memory_mapping: + { + struct domain *d; + unsigned long gfn = domctl->u.memory_mapping.first_gfn; + unsigned long mfn = domctl->u.memory_mapping.first_mfn; + unsigned long nr_mfns = domctl->u.memory_mapping.nr_mfns; + int i; + + ret = -EINVAL; + if ( (mfn + nr_mfns - 1) < mfn ) /* wrap? */ + break; + + ret = -ESRCH; + if ( unlikely((d = rcu_lock_domain_by_id(domctl->domain)) == NULL) ) + break; + + ret=0; + if ( domctl->u.memory_mapping.add_mapping ) + { + gdprintk(XENLOG_INFO, + "memory_map:add: gfn=%lx mfn=%lx nr_mfns=%lx\n", + gfn, mfn, nr_mfns); + + ret = iomem_permit_access(d, mfn, mfn + nr_mfns - 1); + for ( i = 0; i < nr_mfns; i++ ) + set_mmio_p2m_entry(d, gfn+i, _mfn(mfn+i)); + } + else + { + gdprintk(XENLOG_INFO, + "memory_map:remove: gfn=%lx mfn=%lx nr_mfns=%lx\n", + gfn, mfn, nr_mfns); + + for ( i = 0; i < nr_mfns; i++ ) + clear_mmio_p2m_entry(d, gfn+i); + ret = iomem_deny_access(d, mfn, mfn + nr_mfns - 1); + } + + rcu_unlock_domain(d); + } + break; + + case XEN_DOMCTL_ioport_mapping: + { +#define MAX_IOPORTS 0x10000 + struct domain *d; + struct hvm_iommu *hd; + unsigned int fgp = domctl->u.ioport_mapping.first_gport; + unsigned int fmp = domctl->u.ioport_mapping.first_mport; + unsigned int np = domctl->u.ioport_mapping.nr_ports; + struct g2m_ioport *g2m_ioport; + int found = 0; + + ret = -EINVAL; + if ( (np == 0) || (fgp > MAX_IOPORTS) || (fmp > MAX_IOPORTS) || + ((fgp + np) > MAX_IOPORTS) || ((fmp + np) > MAX_IOPORTS) ) + { + gdprintk(XENLOG_ERR, + "ioport_map:invalid:gport=%x mport=%x nr_ports=%x\n", + fgp, fmp, np); + break; + } + + ret = -ESRCH; + if ( unlikely((d = rcu_lock_domain_by_id(domctl->domain)) == NULL) ) + break; + + hd = domain_hvm_iommu(d); + if ( domctl->u.ioport_mapping.add_mapping ) + { + gdprintk(XENLOG_INFO, + "ioport_map:add f_gport=%x f_mport=%x np=%x\n", + fgp, fmp, np); + + list_for_each_entry(g2m_ioport, &hd->g2m_ioport_list, list) + if (g2m_ioport->mport == fmp ) { + g2m_ioport->gport = fgp; + g2m_ioport->np = np; + found = 1; + break; + } + if ( !found ) + { + g2m_ioport = xmalloc(struct g2m_ioport); + g2m_ioport->gport = fgp; + g2m_ioport->mport = fmp; + g2m_ioport->np = np; + list_add_tail(&g2m_ioport->list, &hd->g2m_ioport_list); + } + ret = ioports_permit_access(d, fmp, fmp + np - 1); + + } + else { + gdprintk(XENLOG_INFO, + "ioport_map:remove f_gport=%x f_mport=%x np=%x\n", + fgp, fmp, np); + list_for_each_entry(g2m_ioport, &hd->g2m_ioport_list, list) + if ( g2m_ioport->mport == fmp ) { + list_del(&g2m_ioport->list); + break; + } + ret = ioports_deny_access(d, fmp, fmp + np - 1); + } + rcu_unlock_domain(d); + } + break; + default: ret = -ENOSYS; break; @@ -555,18 +706,21 @@ void arch_get_info_guest(struct vcpu *v, if ( is_hvm_vcpu(v) ) { if ( !is_pv_32on64_domain(v->domain) ) - hvm_store_cpu_guest_regs(v, &c.nat->user_regs, c.nat->ctrlreg); + { + memset(c.nat->ctrlreg, 0, sizeof(c.nat->ctrlreg)); + c.nat->ctrlreg[0] = v->arch.hvm_vcpu.guest_cr[0]; + c.nat->ctrlreg[2] = v->arch.hvm_vcpu.guest_cr[2]; + c.nat->ctrlreg[3] = v->arch.hvm_vcpu.guest_cr[3]; + c.nat->ctrlreg[4] = v->arch.hvm_vcpu.guest_cr[4]; + } #ifdef CONFIG_COMPAT else { - struct cpu_user_regs user_regs; - typeof(c.nat->ctrlreg) ctrlreg; - unsigned i; - - hvm_store_cpu_guest_regs(v, &user_regs, ctrlreg); - XLAT_cpu_user_regs(&c.cmp->user_regs, &user_regs); - for ( i = 0; i < ARRAY_SIZE(c.cmp->ctrlreg); ++i ) - c.cmp->ctrlreg[i] = ctrlreg[i]; + memset(c.cmp->ctrlreg, 0, sizeof(c.cmp->ctrlreg)); + c.cmp->ctrlreg[0] = v->arch.hvm_vcpu.guest_cr[0]; + c.cmp->ctrlreg[2] = v->arch.hvm_vcpu.guest_cr[2]; + c.cmp->ctrlreg[3] = v->arch.hvm_vcpu.guest_cr[3]; + c.cmp->ctrlreg[4] = v->arch.hvm_vcpu.guest_cr[4]; } #endif } diff -r eae7b887e5ac -r ee498c9af856 xen/arch/x86/e820.c --- a/xen/arch/x86/e820.c Thu Sep 27 09:16:23 2007 -0600 +++ b/xen/arch/x86/e820.c Thu Sep 27 12:22:16 2007 -0600 @@ -2,6 +2,7 @@ #include <xen/init.h> #include <xen/lib.h> #include <xen/compat.h> +#include <xen/dmi.h> #include <asm/e820.h> #include <asm/page.h> @@ -343,6 +344,15 @@ static void __init clip_to_limit(uint64_ } } +static void __init reserve_dmi_region(void) +{ + u32 base, len; + if ( (dmi_get_table(&base, &len) == 0) && ((base + len) > base) && + reserve_e820_ram(&e820, base, base + len) ) + printk("WARNING: DMI table located in E820 RAM %08x-%08x. Fixed.\n", + base, base+len); +} + static void __init machine_specific_memory_setup( struct e820entry *raw, int *raw_nr) { @@ -366,6 +376,74 @@ static void __init machine_specific_memo "Only the first %u GB of the physical memory map " "can be accessed by 32-on-64 guests."); #endif + + reserve_dmi_region(); +} + +/* Reserve RAM area (@s,@e) in the specified e820 map. */ +int __init reserve_e820_ram(struct e820map *e820, uint64_t s, uint64_t e) +{ + uint64_t rs = 0, re = 0; + int i; + + for ( i = 0; i < e820->nr_map; i++ ) + { + /* Have we found the e820 region that includes the specified range? */ + rs = e820->map[i].addr; + re = rs + e820->map[i].size; + if ( (s >= rs) && (e <= re) ) + break; + } + + if ( (i == e820->nr_map) || (e820->map[i].type != E820_RAM) ) + return 0; + + if ( (s == rs) && (e == re) ) + { + /* Complete excision. */ + memmove(&e820->map[i], &e820->map[i+1], + (e820->nr_map-i-1) * sizeof(e820->map[0])); + e820->nr_map--; + } + else if ( s == rs ) + { + /* Truncate start. */ + e820->map[i].addr += e - s; + e820->map[i].size -= e - s; + } + else if ( e == re ) + { + /* Truncate end. */ + e820->map[i].size -= e - s; + } + else if ( e820->nr_map < ARRAY_SIZE(e820->map) ) + { + /* Split in two. */ + memmove(&e820->map[i+1], &e820->map[i], + (e820->nr_map-i) * sizeof(e820->map[0])); + e820->nr_map++; + e820->map[i].size = s - rs; + i++; + e820->map[i].addr = e; + e820->map[i].size = re - e; + } + else + { + /* e820map is at maximum size. We have to leak some space. */ + if ( (s - rs) > (re - e) ) + { + printk("e820 overflow: leaking RAM %"PRIx64"-%"PRIx64"\n", e, re); + e820->map[i].size = s - rs; + } + else + { + printk("e820 overflow: leaking RAM %"PRIx64"-%"PRIx64"\n", rs, s); + e820->map[i].addr = e; + e820->map[i].size = re - e; + } + } + + return 1; } unsigned long __init init_e820( diff -r eae7b887e5ac -r ee498c9af856 xen/arch/x86/hvm/hvm.c --- a/xen/arch/x86/hvm/hvm.c Thu Sep 27 09:16:23 2007 -0600 +++ b/xen/arch/x86/hvm/hvm.c Thu Sep 27 12:22:16 2007 -0600 @@ -273,8 +273,10 @@ static int hvm_save_cpu_ctxt(struct doma ctxt.rbp = vc->user_regs.ebp; ctxt.rsi = vc->user_regs.esi; ctxt.rdi = vc->user_regs.edi; - /* %rsp handled by arch-specific call above */ -#ifdef __x86_64__ + ctxt.rsp = vc->user_regs.esp; + ctxt.rip = vc->user_regs.eip; + ctxt.rflags = vc->user_regs.eflags; +#ifdef __x86_64__ ctxt.r8 = vc->user_regs.r8; ctxt.r9 = vc->user_regs.r9; ctxt.r10 = vc->user_regs.r10; @@ -337,6 +339,8 @@ static int hvm_load_cpu_ctxt(struct doma vc->user_regs.esi = ctxt.rsi; vc->user_regs.edi = ctxt.rdi; vc->user_regs.esp = ctxt.rsp; + vc->user_regs.eip = ctxt.rip; + vc->user_regs.eflags = ctxt.rflags | 2; #ifdef __x86_64__ vc->user_regs.r8 = ctxt.r8; vc->user_regs.r9 = ctxt.r9; @@ -672,6 +676,409 @@ int hvm_set_cr4(unsigned long value) return 0; } +int hvm_virtual_to_linear_addr( + enum x86_segment seg, + struct segment_register *reg, + unsigned long offset, + unsigned int bytes, + enum hvm_access_type access_type, + unsigned int addr_size, + unsigned long *linear_addr) +{ + unsigned long addr = offset; + uint32_t last_byte; + + if ( addr_size != 64 ) + { + /* + * COMPATIBILITY MODE: Apply segment checks and add base. + */ + + switch ( access_type ) + { + case hvm_access_read: + if ( (reg->attr.fields.type & 0xa) == 0x8 ) + goto gpf; /* execute-only code segment */ + break; + case hvm_access_write: + if ( (reg->attr.fields.type & 0xa) != 0x2 ) + goto gpf; /* not a writable data segment */ + break; + default: + break; + } + + last_byte = offset + bytes - 1; + + /* Is this a grows-down data segment? Special limit check if so. */ + if ( (reg->attr.fields.type & 0xc) == 0x4 ) + { + /* Is upper limit 0xFFFF or 0xFFFFFFFF? */ + if ( !reg->attr.fields.db ) + last_byte = (uint16_t)last_byte; + + /* Check first byte and last byte against respective bounds. */ + if ( (offset <= reg->limit) || (last_byte < offset) ) + goto gpf; + } + else if ( (last_byte > reg->limit) || (last_byte < offset) ) + goto gpf; /* last byte is beyond limit or wraps 0xFFFFFFFF */ + + /* + * Hardware truncates to 32 bits in compatibility mode. + * It does not truncate to 16 bits in 16-bit address-size mode. + */ + addr = (uint32_t)(addr + reg->base); + } + else + { + /* + * LONG MODE: FS and GS add segment base. Addresses must be canonical. + */ + + if ( (seg == x86_seg_fs) || (seg == x86_seg_gs) ) + addr += reg->base; + + if ( !is_canonical_address(addr) ) + goto gpf; + } + + *linear_addr = addr; + return 1; + + gpf: + return 0; +} + +static void *hvm_map(unsigned long va, int size) +{ + unsigned long gfn, mfn; + p2m_type_t p2mt; + + if ( ((va & ~PAGE_MASK) + size) > PAGE_SIZE ) + { + hvm_inject_exception(TRAP_page_fault, PFEC_write_access, + (va + PAGE_SIZE - 1) & PAGE_MASK); + return NULL; + } + + gfn = paging_gva_to_gfn(current, va); + mfn = mfn_x(gfn_to_mfn_current(gfn, &p2mt)); + if ( !p2m_is_ram(p2mt) ) + { + hvm_inject_exception(TRAP_page_fault, PFEC_write_access, va); + return NULL; + } + + ASSERT(mfn_valid(mfn)); + + paging_mark_dirty(current->domain, mfn); + + return (char *)map_domain_page(mfn) + (va & ~PAGE_MASK); +} + +static void hvm_unmap(void *p) +{ + if ( p ) + unmap_domain_page(p); +} + +static int hvm_load_segment_selector( + struct vcpu *v, enum x86_segment seg, uint16_t sel) +{ + struct segment_register desctab, cs, segr; + struct desc_struct *pdesc, desc; + u8 dpl, rpl, cpl; + int fault_type = TRAP_invalid_tss; + + /* NULL selector? */ + if ( (sel & 0xfffc) == 0 ) + { + if ( (seg == x86_seg_cs) || (seg == x86_seg_ss) ) + goto fail; + memset(&segr, 0, sizeof(segr)); + hvm_set_segment_register(v, seg, &segr); + return 0; + } + + /* LDT descriptor must be in the GDT. */ + if ( (seg == x86_seg_ldtr) && (sel & 4) ) + goto fail; + + hvm_get_segment_register(v, x86_seg_cs, &cs); + hvm_get_segment_register( + v, (sel & 4) ? x86_seg_ldtr : x86_seg_gdtr, &desctab); + + /* Check against descriptor table limit. */ + if ( ((sel & 0xfff8) + 7) > desctab.limit ) + goto fail; + + pdesc = hvm_map(desctab.base + (sel & 0xfff8), 8); + if ( pdesc == NULL ) + goto hvm_map_fail; + + do { + desc = *pdesc; + + /* Segment present in memory? */ + if ( !(desc.b & (1u<<15)) ) + { + fault_type = TRAP_no_segment; + goto unmap_and_fail; + } + + /* LDT descriptor is a system segment. All others are code/data. */ + if ( (desc.b & (1u<<12)) == ((seg == x86_seg_ldtr) << 12) ) + goto unmap_and_fail; + + dpl = (desc.b >> 13) & 3; + rpl = sel & 3; + cpl = cs.sel & 3; + + switch ( seg ) + { + case x86_seg_cs: + /* Code segment? */ + if ( !(desc.b & (1u<<11)) ) + goto unmap_and_fail; + /* Non-conforming segment: check DPL against RPL. */ + if ( ((desc.b & (6u<<9)) != 6) && (dpl != rpl) ) + goto unmap_and_fail; + break; + case x86_seg_ss: + /* Writable data segment? */ + if ( (desc.b & (5u<<9)) != (1u<<9) ) + goto unmap_and_fail; + if ( (dpl != cpl) || (dpl != rpl) ) + goto unmap_and_fail; + break; + case x86_seg_ldtr: + /* LDT system segment? */ + if ( (desc.b & (15u<<8)) != (2u<<8) ) + goto unmap_and_fail; + goto skip_accessed_flag; + default: + /* Readable code or data segment? */ + if ( (desc.b & (5u<<9)) == (4u<<9) ) + goto unmap_and_fail; + /* Non-conforming segment: check DPL against RPL and CPL. */ + if ( ((desc.b & (6u<<9)) != 6) && ((dpl < cpl) || (dpl < rpl)) ) + goto unmap_and_fail; + break; + } + } while ( !(desc.b & 0x100) && /* Ensure Accessed flag is set */ + (cmpxchg(&pdesc->b, desc.b, desc.b | 0x100) != desc.b) ); + + /* Force the Accessed flag in our local copy. */ + desc.b |= 0x100; + + skip_accessed_flag: + hvm_unmap(pdesc); + + segr.base = (((desc.b << 0) & 0xff000000u) | + ((desc.b << 16) & 0x00ff0000u) | + ((desc.a >> 16) & 0x0000ffffu)); + segr.attr.bytes = (((desc.b >> 8) & 0x00ffu) | + ((desc.b >> 12) & 0x0f00u)); + segr.limit = (desc.b & 0x000f0000u) | (desc.a & 0x0000ffffu); + if ( segr.attr.fields.g ) + segr.limit = (segr.limit << 12) | 0xfffu; + segr.sel = sel; + hvm_set_segment_register(v, seg, &segr); + + return 0; + + unmap_and_fail: + hvm_unmap(pdesc); + fail: + hvm_inject_exception(fault_type, sel & 0xfffc, 0); + hvm_map_fail: + return 1; +} + +void hvm_task_switch( + uint16_t tss_sel, enum hvm_task_switch_reason taskswitch_reason, + int32_t errcode) +{ + struct vcpu *v = current; + struct cpu_user_regs *regs = guest_cpu_user_regs(); + struct segment_register gdt, tr, prev_tr, segr; + struct desc_struct *optss_desc = NULL, *nptss_desc = NULL, tss_desc; + unsigned long eflags; + int exn_raised; + struct { + u16 back_link,__blh; + u32 esp0; + u16 ss0, _0; + u32 esp1; + u16 ss1, _1; + u32 esp2; + u16 ss2, _2; + u32 cr3, eip, eflags, eax, ecx, edx, ebx, esp, ebp, esi, edi; + u16 es, _3, cs, _4, ss, _5, ds, _6, fs, _7, gs, _8, ldt, _9; + u16 trace, iomap; + } *ptss, tss; + + hvm_get_segment_register(v, x86_seg_gdtr, &gdt); + hvm_get_segment_register(v, x86_seg_tr, &prev_tr); + + if ( ((tss_sel & 0xfff8) + 7) > gdt.limit ) + { + hvm_inject_exception((taskswitch_reason == TSW_iret) ? + TRAP_invalid_tss : TRAP_gp_fault, + tss_sel & 0xfff8, 0); + goto out; + } + + optss_desc = hvm_map(gdt.base + (prev_tr.sel & 0xfff8), 8); + if ( optss_desc == NULL ) + goto out; + + nptss_desc = hvm_map(gdt.base + (tss_sel & 0xfff8), 8); + if ( nptss_desc == NULL ) + goto out; + + tss_desc = *nptss_desc; + tr.sel = tss_sel; + tr.base = (((tss_desc.b << 0) & 0xff000000u) | + ((tss_desc.b << 16) & 0x00ff0000u) | + ((tss_desc.a >> 16) & 0x0000ffffu)); + tr.attr.bytes = (((tss_desc.b >> 8) & 0x00ffu) | + ((tss_desc.b >> 12) & 0x0f00u)); + tr.limit = (tss_desc.b & 0x000f0000u) | (tss_desc.a & 0x0000ffffu); + if ( tr.attr.fields.g ) + tr.limit = (tr.limit << 12) | 0xfffu; + + if ( !tr.attr.fields.p ) + { + hvm_inject_exception(TRAP_no_segment, tss_sel & 0xfff8, 0); + goto out; + } + + if ( tr.attr.fields.type != ((taskswitch_reason == TSW_iret) ? 0xb : 0x9) ) + { + hvm_inject_exception( + (taskswitch_reason == TSW_iret) ? TRAP_invalid_tss : TRAP_gp_fault, + tss_sel & 0xfff8, 0); + goto out; + } + + if ( !tr.attr.fields.g && (tr.limit < (sizeof(tss)-1)) ) + { + hvm_inject_exception(TRAP_invalid_tss, tss_sel & 0xfff8, 0); + goto out; + } + + ptss = hvm_map(prev_tr.base, sizeof(tss)); + if ( ptss == NULL ) + goto out; + + eflags = regs->eflags; + if ( taskswitch_reason == TSW_iret ) + eflags &= ~X86_EFLAGS_NT; + + ptss->cr3 = v->arch.hvm_vcpu.guest_cr[3]; + ptss->eip = regs->eip; + ptss->eflags = eflags; + ptss->eax = regs->eax; + ptss->ecx = regs->ecx; + ptss->edx = regs->edx; + ptss->ebx = regs->ebx; + ptss->esp = regs->esp; + ptss->ebp = regs->ebp; + ptss->esi = regs->esi; + ptss->edi = regs->edi; + + hvm_get_segment_register(v, x86_seg_es, &segr); + ptss->es = segr.sel; + hvm_get_segment_register(v, x86_seg_cs, &segr); + ptss->cs = segr.sel; + hvm_get_segment_register(v, x86_seg_ss, &segr); + ptss->ss = segr.sel; + hvm_get_segment_register(v, x86_seg_ds, &segr); + ptss->ds = segr.sel; + hvm_get_segment_register(v, x86_seg_fs, &segr); + ptss->fs = segr.sel; + hvm_get_segment_register(v, x86_seg_gs, &segr); + ptss->gs = segr.sel; + hvm_get_segment_register(v, x86_seg_ldtr, &segr); + ptss->ldt = segr.sel; + + hvm_unmap(ptss); + + ptss = hvm_map(tr.base, sizeof(tss)); + if ( ptss == NULL ) + goto out; + + if ( !hvm_set_cr3(ptss->cr3) ) + { + hvm_unmap(ptss); + goto out; + } + + regs->eip = ptss->eip; + regs->eflags = ptss->eflags | 2; + regs->eax = ptss->eax; + regs->ecx = ptss->ecx; + regs->edx = ptss->edx; + regs->ebx = ptss->ebx; + regs->esp = ptss->esp; + regs->ebp = ptss->ebp; + regs->esi = ptss->esi; + regs->edi = ptss->edi; + + if ( (taskswitch_reason == TSW_call_or_int) ) + { + regs->eflags |= X86_EFLAGS_NT; + ptss->back_link = prev_tr.sel; + } + + exn_raised = 0; + if ( hvm_load_segment_selector(v, x86_seg_es, ptss->es) || + hvm_load_segment_selector(v, x86_seg_cs, ptss->cs) || + hvm_load_segment_selector(v, x86_seg_ss, ptss->ss) || + hvm_load_segment_selector(v, x86_seg_ds, ptss->ds) || + hvm_load_segment_selector(v, x86_seg_fs, ptss->fs) || + hvm_load_segment_selector(v, x86_seg_gs, ptss->gs) || + hvm_load_segment_selector(v, x86_seg_ldtr, ptss->ldt) ) + exn_raised = 1; + + if ( (ptss->trace & 1) && !exn_raised ) + hvm_inject_exception(TRAP_debug, tss_sel & 0xfff8, 0); + + hvm_unmap(ptss); + + tr.attr.fields.type = 0xb; /* busy 32-bit tss */ + hvm_set_segment_register(v, x86_seg_tr, &tr); + + v->arch.hvm_vcpu.guest_cr[0] |= X86_CR0_TS; + hvm_update_guest_cr(v, 0); + + if ( (taskswitch_reason == TSW_iret) || + (taskswitch_reason == TSW_jmp) ) + clear_bit(41, optss_desc); /* clear B flag of old task */ + + if ( taskswitch_reason != TSW_iret ) + set_bit(41, nptss_desc); /* set B flag of new task */ + + if ( errcode >= 0 ) + { + struct segment_register reg; + unsigned long linear_addr; + regs->esp -= 4; + hvm_get_segment_register(current, x86_seg_ss, ®); + /* Todo: do not ignore access faults here. */ + if ( hvm_virtual_to_linear_addr(x86_seg_ss, ®, regs->esp, + 4, hvm_access_write, 32, + &linear_addr) ) + hvm_copy_to_guest_virt(linear_addr, &errcode, 4); + } + + out: + hvm_unmap(optss_desc); + hvm_unmap(nptss_desc); +} + /* * __hvm_copy(): * @buf = hypervisor buffer @@ -906,7 +1313,6 @@ int hvm_do_hypercall(struct cpu_user_reg #endif case 4: case 2: - hvm_store_cpu_guest_regs(current, regs, NULL); if ( unlikely(ring_3(regs)) ) { default: diff -r eae7b887e5ac -r ee498c9af856 xen/arch/x86/hvm/i8254.c --- a/xen/arch/x86/hvm/i8254.c Thu Sep 27 09:16:23 2007 -0600 +++ b/xen/arch/x86/hvm/i8254.c Thu Sep 27 12:22:16 2007 -0600 @@ -598,11 +598,13 @@ int pv_pit_handler(int port, int data, i .size = 1, .type = IOREQ_TYPE_PIO, .addr = port, - .dir = write ? 0 : 1, - .data = write ? data : 0, + .dir = write ? IOREQ_WRITE : IOREQ_READ, + .data = data }; - if ( port == 0x61 ) + if ( (current->domain->domain_id == 0) && dom0_pit_access(&ioreq) ) + /* nothing to do */; + else if ( port == 0x61 ) handle_speaker_io(&ioreq); else handle_pit_io(&ioreq); diff -r eae7b887e5ac -r ee498c9af856 xen/arch/x86/hvm/intercept.c --- a/xen/arch/x86/hvm/intercept.c Thu Sep 27 09:16:23 2007 -0600 +++ b/xen/arch/x86/hvm/intercept.c Thu Sep 27 12:22:16 2007 -0600 @@ -29,6 +29,7 @@ #include <asm/current.h> #include <io_ports.h> #include <xen/event.h> +#include <asm/iommu.h> extern struct hvm_mmio_handler hpet_mmio_handler; @@ -242,6 +243,9 @@ int hvm_io_intercept(ioreq_t *p, int typ &(v->domain->arch.hvm_domain.io_handler); int i; unsigned long addr, size; + + if ( (type == HVM_PORTIO) && (dpci_ioport_intercept(p)) ) + return 1; for (i = 0; i < handler->num_slot; i++) { if( type != handler->hdl_list[i].type) diff -r eae7b887e5ac -r ee498c9af856 xen/arch/x86/hvm/io.c --- a/xen/arch/x86/hvm/io.c Thu Sep 27 09:16:23 2007 -0600 +++ b/xen/arch/x86/hvm/io.c Thu Sep 27 12:22:16 2007 -0600 @@ -40,8 +40,10 @@ #include <asm/hvm/vpt.h> #include <asm/hvm/vpic.h> #include <asm/hvm/vlapic.h> +#include <asm/hvm/trace.h> #include <public/sched.h> +#include <xen/iocap.h> #include <public/hvm/ioreq.h> #if defined (__i386__) @@ -475,6 +477,7 @@ static void hvm_pio_assist(struct cpu_us printk("Error: %s unknown port size\n", __FUNCTION__); domain_crash_synchronous(); } + HVMTRACE_1D(IO_ASSIST, current, p->data); } } @@ -490,6 +493,8 @@ static void hvm_mmio_assist(struct cpu_u dst = mmio_opp->operand[1]; size = operand_size(src); + HVMTRACE_1D(MMIO_ASSIST, current, p->data); + switch (mmio_opp->instr) { case INSTR_MOV: if (dst & REGISTER) { @@ -857,11 +862,127 @@ void hvm_io_assist(void) /* Copy register changes back into current guest state. */ regs->eflags &= ~X86_EFLAGS_RF; - hvm_load_cpu_guest_regs(v, regs); memcpy(guest_cpu_user_regs(), regs, HVM_CONTEXT_STACK_BYTES); out: vcpu_end_shutdown_deferral(v); +} + +void dpci_ioport_read(uint32_t mport, ioreq_t *p) +{ + uint64_t i; + uint64_t z_data; + uint64_t length = (p->count * p->size); + + for ( i = 0; i < length; i += p->size ) + { + z_data = ~0ULL; + + switch ( p->size ) + { + case BYTE: + z_data = (uint64_t)inb(mport); + break; + case WORD: + z_data = (uint64_t)inw(mport); + break; + case LONG: + z_data = (uint64_t)inl(mport); + break; + default: + gdprintk(XENLOG_ERR, "Error: unable to handle size: %" + PRId64 "\n", p->size); + return; + } + + p->data = z_data; + if ( p->data_is_ptr && + hvm_copy_to_guest_phys(p->data + i, (void *)&z_data, + (int)p->size) ) + { + gdprintk(XENLOG_ERR, "Error: couldn't copy to hvm phys\n"); + return; + } + } +} + +void dpci_ioport_write(uint32_t mport, ioreq_t *p) +{ + uint64_t i; + uint64_t z_data = 0; + uint64_t length = (p->count * p->size); + + for ( i = 0; i < length; i += p->size ) + { + z_data = p->data; + if ( p->data_is_ptr && + hvm_copy_from_guest_phys((void *)&z_data, + p->data + i, (int)p->size) ) + { + gdprintk(XENLOG_ERR, "Error: couldn't copy from hvm phys\n"); + return; + } + + switch ( p->size ) + { + case BYTE: + outb((uint8_t) z_data, mport); + break; + case WORD: + outw((uint16_t) z_data, mport); + break; + case LONG: + outl((uint32_t) z_data, mport); + break; + default: + gdprintk(XENLOG_ERR, "Error: unable to handle size: %" + PRId64 "\n", p->size); + break; + } + } +} + +int dpci_ioport_intercept(ioreq_t *p) +{ + struct domain *d = current->domain; + struct hvm_iommu *hd = domain_hvm_iommu(d); + struct g2m_ioport *g2m_ioport; + unsigned int mport, gport = p->addr; + unsigned int s = 0, e = 0; + + list_for_each_entry( g2m_ioport, &hd->g2m_ioport_list, list ) + { + s = g2m_ioport->gport; + e = s + g2m_ioport->np; + if ( (gport >= s) && (gport < e) ) + goto found; + } + + return 0; + + found: + mport = (gport - s) + g2m_ioport->mport; + + if ( !ioports_access_permitted(d, mport, mport + p->size - 1) ) + { + gdprintk(XENLOG_ERR, "Error: access to gport=0x%x denied!\n", + (uint32_t)p->addr); + return 0; + } + + switch ( p->dir ) + { + case IOREQ_READ: + dpci_ioport_read(mport, p); + break; + case IOREQ_WRITE: + dpci_ioport_write(mport, p); + break; + default: + gdprintk(XENLOG_ERR, "Error: couldn't handle p->dir = %d", p->dir); + } + + return 1; } /* diff -r eae7b887e5ac -r ee498c9af856 xen/arch/x86/hvm/irq.c --- a/xen/arch/x86/hvm/irq.c Thu Sep 27 09:16:23 2007 -0600 +++ b/xen/arch/x86/hvm/irq.c Thu Sep 27 12:22:16 2007 -0600 @@ -26,7 +26,7 @@ #include <asm/hvm/domain.h> #include <asm/hvm/support.h> -static void __hvm_pci_intx_assert( +void __hvm_pci_intx_assert( struct domain *d, unsigned int device, unsigned int intx) { struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq; @@ -59,7 +59,7 @@ void hvm_pci_intx_assert( spin_unlock(&d->arch.hvm_domain.irq_lock); } -static void __hvm_pci_intx_deassert( +void __hvm_pci_intx_deassert( struct domain *d, unsigned int device, unsigned int intx) { struct hvm_irq *hvm_irq = &d->arch.hvm_domain.irq; @@ -306,14 +306,7 @@ int hvm_vcpu_ack_pending_irq(struct vcpu switch ( type ) { case hvm_intack_nmi: -#if 0 return test_and_clear_bool(v->nmi_pending); -#else - if ( test_and_clear_bool(v->nmi_pending) ) - gdprintk(XENLOG_WARNING, "Dropping NMI delivery to %d:%d\n", - v->domain->domain_id, v->vcpu_id); - break; -#endif case hvm_intack_lapic: return ((*vector = cpu_get_apic_interrupt(v)) != -1); case hvm_intack_pic: diff -r eae7b887e5ac -r ee498c9af856 xen/arch/x86/hvm/platform.c --- a/xen/arch/x86/hvm/platform.c Thu Sep 27 09:16:23 2007 -0600 +++ b/xen/arch/x86/hvm/platform.c Thu Sep 27 12:22:16 2007 -0600 @@ -1032,7 +1032,6 @@ void handle_mmio(unsigned long gpa) /* Copy current guest state into io instruction state structure. */ memcpy(regs, guest_cpu_user_regs(), HVM_CONTEXT_STACK_BYTES); - hvm_store_cpu_guest_regs(v, regs, NULL); df = regs->eflags & X86_EFLAGS_DF ? 1 : 0; diff -r eae7b887e5ac -r ee498c9af856 xen/arch/x86/hvm/svm/Makefile --- a/xen/arch/x86/hvm/svm/Makefile Thu Sep 27 09:16:23 2007 -0600 +++ b/xen/arch/x86/hvm/svm/Makefile Thu Sep 27 12:22:16 2007 -0600 @@ -1,5 +1,7 @@ subdir-$(x86_32) += x86_32 subdir-$(x86_32) += x86_32 subdir-$(x86_64) += x86_64 + +subdir-y += amd_iommu obj-y += asid.o obj-y += emulate.o diff -r eae7b887e5ac -r ee498c9af856 xen/arch/x86/hvm/svm/amd_iommu/Makefile --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/arch/x86/hvm/svm/amd_iommu/Makefile Thu Sep 27 12:22:16 2007 -0600 @@ -0,0 +1,4 @@ +obj-y += amd-iommu-detect.o +obj-y += amd-iommu-init.o +obj-y += amd-iommu-map.o +obj-y += pci-amd-iommu.o diff -r eae7b887e5ac -r ee498c9af856 xen/arch/x86/hvm/svm/amd_iommu/amd-iommu-detect.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/arch/x86/hvm/svm/amd_iommu/amd-iommu-detect.c Thu Sep 27 12:22:16 2007 -0600 @@ -0,0 +1,211 @@ +/* + * Copyright (C) 2007 Advanced Micro Devices, Inc. + * Author: Leo Duran <leo.duran@xxxxxxx> + * Author: Wei Wang <wei.wang2@xxxxxxx> - adapted to xen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <asm/iommu.h> +#include <asm/amd-iommu.h> +#include <asm/hvm/svm/amd-iommu-proto.h> +#include "pci-direct.h" +#include "pci_regs.h" + +static int __init valid_bridge_bus_config(int bus, int dev, int func, + int *sec_bus, int *sub_bus) +{ + int pri_bus; + + pri_bus = read_pci_config_byte(bus, dev, func, PCI_PRIMARY_BUS); + *sec_bus = read_pci_config_byte(bus, dev, func, PCI_SECONDARY_BUS); + *sub_bus = read_pci_config_byte(bus, dev, func, PCI_SUBORDINATE_BUS); + + return ( pri_bus == bus && *sec_bus > bus && *sub_bus >= *sec_bus ); +} + +int __init get_iommu_last_downstream_bus(struct amd_iommu *iommu) +{ + int bus, dev, func; + int devfn, hdr_type; + int sec_bus, sub_bus; + int multi_func; + + bus = iommu->last_downstream_bus = iommu->root_bus; + iommu->downstream_bus_present[bus] = 1; + dev = PCI_SLOT(iommu->first_devfn); + multi_func = PCI_FUNC(iommu->first_devfn) > 0; + for ( devfn = iommu->first_devfn; devfn <= iommu->last_devfn; ++devfn ) { + /* skipping to next device#? */ + if ( dev != PCI_SLOT(devfn) ) { + dev = PCI_SLOT(devfn); + multi_func = 0; + } + func = PCI_FUNC(devfn); + + if ( !VALID_PCI_VENDOR_ID( + read_pci_config_16(bus, dev, func, PCI_VENDOR_ID)) ) + continue; + + hdr_type = read_pci_config_byte(bus, dev, func, + PCI_HEADER_TYPE); + if ( func == 0 ) + multi_func = IS_PCI_MULTI_FUNCTION(hdr_type); + + if ( (func == 0 || multi_func) && + IS_PCI_TYPE1_HEADER(hdr_type) ) { + if (!valid_bridge_bus_config(bus, dev, func, + &sec_bus, &sub_bus)) + return -ENODEV; + + if ( sub_bus > iommu->last_downstream_bus ) + iommu->last_downstream_bus = sub_bus; + do { + iommu->downstream_bus_present[sec_bus] = 1; + } while ( sec_bus++ < sub_bus ); + } + } + + return 0; +} + +int __init get_iommu_capabilities(u8 bus, u8 dev, u8 func, u8 cap_ptr, + struct amd_iommu *iommu) +{ + u32 cap_header, cap_range; + u64 mmio_bar; + + /* remove it when BIOS available */ + write_pci_config(bus, dev, func, + cap_ptr + PCI_CAP_MMIO_BAR_HIGH_OFFSET, 0x00000000); + write_pci_config(bus, dev, func, + cap_ptr + PCI_CAP_MMIO_BAR_LOW_OFFSET, 0x40000001); + /* remove it when BIOS available */ + + mmio_bar = (u64)read_pci_config(bus, dev, func, + cap_ptr + PCI_CAP_MMIO_BAR_HIGH_OFFSET) << 32; + mmio_bar |= read_pci_config(bus, dev, func, + cap_ptr + PCI_CAP_MMIO_BAR_LOW_OFFSET) & + PCI_CAP_MMIO_BAR_LOW_MASK; + iommu->mmio_base_phys = (unsigned long)mmio_bar; + + if ( (mmio_bar == 0) || ( (mmio_bar & 0x3FFF) != 0 ) ) { + dprintk(XENLOG_ERR , + "AMD IOMMU: Invalid MMIO_BAR = 0x%"PRIx64"\n", mmio_bar); + return -ENODEV; + } + + cap_header = read_pci_config(bus, dev, func, cap_ptr); + iommu->revision = get_field_from_reg_u32(cap_header, + PCI_CAP_REV_MASK, PCI_CAP_REV_SHIFT); + iommu->iotlb_support = get_field_from_reg_u32(cap_header, + PCI_CAP_IOTLB_MASK, PCI_CAP_IOTLB_SHIFT); + iommu->ht_tunnel_support = get_field_from_reg_u32(cap_header, + PCI_CAP_HT_TUNNEL_MASK, + PCI_CAP_HT_TUNNEL_SHIFT); + iommu->not_present_cached = get_field_from_reg_u32(cap_header, + PCI_CAP_NP_CACHE_MASK, + PCI_CAP_NP_CACHE_SHIFT); + + cap_range = read_pci_config(bus, dev, func, + cap_ptr + PCI_CAP_RANGE_OFFSET); + iommu->root_bus = get_field_from_reg_u32(cap_range, + PCI_CAP_BUS_NUMBER_MASK, + PCI_CAP_BUS_NUMBER_SHIFT); + iommu->first_devfn = get_field_from_reg_u32(cap_range, + PCI_CAP_FIRST_DEVICE_MASK, + PCI_CAP_FIRST_DEVICE_SHIFT); + iommu->last_devfn = get_field_from_reg_u32(cap_range, + PCI_CAP_LAST_DEVICE_MASK, + PCI_CAP_LAST_DEVICE_SHIFT); + + return 0; +} + +static int __init scan_caps_for_iommu(int bus, int dev, int func, + iommu_detect_callback_ptr_t iommu_detect_callback) +{ + int cap_ptr, cap_id, cap_type; + u32 cap_header; + int count, error = 0; + + count = 0; + cap_ptr = read_pci_config_byte(bus, dev, func, + PCI_CAPABILITY_LIST); + while ( cap_ptr >= PCI_MIN_CAP_OFFSET && + count < PCI_MAX_CAP_BLOCKS && !error ) { + cap_ptr &= PCI_CAP_PTR_MASK; + cap_header = read_pci_config(bus, dev, func, cap_ptr); + cap_id = get_field_from_reg_u32(cap_header, + PCI_CAP_ID_MASK, PCI_CAP_ID_SHIFT); + + if ( cap_id == PCI_CAP_ID_SECURE_DEVICE ) { + cap_type = get_field_from_reg_u32(cap_header, + PCI_CAP_TYPE_MASK, PCI_CAP_TYPE_SHIFT); + if ( cap_type == PCI_CAP_TYPE_IOMMU ) { + error = iommu_detect_callback( + bus, dev, func, cap_ptr); + } + } + + cap_ptr = get_field_from_reg_u32(cap_header, + PCI_CAP_NEXT_PTR_MASK, PCI_CAP_NEXT_PTR_SHIFT); + ++count; } + + return error; +} + +static int __init scan_functions_for_iommu(int bus, int dev, + iommu_detect_callback_ptr_t iommu_detect_callback) +{ + int func, hdr_type; + int count, error = 0; + + func = 0; + count = 1; + while ( VALID_PCI_VENDOR_ID(read_pci_config_16(bus, dev, func, + PCI_VENDOR_ID)) && !error && func < count ) { + hdr_type = read_pci_config_byte(bus, dev, func, + PCI_HEADER_TYPE); + + if ( func == 0 && IS_PCI_MULTI_FUNCTION(hdr_type) ) + count = PCI_MAX_FUNC_COUNT; + + if ( IS_PCI_TYPE0_HEADER(hdr_type) || + IS_PCI_TYPE1_HEADER(hdr_type) ) { + error = scan_caps_for_iommu(bus, dev, func, + iommu_detect_callback); + } + ++func; + } + + return error; +} + + +int __init scan_for_iommu(iommu_detect_callback_ptr_t iommu_detect_callback) +{ + int bus, dev, error = 0; + + for ( bus = 0; bus < PCI_MAX_BUS_COUNT && !error; ++bus ) { + for ( dev = 0; dev < PCI_MAX_DEV_COUNT && !error; ++dev ) { + error = scan_functions_for_iommu(bus, dev, + iommu_detect_callback); + } + } + + return error; +} + diff -r eae7b887e5ac -r ee498c9af856 xen/arch/x86/hvm/svm/amd_iommu/amd-iommu-init.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/arch/x86/hvm/svm/amd_iommu/amd-iommu-init.c Thu Sep 27 12:22:16 2007 -0600 @@ -0,0 +1,145 @@ +/* + * Copyright (C) 2007 Advanced Micro Devices, Inc. + * Author: Leo Duran <leo.duran@xxxxxxx> + * Author: Wei Wang <wei.wang2@xxxxxxx> - adapted to xen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <asm/amd-iommu.h> +#include <asm/hvm/svm/amd-iommu-proto.h> +#include <asm-x86/fixmap.h> +#include "pci-direct.h" +#include "pci_regs.h" + +extern int nr_amd_iommus; + +int __init map_iommu_mmio_region(struct amd_iommu *iommu) +{ + unsigned long mfn; + + if ( nr_amd_iommus > MAX_AMD_IOMMUS ) { + gdprintk(XENLOG_ERR, + "IOMMU: nr_amd_iommus %d > MAX_IOMMUS\n", nr_amd_iommus); + return -ENOMEM; + } + + iommu->mmio_base = (void *) fix_to_virt(FIX_IOMMU_MMIO_BASE_0 + + nr_amd_iommus * MMIO_PAGES_PER_IOMMU); + mfn = (unsigned long)iommu->mmio_base_phys >> PAGE_SHIFT; + map_pages_to_xen((unsigned long)iommu->mmio_base, mfn, + MMIO_PAGES_PER_IOMMU, PAGE_HYPERVISOR_NOCACHE); + + memset((u8*)iommu->mmio_base, 0, IOMMU_MMIO_REGION_LENGTH); + + return 0; +} + +void __init unmap_iommu_mmio_region(struct amd_iommu *iommu) +{ + if ( iommu->mmio_base ) { + iounmap(iommu->mmio_base); + iommu->mmio_base = NULL; + } +} + +void __init register_iommu_dev_table_in_mmio_space(struct amd_iommu *iommu) +{ + u64 addr_64, addr_lo, addr_hi; + u32 entry; + + addr_64 = (u64)virt_to_maddr(iommu->dev_table.buffer); + addr_lo = addr_64 & DMA_32BIT_MASK; + addr_hi = addr_64 >> 32; + + set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, 0, + IOMMU_DEV_TABLE_BASE_LOW_MASK, + IOMMU_DEV_TABLE_BASE_LOW_SHIFT, &entry); + set_field_in_reg_u32((iommu->dev_table.alloc_size / PAGE_SIZE) - 1, + entry, IOMMU_DEV_TABLE_SIZE_MASK, + IOMMU_DEV_TABLE_SIZE_SHIFT, &entry); + writel(entry, iommu->mmio_base + IOMMU_DEV_TABLE_BASE_LOW_OFFSET); + + set_field_in_reg_u32((u32)addr_hi, 0, + IOMMU_DEV_TABLE_BASE_HIGH_MASK, + IOMMU_DEV_TABLE_BASE_HIGH_SHIFT, &entry); + writel(entry, iommu->mmio_base + IOMMU_DEV_TABLE_BASE_HIGH_OFFSET); +} + +void __init register_iommu_cmd_buffer_in_mmio_space(struct amd_iommu *iommu) +{ + u64 addr_64, addr_lo, addr_hi; + u32 power_of2_entries; + u32 entry; + + addr_64 = (u64)virt_to_maddr(iommu->cmd_buffer.buffer); + addr_lo = addr_64 & DMA_32BIT_MASK; + addr_hi = addr_64 >> 32; + + set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, 0, + IOMMU_CMD_BUFFER_BASE_LOW_MASK, + IOMMU_CMD_BUFFER_BASE_LOW_SHIFT, &entry); + writel(entry, iommu->mmio_base + IOMMU_CMD_BUFFER_BASE_LOW_OFFSET); + + power_of2_entries = get_order_from_bytes(iommu->cmd_buffer.alloc_size) + + IOMMU_CMD_BUFFER_POWER_OF2_ENTRIES_PER_PAGE; + + set_field_in_reg_u32((u32)addr_hi, 0, + IOMMU_CMD_BUFFER_BASE_HIGH_MASK, + IOMMU_CMD_BUFFER_BASE_HIGH_SHIFT, &entry); + set_field_in_reg_u32(power_of2_entries, entry, + IOMMU_CMD_BUFFER_LENGTH_MASK, + IOMMU_CMD_BUFFER_LENGTH_SHIFT, &entry); + writel(entry, iommu->mmio_base+IOMMU_CMD_BUFFER_BASE_HIGH_OFFSET); +} + +static void __init set_iommu_translation_control(struct amd_iommu *iommu, + int enable) +{ + u32 entry; + + entry = readl(iommu->mmio_base+IOMMU_CONTROL_MMIO_OFFSET); + set_field_in_reg_u32(iommu->ht_tunnel_support ? IOMMU_CONTROL_ENABLED : + IOMMU_CONTROL_ENABLED, entry, + IOMMU_CONTROL_HT_TUNNEL_TRANSLATION_MASK, + IOMMU_CONTROL_HT_TUNNEL_TRANSLATION_SHIFT, &entry); + set_field_in_reg_u32(enable ? IOMMU_CONTROL_ENABLED : + IOMMU_CONTROL_ENABLED, entry, + IOMMU_CONTROL_TRANSLATION_ENABLE_MASK, + IOMMU_CONTROL_TRANSLATION_ENABLE_SHIFT, &entry); + writel(entry, iommu->mmio_base+IOMMU_CONTROL_MMIO_OFFSET); +} + +static void __init set_iommu_command_buffer_control(struct amd_iommu *iommu, + int enable) +{ + u32 entry; + + entry = readl(iommu->mmio_base+IOMMU_CONTROL_MMIO_OFFSET); + set_field_in_reg_u32(enable ? IOMMU_CONTROL_ENABLED : + IOMMU_CONTROL_ENABLED, entry, + IOMMU_CONTROL_COMMAND_BUFFER_ENABLE_MASK, + IOMMU_CONTROL_COMMAND_BUFFER_ENABLE_SHIFT, &entry); + writel(entry, iommu->mmio_base+IOMMU_CONTROL_MMIO_OFFSET); +} + +void __init enable_iommu(struct amd_iommu *iommu) +{ + set_iommu_command_buffer_control(iommu, IOMMU_CONTROL_ENABLED); + set_iommu_translation_control(iommu, IOMMU_CONTROL_ENABLED); + printk("AMD IOMMU %d: Enabled\n", nr_amd_iommus); +} + + diff -r eae7b887e5ac -r ee498c9af856 xen/arch/x86/hvm/svm/amd_iommu/amd-iommu-map.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/arch/x86/hvm/svm/amd_iommu/amd-iommu-map.c Thu Sep 27 12:22:16 2007 -0600 @@ -0,0 +1,419 @@ +/* + * Copyright (C) 2007 Advanced Micro Devices, Inc. + * Author: Leo Duran <leo.duran@xxxxxxx> + * Author: Wei Wang <wei.wang2@xxxxxxx> - adapted to xen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <asm/hvm/iommu.h> +#include <asm/amd-iommu.h> +#include <asm/hvm/svm/amd-iommu-proto.h> +#include <xen/sched.h> + +extern long amd_iommu_poll_comp_wait; + +static int queue_iommu_command(struct amd_iommu *iommu, u32 cmd[]) +{ + u32 tail, head, *cmd_buffer; + int i; + + BUG_ON( !iommu || !cmd ); + + tail = iommu->cmd_buffer_tail; + if ( ++tail == iommu->cmd_buffer.entries ) { + tail = 0; + } + head = get_field_from_reg_u32( + readl(iommu->mmio_base+IOMMU_CMD_BUFFER_HEAD_OFFSET), + IOMMU_CMD_BUFFER_HEAD_MASK, + IOMMU_CMD_BUFFER_HEAD_SHIFT); + if ( head != tail ) { + cmd_buffer = (u32 *)(iommu->cmd_buffer.buffer + + (iommu->cmd_buffer_tail * IOMMU_CMD_BUFFER_ENTRY_SIZE)); + for ( i = 0; i < IOMMU_CMD_BUFFER_U32_PER_ENTRY; ++i ) { + cmd_buffer[i] = cmd[i]; + } + + iommu->cmd_buffer_tail = tail; + return 1; + } + + return 0; +} + +static void commit_iommu_command_buffer(struct amd_iommu *iommu) +{ + u32 tail; + + BUG_ON( !iommu ); + + set_field_in_reg_u32(iommu->cmd_buffer_tail, 0, + IOMMU_CMD_BUFFER_TAIL_MASK, + IOMMU_CMD_BUFFER_TAIL_SHIFT, &tail); + writel(tail, iommu->mmio_base+IOMMU_CMD_BUFFER_TAIL_OFFSET); +} + +int send_iommu_command(struct amd_iommu *iommu, u32 cmd[]) +{ + BUG_ON( !iommu || !cmd ); + + if ( queue_iommu_command(iommu, cmd) ) { + commit_iommu_command_buffer(iommu); + return 1; + } + return 0; +} + +static void invalidate_iommu_page(struct amd_iommu *iommu, + u64 io_addr, u16 domain_id) +{ + u64 addr_lo, addr_hi; + u32 cmd[4], entry; + + addr_lo = io_addr & DMA_32BIT_MASK; + addr_hi = io_addr >> 32; + + set_field_in_reg_u32(domain_id, 0, + IOMMU_INV_IOMMU_PAGES_DOMAIN_ID_MASK, + IOMMU_INV_IOMMU_PAGES_DOMAIN_ID_SHIFT, &entry); + set_field_in_reg_u32(IOMMU_CMD_INVALIDATE_IOMMU_PAGES, entry, + IOMMU_CMD_OPCODE_MASK, IOMMU_CMD_OPCODE_SHIFT, &entry); + cmd[1] = entry; + + set_field_in_reg_u32(IOMMU_CONTROL_DISABLED, 0, + IOMMU_INV_IOMMU_PAGES_S_FLAG_MASK, + IOMMU_INV_IOMMU_PAGES_S_FLAG_SHIFT, &entry); + set_field_in_reg_u32(IOMMU_CONTROL_DISABLED, entry, + IOMMU_INV_IOMMU_PAGES_PDE_FLAG_MASK, + IOMMU_INV_IOMMU_PAGES_PDE_FLAG_SHIFT, &entry); + set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, entry, + IOMMU_INV_IOMMU_PAGES_ADDR_LOW_MASK, + IOMMU_INV_IOMMU_PAGES_ADDR_LOW_SHIFT, &entry); + cmd[2] = entry; + + set_field_in_reg_u32((u32)addr_hi, 0, + IOMMU_INV_IOMMU_PAGES_ADDR_HIGH_MASK, + IOMMU_INV_IOMMU_PAGES_ADDR_HIGH_SHIFT, &entry); + cmd[3] = entry; + + cmd[0] = 0; + send_iommu_command(iommu, cmd); +} + +static void flush_command_buffer(struct amd_iommu *iommu) +{ + u32 cmd[4], status; + int loop_count, comp_wait; + + /* clear 'ComWaitInt' in status register (WIC) */ + set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, 0, + IOMMU_STATUS_COMP_WAIT_INT_MASK, + IOMMU_STATUS_COMP_WAIT_INT_SHIFT, &status); + writel(status, iommu->mmio_base + IOMMU_STATUS_MMIO_OFFSET); + + /* send an empty COMPLETION_WAIT command to flush command buffer */ + cmd[3] = cmd[2] = 0; + set_field_in_reg_u32(IOMMU_CMD_COMPLETION_WAIT, 0, + IOMMU_CMD_OPCODE_MASK, + IOMMU_CMD_OPCODE_SHIFT, &cmd[1]); + set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, 0, + IOMMU_COMP_WAIT_I_FLAG_MASK, + IOMMU_COMP_WAIT_I_FLAG_SHIFT, &cmd[0]); + send_iommu_command(iommu, cmd); + + /* wait for 'ComWaitInt' to signal comp#endifletion? */ + if ( amd_iommu_poll_comp_wait ) { + loop_count = amd_iommu_poll_comp_wait; + do { + status = readl(iommu->mmio_base + + IOMMU_STATUS_MMIO_OFFSET); + comp_wait = get_field_from_reg_u32(status, + IOMMU_STATUS_COMP_WAIT_INT_MASK, + IOMMU_STATUS_COMP_WAIT_INT_SHIFT); + --loop_count; + } while ( loop_count && !comp_wait ); + + if ( comp_wait ) { + /* clear 'ComWaitInt' in status register (WIC) */ + status &= IOMMU_STATUS_COMP_WAIT_INT_MASK; + writel(status, iommu->mmio_base + + IOMMU_STATUS_MMIO_OFFSET); + } else + dprintk(XENLOG_WARNING, "AMD IOMMU: %s(): Warning:" + " ComWaitInt bit did not assert!\n", + __FUNCTION__); + } +} + +static void clear_page_table_entry_present(u32 *pte) +{ + set_field_in_reg_u32(IOMMU_CONTROL_DISABLED, pte[0], + IOMMU_PTE_PRESENT_MASK, + IOMMU_PTE_PRESENT_SHIFT, &pte[0]); +} + +static void set_page_table_entry_present(u32 *pte, u64 page_addr, + int iw, int ir) +{ + u64 addr_lo, addr_hi; + u32 entry; + + addr_lo = page_addr & DMA_32BIT_MASK; + addr_hi = page_addr >> 32; + + set_field_in_reg_u32((u32)addr_hi, 0, + IOMMU_PTE_ADDR_HIGH_MASK, + IOMMU_PTE_ADDR_HIGH_SHIFT, &entry); + set_field_in_reg_u32(iw ? IOMMU_CONTROL_ENABLED : + IOMMU_CONTROL_DISABLED, entry, + IOMMU_PTE_IO_WRITE_PERMISSION_MASK, + IOMMU_PTE_IO_WRITE_PERMISSION_SHIFT, &entry); + set_field_in_reg_u32(ir ? IOMMU_CONTROL_ENABLED : + IOMMU_CONTROL_DISABLED, entry, + IOMMU_PTE_IO_READ_PERMISSION_MASK, + IOMMU_PTE_IO_READ_PERMISSION_SHIFT, &entry); + pte[1] = entry; + + set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, 0, + IOMMU_PTE_ADDR_LOW_MASK, + IOMMU_PTE_ADDR_LOW_SHIFT, &entry); + set_field_in_reg_u32(IOMMU_PAGING_MODE_LEVEL_0, entry, + IOMMU_PTE_NEXT_LEVEL_MASK, + IOMMU_PTE_NEXT_LEVEL_SHIFT, &entry); + set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry, + IOMMU_PTE_PRESENT_MASK, + IOMMU_PTE_PRESENT_SHIFT, &entry); + pte[0] = entry; +} + + +static void amd_iommu_set_page_directory_entry(u32 *pde, + u64 next_ptr, u8 next_level) +{ + u64 addr_lo, addr_hi; + u32 entry; + + addr_lo = next_ptr & DMA_32BIT_MASK; + addr_hi = next_ptr >> 32; + + /* enable read/write permissions,which will be enforced at the PTE */ + set_field_in_reg_u32((u32)addr_hi, 0, + IOMMU_PDE_ADDR_HIGH_MASK, IOMMU_PDE_ADDR_HIGH_SHIFT, &entry); + set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry, + IOMMU_PDE_IO_WRITE_PERMISSION_MASK, + IOMMU_PDE_IO_WRITE_PERMISSION_SHIFT, &entry); + set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry, + IOMMU_PDE_IO_READ_PERMISSION_MASK, + IOMMU_PDE_IO_READ_PERMISSION_SHIFT, &entry); + pde[1] = entry; + + /* mark next level as 'present' */ + set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, 0, + IOMMU_PDE_ADDR_LOW_MASK, IOMMU_PDE_ADDR_LOW_SHIFT, &entry); + set_field_in_reg_u32(next_level, entry, + IOMMU_PDE_NEXT_LEVEL_MASK, + IOMMU_PDE_NEXT_LEVEL_SHIFT, &entry); + set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry, + IOMMU_PDE_PRESENT_MASK, + IOMMU_PDE_PRESENT_SHIFT, &entry); + pde[0] = entry; +} + +void amd_iommu_set_dev_table_entry(u32 *dte, u64 root_ptr, u16 domain_id, + u8 paging_mode) +{ + u64 addr_hi, addr_lo; + u32 entry; + + dte[6] = dte[5] = dte[4] = 0; + + set_field_in_reg_u32(IOMMU_DEV_TABLE_SYS_MGT_MSG_FORWARDED, 0, + IOMMU_DEV_TABLE_SYS_MGT_MSG_ENABLE_MASK, + IOMMU_DEV_TABLE_SYS_MGT_MSG_ENABLE_SHIFT, &entry); + dte[3] = entry; + + set_field_in_reg_u32(domain_id, 0, + IOMMU_DEV_TABLE_DOMAIN_ID_MASK, + IOMMU_DEV_TABLE_DOMAIN_ID_SHIFT, &entry); + dte[2] = entry; + + addr_lo = root_ptr & DMA_32BIT_MASK; + addr_hi = root_ptr >> 32; + set_field_in_reg_u32((u32)addr_hi, 0, + IOMMU_DEV_TABLE_PAGE_TABLE_PTR_HIGH_MASK, + IOMMU_DEV_TABLE_PAGE_TABLE_PTR_HIGH_SHIFT, &entry); + set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry, + IOMMU_DEV_TABLE_IO_WRITE_PERMISSION_MASK, + IOMMU_DEV_TABLE_IO_WRITE_PERMISSION_SHIFT, &entry); + set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry, + IOMMU_DEV_TABLE_IO_READ_PERMISSION_MASK, + IOMMU_DEV_TABLE_IO_READ_PERMISSION_SHIFT, &entry); + dte[1] = entry; + + set_field_in_reg_u32((u32)addr_lo >> PAGE_SHIFT, 0, + IOMMU_DEV_TABLE_PAGE_TABLE_PTR_LOW_MASK, + IOMMU_DEV_TABLE_PAGE_TABLE_PTR_LOW_SHIFT, &entry); + set_field_in_reg_u32(paging_mode, entry, + IOMMU_DEV_TABLE_PAGING_MODE_MASK, + IOMMU_DEV_TABLE_PAGING_MODE_SHIFT, &entry); + set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry, + IOMMU_DEV_TABLE_TRANSLATION_VALID_MASK, + IOMMU_DEV_TABLE_TRANSLATION_VALID_SHIFT, &entry); + set_field_in_reg_u32(IOMMU_CONTROL_ENABLED, entry, + IOMMU_DEV_TABLE_VALID_MASK, + IOMMU_DEV_TABLE_VALID_SHIFT, &entry); + dte[0] = entry; +} + +static void *amd_iommu_get_vptr_from_page_table_entry(u32 *entry) +{ + u64 addr_lo, addr_hi, ptr; + + addr_lo = get_field_from_reg_u32(entry[0], + IOMMU_DEV_TABLE_PAGE_TABLE_PTR_LOW_MASK, + IOMMU_DEV_TABLE_PAGE_TABLE_PTR_LOW_SHIFT); + + addr_hi = get_field_from_reg_u32(entry[1], + IOMMU_DEV_TABLE_PAGE_TABLE_PTR_HIGH_MASK, + IOMMU_DEV_TABLE_PAGE_TABLE_PTR_HIGH_SHIFT); + + ptr = (addr_hi << 32) | (addr_lo << PAGE_SHIFT); + return ptr ? maddr_to_virt((unsigned long)ptr) : NULL; +} + +static int amd_iommu_is_pte_present(u32 *entry) +{ + return (get_field_from_reg_u32(entry[0], + IOMMU_PDE_PRESENT_MASK, + IOMMU_PDE_PRESENT_SHIFT)); +} + +static void *get_pte_from_page_tables(void *table, int level, + unsigned long io_pfn) +{ + unsigned long offset; + void *pde = 0; + + BUG_ON( !table ); + + while ( level > 0 ) + { + void *next_table = 0; + unsigned long next_ptr; + offset = io_pfn >> ((PTE_PER_TABLE_SHIFT * + (level - IOMMU_PAGING_MODE_LEVEL_1))); + offset &= ~PTE_PER_TABLE_MASK; + pde = table + (offset * IOMMU_PAGE_TABLE_ENTRY_SIZE); + + if ( level == 1 ) + break; + if ( !pde ) + return NULL; + if ( !amd_iommu_is_pte_present(pde) ) { + next_table = alloc_xenheap_page(); + if ( next_table == NULL ) + return NULL; + memset(next_table, 0, PAGE_SIZE); + if ( *(u64*)(pde) == 0 ) { + next_ptr = (u64)virt_to_maddr(next_table); + amd_iommu_set_page_directory_entry((u32 *)pde, + next_ptr, level - 1); + } else + free_xenheap_page(next_table); + } + table = amd_iommu_get_vptr_from_page_table_entry(pde); + level--; + } + + return pde; +} + +int amd_iommu_map_page(struct domain *d, unsigned long gfn, + unsigned long mfn) +{ + void *pte; + unsigned long flags; + u64 maddr; + struct hvm_iommu *hd = domain_hvm_iommu(d); + int iw, ir; + + BUG_ON( !hd->root_table ); + + maddr = (u64)(mfn << PAGE_SHIFT); + + iw = IOMMU_IO_WRITE_ENABLED; + ir = IOMMU_IO_READ_ENABLED; + + spin_lock_irqsave(&hd->mapping_lock, flags); + + pte = get_pte_from_page_tables(hd->root_table, hd->paging_mode, gfn); + + if ( pte != 0 ) { + set_page_table_entry_present((u32 *)pte, maddr, iw, ir); + spin_unlock_irqrestore(&hd->mapping_lock, flags); + return 0; + } else { + dprintk(XENLOG_ERR, + "%s() AMD IOMMU: Invalid IO pagetable entry gfn = %lx\n", + __FUNCTION__, gfn); + spin_unlock_irqrestore(&hd->mapping_lock, flags); + return -EIO; + } +} + +int amd_iommu_unmap_page(struct domain *d, unsigned long gfn) +{ + void *pte; + unsigned long flags; + u64 io_addr = gfn; + int requestor_id; + struct amd_iommu *iommu; + struct hvm_iommu *hd = domain_hvm_iommu(d); + + BUG_ON( !hd->root_table ); + + requestor_id = hd->domain_id; + io_addr = (u64)(gfn << PAGE_SHIFT); + + spin_lock_irqsave(&hd->mapping_lock, flags); + + pte = get_pte_from_page_tables(hd->root_table, hd->paging_mode, gfn); + + if ( pte != 0 ) { + /* mark PTE as 'page not present' */ + clear_page_table_entry_present((u32 *)pte); + spin_unlock_irqrestore(&hd->mapping_lock, flags); + + /* send INVALIDATE_IOMMU_PAGES command */ + for_each_amd_iommu(iommu) { + + spin_lock_irqsave(&iommu->lock, flags); + + invalidate_iommu_page(iommu, io_addr, requestor_id); + flush_command_buffer(iommu); + + spin_unlock_irqrestore(&iommu->lock, flags); + } + + return 0; + } else { + dprintk(XENLOG_ERR, + "%s() AMD IOMMU: Invalid IO pagetable entry gfn = %lx\n", + __FUNCTION__, gfn); + spin_unlock_irqrestore(&hd->mapping_lock, flags); + return -EIO; + } +} diff -r eae7b887e5ac -r ee498c9af856 xen/arch/x86/hvm/svm/amd_iommu/pci-amd-iommu.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/arch/x86/hvm/svm/amd_iommu/pci-amd-iommu.c Thu Sep 27 12:22:16 2007 -0600 @@ -0,0 +1,389 @@ +/* + * Copyright (C) 2007 Advanced Micro Devices, Inc. + * Author: Leo Duran <leo.duran@xxxxxxx> + * Author: Wei Wang <wei.wang2@xxxxxxx> - adapted to xen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <asm/amd-iommu.h> +#include <asm/hvm/svm/amd-iommu-proto.h> +#include <xen/sched.h> +#include <asm/mm.h> +#include "pci-direct.h" +#include "pci_regs.h" + +struct list_head amd_iommu_head; +long amd_iommu_poll_comp_wait = COMPLETION_WAIT_DEFAULT_POLLING_COUNT; +static long amd_iommu_cmd_buffer_entries = IOMMU_CMD_BUFFER_DEFAULT_ENTRIES; +int nr_amd_iommus = 0; + +/* will set if amd-iommu HW is found */ +int amd_iommu_enabled = 0; + +static int enable_amd_iommu = 0; +boolean_param("enable_amd_iommu", enable_amd_iommu); + +static void deallocate_domain_page_tables(struct hvm_iommu *hd) +{ + if ( hd->root_table ) + free_xenheap_page(hd->root_table); +} + +static void deallocate_domain_resources(struct hvm_iommu *hd) +{ + deallocate_domain_page_tables(hd); +} + +static void __init init_cleanup(void) +{ + struct amd_iommu *iommu; + + dprintk(XENLOG_ERR, "AMD IOMMU: %s()\n", __FUNCTION__); + + for_each_amd_iommu(iommu) { + unmap_iommu_mmio_region(iommu); + } +} + +static void __init deallocate_iommu_table_struct( + struct table_struct *table) +{ + if (table->buffer) { + free_xenheap_pages(table->buffer, + get_order_from_bytes(table->alloc_size)); + table->buffer = NULL; + } +} + +static void __init deallocate_iommu_resources(struct amd_iommu *iommu) +{ + deallocate_iommu_table_struct(&iommu->dev_table); + deallocate_iommu_table_struct(&iommu->cmd_buffer);; +} + +static void __init detect_cleanup(void) +{ + struct amd_iommu *iommu; + + dprintk(XENLOG_ERR, "AMD IOMMU: %s()\n", __FUNCTION__); + + for_each_amd_iommu(iommu) { + list_del(&iommu->list); + deallocate_iommu_resources(iommu); + xfree(iommu); + } +} + +static int requestor_id_from_bdf(int bdf) +{ + /* HACK - HACK */ + /* account for possible 'aliasing' by parent device */ + return bdf; +} + +static int __init allocate_iommu_table_struct(struct table_struct *table, + const char *name) +{ + table->buffer = (void *) alloc_xenheap_pages( + get_order_from_bytes(table->alloc_size)); + + if ( !table->buffer ) { + dprintk(XENLOG_ERR, "AMD IOMMU: Error allocating %s\n", name); + return -ENOMEM; + } + memset(table->buffer, 0, table->alloc_size); + + return 0; +} + +static int __init allocate_iommu_resources(struct amd_iommu *iommu) +{ + /* allocate 'device table' on a 4K boundary */ + iommu->dev_table.alloc_size = + PAGE_ALIGN(((iommu->last_downstream_bus + 1) * + IOMMU_DEV_TABLE_ENTRIES_PER_BUS) * + IOMMU_DEV_TABLE_ENTRY_SIZE); + iommu->dev_table.entries = + iommu->dev_table.alloc_size / IOMMU_DEV_TABLE_ENTRY_SIZE; + + if (allocate_iommu_table_struct(&iommu->dev_table, + "Device Table") != 0) + goto error_out; + + /* allocate 'command buffer' in power of 2 increments of 4K */ + iommu->cmd_buffer_tail = 0; + iommu->cmd_buffer.alloc_size = + PAGE_SIZE << get_order_from_bytes( + PAGE_ALIGN(amd_iommu_cmd_buffer_entries * + IOMMU_CMD_BUFFER_ENTRY_SIZE)); + + iommu->cmd_buffer.entries = + iommu->cmd_buffer.alloc_size / IOMMU_CMD_BUFFER_ENTRY_SIZE; + + if ( allocate_iommu_table_struct(&iommu->cmd_buffer, + "Command Buffer") != 0 ) + goto error_out; + + return 0; + +error_out: + deallocate_iommu_resources(iommu); + return -ENOMEM; +} + +int iommu_detect_callback(u8 bus, u8 dev, u8 func, u8 cap_ptr) +{ + struct amd_iommu *iommu; + + iommu = (struct amd_iommu *) xmalloc(struct amd_iommu); + if ( !iommu ) { + dprintk(XENLOG_ERR, "AMD IOMMU: Error allocating amd_iommu\n"); + return -ENOMEM; + } + memset(iommu, 0, sizeof(struct amd_iommu)); + spin_lock_init(&iommu->lock); + + /* get capability and topology information */ + if ( get_iommu_capabilities(bus, dev, func, cap_ptr, iommu) != 0 ) + goto error_out; + if ( get_iommu_last_downstream_bus(iommu) != 0 ) + goto error_out; + + list_add_tail(&iommu->list, &amd_iommu_head); + + /* allocate resources for this IOMMU */ + if (allocate_iommu_resources(iommu) != 0) + goto error_out; + + return 0; + +error_out: + xfree(iommu); + return -ENODEV; +} + +static int __init amd_iommu_init(void) +{ + struct amd_iommu *iommu; + unsigned long flags; + + for_each_amd_iommu(iommu) { + spin_lock_irqsave(&iommu->lock, flags); + + /* register IOMMU data strucures in MMIO space */ + if (map_iommu_mmio_region(iommu) != 0) + goto error_out; + register_iommu_dev_table_in_mmio_space(iommu); + register_iommu_cmd_buffer_in_mmio_space(iommu); + + /* enable IOMMU translation services */ + enable_iommu(iommu); + nr_amd_iommus++; + + spin_unlock_irqrestore(&iommu->lock, flags); + } + + amd_iommu_enabled = 1; + + return 0; + +error_out: + init_cleanup(); + return -ENODEV; +} + +struct amd_iommu *find_iommu_for_device(int bus, int devfn) +{ + struct amd_iommu *iommu; + + for_each_amd_iommu(iommu) { + if ( bus == iommu->root_bus ) { + if ( devfn >= iommu->first_devfn && + devfn <= iommu->last_devfn ) + return iommu; + } + else if ( bus <= iommu->last_downstream_bus ) { + if ( iommu->downstream_bus_present[bus] ) + return iommu; + } + } + + return NULL; +} + +void amd_iommu_setup_domain_device( + struct domain *domain, struct amd_iommu *iommu, int requestor_id) +{ + void *dte; + u64 root_ptr; + unsigned long flags; + struct hvm_iommu *hd = domain_hvm_iommu(domain); + + BUG_ON( !hd->root_table||!hd->paging_mode ); + + root_ptr = (u64)virt_to_maddr(hd->root_table); + dte = iommu->dev_table.buffer + + (requestor_id * IOMMU_DEV_TABLE_ENTRY_SIZE); + + spin_lock_irqsave(&iommu->lock, flags); + + amd_iommu_set_dev_table_entry((u32 *)dte, + root_ptr, hd->domain_id, hd->paging_mode); + + dprintk(XENLOG_INFO, "AMD IOMMU: Set DTE req_id:%x, " + "root_ptr:%"PRIx64", domain_id:%d, paging_mode:%d\n", + requestor_id, root_ptr, hd->domain_id, hd->paging_mode); + + spin_unlock_irqrestore(&iommu->lock, flags); +} + +void __init amd_iommu_setup_dom0_devices(void) +{ + struct hvm_iommu *hd = domain_hvm_iommu(dom0); + struct amd_iommu *iommu; + struct pci_dev *pdev; + int bus, dev, func; + u32 l; + int req_id, bdf; + + for ( bus = 0; bus < 256; bus++ ) { + for ( dev = 0; dev < 32; dev++ ) { + for ( func = 0; func < 8; func++ ) { + l = read_pci_config(bus, dev, func, PCI_VENDOR_ID); + /* some broken boards return 0 or ~0 if a slot is empty: */ + if ( l == 0xffffffff || l == 0x00000000 || + l == 0x0000ffff || l == 0xffff0000 ) + continue; + + pdev = xmalloc(struct pci_dev); + pdev->bus = bus; + pdev->devfn = PCI_DEVFN(dev, func); + list_add_tail(&pdev->list, &hd->pdev_list); + + bdf = (bus << 8) | pdev->devfn; + req_id = requestor_id_from_bdf(bdf); + iommu = find_iommu_for_device(bus, pdev->devfn); + + if ( iommu ) + amd_iommu_setup_domain_device(dom0, iommu, req_id); + } + } + } +} + +int amd_iommu_detect(void) +{ + unsigned long i; + + if ( !enable_amd_iommu ) { + printk("AMD IOMMU: Disabled\n"); + return 0; + } + + INIT_LIST_HEAD(&amd_iommu_head); + + if ( scan_for_iommu(iommu_detect_callback) != 0 ) { + dprintk(XENLOG_ERR, "AMD IOMMU: Error detection\n"); + goto error_out; + } + + if ( !iommu_found() ) { + printk("AMD IOMMU: Not found!\n"); + return 0; + } + + if ( amd_iommu_init() != 0 ) { + dprintk(XENLOG_ERR, "AMD IOMMU: Error initialization\n"); + goto error_out; + } + + if ( amd_iommu_domain_init(dom0) != 0 ) + goto error_out; + + /* setup 1:1 page table for dom0 */ + for ( i = 0; i < max_page; i++ ) + amd_iommu_map_page(dom0, i, i); + + amd_iommu_setup_dom0_devices(); + return 0; + +error_out: + detect_cleanup(); + return -ENODEV; + +} + +static int allocate_domain_resources(struct hvm_iommu *hd) +{ + /* allocate root table */ + hd->root_table = (void *)alloc_xenheap_page(); + if ( !hd->root_table ) + return -ENOMEM; + memset((u8*)hd->root_table, 0, PAGE_SIZE); + + return 0; +} + +static int get_paging_mode(unsigned long entries) +{ + int level = 1; + + BUG_ON ( !max_page ); + + if ( entries > max_page ) + entries = max_page; + + while ( entries > PTE_PER_TABLE_SIZE ) { + entries = PTE_PER_TABLE_ALIGN(entries) >> PTE_PER_TABLE_SHIFT; + ++level; + if ( level > 6 ) + return -ENOMEM; + } + + dprintk(XENLOG_INFO, "AMD IOMMU: paging mode = %d\n", level); + + return level; +} + +int amd_iommu_domain_init(struct domain *domain) +{ + struct hvm_iommu *hd = domain_hvm_iommu(domain); + + spin_lock_init(&hd->mapping_lock); + spin_lock_init(&hd->iommu_list_lock); + INIT_LIST_HEAD(&hd->pdev_list); + + /* allocate page directroy */ + if ( allocate_domain_resources(hd) != 0 ) { + dprintk(XENLOG_ERR, "AMD IOMMU: %s()\n", __FUNCTION__); + goto error_out; + } + + if ( is_hvm_domain(domain) ) + hd->paging_mode = IOMMU_PAGE_TABLE_LEVEL_4; + else + hd->paging_mode = get_paging_mode(max_page); + + hd->domain_id = domain->domain_id; + + return 0; + +error_out: + deallocate_domain_resources(hd); + return -ENOMEM; +} + + diff -r eae7b887e5ac -r ee498c9af856 xen/arch/x86/hvm/svm/amd_iommu/pci-direct.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/arch/x86/hvm/svm/amd_iommu/pci-direct.h Thu Sep 27 12:22:16 2007 -0600 @@ -0,0 +1,48 @@ +#ifndef ASM_PCI_DIRECT_H +#define ASM_PCI_DIRECT_H 1 + +#include <xen/types.h> +#include <asm/io.h> + +/* Direct PCI access. This is used for PCI accesses in early boot before + the PCI subsystem works. */ + +#define PDprintk(x...) + +static inline u32 read_pci_config(u8 bus, u8 slot, u8 func, u8 offset) +{ + u32 v; + outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); + v = inl(0xcfc); + if (v != 0xffffffff) + PDprintk("%x reading 4 from %x: %x\n", slot, offset, v); + return v; +} + +static inline u8 read_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset) +{ + u8 v; + outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); + v = inb(0xcfc + (offset&3)); + PDprintk("%x reading 1 from %x: %x\n", slot, offset, v); + return v; +} + +static inline u16 read_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset) +{ + u16 v; + outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); + v = inw(0xcfc + (offset&2)); + PDprintk("%x reading 2 from %x: %x\n", slot, offset, v); + return v; +} + +static inline void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset, + u32 val) +{ + PDprintk("%x writing to %x: %x\n", slot, offset, val); + outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); + outl(val, 0xcfc); +} + +#endif diff -r eae7b887e5ac -r ee498c9af856 xen/arch/x86/hvm/svm/amd_iommu/pci_regs.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/arch/x86/hvm/svm/amd_iommu/pci_regs.h Thu Sep 27 12:22:16 2007 -0600 @@ -0,0 +1,513 @@ +/* + * pci_regs.h + * + * PCI standard defines + * Copyright 1994, Drew Eckhardt + * Copyright 1997--1999 Martin Mares <mj@xxxxxx> + * + * For more information, please consult the following manuals (look at + * http://www.pcisig.com/ for how to get them): + * + * PCI BIOS Specification + * PCI Local Bus Specification + * PCI to PCI Bridge Specification + * PCI System Design Guide + * + * For hypertransport information, please consult the following manuals + * from http://www.hypertransport.org + * + * The Hypertransport I/O Link Specification + */ + +#ifndef LINUX_PCI_REGS_H +#define LINUX_PCI_REGS_H + +/* + * Under PCI, each device has 256 bytes of configuration address space, + * of which the first 64 bytes are standardized as follows: + */ +#define PCI_VENDOR_ID 0x00 /* 16 bits */ +#define PCI_DEVICE_ID 0x02 /* 16 bits */ +#define PCI_COMMAND 0x04 /* 16 bits */ +#define PCI_COMMAND_IO 0x1 /* Enable response in I/O space */ +#define PCI_COMMAND_MEMORY 0x2 /* Enable response in Memory space */ +#define PCI_COMMAND_MASTER 0x4 /* Enable bus mastering */ +#define PCI_COMMAND_SPECIAL 0x8 /* Enable response to special cycles */ +#define PCI_COMMAND_INVALIDATE 0x10 /* Use memory write and invalidate */ +#define PCI_COMMAND_VGA_PALETTE 0x20 /* Enable palette snooping */ +#define PCI_COMMAND_PARITY 0x40 /* Enable parity checking */ +#define PCI_COMMAND_WAIT 0x80 /* Enable address/data stepping */ +#define PCI_COMMAND_SERR 0x100 /* Enable SERR */ +#define PCI_COMMAND_FAST_BACK 0x200 /* Enable back-to-back writes */ +#define PCI_COMMAND_INTX_DISABLE 0x400 /* INTx Emulation Disable */ + +#define PCI_STATUS 0x06 /* 16 bits */ +#define PCI_STATUS_CAP_LIST 0x10 /* Support Capability List */ +#define PCI_STATUS_66MHZ 0x20 /* Support 66 Mhz PCI 2.1 bus */ +#define PCI_STATUS_UDF 0x40 /* Support User Definable Features [obsolete] */ +#define PCI_STATUS_FAST_BACK 0x80 /* Accept fast-back to back */ +#define PCI_STATUS_PARITY 0x100 /* Detected parity error */ +#define PCI_STATUS_DEVSEL_MASK 0x600 /* DEVSEL timing */ +#define PCI_STATUS_DEVSEL_FAST 0x000 +#define PCI_STATUS_DEVSEL_MEDIUM 0x200 +#define PCI_STATUS_DEVSEL_SLOW 0x400 +#define PCI_STATUS_SIG_TARGET_ABORT 0x800 /* Set on target abort */ +#define PCI_STATUS_REC_TARGET_ABORT 0x1000 /* Master ack of " */ +#define PCI_STATUS_REC_MASTER_ABORT 0x2000 /* Set on master abort */ +#define PCI_STATUS_SIG_SYSTEM_ERROR 0x4000 /* Set when we drive SERR */ +#define PCI_STATUS_DETECTED_PARITY 0x8000 /* Set on parity error */ + +#define PCI_CLASS_REVISION 0x08 /* High 24 bits are class, low 8 revision */ +#define PCI_REVISION_ID 0x08 /* Revision ID */ +#define PCI_CLASS_PROG 0x09 /* Reg. Level Programming Interface */ +#define PCI_CLASS_DEVICE 0x0a /* Device class */ + +#define PCI_CACHE_LINE_SIZE 0x0c /* 8 bits */ +#define PCI_LATENCY_TIMER 0x0d /* 8 bits */ +#define PCI_HEADER_TYPE 0x0e /* 8 bits */ +#define PCI_HEADER_TYPE_NORMAL 0 +#define PCI_HEADER_TYPE_BRIDGE 1 +#define PCI_HEADER_TYPE_CARDBUS 2 + +#define PCI_BIST 0x0f /* 8 bits */ +#define PCI_BIST_CODE_MASK 0x0f /* Return result */ +#define PCI_BIST_START 0x40 /* 1 to start BIST, 2 secs or less */ +#define PCI_BIST_CAPABLE 0x80 /* 1 if BIST capable */ + +/* + * Base addresses specify locations in memory or I/O space. + * Decoded size can be determined by writing a value of + * 0xffffffff to the register, and reading it back. Only + * 1 bits are decoded. + */ +#define PCI_BASE_ADDRESS_0 0x10 /* 32 bits */ +#define PCI_BASE_ADDRESS_1 0x14 /* 32 bits [htype 0,1 only] */ +#define PCI_BASE_ADDRESS_2 0x18 /* 32 bits [htype 0 only] */ +#define PCI_BASE_ADDRESS_3 0x1c /* 32 bits */ +#define PCI_BASE_ADDRESS_4 0x20 /* 32 bits */ +#define PCI_BASE_ADDRESS_5 0x24 /* 32 bits */ +#define PCI_BASE_ADDRESS_SPACE 0x01 /* 0 = memory, 1 = I/O */ +#define PCI_BASE_ADDRESS_SPACE_IO 0x01 +#define PCI_BASE_ADDRESS_SPACE_MEMORY 0x00 +#define PCI_BASE_ADDRESS_MEM_TYPE_MASK 0x06 +#define PCI_BASE_ADDRESS_MEM_TYPE_32 0x00 /* 32 bit address */ +#define PCI_BASE_ADDRESS_MEM_TYPE_1M 0x02 /* Below 1M [obsolete] */ +#define PCI_BASE_ADDRESS_MEM_TYPE_64 0x04 /* 64 bit address */ +#define PCI_BASE_ADDRESS_MEM_PREFETCH 0x08 /* prefetchable? */ +#define PCI_BASE_ADDRESS_MEM_MASK (~0x0fUL) +#define PCI_BASE_ADDRESS_IO_MASK (~0x03UL) +/* bit 1 is reserved if address_space = 1 */ + +/* Header type 0 (normal devices) */ +#define PCI_CARDBUS_CIS 0x28 +#define PCI_SUBSYSTEM_VENDOR_ID 0x2c +#define PCI_SUBSYSTEM_ID 0x2e +#define PCI_ROM_ADDRESS 0x30 /* Bits 31..11 are address, 10..1 reserved */ +#define PCI_ROM_ADDRESS_ENABLE 0x01 +#define PCI_ROM_ADDRESS_MASK (~0x7ffUL) + +#define PCI_CAPABILITY_LIST 0x34 /* Offset of first capability list entry */ + +/* 0x35-0x3b are reserved */ +#define PCI_INTERRUPT_LINE 0x3c /* 8 bits */ +#define PCI_INTERRUPT_PIN 0x3d /* 8 bits */ +#define PCI_MIN_GNT 0x3e /* 8 bits */ +#define PCI_MAX_LAT 0x3f /* 8 bits */ + +/* Header type 1 (PCI-to-PCI bridges) */ +#define PCI_PRIMARY_BUS 0x18 /* Primary bus number */ +#define PCI_SECONDARY_BUS 0x19 /* Secondary bus number */ +#define PCI_SUBORDINATE_BUS 0x1a /* Highest bus number behind the bridge */ +#define PCI_SEC_LATENCY_TIMER 0x1b /* Latency timer for secondary interface */ +#define PCI_IO_BASE 0x1c /* I/O range behind the bridge */ +#define PCI_IO_LIMIT 0x1d +#define PCI_IO_RANGE_TYPE_MASK 0x0fUL /* I/O bridging type */ +#define PCI_IO_RANGE_TYPE_16 0x00 +#define PCI_IO_RANGE_TYPE_32 0x01 +#define PCI_IO_RANGE_MASK (~0x0fUL) +#define PCI_SEC_STATUS 0x1e /* Secondary status register, only bit 14 used */ +#define PCI_MEMORY_BASE 0x20 /* Memory range behind */ +#define PCI_MEMORY_LIMIT 0x22 +#define PCI_MEMORY_RANGE_TYPE_MASK 0x0fUL +#define PCI_MEMORY_RANGE_MASK (~0x0fUL) +#define PCI_PREF_MEMORY_BASE 0x24 /* Prefetchable memory range behind */ +#define PCI_PREF_MEMORY_LIMIT 0x26 +#define PCI_PREF_RANGE_TYPE_MASK 0x0fUL +#define PCI_PREF_RANGE_TYPE_32 0x00 +#define PCI_PREF_RANGE_TYPE_64 0x01 +#define PCI_PREF_RANGE_MASK (~0x0fUL) +#define PCI_PREF_BASE_UPPER32 0x28 /* Upper half of prefetchable memory range */ +#define PCI_PREF_LIMIT_UPPER32 0x2c +#define PCI_IO_BASE_UPPER16 0x30 /* Upper half of I/O addresses */ +#define PCI_IO_LIMIT_UPPER16 0x32 +/* 0x34 same as for htype 0 */ +/* 0x35-0x3b is reserved */ +#define PCI_ROM_ADDRESS1 0x38 /* Same as PCI_ROM_ADDRESS, but for htype 1 */ +/* 0x3c-0x3d are same as for htype 0 */ +#define PCI_BRIDGE_CONTROL 0x3e +#define PCI_BRIDGE_CTL_PARITY 0x01 /* Enable parity detection on secondary interface */ +#define PCI_BRIDGE_CTL_SERR 0x02 /* The same for SERR forwarding */ +#define PCI_BRIDGE_CTL_NO_ISA 0x04 /* Disable bridging of ISA ports */ +#define PCI_BRIDGE_CTL_VGA 0x08 /* Forward VGA addresses */ +#define PCI_BRIDGE_CTL_MASTER_ABORT 0x20 /* Report master aborts */ +#define PCI_BRIDGE_CTL_BUS_RESET 0x40 /* Secondary bus reset */ +#define PCI_BRIDGE_CTL_FAST_BACK 0x80 /* Fast Back2Back enabled on secondary interface */ + +/* Header type 2 (CardBus bridges) */ +#define PCI_CB_CAPABILITY_LIST 0x14 +/* 0x15 reserved */ +#define PCI_CB_SEC_STATUS 0x16 /* Secondary status */ +#define PCI_CB_PRIMARY_BUS 0x18 /* PCI bus number */ +#define PCI_CB_CARD_BUS 0x19 /* CardBus bus number */ +#define PCI_CB_SUBORDINATE_BUS 0x1a /* Subordinate bus number */ +#define PCI_CB_LATENCY_TIMER 0x1b /* CardBus latency timer */ +#define PCI_CB_MEMORY_BASE_0 0x1c +#define PCI_CB_MEMORY_LIMIT_0 0x20 +#define PCI_CB_MEMORY_BASE_1 0x24 +#define PCI_CB_MEMORY_LIMIT_1 0x28 +#define PCI_CB_IO_BASE_0 0x2c +#define PCI_CB_IO_BASE_0_HI 0x2e +#define PCI_CB_IO_LIMIT_0 0x30 +#define PCI_CB_IO_LIMIT_0_HI 0x32 +#define PCI_CB_IO_BASE_1 0x34 +#define PCI_CB_IO_BASE_1_HI 0x36 +#define PCI_CB_IO_LIMIT_1 0x38 +#define PCI_CB_IO_LIMIT_1_HI 0x3a +#define PCI_CB_IO_RANGE_MASK (~0x03UL) +/* 0x3c-0x3d are same as for htype 0 */ +#define PCI_CB_BRIDGE_CONTROL 0x3e +#define PCI_CB_BRIDGE_CTL_PARITY 0x01 /* Similar to standard bridge control register */ +#define PCI_CB_BRIDGE_CTL_SERR 0x02 +#define PCI_CB_BRIDGE_CTL_ISA 0x04 +#define PCI_CB_BRIDGE_CTL_VGA 0x08 +#define PCI_CB_BRIDGE_CTL_MASTER_ABORT 0x20 +#define PCI_CB_BRIDGE_CTL_CB_RESET 0x40 /* CardBus reset */ +#define PCI_CB_BRIDGE_CTL_16BIT_INT 0x80 /* Enable interrupt for 16-bit cards */ +#define PCI_CB_BRIDGE_CTL_PREFETCH_MEM0 0x100 /* Prefetch enable for both memory regions */ +#define PCI_CB_BRIDGE_CTL_PREFETCH_MEM1 0x200 +#define PCI_CB_BRIDGE_CTL_POST_WRITES 0x400 +#define PCI_CB_SUBSYSTEM_VENDOR_ID 0x40 +#define PCI_CB_SUBSYSTEM_ID 0x42 +#define PCI_CB_LEGACY_MODE_BASE 0x44 /* 16-bit PC Card legacy mode base address (ExCa) */ +/* 0x48-0x7f reserved */ + +/* Capability lists */ + +#define PCI_CAP_LIST_ID 0 /* Capability ID */ +#define PCI_CAP_ID_PM 0x01 /* Power Management */ +#define PCI_CAP_ID_AGP 0x02 /* Accelerated Graphics Port */ +#define PCI_CAP_ID_VPD 0x03 /* Vital Product Data */ +#define PCI_CAP_ID_SLOTID 0x04 /* Slot Identification */ +#define PCI_CAP_ID_MSI 0x05 /* Message Signalled Interrupts */ +#define PCI_CAP_ID_CHSWP 0x06 /* CompactPCI HotSwap */ +#define PCI_CAP_ID_PCIX 0x07 /* PCI-X */ +#define PCI_CAP_ID_HT 0x08 /* HyperTransport */ +#define PCI_CAP_ID_VNDR 0x09 /* Vendor specific capability */ +#define PCI_CAP_ID_SHPC 0x0C /* PCI Standard Hot-Plug Controller */ +#define PCI_CAP_ID_EXP 0x10 /* PCI Express */ +#define PCI_CAP_ID_MSIX 0x11 /* MSI-X */ +#define PCI_CAP_LIST_NEXT 1 /* Next capability in the list */ +#define PCI_CAP_FLAGS 2 /* Capability defined flags (16 bits) */ +#define PCI_CAP_SIZEOF 4 + +/* Power Management Registers */ + +#define PCI_PM_PMC 2 /* PM Capabilities Register */ +#define PCI_PM_CAP_VER_MASK 0x0007 /* Version */ +#define PCI_PM_CAP_PME_CLOCK 0x0008 /* PME clock required */ +#define PCI_PM_CAP_RESERVED 0x0010 /* Reserved field */ +#define PCI_PM_CAP_DSI 0x0020 /* Device specific initialization */ +#define PCI_PM_CAP_AUX_POWER 0x01C0 /* Auxilliary power support mask */ +#define PCI_PM_CAP_D1 0x0200 /* D1 power state support */ +#define PCI_PM_CAP_D2 0x0400 /* D2 power state support */ +#define PCI_PM_CAP_PME 0x0800 /* PME pin supported */ +#define PCI_PM_CAP_PME_MASK 0xF800 /* PME Mask of all supported states */ +#define PCI_PM_CAP_PME_D0 0x0800 /* PME# from D0 */ +#define PCI_PM_CAP_PME_D1 0x1000 /* PME# from D1 */ +#define PCI_PM_CAP_PME_D2 0x2000 /* PME# from D2 */ +#define PCI_PM_CAP_PME_D3 0x4000 /* PME# from D3 (hot) */ +#define PCI_PM_CAP_PME_D3cold 0x8000 /* PME# from D3 (cold) */ +#define PCI_PM_CTRL 4 /* PM control and status register */ +#define PCI_PM_CTRL_STATE_MASK 0x0003 /* Current power state (D0 to D3) */ +#define PCI_PM_CTRL_NO_SOFT_RESET 0x0004 /* No reset for D3hot->D0 */ +#define PCI_PM_CTRL_PME_ENABLE 0x0100 /* PME pin enable */ +#define PCI_PM_CTRL_DATA_SEL_MASK 0x1e00 /* Data select (??) */ +#define PCI_PM_CTRL_DATA_SCALE_MASK 0x6000 /* Data scale (??) */ +#define PCI_PM_CTRL_PME_STATUS 0x8000 /* PME pin status */ +#define PCI_PM_PPB_EXTENSIONS 6 /* PPB support extensions (??) */ +#define PCI_PM_PPB_B2_B3 0x40 /* Stop clock when in D3hot (??) */ +#define PCI_PM_BPCC_ENABLE 0x80 /* Bus power/clock control enable (??) */ +#define PCI_PM_DATA_REGISTER 7 /* (??) */ +#define PCI_PM_SIZEOF 8 + +/* AGP registers */ + +#define PCI_AGP_VERSION 2 /* BCD version number */ +#define PCI_AGP_RFU 3 /* Rest of capability flags */ +#define PCI_AGP_STATUS 4 /* Status register */ +#define PCI_AGP_STATUS_RQ_MASK 0xff000000 /* Maximum number of requests - 1 */ +#define PCI_AGP_STATUS_SBA 0x0200 /* Sideband addressing supported */ +#define PCI_AGP_STATUS_64BIT 0x0020 /* 64-bit addressing supported */ +#define PCI_AGP_STATUS_FW 0x0010 /* FW transfers supported */ +#define PCI_AGP_STATUS_RATE4 0x0004 /* 4x transfer rate supported */ +#define PCI_AGP_STATUS_RATE2 0x0002 /* 2x transfer rate supported */ +#define PCI_AGP_STATUS_RATE1 0x0001 /* 1x transfer rate supported */ +#define PCI_AGP_COMMAND 8 /* Control register */ +#define PCI_AGP_COMMAND_RQ_MASK 0xff000000 /* Master: Maximum number of requests */ +#define PCI_AGP_COMMAND_SBA 0x0200 /* Sideband addressing enabled */ +#define PCI_AGP_COMMAND_AGP 0x0100 /* Allow processing of AGP transactions */ +#define PCI_AGP_COMMAND_64BIT 0x0020 /* Allow processing of 64-bit addresses */ +#define PCI_AGP_COMMAND_FW 0x0010 /* Force FW transfers */ +#define PCI_AGP_COMMAND_RATE4 0x0004 /* Use 4x rate */ +#define PCI_AGP_COMMAND_RATE2 0x0002 /* Use 2x rate */ +#define PCI_AGP_COMMAND_RATE1 0x0001 /* Use 1x rate */ +#define PCI_AGP_SIZEOF 12 + +/* Vital Product Data */ + +#define PCI_VPD_ADDR 2 /* Address to access (15 bits!) */ +#define PCI_VPD_ADDR_MASK 0x7fff /* Address mask */ +#define PCI_VPD_ADDR_F 0x8000 /* Write 0, 1 indicates completion */ +#define PCI_VPD_DATA 4 /* 32-bits of data returned here */ + +/* Slot Identification */ + +#define PCI_SID_ESR 2 /* Expansion Slot Register */ +#define PCI_SID_ESR_NSLOTS 0x1f /* Number of expansion slots available */ +#define PCI_SID_ESR_FIC 0x20 /* First In Chassis Flag */ +#define PCI_SID_CHASSIS_NR 3 /* Chassis Number */ + +/* Message Signalled Interrupts registers */ + +#define PCI_MSI_FLAGS 2 /* Various flags */ +#define PCI_MSI_FLAGS_64BIT 0x80 /* 64-bit addresses allowed */ +#define PCI_MSI_FLAGS_QSIZE 0x70 /* Message queue size configured */ +#define PCI_MSI_FLAGS_QMASK 0x0e /* Maximum queue size available */ +#define PCI_MSI_FLAGS_ENABLE 0x01 /* MSI feature enabled */ +#define PCI_MSI_FLAGS_MASKBIT 0x100 /* 64-bit mask bits allowed */ +#define PCI_MSI_RFU 3 /* Rest of capability flags */ +#define PCI_MSI_ADDRESS_LO 4 /* Lower 32 bits */ +#define PCI_MSI_ADDRESS_HI 8 /* Upper 32 bits (if PCI_MSI_FLAGS_64BIT set) */ +#define PCI_MSI_DATA_32 8 /* 16 bits of data for 32-bit devices */ +#define PCI_MSI_DATA_64 12 /* 16 bits of data for 64-bit devices */ +#define PCI_MSI_MASK_BIT 16 /* Mask bits register */ + +/* MSI-X registers (these are at offset PCI_MSIX_FLAGS) */ +#define PCI_MSIX_FLAGS 2 +#define PCI_MSIX_FLAGS_QSIZE 0x7FF +#define PCI_MSIX_FLAGS_ENABLE (1 << 15) +#define PCI_MSIX_FLAGS_MASKALL (1 << 14) +#define PCI_MSIX_FLAGS_BIRMASK (7 << 0) +#define PCI_MSIX_FLAGS_BITMASK (1 << 0) + +/* CompactPCI Hotswap Register */ + +#define PCI_CHSWP_CSR 2 /* Control and Status Register */ +#define PCI_CHSWP_DHA 0x01 /* Device Hiding Arm */ +#define PCI_CHSWP_EIM 0x02 /* ENUM# Signal Mask */ +#define PCI_CHSWP_PIE 0x04 /* Pending Insert or Extract */ +#define PCI_CHSWP_LOO 0x08 /* LED On / Off */ +#define PCI_CHSWP_PI 0x30 /* Programming Interface */ +#define PCI_CHSWP_EXT 0x40 /* ENUM# status - extraction */ +#define PCI_CHSWP_INS 0x80 /* ENUM# status - insertion */ + +/* PCI-X registers */ + +#define PCI_X_CMD 2 /* Modes & Features */ +#define PCI_X_CMD_DPERR_E 0x0001 /* Data Parity Error Recovery Enable */ +#define PCI_X_CMD_ERO 0x0002 /* Enable Relaxed Ordering */ +#define PCI_X_CMD_MAX_READ 0x000c /* Max Memory Read Byte Count */ +#define PCI_X_CMD_MAX_SPLIT 0x0070 /* Max Outstanding Split Transactions */ +#define PCI_X_CMD_VERSION(x) (((x) >> 12) & 3) /* Version */ +#define PCI_X_STATUS 4 /* PCI-X capabilities */ +#define PCI_X_STATUS_DEVFN 0x000000ff /* A copy of devfn */ +#define PCI_X_STATUS_BUS 0x0000ff00 /* A copy of bus nr */ +#define PCI_X_STATUS_64BIT 0x00010000 /* 64-bit device */ +#define PCI_X_STATUS_133MHZ 0x00020000 /* 133 MHz capable */ +#define PCI_X_STATUS_SPL_DISC 0x00040000 /* Split Completion Discarded */ +#define PCI_X_STATUS_UNX_SPL 0x00080000 /* Unexpected Split Completion */ +#define PCI_X_STATUS_COMPLEX 0x00100000 /* Device Complexity */ +#define PCI_X_STATUS_MAX_READ 0x00600000 /* Designed Max Memory Read Count */ +#define PCI_X_STATUS_MAX_SPLIT 0x03800000 /* Designed Max Outstanding Split Transactions */ +#define PCI_X_STATUS_MAX_CUM 0x1c000000 /* Designed Max Cumulative Read Size */ +#define PCI_X_STATUS_SPL_ERR 0x20000000 /* Rcvd Split Completion Error Msg */ +#define PCI_X_STATUS_266MHZ 0x40000000 /* 266 MHz capable */ +#define PCI_X_STATUS_533MHZ 0x80000000 /* 533 MHz capable */ + +/* PCI Express capability registers */ + +#define PCI_EXP_FLAGS 2 /* Capabilities register */ +#define PCI_EXP_FLAGS_VERS 0x000f /* Capability version */ +#define PCI_EXP_FLAGS_TYPE 0x00f0 /* Device/Port type */ +#define PCI_EXP_TYPE_ENDPOINT 0x0 /* Express Endpoint */ +#define PCI_EXP_TYPE_LEG_END 0x1 /* Legacy Endpoint */ +#define PCI_EXP_TYPE_ROOT_PORT 0x4 /* Root Port */ +#define PCI_EXP_TYPE_UPSTREAM 0x5 /* Upstream Port */ +#define PCI_EXP_TYPE_DOWNSTREAM 0x6 /* Downstream Port */ +#define PCI_EXP_TYPE_PCI_BRIDGE 0x7 /* PCI/PCI-X Bridge */ +#define PCI_EXP_FLAGS_SLOT 0x0100 /* Slot implemented */ +#define PCI_EXP_FLAGS_IRQ 0x3e00 /* Interrupt message number */ +#define PCI_EXP_DEVCAP 4 /* Device capabilities */ +#define PCI_EXP_DEVCAP_PAYLOAD 0x07 /* Max_Payload_Size */ +#define PCI_EXP_DEVCAP_PHANTOM 0x18 /* Phantom functions */ +#define PCI_EXP_DEVCAP_EXT_TAG 0x20 /* Extended tags */ +#define PCI_EXP_DEVCAP_L0S 0x1c0 /* L0s Acceptable Latency */ +#define PCI_EXP_DEVCAP_L1 0xe00 /* L1 Acceptable Latency */ +#define PCI_EXP_DEVCAP_ATN_BUT 0x1000 /* Attention Button Present */ +#define PCI_EXP_DEVCAP_ATN_IND 0x2000 /* Attention Indicator Present */ +#define PCI_EXP_DEVCAP_PWR_IND 0x4000 /* Power Indicator Present */ +#define PCI_EXP_DEVCAP_PWR_VAL 0x3fc0000 /* Slot Power Limit Value */ +#define PCI_EXP_DEVCAP_PWR_SCL 0xc000000 /* Slot Power Limit Scale */ +#define PCI_EXP_DEVCTL 8 /* Device Control */ +#define PCI_EXP_DEVCTL_CERE 0x0001 /* Correctable Error Reporting En. */ +#define PCI_EXP_DEVCTL_NFERE 0x0002 /* Non-Fatal Error Reporting Enable */ +#define PCI_EXP_DEVCTL_FERE 0x0004 /* Fatal Error Reporting Enable */ +#define PCI_EXP_DEVCTL_URRE 0x0008 /* Unsupported Request Reporting En. */ +#define PCI_EXP_DEVCTL_RELAX_EN 0x0010 /* Enable relaxed ordering */ +#define PCI_EXP_DEVCTL_PAYLOAD 0x00e0 /* Max_Payload_Size */ +#define PCI_EXP_DEVCTL_EXT_TAG 0x0100 /* Extended Tag Field Enable */ +#define PCI_EXP_DEVCTL_PHANTOM 0x0200 /* Phantom Functions Enable */ +#define PCI_EXP_DEVCTL_AUX_PME 0x0400 /* Auxiliary Power PM Enable */ +#define PCI_EXP_DEVCTL_NOSNOOP_EN 0x0800 /* Enable No Snoop */ +#define PCI_EXP_DEVCTL_READRQ 0x7000 /* Max_Read_Request_Size */ +#define PCI_EXP_DEVSTA 10 /* Device Status */ +#define PCI_EXP_DEVSTA_CED 0x01 /* Correctable Error Detected */ +#define PCI_EXP_DEVSTA_NFED 0x02 /* Non-Fatal Error Detected */ +#define PCI_EXP_DEVSTA_FED 0x04 /* Fatal Error Detected */ +#define PCI_EXP_DEVSTA_URD 0x08 /* Unsupported Request Detected */ +#define PCI_EXP_DEVSTA_AUXPD 0x10 /* AUX Power Detected */ +#define PCI_EXP_DEVSTA_TRPND 0x20 /* Transactions Pending */ +#define PCI_EXP_LNKCAP 12 /* Link Capabilities */ +#define PCI_EXP_LNKCTL 16 /* Link Control */ +#define PCI_EXP_LNKCTL_CLKREQ_EN 0x100 /* Enable clkreq */ +#define PCI_EXP_LNKSTA 18 /* Link Status */ +#define PCI_EXP_SLTCAP 20 /* Slot Capabilities */ +#define PCI_EXP_SLTCTL 24 /* Slot Control */ +#define PCI_EXP_SLTSTA 26 /* Slot Status */ +#define PCI_EXP_RTCTL 28 /* Root Control */ +#define PCI_EXP_RTCTL_SECEE 0x01 /* System Error on Correctable Error */ +#define PCI_EXP_RTCTL_SENFEE 0x02 /* System Error on Non-Fatal Error */ +#define PCI_EXP_RTCTL_SEFEE 0x04 /* System Error on Fatal Error */ +#define PCI_EXP_RTCTL_PMEIE 0x08 /* PME Interrupt Enable */ +#define PCI_EXP_RTCTL_CRSSVE 0x10 /* CRS Software Visibility Enable */ +#define PCI_EXP_RTCAP 30 /* Root Capabilities */ +#define PCI_EXP_RTSTA 32 /* Root Status */ + +/* Extended Capabilities (PCI-X 2.0 and Express) */ +#define PCI_EXT_CAP_ID(header) (header & 0x0000ffff) +#define PCI_EXT_CAP_VER(header) ((header >> 16) & 0xf) +#define PCI_EXT_CAP_NEXT(header) ((header >> 20) & 0xffc) + +#define PCI_EXT_CAP_ID_ERR 1 +#define PCI_EXT_CAP_ID_VC 2 +#define PCI_EXT_CAP_ID_DSN 3 +#define PCI_EXT_CAP_ID_PWR 4 + +/* Advanced Error Reporting */ +#define PCI_ERR_UNCOR_STATUS 4 /* Uncorrectable Error Status */ +#define PCI_ERR_UNC_TRAIN 0x00000001 /* Training */ +#define PCI_ERR_UNC_DLP 0x00000010 /* Data Link Protocol */ +#define PCI_ERR_UNC_POISON_TLP 0x00001000 /* Poisoned TLP */ +#define PCI_ERR_UNC_FCP 0x00002000 /* Flow Control Protocol */ +#define PCI_ERR_UNC_COMP_TIME 0x00004000 /* Completion Timeout */ +#define PCI_ERR_UNC_COMP_ABORT 0x00008000 /* Completer Abort */ +#define PCI_ERR_UNC_UNX_COMP 0x00010000 /* Unexpected Completion */ +#define PCI_ERR_UNC_RX_OVER 0x00020000 /* Receiver Overflow */ +#define PCI_ERR_UNC_MALF_TLP 0x00040000 /* Malformed TLP */ +#define PCI_ERR_UNC_ECRC 0x00080000 /* ECRC Error Status */ +#define PCI_ERR_UNC_UNSUP 0x00100000 /* Unsupported Request */ +#define PCI_ERR_UNCOR_MASK 8 /* Uncorrectable Error Mask */ + /* Same bits as above */ +#define PCI_ERR_UNCOR_SEVER 12 /* Uncorrectable Error Severity */ + /* Same bits as above */ +#define PCI_ERR_COR_STATUS 16 /* Correctable Error Status */ +#define PCI_ERR_COR_RCVR 0x00000001 /* Receiver Error Status */ +#define PCI_ERR_COR_BAD_TLP 0x00000040 /* Bad TLP Status */ +#define PCI_ERR_COR_BAD_DLLP 0x00000080 /* Bad DLLP Status */ +#define PCI_ERR_COR_REP_ROLL 0x00000100 /* REPLAY_NUM Rollover */ +#define PCI_ERR_COR_REP_TIMER 0x00001000 /* Replay Timer Timeout */ +#define PCI_ERR_COR_MASK 20 /* Correctable Error Mask */ + /* Same bits as above */ +#define PCI_ERR_CAP 24 /* Advanced Error Capabilities */ +#define PCI_ERR_CAP_FEP(x) ((x) & 31) /* First Error Pointer */ +#define PCI_ERR_CAP_ECRC_GENC 0x00000020 /* ECRC Generation Capable */ +#define PCI_ERR_CAP_ECRC_GENE 0x00000040 /* ECRC Generation Enable */ +#define PCI_ERR_CAP_ECRC_CHKC 0x00000080 /* ECRC Check Capable */ +#define PCI_ERR_CAP_ECRC_CHKE 0x00000100 /* ECRC Check Enable */ +#define PCI_ERR_HEADER_LOG 28 /* Header Log Register (16 bytes) */ +#define PCI_ERR_ROOT_COMMAND 44 /* Root Error Command */ +/* Correctable Err Reporting Enable */ +#define PCI_ERR_ROOT_CMD_COR_EN 0x00000001 +/* Non-fatal Err Reporting Enable */ +#define PCI_ERR_ROOT_CMD_NONFATAL_EN 0x00000002 +/* Fatal Err Reporting Enable */ +#define PCI_ERR_ROOT_CMD_FATAL_EN 0x00000004 +#define PCI_ERR_ROOT_STATUS 48 +#define PCI_ERR_ROOT_COR_RCV 0x00000001 /* ERR_COR Received */ +/* Multi ERR_COR Received */ +#define PCI_ERR_ROOT_MULTI_COR_RCV 0x00000002 +/* ERR_FATAL/NONFATAL Recevied */ +#define PCI_ERR_ROOT_UNCOR_RCV 0x00000004 +/* Multi ERR_FATAL/NONFATAL Recevied */ +#define PCI_ERR_ROOT_MULTI_UNCOR_RCV 0x00000008 +#define PCI_ERR_ROOT_FIRST_FATAL 0x00000010 /* First Fatal */ +#define PCI_ERR_ROOT_NONFATAL_RCV 0x00000020 /* Non-Fatal Received */ +#define PCI_ERR_ROOT_FATAL_RCV 0x00000040 /* Fatal Received */ +#define PCI_ERR_ROOT_COR_SRC 52 +#define PCI_ERR_ROOT_SRC 54 + +/* Virtual Channel */ +#define PCI_VC_PORT_REG1 4 +#define PCI_VC_PORT_REG2 8 +#define PCI_VC_PORT_CTRL 12 +#define PCI_VC_PORT_STATUS 14 +#define PCI_VC_RES_CAP 16 +#define PCI_VC_RES_CTRL 20 +#define PCI_VC_RES_STATUS 26 + +/* Power Budgeting */ +#define PCI_PWR_DSR 4 /* Data Select Register */ +#define PCI_PWR_DATA 8 /* Data Register */ +#define PCI_PWR_DATA_BASE(x) ((x) & 0xff) /* Base Power */ +#define PCI_PWR_DATA_SCALE(x) (((x) >> 8) & 3) /* Data Scale */ +#define PCI_PWR_DATA_PM_SUB(x) (((x) >> 10) & 7) /* PM Sub State */ +#define PCI_PWR_DATA_PM_STATE(x) (((x) >> 13) & 3) /* PM State */ +#define PCI_PWR_DATA_TYPE(x) (((x) >> 15) & 7) /* Type */ +#define PCI_PWR_DATA_RAIL(x) (((x) >> 18) & 7) /* Power Rail */ +#define PCI_PWR_CAP 12 /* Capability */ +#define PCI_PWR_CAP_BUDGET(x) ((x) & 1) /* Included in system budget */ + +/* + * Hypertransport sub capability types + * + * Unfortunately there are both 3 bit and 5 bit capability types defined + * in the HT spec, catering for that is a little messy. You probably don't + * want to use these directly, just use pci_find_ht_capability() and it + * will do the right thing for you. + */ +#define HT_3BIT_CAP_MASK 0xE0 +#define HT_CAPTYPE_SLAVE 0x00 /* Slave/Primary link configuration */ +#define HT_CAPTYPE_HOST 0x20 /* Host/Secondary link configuration */ + +#define HT_5BIT_CAP_MASK 0xF8 +#define HT_CAPTYPE_IRQ 0x80 /* IRQ Configuration */ +#define HT_CAPTYPE_REMAPPING_40 0xA0 /* 40 bit address remapping */ +#define HT_CAPTYPE_REMAPPING_64 0xA2 /* 64 bit address remapping */ +#define HT_CAPTYPE_UNITID_CLUMP 0x90 /* Unit ID clumping */ +#define HT_CAPTYPE_EXTCONF 0x98 /* Extended Configuration Space Access */ +#define HT_CAPTYPE_MSI_MAPPING 0xA8 /* MSI Mapping Capability */ +#define HT_MSI_FLAGS 0x02 /* Offset to flags */ +#define HT_MSI_FLAGS_ENABLE 0x1 /* Mapping enable */ +#define HT_MSI_FLAGS_FIXED 0x2 /* Fixed mapping only */ +#define HT_MSI_FIXED_ADDR 0x00000000FEE00000ULL /* Fixed addr */ +#define HT_MSI_ADDR_LO 0x04 /* Offset to low addr bits */ +#define HT_MSI_ADDR_LO_MASK 0xFFF00000 /* Low address bit mask */ +#define HT_MSI_ADDR_HI 0x08 /* Offset to high addr bits */ +#define HT_CAPTYPE_DIRECT_ROUTE 0xB0 /* Direct routing configuration */ +#define HT_CAPTYPE_VCSET 0xB8 /* Virtual Channel configuration */ +#define HT_CAPTYPE_ERROR_RETRY 0xC0 /* Retry on error configuration */ +#define HT_CAPTYPE_GEN3 0xD0 /* Generation 3 hypertransport configuration */ +#define HT_CAPTYPE_PM 0xE0 /* Hypertransport powermanagement configuration */ + + +#endif /* LINUX_PCI_REGS_H */ diff -r eae7b887e5ac -r ee498c9af856 xen/arch/x86/hvm/svm/emulate.c --- a/xen/arch/x86/hvm/svm/emulate.c Thu Sep 27 09:16:23 2007 -0600 +++ b/xen/arch/x86/hvm/svm/emulate.c Thu Sep 27 12:22:16 2007 -0600 @@ -59,8 +59,8 @@ extern int inst_copy_from_guest(unsigned #define DECODE_SIB_BASE(prefix, sib) DECODE_MODRM_RM(prefix, sib) -static inline unsigned long DECODE_GPR_VALUE(struct vmcb_struct *vmcb, - struct cpu_user_regs *regs, u8 gpr_rm) +static inline unsigned long DECODE_GPR_VALUE( + struct cpu_user_regs *regs, u8 gpr_rm) { unsigned long value; switch (gpr_rm) @@ -78,7 +78,7 @@ static inline unsigned long DECODE_GPR_V value = regs->ebx; break; case 0x4: - value = (unsigned long)vmcb->rsp; + value = regs->esp; case 0x5: value = regs->ebp; break; @@ -172,7 +172,7 @@ unsigned long get_effective_addr_modrm64 } else { - effective_addr = DECODE_GPR_VALUE(vmcb, regs, modrm_rm); + effective_addr = DECODE_GPR_VALUE(regs, modrm_rm); } break; @@ -202,12 +202,12 @@ unsigned long get_effective_addr_modrm64 #if __x86_64__ /* 64-bit mode */ if (vmcb->cs.attr.fields.l && hvm_long_mode_enabled(v)) - return vmcb->rip + inst_len + *size + disp; + return regs->eip + inst_len + *size + disp; #endif return disp; default: - effective_addr = DECODE_GPR_VALUE(vmcb, regs, modrm_rm); + effective_addr = DECODE_GPR_VALUE(regs, modrm_rm); } @@ -251,7 +251,7 @@ unsigned long get_effective_addr_sib(str sib_idx = DECODE_SIB_INDEX(prefix, sib); sib_base = DECODE_SIB_BASE(prefix, sib); - base = DECODE_GPR_VALUE(vmcb, regs, sib_base); + base = DECODE_GPR_VALUE(regs, sib_base); if ((unsigned long)-1 == base) { @@ -293,7 +293,7 @@ unsigned long get_effective_addr_sib(str if (4 == sib_idx) return base; - effective_addr = DECODE_GPR_VALUE(vmcb, regs, sib_idx); + effective_addr = DECODE_GPR_VALUE(regs, sib_idx); effective_addr <<= sib_scale; @@ -326,7 +326,8 @@ unsigned long svm_rip2pointer(struct vcp * no matter what kind of addressing is used. */ struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; - unsigned long p = vmcb->cs.base + vmcb->rip; + unsigned long p = vmcb->cs.base + guest_cpu_user_regs()->eip; + ASSERT(v == current); if (!(vmcb->cs.attr.fields.l && hvm_long_mode_enabled(v))) return (u32)p; /* mask to 32 bits */ /* NB. Should mask to 16 bits if in real mode or 16-bit protected mode. */ diff -r eae7b887e5ac -r ee498c9af856 xen/arch/x86/hvm/svm/svm.c --- a/xen/arch/x86/hvm/svm/svm.c Thu Sep 27 09:16:23 2007 -0600 +++ b/xen/arch/x86/hvm/svm/svm.c Thu Sep 27 12:22:16 2007 -0600 @@ -72,6 +72,14 @@ static void *root_vmcb[NR_CPUS] __read_m /* hardware assisted paging bits */ extern int opt_hap_enabled; +static void inline __update_guest_eip( + struct cpu_user_regs *regs, int inst_len) +{ + ASSERT(inst_len > 0); + regs->eip += inst_len; + regs->eflags &= ~X86_EFLAGS_RF; +} + static void svm_inject_exception( struct vcpu *v, int trap, int ev, int error_code) { @@ -106,30 +114,6 @@ static int svm_lme_is_set(struct vcpu *v #else return 0; #endif -} - -static void svm_store_cpu_guest_regs( - struct vcpu *v, struct cpu_user_regs *regs, unsigned long *crs) -{ - struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; - - if ( regs != NULL ) - { - regs->ss = vmcb->ss.sel; - regs->esp = vmcb->rsp; - regs->eflags = vmcb->rflags; - regs->cs = vmcb->cs.sel; - regs->eip = vmcb->rip; - } - - if ( crs != NULL ) - { - /* Returning the guest's regs */ - crs[0] = v->arch.hvm_vcpu.guest_cr[0]; - crs[2] = v->arch.hvm_vcpu.guest_cr[2]; - crs[3] = v->arch.hvm_vcpu.guest_cr[3]; - crs[4] = v->arch.hvm_vcpu.guest_cr[4]; - } } static enum handler_return long_mode_do_msr_write(struct cpu_user_regs *regs) @@ -247,29 +231,10 @@ int svm_vmcb_save(struct vcpu *v, struct { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; - c->rip = vmcb->rip; - -#ifdef HVM_DEBUG_SUSPEND - printk("%s: eip=0x%"PRIx64".\n", - __func__, - inst_len, c->eip); -#endif - - c->rsp = vmcb->rsp; - c->rflags = vmcb->rflags; - c->cr0 = v->arch.hvm_vcpu.guest_cr[0]; c->cr2 = v->arch.hvm_vcpu.guest_cr[2]; c->cr3 = v->arch.hvm_vcpu.guest_cr[3]; c->cr4 = v->arch.hvm_vcpu.guest_cr[4]; - -#ifdef HVM_DEBUG_SUSPEND - printk("%s: cr3=0x%"PRIx64", cr0=0x%"PRIx64", cr4=0x%"PRIx64".\n", - __func__, - c->cr3, - c->cr0, - c->cr4); -#endif c->idtr_limit = vmcb->idtr.limit; c->idtr_base = vmcb->idtr.base; @@ -368,10 +333,6 @@ int svm_vmcb_restore(struct vcpu *v, str v->arch.guest_table = pagetable_from_pfn(mfn); } - - vmcb->rip = c->rip; - vmcb->rsp = c->rsp; - vmcb->rflags = c->rflags; v->arch.hvm_vcpu.guest_cr[0] = c->cr0 | X86_CR0_ET; v->arch.hvm_vcpu.guest_cr[2] = c->cr2; @@ -532,7 +493,8 @@ static int svm_interrupts_enabled(struct return !vmcb->interrupt_shadow; ASSERT((type == hvm_intack_pic) || (type == hvm_intack_lapic)); - return !irq_masked(vmcb->rflags) && !vmcb->interrupt_shadow; + return (!irq_masked(guest_cpu_user_regs()->eflags) && + !vmcb->interrupt_shadow); } static int svm_guest_x86_mode(struct vcpu *v) @@ -541,7 +503,7 @@ static int svm_guest_x86_mode(struct vcp if ( unlikely(!(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE)) ) return 0; - if ( unlikely(vmcb->rflags & X86_EFLAGS_VM) ) + if ( unlikely(guest_cpu_user_regs()->eflags & X86_EFLAGS_VM) ) return 1; if ( hvm_long_mode_enabled(v) && likely(vmcb->cs.attr.fields.l) ) return 8; @@ -618,9 +580,7 @@ static void svm_sync_vmcb(struct vcpu *v arch_svm->vmcb_in_sync = 1; - asm volatile ( - ".byte 0x0f,0x01,0xdb" /* vmsave */ - : : "a" (__pa(arch_svm->vmcb)) ); + svm_vmsave(arch_svm->vmcb); } static unsigned long svm_get_segment_base(struct vcpu *v, enum x86_segment seg) @@ -649,6 +609,9 @@ static void svm_get_segment_register(str struct segment_register *reg) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; + + ASSERT(v == current); + switch ( seg ) { case x86_seg_cs: @@ -685,7 +648,61 @@ static void svm_get_segment_register(str svm_sync_vmcb(v); memcpy(reg, &vmcb->ldtr, sizeof(*reg)); break; - default: BUG(); + default: + BUG(); + } +} + +static void svm_set_segment_register(struct vcpu *v, enum x86_segment seg, + struct segment_register *reg) +{ + struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; + + ASSERT(v == current); + + switch ( seg ) + { + case x86_seg_cs: + memcpy(&vmcb->cs, reg, sizeof(*reg)); + break; + case x86_seg_ds: + memcpy(&vmcb->ds, reg, sizeof(*reg)); + break; + case x86_seg_es: + memcpy(&vmcb->es, reg, sizeof(*reg)); + break; + case x86_seg_fs: + svm_sync_vmcb(v); + memcpy(&vmcb->fs, reg, sizeof(*reg)); + svm_vmload(vmcb); + break; + case x86_seg_gs: + svm_sync_vmcb(v); + memcpy(&vmcb->gs, reg, sizeof(*reg)); + svm_vmload(vmcb); + break; + case x86_seg_ss: + memcpy(&vmcb->ss, reg, sizeof(*reg)); + vmcb->cpl = vmcb->ss.attr.fields.dpl; + break; + case x86_seg_tr: + svm_sync_vmcb(v); + memcpy(&vmcb->tr, reg, sizeof(*reg)); + svm_vmload(vmcb); + break; + case x86_seg_gdtr: + memcpy(&vmcb->gdtr, reg, sizeof(*reg)); + break; + case x86_seg_idtr: + memcpy(&vmcb->idtr, reg, sizeof(*reg)); + break; + case x86_seg_ldtr: + svm_sync_vmcb(v); + memcpy(&vmcb->ldtr, reg, sizeof(*reg)); + svm_vmload(vmcb); + break; + default: + BUG(); } } @@ -744,7 +761,6 @@ static void svm_init_ap_context( */ svm_reset_to_realmode(v, regs); /* Adjust the vmcb's hidden register state. */ - vmcb->rip = 0; vmcb->cs.sel = cs_sel; vmcb->cs.base = (cs_sel << 4); } @@ -769,17 +785,6 @@ static void svm_init_hypercall_page(stru *(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */ } -static void svm_load_cpu_guest_regs(struct vcpu *v, struct cpu_user_regs *regs) -{ - struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; - - vmcb->ss.sel = regs->ss; - vmcb->rsp = regs->esp; - vmcb->rflags = regs->eflags | 2UL; - vmcb->cs.sel = regs->cs; - vmcb->rip = regs->eip; -} - static void svm_ctxt_switch_from(struct vcpu *v) { int cpu = smp_processor_id(); @@ -787,10 +792,7 @@ static void svm_ctxt_switch_from(struct svm_save_dr(v); svm_sync_vmcb(v); - - asm volatile ( - ".byte 0x0f,0x01,0xda" /* vmload */ - : : "a" (__pa(root_vmcb[cpu])) ); + svm_vmload(root_vmcb[cpu]); #ifdef __x86_64__ /* Resume use of ISTs now that the host TR is reinstated. */ @@ -826,12 +828,8 @@ static void svm_ctxt_switch_to(struct vc svm_restore_dr(v); - asm volatile ( - ".byte 0x0f,0x01,0xdb" /* vmsave */ - : : "a" (__pa(root_vmcb[cpu])) ); - asm volatile ( - ".byte 0x0f,0x01,0xda" /* vmload */ - : : "a" (__pa(v->arch.hvm_svm.vmcb)) ); + svm_vmsave(root_vmcb[cpu]); + svm_vmload(v->arch.hvm_svm.vmcb); } static void svm_do_resume(struct vcpu *v) @@ -918,14 +916,13 @@ static struct hvm_function_table svm_fun .domain_destroy = svm_domain_destroy, .vcpu_initialise = svm_vcpu_initialise, .vcpu_destroy = svm_vcpu_destroy, - .store_cpu_guest_regs = svm_store_cpu_guest_regs, - .load_cpu_guest_regs = svm_load_cpu_guest_regs, .save_cpu_ctxt = svm_save_vmcb_ctxt, .load_cpu_ctxt = svm_load_vmcb_ctxt, .interrupts_enabled = svm_interrupts_enabled, .guest_x86_mode = svm_guest_x86_mode, .get_segment_base = svm_get_segment_base, .get_segment_register = svm_get_segment_register, + .set_segment_register = svm_set_segment_register, .update_host_cr3 = svm_update_host_cr3, .update_guest_cr = svm_update_guest_cr, .update_guest_efer = svm_update_guest_efer, @@ -1111,7 +1108,7 @@ static void svm_vmexit_do_cpuid(struct v inst_len = __get_instruction_length(v, INSTR_CPUID, NULL); ASSERT(inst_len > 0); - __update_guest_eip(vmcb, inst_len); + __update_guest_eip(regs, inst_len); } static unsigned long *get_reg_p( @@ -1143,7 +1140,7 @@ static unsigned long *get_reg_p( reg_p = (unsigned long *)®s->ebp; break; case SVM_REG_ESP: - reg_p = (unsigned long *)&vmcb->rsp; + reg_p = (unsigned long *)®s->esp; break; #ifdef __x86_64__ case SVM_REG_R8: @@ -1315,7 +1312,7 @@ static int svm_get_io_address( * than one byte (+ maybe rep-prefix), we have some prefix so we need * to figure out what it is... */ - isize = vmcb->exitinfo2 - vmcb->rip; + isize = vmcb->exitinfo2 - regs->eip; if (info.fields.rep) isize --; @@ -1468,7 +1465,6 @@ static void svm_io_instruction(struct vc /* Copy current guest state into io instruction state structure. */ memcpy(regs, guest_cpu_user_regs(), HVM_CONTEXT_STACK_BYTES); - svm_store_cpu_guest_regs(v, regs, NULL); info.bytes = vmcb->exitinfo1; @@ -1486,12 +1482,12 @@ static void svm_io_instruction(struct vc if (dir==IOREQ_READ) HVMTRACE_2D(IO_READ, v, port, size); else - HVMTRACE_2D(IO_WRITE, v, port, size); + HVMTRACE_3D(IO_WRITE, v, port, size, regs->eax); HVM_DBG_LOG(DBG_LEVEL_IO, "svm_io_instruction: port 0x%x eip=%x:%"PRIx64", " "exit_qualification = %"PRIx64, - port, vmcb->cs.sel, vmcb->rip, info.bytes); + port, vmcb->cs.sel, (uint64_t)regs->eip, info.bytes); /* string instruction */ if (info.fields.str) @@ -1742,7 +1738,7 @@ static void svm_cr_access( if (index > 0 && (buffer[index-1] & 0xF0) == 0x40) prefix = buffer[index-1]; - HVM_DBG_LOG(DBG_LEVEL_1, "eip = %lx", (unsigned long) vmcb->rip); + HVM_DBG_LOG(DBG_LEVEL_1, "eip = %lx", (unsigned long)regs->eip); switch ( match ) @@ -1763,6 +1759,7 @@ static void svm_cr_access( vmcb->exception_intercepts &= ~(1U << TRAP_no_device); vmcb->cr0 &= ~X86_CR0_TS; /* clear TS */ v->arch.hvm_vcpu.guest_cr[0] &= ~X86_CR0_TS; /* clear TS */ + HVMTRACE_0D(CLTS, current); break; case INSTR_LMSW: @@ -1770,6 +1767,7 @@ static void svm_cr_access( value = get_reg(gpreg, regs, vmcb) & 0xF; value = (v->arch.hvm_vcpu.guest_cr[0] & ~0xF) | value; result = svm_set_cr0(value); + HVMTRACE_1D(LMSW, current, value); break; case INSTR_SMSW: @@ -1837,7 +1835,7 @@ static void svm_cr_access( ASSERT(inst_len); if ( result ) - __update_guest_eip(vmcb, inst_len); + __update_guest_eip(regs, inst_len); } static void svm_do_msr_access( @@ -1916,7 +1914,7 @@ static void svm_do_msr_access( regs->edx = msr_content >> 32; done: - HVMTRACE_2D(MSR_READ, v, ecx, msr_content); + hvmtrace_msr_read(v, ecx, msr_content); HVM_DBG_LOG(DBG_LEVEL_1, "returns: ecx=%x, eax=%lx, edx=%lx", ecx, (unsigned long)regs->eax, (unsigned long)regs->edx); @@ -1926,7 +1924,7 @@ static void svm_do_msr_access( { msr_content = (u32)regs->eax | ((u64)regs->edx << 32); - HVMTRACE_2D(MSR_WRITE, v, ecx, msr_content); + hvmtrace_msr_write(v, ecx, msr_content); switch (ecx) { @@ -1960,14 +1958,15 @@ static void svm_do_msr_access( inst_len = __get_instruction_length(v, INSTR_WRMSR, NULL); } - __update_guest_eip(vmcb, inst_len); -} - -static void svm_vmexit_do_hlt(struct vmcb_struct *vmcb) + __update_guest_eip(regs, inst_len); +} + +static void svm_vmexit_do_hlt(struct vmcb_struct *vmcb, + struct cpu_user_regs *regs) { enum hvm_intack type = hvm_vcpu_has_pending_irq(current); - __update_guest_eip(vmcb, 1); + __update_guest_eip(regs, 1); /* Check for interrupt not handled or new interrupt. */ if ( vmcb->eventinj.fields.v || @@ -1978,13 +1977,12 @@ static void svm_vmexit_do_hlt(struct vmc } HVMTRACE_1D(HLT, current, /*int pending=*/ 0); - hvm_hlt(vmcb->rflags); -} - -static void svm_vmexit_do_invd(struct vcpu *v) -{ - struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; - int inst_len; + hvm_hlt(regs->eflags); +} + +static void svm_vmexit_do_invd(struct cpu_user_regs *regs) +{ + int inst_len; /* Invalidate the cache - we can't really do that safely - maybe we should * WBINVD, but I think it's just fine to completely ignore it - we should @@ -1996,8 +1994,8 @@ static void svm_vmexit_do_invd(struct vc */ gdprintk(XENLOG_WARNING, "INVD instruction intercepted - ignored\n"); - inst_len = __get_instruction_length(v, INSTR_INVD, NULL); - __update_guest_eip(vmcb, inst_len); + inst_len = __get_instruction_length(current, INSTR_INVD, NULL); + __update_guest_eip(regs, inst_len); } void svm_handle_invlpg(const short invlpga, struct cpu_user_regs *regs) @@ -2006,7 +2004,6 @@ void svm_handle_invlpg(const short invlp u8 opcode[MAX_INST_LEN], prefix, length = MAX_INST_LEN; unsigned long g_vaddr; int inst_len; - struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; /* * Unknown how many bytes the invlpg instruction will take. Use the @@ -2023,7 +2020,7 @@ void svm_handle_invlpg(const short invlp { inst_len = __get_instruction_length(v, INSTR_INVLPGA, opcode); ASSERT(inst_len > 0); - __update_guest_eip(vmcb, inst_len); + __update_guest_eip(regs, inst_len); /* * The address is implicit on this instruction. At the moment, we don't @@ -2050,7 +2047,7 @@ void svm_handle_invlpg(const short invlp &opcode[inst_len], &length); inst_len += length; - __update_guest_eip (vmcb, inst_len); + __update_guest_eip(regs, inst_len); } HVMTRACE_3D(INVLPG, v, (invlpga?1:0), g_vaddr, (invlpga?regs->ecx:0)); @@ -2073,6 +2070,8 @@ static int svm_reset_to_realmode(struct memset(regs, 0, sizeof(struct cpu_user_regs)); + regs->eflags = 2; + v->arch.hvm_vcpu.guest_cr[0] = X86_CR0_ET; svm_update_guest_cr(v, 0); @@ -2085,7 +2084,7 @@ static int svm_reset_to_realmode(struct vmcb->efer = EFER_SVME; /* This will jump to ROMBIOS */ - vmcb->rip = 0xFFF0; + regs->eip = 0xFFF0; /* Set up the segment registers and all their hidden states. */ vmcb->cs.sel = 0xF000; @@ -2138,16 +2137,12 @@ static int svm_reset_to_realmode(struct vmcb->idtr.limit = 0x3ff; vmcb->idtr.base = 0x00; - vmcb->rax = 0; - vmcb->rsp = 0; - return 0; } asmlinkage void svm_vmexit_handler(struct cpu_user_regs *regs) { unsigned int exit_reason; - unsigned long eip; struct vcpu *v = current; struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; eventinj_t eventinj; @@ -2165,7 +2160,7 @@ asmlinkage void svm_vmexit_handler(struc exit_reason = vmcb->exitcode; - HVMTRACE_2D(VMEXIT, v, vmcb->rip, exit_reason); + hvmtrace_vmexit(v, regs->eip, exit_reason); if ( unlikely(exit_reason == VMEXIT_INVALID) ) { @@ -2174,7 +2169,6 @@ asmlinkage void svm_vmexit_handler(struc } perfc_incra(svmexits, exit_reason); - eip = vmcb->rip; /* Event delivery caused this intercept? Queue for redelivery. */ eventinj = vmcb->exitintinfo; @@ -2211,7 +2205,7 @@ asmlinkage void svm_vmexit_handler(struc goto exit_and_crash; /* AMD Vol2, 15.11: INT3, INTO, BOUND intercepts do not update RIP. */ inst_len = __get_instruction_length(v, INSTR_INT3, NULL); - __update_guest_eip(vmcb, inst_len); + __update_guest_eip(regs, inst_len); domain_pause_for_debugger(); break; @@ -2242,7 +2236,6 @@ asmlinkage void svm_vmexit_handler(struc case VMEXIT_EXCEPTION_MC: HVMTRACE_0D(MCE, v); - svm_store_cpu_guest_regs(v, regs, NULL); do_machine_check(regs); break; _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-changelog
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |