[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-changelog] [xen-unstable] merge with xen-unstable.hg
# HG changeset patch # User Isaku Yamahata <yamahata@xxxxxxxxxxxxx> # Date 1214965837 -32400 # Node ID 08f77df14cba8e2dfe580779bb9ca2f64e1ae0ae # Parent 11318234588e61b45df5a06fe6a29264854ba22a # Parent 19970181d6a46aee1199857b6d3c6bedc7507121 merge with xen-unstable.hg --- docs/ChangeLog | 9 extras/mini-os/arch/x86/mm.c | 11 extras/mini-os/blkfront.c | 1 extras/mini-os/fbfront.c | 2 extras/mini-os/fs-front.c | 10 extras/mini-os/lib/sys.c | 2 extras/mini-os/netfront.c | 6 stubdom/grub.patches/99minios | 10 stubdom/grub/Makefile | 2 tools/blktap/drivers/Makefile | 10 tools/blktap/drivers/blktapctrl.c | 2 tools/blktap/drivers/block-qcow.c | 35 + tools/blktap/drivers/block-qcow2.c | 5 tools/blktap/drivers/check_gcrypt | 14 tools/blktap/lib/blktaplib.h | 2 tools/debugger/xenitp/xenitp.c | 24 tools/examples/xend-config.sxp | 3 tools/firmware/hvmloader/hvmloader.c | 10 tools/firmware/rombios/rombios.c | 35 - tools/ioemu/hw/xen_console.c | 8 tools/ioemu/target-i386-dm/exec-dm.c | 17 tools/ioemu/xenstore.c | 11 tools/libxc/ia64/xc_ia64_hvm_build.c | 7 tools/libxc/ia64/xc_ia64_linux_restore.c | 24 tools/libxc/ia64/xc_ia64_linux_save.c | 19 tools/libxc/xc_core.c | 8 tools/libxc/xc_core_ia64.c | 3 tools/libxc/xc_core_ia64.h | 2 tools/libxc/xc_domain.c | 65 -- tools/libxc/xc_domain_restore.c | 12 tools/libxc/xc_domain_save.c | 20 tools/libxc/xc_misc.c | 28 tools/libxc/xc_pagetab.c | 4 tools/libxc/xc_private.h | 4 tools/libxc/xc_ptrace.c | 34 - tools/libxc/xc_ptrace_core.c | 8 tools/libxc/xc_resume.c | 10 tools/libxc/xenctrl.h | 44 + tools/libxc/xg_save_restore.h | 22 tools/python/xen/util/blkif.py | 41 - tools/python/xen/xend/XendConfig.py | 2 tools/python/xen/xend/XendOptions.py | 7 tools/python/xen/xend/image.py | 20 tools/python/xen/xend/server/blkif.py | 6 tools/python/xen/xm/main.py | 3 tools/tests/test_x86_emulator.c | 9 tools/xenballoon/xenballoon-monitor | 43 + tools/xenballoon/xenballoon.conf | 91 +++ tools/xenballoon/xenballoond | 205 ++++++ tools/xenballoon/xenballoond.README | 82 ++ tools/xenballoon/xenballoond.init | 91 +++ tools/xentrace/xenctx.c | 8 tools/xm-test/lib/XmTestLib/block_utils.py | 2 xen/arch/ia64/vmx/vmx_hypercall.c | 47 + xen/arch/ia64/xen/mm.c | 6 xen/arch/x86/acpi/cpufreq/Makefile | 1 xen/arch/x86/acpi/cpufreq/cpufreq.c | 139 +++- xen/arch/x86/acpi/cpufreq/cpufreq_ondemand.c | 14 xen/arch/x86/acpi/cpufreq/powernow.c | 305 ++++++++++ xen/arch/x86/acpi/cpufreq/utility.c | 103 +++ xen/arch/x86/acpi/pmstat.c | 7 xen/arch/x86/acpi/power.c | 25 xen/arch/x86/hvm/emulate.c | 113 +-- xen/arch/x86/hvm/hvm.c | 60 + xen/arch/x86/hvm/vmx/vmcs.c | 100 +-- xen/arch/x86/hvm/vmx/vmx.c | 11 xen/arch/x86/hvm/vmx/vpmu_core2.c | 20 xen/arch/x86/mm.c | 45 + xen/arch/x86/mm/shadow/common.c | 811 ++++++++++++++++++++++++++- xen/arch/x86/mm/shadow/multi.c | 559 +++++++++++++++++- xen/arch/x86/mm/shadow/multi.h | 14 xen/arch/x86/mm/shadow/private.h | 130 ++++ xen/arch/x86/mm/shadow/types.h | 5 xen/arch/x86/platform_hypercall.c | 7 xen/arch/x86/x86_emulate/x86_emulate.c | 700 ++++++++++++++++++----- xen/arch/x86/x86_emulate/x86_emulate.h | 37 - xen/common/domain.c | 259 ++++---- xen/drivers/passthrough/vtd/dmar.c | 3 xen/drivers/passthrough/vtd/dmar.h | 16 xen/drivers/passthrough/vtd/intremap.c | 7 xen/drivers/passthrough/vtd/iommu.c | 16 xen/drivers/passthrough/vtd/qinval.c | 16 xen/drivers/passthrough/vtd/utils.c | 2 xen/include/acpi/cpufreq/cpufreq.h | 3 xen/include/acpi/cpufreq/processor_perf.h | 13 xen/include/asm-x86/domain.h | 14 xen/include/asm-x86/hvm/vmx/vmcs.h | 8 xen/include/asm-x86/mm.h | 8 xen/include/asm-x86/perfc_defn.h | 15 xen/include/public/hvm/hvm_op.h | 13 xen/include/xen/domain.h | 3 xen/include/xen/sched.h | 12 92 files changed, 3996 insertions(+), 824 deletions(-) diff -r 11318234588e -r 08f77df14cba docs/ChangeLog --- a/docs/ChangeLog Thu Jun 19 12:48:04 2008 +0900 +++ b/docs/ChangeLog Wed Jul 02 11:30:37 2008 +0900 @@ -16,6 +16,15 @@ Xen 3.3 release Xen 3.3 release --------------- +17903: Add greater than 16 xvd device availability +http://xenbits.xensource.com/xen-unstable.hg?rev/0728459b3c8d + +The tools can now attach a disk of the form: +(1<<28) | (device<<8) | partition +to support many more xvd disks and up to 256 partitions. +The linux guest frontend has been expanded to support +this new construct, while legacy guests should just ignore it. + 17538: Add XENPF_set_processor_pminfo http://xenbits.xensource.com/xen-unstable.hg?rev/5bb9093eb0e9 diff -r 11318234588e -r 08f77df14cba extras/mini-os/arch/x86/mm.c --- a/extras/mini-os/arch/x86/mm.c Thu Jun 19 12:48:04 2008 +0900 +++ b/extras/mini-os/arch/x86/mm.c Wed Jul 02 11:30:37 2008 +0900 @@ -528,18 +528,13 @@ void *map_frames_ex(unsigned long *f, un static void clear_bootstrap(void) { - xen_pfn_t mfns[] = { virt_to_mfn(&shared_info) }; - int n = sizeof(mfns)/sizeof(*mfns); pte_t nullpte = { }; /* Use first page as the CoW zero page */ memset(&_text, 0, PAGE_SIZE); - mfn_zero = pfn_to_mfn((unsigned long) &_text); - if (HYPERVISOR_update_va_mapping((unsigned long) &_text, nullpte, UVMF_INVLPG)) - printk("Unable to unmap first page\n"); - - if (free_physical_pages(mfns, n) != n) - printk("Unable to free bootstrap pages\n"); + mfn_zero = virt_to_mfn((unsigned long) &_text); + if (HYPERVISOR_update_va_mapping(0, nullpte, UVMF_INVLPG)) + printk("Unable to unmap NULL page\n"); } void arch_init_p2m(unsigned long max_pfn) diff -r 11318234588e -r 08f77df14cba extras/mini-os/blkfront.c --- a/extras/mini-os/blkfront.c Thu Jun 19 12:48:04 2008 +0900 +++ b/extras/mini-os/blkfront.c Wed Jul 02 11:30:37 2008 +0900 @@ -125,7 +125,6 @@ struct blkfront_dev *init_blkfront(char dev->events = NULL; - // FIXME: proper frees on failures again: err = xenbus_transaction_start(&xbt); if (err) { diff -r 11318234588e -r 08f77df14cba extras/mini-os/fbfront.c --- a/extras/mini-os/fbfront.c Thu Jun 19 12:48:04 2008 +0900 +++ b/extras/mini-os/fbfront.c Wed Jul 02 11:30:37 2008 +0900 @@ -100,7 +100,6 @@ struct kbdfront_dev *init_kbdfront(char s->in_cons = s->in_prod = 0; s->out_cons = s->out_prod = 0; - // FIXME: proper frees on failures again: err = xenbus_transaction_start(&xbt); if (err) { @@ -408,7 +407,6 @@ struct fbfront_dev *init_fbfront(char *n s->pd[i] = 0; - // FIXME: proper frees on failures again: err = xenbus_transaction_start(&xbt); if (err) { diff -r 11318234588e -r 08f77df14cba extras/mini-os/fs-front.c --- a/extras/mini-os/fs-front.c Thu Jun 19 12:48:04 2008 +0900 +++ b/extras/mini-os/fs-front.c Wed Jul 02 11:30:37 2008 +0900 @@ -136,8 +136,8 @@ again: again: old_id = freelist[0]; /* Note: temporal inconsistency, since freelist[0] can be changed by someone - * else, but we are a sole owner of freelist[id], it's OK. */ - freelist[id] = old_id; + * else, but we are a sole owner of freelist[id + 1], it's OK. */ + freelist[id + 1] = old_id; new_id = id; if(cmpxchg(&freelist[0], old_id, new_id) != old_id) { @@ -154,7 +154,7 @@ static inline unsigned short get_id_from again: old_id = freelist[0]; - new_id = freelist[old_id]; + new_id = freelist[old_id + 1]; if(cmpxchg(&freelist[0], old_id, new_id) != old_id) { printk("Cmpxchg on freelist remove failed.\n"); @@ -785,8 +785,8 @@ static void alloc_request_table(struct f printk("Allocating request array for import %d, nr_entries = %d.\n", import->import_id, import->nr_entries); requests = xmalloc_array(struct fs_request, import->nr_entries); - import->freelist = xmalloc_array(unsigned short, import->nr_entries); - memset(import->freelist, 0, sizeof(unsigned short) * import->nr_entries); + import->freelist = xmalloc_array(unsigned short, import->nr_entries + 1); + memset(import->freelist, 0, sizeof(unsigned short) * (import->nr_entries + 1)); for(i=0; i<import->nr_entries; i++) { /* TODO: that's a lot of memory */ diff -r 11318234588e -r 08f77df14cba extras/mini-os/lib/sys.c --- a/extras/mini-os/lib/sys.c Thu Jun 19 12:48:04 2008 +0900 +++ b/extras/mini-os/lib/sys.c Wed Jul 02 11:30:37 2008 +0900 @@ -686,7 +686,7 @@ static int select_poll(int nfds, fd_set #ifdef LIBC_VERBOSE static int nb; static int nbread[NOFILE], nbwrite[NOFILE], nbexcept[NOFILE]; - static s64_t lastshown; + static s_time_t lastshown; nb++; #endif diff -r 11318234588e -r 08f77df14cba extras/mini-os/netfront.c --- a/extras/mini-os/netfront.c Thu Jun 19 12:48:04 2008 +0900 +++ b/extras/mini-os/netfront.c Wed Jul 02 11:30:37 2008 +0900 @@ -38,7 +38,7 @@ struct netfront_dev { struct netfront_dev { domid_t dom; - unsigned short tx_freelist[NET_TX_RING_SIZE]; + unsigned short tx_freelist[NET_TX_RING_SIZE + 1]; struct semaphore tx_sem; struct net_buffer rx_buffers[NET_RX_RING_SIZE]; @@ -70,14 +70,14 @@ void init_rx_buffers(struct netfront_dev static inline void add_id_to_freelist(unsigned int id,unsigned short* freelist) { - freelist[id] = freelist[0]; + freelist[id + 1] = freelist[0]; freelist[0] = id; } static inline unsigned short get_id_from_freelist(unsigned short* freelist) { unsigned int id = freelist[0]; - freelist[0] = freelist[id]; + freelist[0] = freelist[id + 1]; return id; } diff -r 11318234588e -r 08f77df14cba stubdom/grub.patches/99minios --- a/stubdom/grub.patches/99minios Thu Jun 19 12:48:04 2008 +0900 +++ b/stubdom/grub.patches/99minios Wed Jul 02 11:30:37 2008 +0900 @@ -832,7 +832,18 @@ Index: grub/stage2/fsys_reiserfs.c Index: grub/stage2/fsys_reiserfs.c =================================================================== --- grub.orig/stage2/fsys_reiserfs.c 2008-06-16 15:18:03.410933000 +0100 -+++ grub/stage2/fsys_reiserfs.c 2008-06-16 15:18:14.786009000 +0100 ++++ grub/stage2/fsys_reiserfs.c 2008-06-20 18:33:52.002100000 +0100 +@@ -224,8 +224,8 @@ + + struct disk_child + { +- unsigned long dc_block_number; /* Disk child's block number. */ +- unsigned short dc_size; /* Disk child's used space. */ ++ __u32 dc_block_number; /* Disk child's block number. */ ++ __u16 dc_size; /* Disk child's used space. */ + }; + + #define DC_SIZE (sizeof (struct disk_child)) @@ -369,7 +369,14 @@ static __inline__ unsigned long log2 (unsigned long word) diff -r 11318234588e -r 08f77df14cba stubdom/grub/Makefile --- a/stubdom/grub/Makefile Thu Jun 19 12:48:04 2008 +0900 +++ b/stubdom/grub/Makefile Wed Jul 02 11:30:37 2008 +0900 @@ -5,7 +5,7 @@ vpath %.c ../grub-cvs BOOT=boot-$(XEN_TARGET_ARCH).o -DEF_CPPFLAGS += -I$(XEN_ROOT)/tools/libxc -I. +DEF_CPPFLAGS += -I$(XEN_ROOT)/tools/libxc -I$(XEN_ROOT)/tools/include -I. DEF_CPPFLAGS += -I../grub-cvs/stage1 DEF_CPPFLAGS += -I../grub-cvs/stage2 DEF_CPPFLAGS += -I../grub-cvs/netboot diff -r 11318234588e -r 08f77df14cba tools/blktap/drivers/Makefile --- a/tools/blktap/drivers/Makefile Thu Jun 19 12:48:04 2008 +0900 +++ b/tools/blktap/drivers/Makefile Wed Jul 02 11:30:37 2008 +0900 @@ -17,8 +17,16 @@ CFLAGS += -Wp,-MD,.$(@F).d CFLAGS += -Wp,-MD,.$(@F).d DEPS = .*.d +ifeq ($(shell . ./check_gcrypt),"yes") +CFLAGS += -DUSE_GCRYPT +CRYPT_LIB := -lgcrypt +else +CRYPT_LIB := -lcrypto +$(warning *** libgcrypt not installed: falling back to libcrypto ***) +endif + LDFLAGS_blktapctrl := $(LDFLAGS_libxenctrl) $(LDFLAGS_libxenstore) -L../lib -lblktap -LDFLAGS_img := $(LIBAIO_DIR)/libaio.a -lcrypto -lpthread -lz +LDFLAGS_img := $(LIBAIO_DIR)/libaio.a $(CRYPT_LIB) -lpthread -lz BLK-OBJS-y := block-aio.o BLK-OBJS-y += block-sync.o diff -r 11318234588e -r 08f77df14cba tools/blktap/drivers/blktapctrl.c --- a/tools/blktap/drivers/blktapctrl.c Thu Jun 19 12:48:04 2008 +0900 +++ b/tools/blktap/drivers/blktapctrl.c Wed Jul 02 11:30:37 2008 +0900 @@ -127,7 +127,7 @@ static int get_new_dev(int *major, int * char *devname; tr.domid = blkif->domid; - tr.busid = (unsigned short)blkif->be_id; + tr.busid = blkif->be_id; ret = ioctl(ctlfd, BLKTAP_IOCTL_NEWINTF, tr ); if ( (ret <= 0)||(ret > MAX_TAP_DEV) ) { diff -r 11318234588e -r 08f77df14cba tools/blktap/drivers/block-qcow.c --- a/tools/blktap/drivers/block-qcow.c Thu Jun 19 12:48:04 2008 +0900 +++ b/tools/blktap/drivers/block-qcow.c Wed Jul 02 11:30:37 2008 +0900 @@ -33,7 +33,6 @@ #include <zlib.h> #include <inttypes.h> #include <libaio.h> -#include <openssl/md5.h> #include "bswap.h" #include "aes.h" #include "tapdisk.h" @@ -146,6 +145,35 @@ struct tdqcow_state { static int decompress_cluster(struct tdqcow_state *s, uint64_t cluster_offset); +#ifdef USE_GCRYPT + +#include <gcrypt.h> + +static uint32_t gen_cksum(char *ptr, int len) +{ + int i; + uint32_t md[4]; + + /* Convert L1 table to big endian */ + for(i = 0; i < len / sizeof(uint64_t); i++) { + cpu_to_be64s(&((uint64_t*) ptr)[i]); + } + + /* Generate checksum */ + gcry_md_hash_buffer(GCRY_MD_MD5, md, ptr, len); + + /* Convert L1 table back to native endianess */ + for(i = 0; i < len / sizeof(uint64_t); i++) { + be64_to_cpus(&((uint64_t*) ptr)[i]); + } + + return md[0]; +} + +#else /* use libcrypto */ + +#include <openssl/md5.h> + static uint32_t gen_cksum(char *ptr, int len) { int i; @@ -153,9 +181,8 @@ static uint32_t gen_cksum(char *ptr, int uint32_t ret; md = malloc(MD5_DIGEST_LENGTH); - if(!md) return 0; - + /* Convert L1 table to big endian */ for(i = 0; i < len / sizeof(uint64_t); i++) { cpu_to_be64s(&((uint64_t*) ptr)[i]); @@ -175,6 +202,8 @@ static uint32_t gen_cksum(char *ptr, int free(md); return ret; } + +#endif static int get_filesize(char *filename, uint64_t *size, struct stat *st) { diff -r 11318234588e -r 08f77df14cba tools/blktap/drivers/block-qcow2.c --- a/tools/blktap/drivers/block-qcow2.c Thu Jun 19 12:48:04 2008 +0900 +++ b/tools/blktap/drivers/block-qcow2.c Wed Jul 02 11:30:37 2008 +0900 @@ -254,10 +254,7 @@ static int bdrv_pread(int fd, int64_t of */ static int bdrv_pwrite(int fd, int64_t offset, const void *buf, int count) { - int ret; - - ret = lseek(fd, offset, SEEK_SET); - if (ret != offset) { + if (lseek(fd, offset, SEEK_SET) == -1) { DPRINTF("bdrv_pwrite failed seek (%#"PRIx64").\n", offset); return -1; } diff -r 11318234588e -r 08f77df14cba tools/blktap/drivers/check_gcrypt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/blktap/drivers/check_gcrypt Wed Jul 02 11:30:37 2008 +0900 @@ -0,0 +1,14 @@ +#!/bin/sh + +cat > .gcrypt.c << EOF +#include <gcrypt.h> +int main(void) { return 0; } +EOF + +if $1 -o .gcrypt .gcrypt.c -lgcrypt 2>/dev/null ; then + echo "yes" +else + echo "no" +fi + +rm -f .gcrypt* diff -r 11318234588e -r 08f77df14cba tools/blktap/lib/blktaplib.h --- a/tools/blktap/lib/blktaplib.h Thu Jun 19 12:48:04 2008 +0900 +++ b/tools/blktap/lib/blktaplib.h Wed Jul 02 11:30:37 2008 +0900 @@ -161,7 +161,7 @@ typedef struct tapdev_info { typedef struct domid_translate { unsigned short domid; - unsigned short busid; + uint32_t busid; } domid_translate_t ; typedef struct image { diff -r 11318234588e -r 08f77df14cba tools/debugger/xenitp/xenitp.c --- a/tools/debugger/xenitp/xenitp.c Thu Jun 19 12:48:04 2008 +0900 +++ b/tools/debugger/xenitp/xenitp.c Wed Jul 02 11:30:37 2008 +0900 @@ -57,6 +57,16 @@ static int cur_vcpu; #define CFM_SOF_MASK 0x3f int virt_to_phys (int is_inst, unsigned long vaddr, unsigned long *paddr); + +/* wrapper for vcpu_gest_context_any_t */ +static int xc_ia64_vcpu_getcontext(int xc_handle, + uint32_t domid, + uint32_t vcpu, + vcpu_guest_context_t *ctxt) +{ + return xc_vcpu_getcontext(xc_handle, domid, vcpu, + (vcpu_guest_context_any_t *)ctxt); +} static inline unsigned int ctx_slot (vcpu_guest_context_t *ctx) { @@ -729,7 +739,7 @@ int wait_domain (int vcpu, vcpu_guest_co fflush (stdout); nanosleep (&ts, NULL); } - return xc_vcpu_getcontext (xc_handle, domid, vcpu, ctx); + return xc_ia64_vcpu_getcontext (xc_handle, domid, vcpu, ctx); } int virt_to_phys (int is_inst, unsigned long vaddr, unsigned long *paddr) @@ -945,13 +955,13 @@ char *parse_arg (char **buf) return res; } -vcpu_guest_context_t vcpu_ctx[MAX_VIRT_CPUS]; +vcpu_guest_context_any_t vcpu_ctx_any[MAX_VIRT_CPUS]; int vcpu_setcontext (int vcpu) { int ret; - ret = xc_vcpu_setcontext (xc_handle, domid, vcpu, &vcpu_ctx[vcpu]); + ret = xc_vcpu_setcontext (xc_handle, domid, vcpu, &vcpu_ctx_any[vcpu]); if (ret < 0) perror ("xc_vcpu_setcontext"); @@ -1518,7 +1528,7 @@ enum cmd_status do_command (int vcpu, ch int flag_ambiguous; cur_vcpu = vcpu; - cur_ctx = &vcpu_ctx[vcpu]; + cur_ctx = &vcpu_ctx_any[vcpu].c; /* Handle repeat last-command. */ if (*line == 0) { @@ -1575,7 +1585,7 @@ void xenitp (int vcpu) int ret; struct sigaction sa; - cur_ctx = &vcpu_ctx[vcpu]; + cur_ctx = &vcpu_ctx_any[vcpu].c; xc_handle = xc_interface_open (); /* for accessing control interface */ @@ -1588,9 +1598,9 @@ void xenitp (int vcpu) exit (-1); } - ret = xc_vcpu_getcontext (xc_handle, domid, vcpu, cur_ctx); + ret = xc_ia64_vcpu_getcontext (xc_handle, domid, vcpu, cur_ctx); if (ret < 0) { - perror ("xc_vcpu_getcontext"); + perror ("xc_ia64_vcpu_getcontext"); exit (-1); } diff -r 11318234588e -r 08f77df14cba tools/examples/xend-config.sxp --- a/tools/examples/xend-config.sxp Thu Jun 19 12:48:04 2008 +0900 +++ b/tools/examples/xend-config.sxp Wed Jul 02 11:30:37 2008 +0900 @@ -242,3 +242,6 @@ # Script to run when the label of a resource has changed. #(resource-label-change-script '') + +# Rotation count of qemu-dm log file. +#(qemu-dm-logrotate-count 10) diff -r 11318234588e -r 08f77df14cba tools/firmware/hvmloader/hvmloader.c --- a/tools/firmware/hvmloader/hvmloader.c Thu Jun 19 12:48:04 2008 +0900 +++ b/tools/firmware/hvmloader/hvmloader.c Wed Jul 02 11:30:37 2008 +0900 @@ -206,10 +206,12 @@ static void pci_setup(void) pci_writew(devfn, 0x3d, 0x0001); break; case 0x0101: - /* PIIX3 IDE */ - ASSERT((vendor_id == 0x8086) && (device_id == 0x7010)); - pci_writew(devfn, 0x40, 0x8000); /* enable IDE0 */ - pci_writew(devfn, 0x42, 0x8000); /* enable IDE1 */ + if ( vendor_id == 0x8086 ) + { + /* Intel ICHs since PIIX3: enable IDE legacy mode. */ + pci_writew(devfn, 0x40, 0x8000); /* enable IDE0 */ + pci_writew(devfn, 0x42, 0x8000); /* enable IDE1 */ + } break; } diff -r 11318234588e -r 08f77df14cba tools/firmware/rombios/rombios.c --- a/tools/firmware/rombios/rombios.c Thu Jun 19 12:48:04 2008 +0900 +++ b/tools/firmware/rombios/rombios.c Wed Jul 02 11:30:37 2008 +0900 @@ -9783,6 +9783,27 @@ smbios_init: #endif +#if BX_TCGBIOS +; The section between the POST entry and the NMI entry is filling up +; and causes crashes if this code was directly there +tcpa_post_part1: + call _tcpa_acpi_init + + push dword #0 + call _tcpa_initialize_tpm + add sp, #4 + + call _tcpa_do_measure_POSTs + call _tcpa_wake_event /* specs: 3.2.3.7 */ + ret + +tcpa_post_part2: + call _tcpa_calling_int19h /* specs: 8.2.3 step 1 */ + call _tcpa_add_event_separators /* specs: 8.2.3 step 2 */ + /* we do not call int 19h handler but keep following eventlog */ + call _tcpa_returned_int19h /* specs: 8.2.3 step 3/7 */ + ret +#endif ;; for 'C' strings and other data, insert them here with @@ -10003,14 +10024,7 @@ post_default_ints: mov 0x0410, ax #if BX_TCGBIOS - call _tcpa_acpi_init - - push dword #0 - call _tcpa_initialize_tpm - add sp, #4 - - call _tcpa_do_measure_POSTs - call _tcpa_wake_event /* specs: 3.2.3.7 */ + call tcpa_post_part1 #endif ;; Parallel setup @@ -10138,10 +10152,7 @@ post_default_ints: call _interactive_bootkey #if BX_TCGBIOS - call _tcpa_calling_int19h /* specs: 8.2.3 step 1 */ - call _tcpa_add_event_separators /* specs: 8.2.3 step 2 */ - /* we do not call int 19h handler but keep following eventlog */ - call _tcpa_returned_int19h /* specs: 8.2.3 step 3/7 */ + call tcpa_post_part2 #endif ;; Start the boot sequence. See the comments in int19_relocated diff -r 11318234588e -r 08f77df14cba tools/ioemu/hw/xen_console.c --- a/tools/ioemu/hw/xen_console.c Thu Jun 19 12:48:04 2008 +0900 +++ b/tools/ioemu/hw/xen_console.c Wed Jul 02 11:30:37 2008 +0900 @@ -160,16 +160,18 @@ int xs_gather(struct xs_handle *xs, cons static int domain_create_ring(struct domain *dom) { - int err, remote_port, ring_ref, rc; + int err, remote_port, ring_ref, limit, rc; err = xs_gather(dom->xsh, dom->serialpath, "ring-ref", "%u", &ring_ref, "port", "%i", &remote_port, + "limit", "%i", &limit, NULL); if (err) { err = xs_gather(dom->xsh, dom->conspath, "ring-ref", "%u", &ring_ref, "port", "%i", &remote_port, + "limit", "%i", &limit, NULL); if (err) { fprintf(stderr, "Console: failed to find ring-ref/port yet\n"); @@ -178,7 +180,9 @@ static int domain_create_ring(struct dom dom->use_consolepath = 1; } else dom->use_consolepath = 0; - fprintf(stderr, "Console: got ring-ref %d port %d\n", ring_ref, remote_port); + dom->buffer.max_capacity = limit; + fprintf(stderr, "Console: got ring-ref %d port %d limit %d\n", + ring_ref, remote_port, limit); if ((ring_ref == dom->ring_ref) && (remote_port == dom->remote_port)) goto out; diff -r 11318234588e -r 08f77df14cba tools/ioemu/target-i386-dm/exec-dm.c --- a/tools/ioemu/target-i386-dm/exec-dm.c Thu Jun 19 12:48:04 2008 +0900 +++ b/tools/ioemu/target-i386-dm/exec-dm.c Wed Jul 02 11:30:37 2008 +0900 @@ -483,9 +483,11 @@ static void memcpy_words(void *dst, void } #endif -void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf, - int len, int is_write) -{ +void cpu_physical_memory_rw(target_phys_addr_t _addr, uint8_t *buf, + int _len, int is_write) +{ + target_phys_addr_t addr = _addr; + int len = _len; int l, io_index; uint8_t *ptr; uint32_t val; @@ -520,6 +522,7 @@ void cpu_physical_memory_rw(target_phys_ } else if ((ptr = phys_ram_addr(addr)) != NULL) { /* Writing to RAM */ memcpy_words(ptr, buf, l); +#ifndef CONFIG_STUBDOM if (logdirty_bitmap != NULL) { /* Record that we have dirtied this frame */ unsigned long pfn = addr >> TARGET_PAGE_BITS; @@ -531,6 +534,7 @@ void cpu_physical_memory_rw(target_phys_ |= 1UL << pfn % HOST_LONG_BITS; } } +#endif #ifdef __ia64__ sync_icache(ptr, l); #endif @@ -566,6 +570,13 @@ void cpu_physical_memory_rw(target_phys_ addr += l; } +#ifdef CONFIG_STUBDOM + if (logdirty_bitmap != NULL) + xc_hvm_modified_memory(xc_handle, domid, _addr >> TARGET_PAGE_BITS, + (_addr + _len + TARGET_PAGE_SIZE - 1) >> TARGET_PAGE_BITS + - _addr >> TARGET_PAGE_BITS); +#endif + mapcache_unlock(); } #endif diff -r 11318234588e -r 08f77df14cba tools/ioemu/xenstore.c --- a/tools/ioemu/xenstore.c Thu Jun 19 12:48:04 2008 +0900 +++ b/tools/ioemu/xenstore.c Wed Jul 02 11:30:37 2008 +0900 @@ -260,8 +260,6 @@ void xenstore_parse_domain_config(int hv /* autoguess qcow vs qcow2 */ } else if (!strcmp(drv,"file") || !strcmp(drv,"phy")) { format = &bdrv_raw; - } else if (!strcmp(drv,"phy")) { - format = &bdrv_raw; } else { format = bdrv_find_format(drv); if (!format) { @@ -404,6 +402,10 @@ void xenstore_process_logdirty_event(voi /* No key yet: wait for the next watch */ return; +#ifdef CONFIG_STUBDOM + /* We pass the writes to hypervisor */ + seg = (void*)1; +#else strncpy(key_terminated, key_ascii, 16); free(key_ascii); key = (key_t) strtoull(key_terminated, NULL, 16); @@ -419,11 +421,6 @@ void xenstore_process_logdirty_event(voi fprintf(logfile, "%s: key=%16.16llx size=%lu\n", __FUNCTION__, (unsigned long long)key, logdirty_bitmap_size); -#ifdef CONFIG_STUBDOM - /* XXX we just can't use shm. */ - fprintf(logfile, "Log dirty is not implemented in stub domains!\n"); - return; -#else shmid = shmget(key, 2 * logdirty_bitmap_size, S_IRUSR|S_IWUSR); if (shmid == -1) { fprintf(logfile, "Log-dirty: shmget failed: segment %16.16llx " diff -r 11318234588e -r 08f77df14cba tools/libxc/ia64/xc_ia64_hvm_build.c --- a/tools/libxc/ia64/xc_ia64_hvm_build.c Thu Jun 19 12:48:04 2008 +0900 +++ b/tools/libxc/ia64/xc_ia64_hvm_build.c Wed Jul 02 11:30:37 2008 +0900 @@ -1052,7 +1052,8 @@ int int xc_hvm_build(int xc_handle, uint32_t domid, int memsize, const char *image_name) { - vcpu_guest_context_t st_ctxt, *ctxt = &st_ctxt; + vcpu_guest_context_any_t st_ctxt_any; + vcpu_guest_context_t *ctxt = &st_ctxt_any.c; char *image = NULL; unsigned long image_size; unsigned long nr_pages; @@ -1079,14 +1080,14 @@ xc_hvm_build(int xc_handle, uint32_t dom free(image); - memset(ctxt, 0, sizeof(*ctxt)); + memset(&st_ctxt_any, 0, sizeof(st_ctxt_any)); ctxt->regs.ip = 0x80000000ffffffb0UL; ctxt->regs.ar.fpsr = xc_ia64_fpsr_default(); ctxt->regs.cr.itir = 14 << 2; ctxt->regs.psr = IA64_PSR_AC | IA64_PSR_BN; ctxt->regs.cr.dcr = 0; ctxt->regs.cr.pta = 15 << 2; - return xc_vcpu_setcontext(xc_handle, domid, 0, ctxt); + return xc_vcpu_setcontext(xc_handle, domid, 0, &st_ctxt_any); error_out: free(image); diff -r 11318234588e -r 08f77df14cba tools/libxc/ia64/xc_ia64_linux_restore.c --- a/tools/libxc/ia64/xc_ia64_linux_restore.c Thu Jun 19 12:48:04 2008 +0900 +++ b/tools/libxc/ia64/xc_ia64_linux_restore.c Wed Jul 02 11:30:37 2008 +0900 @@ -117,8 +117,9 @@ xc_ia64_recv_unallocated_list(int xc_han static int xc_ia64_recv_vcpu_context(int xc_handle, int io_fd, uint32_t dom, - uint32_t vcpu, vcpu_guest_context_t *ctxt) -{ + uint32_t vcpu, vcpu_guest_context_any_t *ctxt_any) +{ + vcpu_guest_context_t *ctxt = &ctxt_any->c; if (read_exact(io_fd, ctxt, sizeof(*ctxt))) { ERROR("Error when reading ctxt"); return -1; @@ -128,14 +129,14 @@ xc_ia64_recv_vcpu_context(int xc_handle, /* Initialize and set registers. */ ctxt->flags = VGCF_EXTRA_REGS | VGCF_SET_CR_IRR | VGCF_online; - if (xc_vcpu_setcontext(xc_handle, dom, vcpu, ctxt) != 0) { + if (xc_vcpu_setcontext(xc_handle, dom, vcpu, ctxt_any) != 0) { ERROR("Couldn't set vcpu context"); return -1; } /* Just a check. */ ctxt->flags = 0; - if (xc_vcpu_getcontext(xc_handle, dom, vcpu, ctxt)) { + if (xc_vcpu_getcontext(xc_handle, dom, vcpu, ctxt_any)) { ERROR("Could not get vcpu context"); return -1; } @@ -226,19 +227,20 @@ xc_ia64_pv_recv_vcpu_context(int xc_hand int rc = -1; /* A copy of the CPU context of the guest. */ - vcpu_guest_context_t ctxt; - - if (lock_pages(&ctxt, sizeof(ctxt))) { + vcpu_guest_context_any_t ctxt_any; + vcpu_guest_context_t *ctxt = &ctxt_any.c; + + if (lock_pages(&ctxt_any, sizeof(ctxt_any))) { /* needed for build domctl, but might as well do early */ ERROR("Unable to lock_pages ctxt"); return -1; } - if (xc_ia64_recv_vcpu_context(xc_handle, io_fd, dom, vcpu, &ctxt)) + if (xc_ia64_recv_vcpu_context(xc_handle, io_fd, dom, vcpu, &ctxt_any)) goto out; /* Then get privreg page. */ - if (read_page(xc_handle, io_fd, dom, ctxt.privregs_pfn) < 0) { + if (read_page(xc_handle, io_fd, dom, ctxt->privregs_pfn) < 0) { ERROR("Could not read vcpu privregs"); goto out; } @@ -441,12 +443,12 @@ xc_ia64_hvm_recv_context(int xc_handle, /* vcpu context */ for (i = 0; i <= info.max_vcpu_id; i++) { /* A copy of the CPU context of the guest. */ - vcpu_guest_context_t ctxt; + vcpu_guest_context_any_t ctxt_any; if (!__test_bit(i, vcpumap)) continue; - if (xc_ia64_recv_vcpu_context(xc_handle, io_fd, dom, i, &ctxt)) + if (xc_ia64_recv_vcpu_context(xc_handle, io_fd, dom, i, &ctxt_any)) goto out; /* system context of vcpu is recieved as hvm context. */ diff -r 11318234588e -r 08f77df14cba tools/libxc/ia64/xc_ia64_linux_save.c --- a/tools/libxc/ia64/xc_ia64_linux_save.c Thu Jun 19 12:48:04 2008 +0900 +++ b/tools/libxc/ia64/xc_ia64_linux_save.c Wed Jul 02 11:30:37 2008 +0900 @@ -180,9 +180,10 @@ xc_ia64_send_unallocated_list(int xc_han static int xc_ia64_send_vcpu_context(int xc_handle, int io_fd, uint32_t dom, - uint32_t vcpu, vcpu_guest_context_t *ctxt) -{ - if (xc_vcpu_getcontext(xc_handle, dom, vcpu, ctxt)) { + uint32_t vcpu, vcpu_guest_context_any_t *ctxt_any) +{ + vcpu_guest_context_t *ctxt = &ctxt_any->c; + if (xc_vcpu_getcontext(xc_handle, dom, vcpu, ctxt_any)) { ERROR("Could not get vcpu context"); return -1; } @@ -269,17 +270,19 @@ xc_ia64_pv_send_context(int xc_handle, i /* vcpu context */ for (i = 0; i <= info->max_vcpu_id; i++) { /* A copy of the CPU context of the guest. */ - vcpu_guest_context_t ctxt; + vcpu_guest_context_any_t ctxt_any; + vcpu_guest_context_t *ctxt = &ctxt_any.c; + char *mem; if (!__test_bit(i, vcpumap)) continue; - if (xc_ia64_send_vcpu_context(xc_handle, io_fd, dom, i, &ctxt)) + if (xc_ia64_send_vcpu_context(xc_handle, io_fd, dom, i, &ctxt_any)) goto out; mem = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, - PROT_READ|PROT_WRITE, ctxt.privregs_pfn); + PROT_READ|PROT_WRITE, ctxt->privregs_pfn); if (mem == NULL) { ERROR("cannot map privreg page"); goto out; @@ -337,12 +340,12 @@ xc_ia64_hvm_send_context(int xc_handle, /* vcpu context */ for (i = 0; i <= info->max_vcpu_id; i++) { /* A copy of the CPU context of the guest. */ - vcpu_guest_context_t ctxt; + vcpu_guest_context_any_t ctxt_any; if (!__test_bit(i, vcpumap)) continue; - if (xc_ia64_send_vcpu_context(xc_handle, io_fd, dom, i, &ctxt)) + if (xc_ia64_send_vcpu_context(xc_handle, io_fd, dom, i, &ctxt_any)) goto out; /* system context of vcpu is sent as hvm context. */ diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_core.c --- a/tools/libxc/xc_core.c Thu Jun 19 12:48:04 2008 +0900 +++ b/tools/libxc/xc_core.c Wed Jul 02 11:30:37 2008 +0900 @@ -407,7 +407,7 @@ xc_domain_dumpcore_via_callback(int xc_h int nr_vcpus = 0; char *dump_mem, *dump_mem_start = NULL; - vcpu_guest_context_t ctxt[MAX_VIRT_CPUS]; + vcpu_guest_context_any_t ctxt[MAX_VIRT_CPUS]; struct xc_core_arch_context arch_ctxt; char dummy[PAGE_SIZE]; int dummy_len; @@ -581,10 +581,10 @@ xc_domain_dumpcore_via_callback(int xc_h PERROR("Could not get section header for .xen_prstatus"); goto out; } - filesz = sizeof(ctxt[0]) * nr_vcpus; + filesz = sizeof(ctxt[0].c) * nr_vcpus; sts = xc_core_shdr_set(shdr, strtab, XEN_DUMPCORE_SEC_PRSTATUS, SHT_PROGBITS, offset, filesz, - __alignof__(ctxt[0]), sizeof(ctxt[0])); + __alignof__(ctxt[0].c), sizeof(ctxt[0].c)); if ( sts != 0 ) goto out; offset += filesz; @@ -707,7 +707,7 @@ xc_domain_dumpcore_via_callback(int xc_h goto out; /* prstatus: .xen_prstatus */ - sts = dump_rtn(args, (char *)&ctxt, sizeof(ctxt[0]) * nr_vcpus); + sts = dump_rtn(args, (char *)&ctxt[0].c, sizeof(ctxt[0].c) * nr_vcpus); if ( sts != 0 ) goto out; diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_core_ia64.c --- a/tools/libxc/xc_core_ia64.c Thu Jun 19 12:48:04 2008 +0900 +++ b/tools/libxc/xc_core_ia64.c Wed Jul 02 11:30:37 2008 +0900 @@ -308,9 +308,10 @@ xc_core_arch_context_free(struct xc_core int xc_core_arch_context_get(struct xc_core_arch_context* arch_ctxt, - vcpu_guest_context_t* ctxt, + vcpu_guest_context_any_t* ctxt_any, int xc_handle, uint32_t domid) { + vcpu_guest_context_t *ctxt = &ctxt_any->c; mapped_regs_t* mapped_regs; if ( ctxt->privregs_pfn == VGC_PRIVREGS_HVM ) diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_core_ia64.h --- a/tools/libxc/xc_core_ia64.h Thu Jun 19 12:48:04 2008 +0900 +++ b/tools/libxc/xc_core_ia64.h Wed Jul 02 11:30:37 2008 +0900 @@ -40,7 +40,7 @@ xc_core_arch_context_free(struct xc_core xc_core_arch_context_free(struct xc_core_arch_context* arch_ctxt); int xc_core_arch_context_get(struct xc_core_arch_context* arch_ctxt, - vcpu_guest_context_t* ctxt, + vcpu_guest_context_any_t* ctxt, int xc_handle, uint32_t domid); int xc_core_arch_context_get_shdr(struct xc_core_arch_context* arch_ctxt, diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_domain.c --- a/tools/libxc/xc_domain.c Thu Jun 19 12:48:04 2008 +0900 +++ b/tools/libxc/xc_domain.c Wed Jul 02 11:30:37 2008 +0900 @@ -298,30 +298,21 @@ int xc_vcpu_getcontext(int xc_handle, int xc_vcpu_getcontext(int xc_handle, uint32_t domid, uint32_t vcpu, - vcpu_guest_context_t *ctxt) -{ - int rc; - DECLARE_DOMCTL; - size_t sz = sizeof(vcpu_guest_context_either_t); + vcpu_guest_context_any_t *ctxt) +{ + int rc; + DECLARE_DOMCTL; + size_t sz = sizeof(vcpu_guest_context_any_t); domctl.cmd = XEN_DOMCTL_getvcpucontext; domctl.domain = (domid_t)domid; domctl.u.vcpucontext.vcpu = (uint16_t)vcpu; - set_xen_guest_handle(domctl.u.vcpucontext.ctxt, ctxt); - - /* - * We may be asked to lock either a 32-bit or a 64-bit context. Lock the - * larger of the two if possible, otherwise fall back to native size. - */ + set_xen_guest_handle(domctl.u.vcpucontext.ctxt, &ctxt->c); + + if ( (rc = lock_pages(ctxt, sz)) != 0 ) - { - sz = sizeof(*ctxt); - if ( (rc = lock_pages(ctxt, sz)) != 0 ) - return rc; - } - + return rc; rc = do_domctl(xc_handle, &domctl); - unlock_pages(ctxt, sz); return rc; @@ -626,32 +617,28 @@ int xc_vcpu_setcontext(int xc_handle, int xc_vcpu_setcontext(int xc_handle, uint32_t domid, uint32_t vcpu, - vcpu_guest_context_t *ctxt) -{ - DECLARE_DOMCTL; - int rc; - size_t sz = sizeof(vcpu_guest_context_either_t); + vcpu_guest_context_any_t *ctxt) +{ + DECLARE_DOMCTL; + int rc; + size_t sz = sizeof(vcpu_guest_context_any_t); + + if (ctxt == NULL) + { + errno = EINVAL; + return -1; + } domctl.cmd = XEN_DOMCTL_setvcpucontext; domctl.domain = domid; domctl.u.vcpucontext.vcpu = vcpu; - set_xen_guest_handle(domctl.u.vcpucontext.ctxt, ctxt); - - /* - * We may be asked to lock either a 32-bit or a 64-bit context. Lock the - * larger of the two if possible, otherwise fall back to native size. - */ - if ( (ctxt != NULL) && (rc = lock_pages(ctxt, sz)) != 0 ) - { - sz = sizeof(*ctxt); - if ( (rc = lock_pages(ctxt, sz)) != 0 ) - return rc; - } - + set_xen_guest_handle(domctl.u.vcpucontext.ctxt, &ctxt->c); + + if ( (rc = lock_pages(ctxt, sz)) != 0 ) + return rc; rc = do_domctl(xc_handle, &domctl); - - if ( ctxt != NULL ) - unlock_pages(ctxt, sz); + + unlock_pages(ctxt, sz); return rc; } diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_domain_restore.c --- a/tools/libxc/xc_domain_restore.c Thu Jun 19 12:48:04 2008 +0900 +++ b/tools/libxc/xc_domain_restore.c Wed Jul 02 11:30:37 2008 +0900 @@ -153,7 +153,7 @@ static xen_pfn_t *load_p2m_frame_list( int io_fd, int *pae_extended_cr3, int *ext_vcpucontext) { xen_pfn_t *p2m_frame_list; - vcpu_guest_context_either_t ctxt; + vcpu_guest_context_any_t ctxt; xen_pfn_t p2m_fl_zero; /* Read first entry of P2M list, or extended-info signature (~0UL). */ @@ -284,12 +284,12 @@ int xc_domain_restore(int xc_handle, int /* The new domain's shared-info frame number. */ unsigned long shared_info_frame; unsigned char shared_info_page[PAGE_SIZE]; /* saved contents from file */ - shared_info_either_t *old_shared_info = - (shared_info_either_t *)shared_info_page; - shared_info_either_t *new_shared_info; + shared_info_any_t *old_shared_info = + (shared_info_any_t *)shared_info_page; + shared_info_any_t *new_shared_info; /* A copy of the CPU context of the guest. */ - vcpu_guest_context_either_t ctxt; + vcpu_guest_context_any_t ctxt; /* A table containing the type of each PFN (/not/ MFN!). */ unsigned long *pfn_type = NULL; @@ -304,7 +304,7 @@ int xc_domain_restore(int xc_handle, int xen_pfn_t *p2m_frame_list = NULL; /* A temporary mapping of the guest's start_info page. */ - start_info_either_t *start_info; + start_info_any_t *start_info; /* Our mapping of the current region (batch) */ char *region_base; diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_domain_save.c --- a/tools/libxc/xc_domain_save.c Thu Jun 19 12:48:04 2008 +0900 +++ b/tools/libxc/xc_domain_save.c Wed Jul 02 11:30:37 2008 +0900 @@ -412,7 +412,7 @@ static int suspend_and_state(int (*suspe ** it to update the MFN to a reasonable value. */ static void *map_frame_list_list(int xc_handle, uint32_t dom, - shared_info_either_t *shinfo) + shared_info_any_t *shinfo) { int count = 100; void *p; @@ -628,9 +628,9 @@ static xen_pfn_t *map_and_save_p2m_table int io_fd, uint32_t dom, unsigned long p2m_size, - shared_info_either_t *live_shinfo) -{ - vcpu_guest_context_either_t ctxt; + shared_info_any_t *live_shinfo) +{ + vcpu_guest_context_any_t ctxt; /* Double and single indirect references to the live P2M table */ void *live_p2m_frame_list_list = NULL; @@ -735,7 +735,7 @@ static xen_pfn_t *map_and_save_p2m_table p2m_frame_list[i/FPP] = mfn_to_pfn(p2m_frame_list[i/FPP]); } - if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt.c) ) + if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt) ) { ERROR("Could not get vcpu context"); goto out; @@ -814,7 +814,7 @@ int xc_domain_save(int xc_handle, int io unsigned long shared_info_frame; /* A copy of the CPU context of the guest. */ - vcpu_guest_context_either_t ctxt; + vcpu_guest_context_any_t ctxt; /* A table containing the type of each PFN (/not/ MFN!). */ unsigned long *pfn_type = NULL; @@ -824,7 +824,7 @@ int xc_domain_save(int xc_handle, int io char page[PAGE_SIZE]; /* Live mapping of shared info structure */ - shared_info_either_t *live_shinfo = NULL; + shared_info_any_t *live_shinfo = NULL; /* base of the region in which domain memory is mapped */ unsigned char *region_base = NULL; @@ -1536,7 +1536,7 @@ int xc_domain_save(int xc_handle, int io } } - if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt.c) ) + if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt) ) { ERROR("Could not get vcpu context"); goto out; @@ -1556,7 +1556,7 @@ int xc_domain_save(int xc_handle, int io if ( !(vcpumap & (1ULL << i)) ) continue; - if ( (i != 0) && xc_vcpu_getcontext(xc_handle, dom, i, &ctxt.c) ) + if ( (i != 0) && xc_vcpu_getcontext(xc_handle, dom, i, &ctxt) ) { ERROR("No context for VCPU%d", i); goto out; @@ -1624,7 +1624,7 @@ int xc_domain_save(int xc_handle, int io * Reset the MFN to be a known-invalid value. See map_frame_list_list(). */ memcpy(page, live_shinfo, PAGE_SIZE); - SET_FIELD(((shared_info_either_t *)page), + SET_FIELD(((shared_info_any_t *)page), arch.pfn_to_mfn_frame_list_list, 0); if ( write_exact(io_fd, page, PAGE_SIZE) ) { diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_misc.c --- a/tools/libxc/xc_misc.c Thu Jun 19 12:48:04 2008 +0900 +++ b/tools/libxc/xc_misc.c Wed Jul 02 11:30:37 2008 +0900 @@ -253,6 +253,34 @@ int xc_hvm_track_dirty_vram( arg.first_pfn = first_pfn; arg.nr = nr; set_xen_guest_handle(arg.dirty_bitmap, (uint8_t *)dirty_bitmap); + + if ( (rc = lock_pages(&arg, sizeof(arg))) != 0 ) + { + PERROR("Could not lock memory"); + return rc; + } + + rc = do_xen_hypercall(xc_handle, &hypercall); + + unlock_pages(&arg, sizeof(arg)); + + return rc; +} + +int xc_hvm_modified_memory( + int xc_handle, domid_t dom, uint64_t first_pfn, uint64_t nr) +{ + DECLARE_HYPERCALL; + struct xen_hvm_modified_memory arg; + int rc; + + hypercall.op = __HYPERVISOR_hvm_op; + hypercall.arg[0] = HVMOP_modified_memory; + hypercall.arg[1] = (unsigned long)&arg; + + arg.domid = dom; + arg.first_pfn = first_pfn; + arg.nr = nr; if ( (rc = lock_pages(&arg, sizeof(arg))) != 0 ) { diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_pagetab.c --- a/tools/libxc/xc_pagetab.c Thu Jun 19 12:48:04 2008 +0900 +++ b/tools/libxc/xc_pagetab.c Wed Jul 02 11:30:37 2008 +0900 @@ -48,7 +48,7 @@ unsigned long xc_translate_foreign_addre unsigned long xc_translate_foreign_address(int xc_handle, uint32_t dom, int vcpu, unsigned long long virt ) { - vcpu_guest_context_t ctx; + vcpu_guest_context_any_t ctx; unsigned long long cr3; void *pd, *pt, *pdppage = NULL, *pdp, *pml = NULL; unsigned long long pde, pte, pdpe, pmle; @@ -78,7 +78,7 @@ unsigned long xc_translate_foreign_addre DPRINTF("failed to retreive vcpu context\n"); goto out; } - cr3 = ((unsigned long long)xen_cr3_to_pfn(ctx.ctrlreg[3])) << PAGE_SHIFT; + cr3 = ((unsigned long long)xen_cr3_to_pfn(ctx.c.ctrlreg[3])) << PAGE_SHIFT; /* Page Map Level 4 */ diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_private.h --- a/tools/libxc/xc_private.h Thu Jun 19 12:48:04 2008 +0900 +++ b/tools/libxc/xc_private.h Wed Jul 02 11:30:37 2008 +0900 @@ -188,9 +188,9 @@ int xc_map_foreign_ranges(int xc_handle, privcmd_mmap_entry_t *entries, int nr); void *map_domain_va_core(unsigned long domfd, int cpu, void *guest_va, - vcpu_guest_context_t *ctxt); + vcpu_guest_context_any_t *ctxt); int xc_waitdomain_core(int xc_handle, int domain, int *status, - int options, vcpu_guest_context_t *ctxt); + int options, vcpu_guest_context_any_t *ctxt); void bitmap_64_to_byte(uint8_t *bp, const uint64_t *lp, int nbits); void bitmap_byte_to_64(uint64_t *lp, const uint8_t *bp, int nbits); diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_ptrace.c --- a/tools/libxc/xc_ptrace.c Thu Jun 19 12:48:04 2008 +0900 +++ b/tools/libxc/xc_ptrace.c Wed Jul 02 11:30:37 2008 +0900 @@ -40,9 +40,9 @@ static int current_isfile; static int current_isfile; static int current_is_hvm; -static uint64_t online_cpumap; -static uint64_t regs_valid; -static vcpu_guest_context_t ctxt[MAX_VIRT_CPUS]; +static uint64_t online_cpumap; +static uint64_t regs_valid; +static vcpu_guest_context_any_t ctxt[MAX_VIRT_CPUS]; extern int ffsll(long long int); #define FOREACH_CPU(cpumap, i) for ( cpumap = online_cpumap; (i = ffsll(cpumap)); cpumap &= ~(1 << (index - 1)) ) @@ -96,9 +96,9 @@ xc_register_event_handler(thr_ev_handler } static inline int -paging_enabled(vcpu_guest_context_t *v) -{ - unsigned long cr0 = v->ctrlreg[0]; +paging_enabled(vcpu_guest_context_any_t *v) +{ + unsigned long cr0 = v->c.ctrlreg[0]; return (cr0 & X86_CR0_PE) && (cr0 & X86_CR0_PG); } @@ -174,7 +174,7 @@ map_domain_va_32( l2 = xc_map_foreign_range( xc_handle, current_domid, PAGE_SIZE, PROT_READ, - xen_cr3_to_pfn(ctxt[cpu].ctrlreg[3])); + xen_cr3_to_pfn(ctxt[cpu].c.ctrlreg[3])); if ( l2 == NULL ) return NULL; @@ -216,7 +216,7 @@ map_domain_va_pae( l3 = xc_map_foreign_range( xc_handle, current_domid, PAGE_SIZE, PROT_READ, - xen_cr3_to_pfn(ctxt[cpu].ctrlreg[3])); + xen_cr3_to_pfn(ctxt[cpu].c.ctrlreg[3])); if ( l3 == NULL ) return NULL; @@ -264,12 +264,12 @@ map_domain_va_64( uint64_t *l4, *l3, *l2, *l1; static void *v[MAX_VIRT_CPUS]; - if ((ctxt[cpu].ctrlreg[4] & 0x20) == 0 ) /* legacy ia32 mode */ + if ((ctxt[cpu].c.ctrlreg[4] & 0x20) == 0 ) /* legacy ia32 mode */ return map_domain_va_32(xc_handle, cpu, guest_va, perm); l4 = xc_map_foreign_range( xc_handle, current_domid, PAGE_SIZE, PROT_READ, - xen_cr3_to_pfn(ctxt[cpu].ctrlreg[3])); + xen_cr3_to_pfn(ctxt[cpu].c.ctrlreg[3])); if ( l4 == NULL ) return NULL; @@ -494,26 +494,26 @@ xc_ptrace( case PTRACE_GETREGS: if (!current_isfile && fetch_regs(xc_handle, cpu, NULL)) goto out_error; - SET_PT_REGS(pt, ctxt[cpu].user_regs); + SET_PT_REGS(pt, ctxt[cpu].c.user_regs); memcpy(data, &pt, sizeof(struct gdb_regs)); break; case PTRACE_GETFPREGS: if (!current_isfile && fetch_regs(xc_handle, cpu, NULL)) goto out_error; - memcpy(data, &ctxt[cpu].fpu_ctxt, sizeof (elf_fpregset_t)); + memcpy(data, &ctxt[cpu].c.fpu_ctxt, sizeof (elf_fpregset_t)); break; case PTRACE_GETFPXREGS: if (!current_isfile && fetch_regs(xc_handle, cpu, NULL)) goto out_error; - memcpy(data, &ctxt[cpu].fpu_ctxt, sizeof(ctxt[cpu].fpu_ctxt)); + memcpy(data, &ctxt[cpu].c.fpu_ctxt, sizeof(ctxt[cpu].c.fpu_ctxt)); break; case PTRACE_SETREGS: if (current_isfile) goto out_unsupported; /* XXX not yet supported */ - SET_XC_REGS(((struct gdb_regs *)data), ctxt[cpu].user_regs); + SET_XC_REGS(((struct gdb_regs *)data), ctxt[cpu].c.user_regs); if ((retval = xc_vcpu_setcontext(xc_handle, current_domid, cpu, &ctxt[cpu]))) goto out_error_domctl; @@ -525,7 +525,7 @@ xc_ptrace( /* XXX we can still have problems if the user switches threads * during single-stepping - but that just seems retarded */ - ctxt[cpu].user_regs.eflags |= PSL_T; + ctxt[cpu].c.user_regs.eflags |= PSL_T; if ((retval = xc_vcpu_setcontext(xc_handle, current_domid, cpu, &ctxt[cpu]))) goto out_error_domctl; @@ -542,9 +542,9 @@ xc_ptrace( if (fetch_regs(xc_handle, cpu, NULL)) goto out_error; /* Clear trace flag */ - if ( ctxt[cpu].user_regs.eflags & PSL_T ) + if ( ctxt[cpu].c.user_regs.eflags & PSL_T ) { - ctxt[cpu].user_regs.eflags &= ~PSL_T; + ctxt[cpu].c.user_regs.eflags &= ~PSL_T; if ((retval = xc_vcpu_setcontext(xc_handle, current_domid, cpu, &ctxt[cpu]))) goto out_error_domctl; diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_ptrace_core.c --- a/tools/libxc/xc_ptrace_core.c Thu Jun 19 12:48:04 2008 +0900 +++ b/tools/libxc/xc_ptrace_core.c Wed Jul 02 11:30:37 2008 +0900 @@ -641,24 +641,24 @@ static const struct xc_core_format_type* void * map_domain_va_core(unsigned long domfd, int cpu, void *guest_va, - vcpu_guest_context_t *ctxt) + vcpu_guest_context_any_t *ctxt) { if (current_format_type == NULL) return NULL; return (current_format_type->map_domain_va_core)(domfd, cpu, guest_va, - ctxt); + &ctxt->c); } int xc_waitdomain_core(int xc_handle, int domfd, int *status, int options, - vcpu_guest_context_t *ctxt) + vcpu_guest_context_any_t *ctxt) { int ret; int i; for (i = 0; i < NR_FORMAT_TYPE; i++) { ret = (format_type[i].waitdomain_core)(xc_handle, domfd, status, - options, ctxt); + options, &ctxt->c); if (ret == 0) { current_format_type = &format_type[i]; break; diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_resume.c --- a/tools/libxc/xc_resume.c Thu Jun 19 12:48:04 2008 +0900 +++ b/tools/libxc/xc_resume.c Wed Jul 02 11:30:37 2008 +0900 @@ -13,7 +13,7 @@ static int modify_returncode(int xc_handle, uint32_t domid) { - vcpu_guest_context_either_t ctxt; + vcpu_guest_context_any_t ctxt; xc_dominfo_t info; xen_capabilities_info_t caps; int rc; @@ -39,7 +39,7 @@ static int modify_returncode(int xc_hand return -1; } - if ( (rc = xc_vcpu_getcontext(xc_handle, domid, 0, &ctxt.c)) != 0 ) + if ( (rc = xc_vcpu_getcontext(xc_handle, domid, 0, &ctxt)) != 0 ) return rc; if ( !info.hvm ) @@ -49,7 +49,7 @@ static int modify_returncode(int xc_hand else ctxt.x32.user_regs.eax = 1; - if ( (rc = xc_vcpu_setcontext(xc_handle, domid, 0, &ctxt.c)) != 0 ) + if ( (rc = xc_vcpu_setcontext(xc_handle, domid, 0, &ctxt)) != 0 ) return rc; return 0; @@ -89,7 +89,7 @@ static int xc_domain_resume_any(int xc_h int i, rc = -1; #if defined(__i386__) || defined(__x86_64__) unsigned long mfn, p2m_size = 0; - vcpu_guest_context_t ctxt; + vcpu_guest_context_any_t ctxt; start_info_t *start_info; shared_info_t *shinfo = NULL; xen_pfn_t *p2m_frame_list_list = NULL; @@ -167,7 +167,7 @@ static int xc_domain_resume_any(int xc_h goto out; } - mfn = ctxt.user_regs.edx; + mfn = ctxt.c.user_regs.edx; start_info = xc_map_foreign_range(xc_handle, domid, PAGE_SIZE, PROT_READ | PROT_WRITE, mfn); diff -r 11318234588e -r 08f77df14cba tools/libxc/xenctrl.h --- a/tools/libxc/xenctrl.h Thu Jun 19 12:48:04 2008 +0900 +++ b/tools/libxc/xenctrl.h Wed Jul 02 11:30:37 2008 +0900 @@ -30,6 +30,11 @@ #include <xen/xsm/acm.h> #include <xen/xsm/acm_ops.h> #include <xen/xsm/flask_op.h> + +#if defined(__i386__) || defined(__x86_64__) +#include <xen/foreign/x86_32.h> +#include <xen/foreign/x86_64.h> +#endif #ifdef __ia64__ #define XC_PAGE_SHIFT 14 @@ -162,6 +167,35 @@ typedef struct xc_dominfo { } xc_dominfo_t; typedef xen_domctl_getdomaininfo_t xc_domaininfo_t; + +typedef union +{ +#if defined(__i386__) || defined(__x86_64__) + vcpu_guest_context_x86_64_t x64; + vcpu_guest_context_x86_32_t x32; +#endif + vcpu_guest_context_t c; +} vcpu_guest_context_any_t; + +typedef union +{ +#if defined(__i386__) || defined(__x86_64__) + shared_info_x86_64_t x64; + shared_info_x86_32_t x32; +#endif + shared_info_t s; +} shared_info_any_t; + +typedef union +{ +#if defined(__i386__) || defined(__x86_64__) + start_info_x86_64_t x64; + start_info_x86_32_t x32; +#endif + start_info_t s; +} start_info_any_t; + + int xc_domain_create(int xc_handle, uint32_t ssidref, xen_domain_handle_t handle, @@ -307,7 +341,7 @@ int xc_vcpu_setcontext(int xc_handle, int xc_vcpu_setcontext(int xc_handle, uint32_t domid, uint32_t vcpu, - vcpu_guest_context_t *ctxt); + vcpu_guest_context_any_t *ctxt); /** * This function will return information about one or more domains, using a * single hypercall. The domain information will be stored into the supplied @@ -368,7 +402,7 @@ int xc_vcpu_getcontext(int xc_handle, int xc_vcpu_getcontext(int xc_handle, uint32_t domid, uint32_t vcpu, - vcpu_guest_context_t *ctxt); + vcpu_guest_context_any_t *ctxt); typedef xen_domctl_getvcpuinfo_t xc_vcpuinfo_t; int xc_vcpu_getinfo(int xc_handle, @@ -894,6 +928,12 @@ int xc_hvm_track_dirty_vram( int xc_handle, domid_t dom, uint64_t first_pfn, uint64_t nr, unsigned long *bitmap); + +/* + * Notify that some pages got modified by the Device Model + */ +int xc_hvm_modified_memory( + int xc_handle, domid_t dom, uint64_t first_pfn, uint64_t nr); typedef enum { XC_ERROR_NONE = 0, diff -r 11318234588e -r 08f77df14cba tools/libxc/xg_save_restore.h --- a/tools/libxc/xg_save_restore.h Thu Jun 19 12:48:04 2008 +0900 +++ b/tools/libxc/xg_save_restore.h Wed Jul 02 11:30:37 2008 +0900 @@ -112,28 +112,6 @@ static inline int get_platform_info(int #define is_mapped(pfn_type) (!((pfn_type) & 0x80000000UL)) -/* 32-on-64 support: saving 32bit guests from 64bit tools and vice versa */ -typedef union -{ - vcpu_guest_context_x86_64_t x64; - vcpu_guest_context_x86_32_t x32; - vcpu_guest_context_t c; -} vcpu_guest_context_either_t; - -typedef union -{ - shared_info_x86_64_t x64; - shared_info_x86_32_t x32; - shared_info_t s; -} shared_info_either_t; - -typedef union -{ - start_info_x86_64_t x64; - start_info_x86_32_t x32; - start_info_t s; -} start_info_either_t; - #define GET_FIELD(_p, _f) ((guest_width==8) ? ((_p)->x64._f) : ((_p)->x32._f)) #define SET_FIELD(_p, _f, _v) do { \ diff -r 11318234588e -r 08f77df14cba tools/python/xen/util/blkif.py --- a/tools/python/xen/util/blkif.py Thu Jun 19 12:48:04 2008 +0900 +++ b/tools/python/xen/util/blkif.py Wed Jul 02 11:30:37 2008 +0900 @@ -16,8 +16,11 @@ def blkdev_name_to_number(name): n = expand_dev_name(name) + devname = 'virtual-device' + devnum = None + try: - return os.stat(n).st_rdev + return (devname, os.stat(n).st_rdev) except Exception, ex: pass @@ -25,28 +28,30 @@ def blkdev_name_to_number(name): if re.match( '/dev/sd[a-z]([1-9]|1[0-5])?$', n): major = scsi_major[(ord(n[7:8]) - ord('a')) / 16] minor = ((ord(n[7:8]) - ord('a')) % 16) * 16 + int(n[8:] or 0) - return major * 256 + minor - if re.match( '/dev/sd[a-i][a-z]([1-9]|1[0-5])?$', n): + devnum = major * 256 + minor + elif re.match( '/dev/sd[a-i][a-z]([1-9]|1[0-5])?$', n): major = scsi_major[((ord(n[7:8]) - ord('a') + 1) * 26 + (ord(n[8:9]) - ord('a'))) / 16 ] minor = (((ord(n[7:8]) - ord('a') + 1 ) * 26 + (ord(n[8:9]) - ord('a'))) % 16) * 16 + int(n[9:] or 0) - return major * 256 + minor - - if re.match( '/dev/hd[a-t]([1-9]|[1-5][0-9]|6[0-3])?', n): + devnum = major * 256 + minor + elif re.match( '/dev/hd[a-t]([1-9]|[1-5][0-9]|6[0-3])?', n): ide_majors = [ 3, 22, 33, 34, 56, 57, 88, 89, 90, 91 ] major = ide_majors[(ord(n[7:8]) - ord('a')) / 2] minor = ((ord(n[7:8]) - ord('a')) % 2) * 64 + int(n[8:] or 0) - return major * 256 + minor + devnum = major * 256 + minor + elif re.match( '/dev/xvd[a-p]([1-9]|1[0-5])?$', n): + devnum = (202 << 8) + ((ord(n[8:9]) - ord('a')) << 4) + int(n[9:] or 0) + elif re.match('/dev/xvd[q-z]([1-9]|1[0-5])?$', n): + devname = 'virtual-device-ext' + devnum = (1 << 28) + ((ord(n[8:9]) - ord('a')) << 8) + int(n[9:] or 0) + elif re.match('/dev/xvd[a-i][a-z]([1-9]|1[0-5])?$', n): + devname = 'virtual-device-ext' + devnum = (1 << 28) + (((ord(n[8:9]) - ord('a') + 1) * 26 + (ord(n[9:10]) - ord('a'))) << 8) + int(n[10:] or 0) + elif re.match( '^(0x)[0-9a-fA-F]+$', name ): + devnum = string.atoi(name, 16) + elif re.match('^[0-9]+$', name): + devnum = string.atoi(name, 10) - if re.match( '/dev/xvd[a-p]([1-9]|1[0-5])?', n): - return 202 * 256 + 16 * (ord(n[8:9]) - ord('a')) + int(n[9:] or 0) - - if re.match( '^(0x)[0-9a-fA-F]+$', name ): - return string.atoi(name,16) - - if re.match('^[0-9]+$', name): - return string.atoi(name, 10) - - return None + return (devname, devnum) def blkdev_segment(name): """Take the given block-device name (e.g. '/dev/sda1', 'hda') @@ -58,7 +63,7 @@ def blkdev_segment(name): type: 'Disk' or identifying name for partition type """ val = None - n = blkdev_name_to_number(name) + (name, n) = blkdev_name_to_number(name) if not n is None: val = { 'device' : n, 'start_sector' : long(0), diff -r 11318234588e -r 08f77df14cba tools/python/xen/xend/XendConfig.py --- a/tools/python/xen/xend/XendConfig.py Thu Jun 19 12:48:04 2008 +0900 +++ b/tools/python/xen/xend/XendConfig.py Wed Jul 02 11:30:37 2008 +0900 @@ -1123,7 +1123,7 @@ class XendConfig(dict): try: devid = int(dev2) except ValueError: - devid = blkdev_name_to_number(dev2) + (xenbus, devid) = blkdev_name_to_number(dev2) if devid == None: log.debug("The device %s is not device name", dev2) return None diff -r 11318234588e -r 08f77df14cba tools/python/xen/xend/XendOptions.py --- a/tools/python/xen/xend/XendOptions.py Thu Jun 19 12:48:04 2008 +0900 +++ b/tools/python/xen/xend/XendOptions.py Wed Jul 02 11:30:37 2008 +0900 @@ -132,6 +132,9 @@ class XendOptions: """Default script to configure a backend network interface""" vif_script = osdep.vif_script + """Default rotation count of qemu-dm log file.""" + qemu_dm_logrotate_count = 10 + def __init__(self): self.configure() @@ -350,6 +353,10 @@ class XendOptions: def get_vnc_x509_verify(self): return self.get_config_string('vnc-x509-verify', self.xend_vnc_x509_verify) + + def get_qemu_dm_logrotate_count(self): + return self.get_config_int("qemu-dm-logrotate-count", + self.qemu_dm_logrotate_count) class XendOptionsFile(XendOptions): diff -r 11318234588e -r 08f77df14cba tools/python/xen/xend/image.py --- a/tools/python/xen/xend/image.py Thu Jun 19 12:48:04 2008 +0900 +++ b/tools/python/xen/xend/image.py Wed Jul 02 11:30:37 2008 +0900 @@ -378,13 +378,23 @@ class ImageHandler: # keep track of pid and spawned options to kill it later self.logfile = "/var/log/xen/qemu-dm-%s.log" % str(self.vm.info['name_label']) - if os.path.exists(self.logfile): - if os.path.exists(self.logfile + ".1"): - os.unlink(self.logfile + ".1") - os.rename(self.logfile, self.logfile + ".1") + + # rotate log + logfile_mode = os.O_WRONLY|os.O_CREAT|os.O_APPEND + logrotate_count = XendOptions.instance().get_qemu_dm_logrotate_count() + if logrotate_count > 0: + logfile_mode |= os.O_TRUNC + if os.path.exists("%s.%d" % (self.logfile, logrotate_count)): + os.unlink("%s.%d" % (self.logfile, logrotate_count)) + for n in range(logrotate_count - 1, 0, -1): + if os.path.exists("%s.%d" % (self.logfile, n)): + os.rename("%s.%d" % (self.logfile, n), + "%s.%d" % (self.logfile, (n + 1))) + if os.path.exists(self.logfile): + os.rename(self.logfile, self.logfile + ".1") null = os.open("/dev/null", os.O_RDONLY) - logfd = os.open(self.logfile, os.O_WRONLY|os.O_CREAT|os.O_TRUNC|os.O_APPEND) + logfd = os.open(self.logfile, logfile_mode) sys.stderr.flush() pid = os.fork() diff -r 11318234588e -r 08f77df14cba tools/python/xen/xend/server/blkif.py --- a/tools/python/xen/xend/server/blkif.py Thu Jun 19 12:48:04 2008 +0900 +++ b/tools/python/xen/xend/server/blkif.py Wed Jul 02 11:30:37 2008 +0900 @@ -81,11 +81,11 @@ class BlkifController(DevController): if security.on() == xsconstants.XS_POLICY_ACM: self.do_access_control(config, uname) - devid = blkif.blkdev_name_to_number(dev) + (device_path, devid) = blkif.blkdev_name_to_number(dev) if devid is None: raise VmError('Unable to find number for device (%s)' % (dev)) - front = { 'virtual-device' : "%i" % devid, + front = { device_path : "%i" % devid, 'device-type' : dev_type } @@ -204,5 +204,5 @@ class BlkifController(DevController): dev = devid.split('/')[-1] dev = int(dev) except ValueError: - dev = blkif.blkdev_name_to_number(dev) + (device_path, dev) = blkif.blkdev_name_to_number(dev) return dev diff -r 11318234588e -r 08f77df14cba tools/python/xen/xm/main.py --- a/tools/python/xen/xm/main.py Thu Jun 19 12:48:04 2008 +0900 +++ b/tools/python/xen/xm/main.py Wed Jul 02 11:30:37 2008 +0900 @@ -2022,8 +2022,7 @@ def xm_block_list(args): map(server.xenapi.VBD.get_runtime_properties, vbd_refs) vbd_devs = \ map(server.xenapi.VBD.get_device, vbd_refs) - vbd_devids = \ - map(blkdev_name_to_number, vbd_devs) + vbd_devids = [blkdev_name_to_number(x)[1] for x in vbd_devs] devs = map(lambda (devid, prop): [devid, map2sxp(prop)], zip(vbd_devids, vbd_properties)) else: diff -r 11318234588e -r 08f77df14cba tools/tests/test_x86_emulator.c --- a/tools/tests/test_x86_emulator.c Thu Jun 19 12:48:04 2008 +0900 +++ b/tools/tests/test_x86_emulator.c Wed Jul 02 11:30:37 2008 +0900 @@ -22,23 +22,22 @@ static int read( static int read( unsigned int seg, unsigned long offset, - unsigned long *val, + void *p_data, unsigned int bytes, struct x86_emulate_ctxt *ctxt) { - *val = 0; - memcpy(val, (void *)offset, bytes); + memcpy(p_data, (void *)offset, bytes); return X86EMUL_OKAY; } static int write( unsigned int seg, unsigned long offset, - unsigned long val, + void *p_data, unsigned int bytes, struct x86_emulate_ctxt *ctxt) { - memcpy((void *)offset, &val, bytes); + memcpy((void *)offset, p_data, bytes); return X86EMUL_OKAY; } diff -r 11318234588e -r 08f77df14cba tools/xenballoon/xenballoon-monitor --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/xenballoon/xenballoon-monitor Wed Jul 02 11:30:37 2008 +0900 @@ -0,0 +1,43 @@ +#!/bin/bash +# +# xenballoon-monitor - monitor certain stats from xenballoond +# (run in dom0 with "watch -d xenballoon-monitor" for xentop-like output) +# +# Copyright (C) 2009 Oracle Corporation and/or its affiliates. +# All rights reserved +# Written by: Dan Magenheimer <dan.magenheimer@xxxxxxxxxx> +# +# Hint: Use "xm sched-credit -d 0 -w 2000" to watch on heavily loaded machines +# +echo "id mem-kb tgt-kb commit swapin swapout pgin pgout active(sec)" +for i in `xenstore-list /local/domain`; do + if [ "$i" -ne 0 ]; then + tot=0; tgt=0; sin=0; sout=0; pgin=0; pgout=0; cmt=0; up=0; idle=0; act=0; + if xenstore-exists /local/domain/$i/memory/meminfo; then + tot=`xenstore-read /local/domain/$i/memory/meminfo | grep MemTotal \ + | sed 's/[^1-9]*\([1-9][0-9]*\).*/\1/'` + cmt=`xenstore-read /local/domain/$i/memory/meminfo | grep Committed_AS \ + | sed 's/[^1-9]*\([1-9][0-9]*\).*/\1/'` + fi + if xenstore-exists /local/domain/$i/memory/selftarget; then + tgt=`xenstore-read /local/domain/$i/memory/selftarget` + fi + if xenstore-exists /local/domain/$i/memory/vmstat; then + sin=`xenstore-read /local/domain/$i/memory/vmstat | grep pswpin \ + | cut -d" " -f2` + sout=`xenstore-read /local/domain/$i/memory/vmstat | grep pswpout \ + | cut -d" " -f2` + pgin=`xenstore-read /local/domain/$i/memory/vmstat | grep pgpgin \ + | cut -d" " -f2` + pgout=`xenstore-read /local/domain/$i/memory/vmstat | grep pgout \ + | cut -d" " -f2` + fi + if xenstore-exists /local/domain/$i/memory/uptime; then + up=`xenstore-read /local/domain/$i/memory/uptime | cut -d" " -f1` + idle=`xenstore-read /local/domain/$i/memory/uptime | cut -d" " -f2` + act=`echo $up - $idle | bc -iq` + fi + printf "%2d %8d%8d%8d%9d%9d%10d%10d%10.2f\n" $i $tot $tgt $cmt $sin $sout $pgin $pgout $act + fi +done +echo Free memory: `xm info | grep free | sed 's/[^1-9]*\([1-9][0-9]*\).*/\1/'` MB diff -r 11318234588e -r 08f77df14cba tools/xenballoon/xenballoon.conf --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/xenballoon/xenballoon.conf Wed Jul 02 11:30:37 2008 +0900 @@ -0,0 +1,91 @@ +## Path: System/xen +## Description: xen domain start/stop on boot +## Type: string +## Default: + +# NOTE: "xenbus is enabled" means not only that /proc/xen/xenbus exists +# but also that /usr/bin/xenstore-* tools are installed. + +## Type: boolean +## Default: false +# +# If XENBALLOON_SELF is true, selfballooning will occur, meaning the +# balloon driver will grow and shrink according to available memory. +# If xenbus is enabled, may be overridden by {memory/selfballoon}==0 +# If false but xenballoond is able to communicate with domain0 via +# xenbus, balloon targets will be set by domain0 +# +XENBALLOON_SELF=false + +## Type: integer (must be > 0) +## Default: 1 +# +# If self-ballooning, number of seconds between checks/adjustments. +# If xenbus is enabled, may be overridden by {memory/interval} +XENBALLOON_SELF_INTERVAL=1 + +## Type: integer (must be > 0) +## Default: 1 +# +# If NOT self-ballooning but xenbus is enabled, number of seconds between +# checks/adjustments. May be overridden by {memory/interval} +XENBALLOON_INTERVAL=1 + +## Type: integer (must be > 0) +## Default: 10 +# +# When current > target, reduces rate at which target memory is ballooned +# out. For a value of n, 1/n of the difference will be ballooned. +# This value applies both to selfballooning and directed ballooning. +# May be overridden by {memory/downhysteresis} +XENBALLOON_AUTO_DOWNHYSTERESIS=10 + +## Type: integer (must be > 0) +## Default: 1 +# +# When current < target, reduces rate at which target memory is reclaimed +# (if available). For a value of n, 1/n of the difference will be ballooned. +# This value applies both to selfballooning and directed ballooning. +# May be overridden by {memory/uphysteresis} +XENBALLOON_AUTO_UPHYSTERESIS=1 + +## Type: integer (must be >= 0) +## Default: 0 +# +# In order to avoid ballooning so much memory that a guest experiences +# out-of-memory errors (OOMs), memory will not be ballooned out below +# a minimum target, in MB. If this value is 0 (default), an heuristic +# based on the maximum amount of memory will be used. (The heuristic +# provides the same minimum as recent versions of the balloon driver but +# early versions of the balloon driver did not enforce a minimum.) +XENBALLOON_MINMEM=0 + +## Type: string +## Default: "/var/run/xenballoon-maxmem" +# +# Location where memory high-water mark is stored; if a guest supports +# hot-add memory, maxmem might increase across time and the minimum +# target heuristic is based on max memory. NOTE: Reboot after changing +# this variable, else overballooning may occur. +XENBALLOON_MAXMEMFILE=/var/run/xenballoon-maxmem + +## Type: integer (0 or 1) +## Default: 1 +# +# If xenbus is enabled, whether selfballooning or directed ballooning, +# place the result of 'cat /proc/meminfo" on xenbus at memory/meminfo +XENBALLOON_SEND_MEMINFO=1 + +## Type: integer (0 or 1) +## Default: 1 +# +# If xenbus is enabled, whether selfballooning or directed ballooning, +# place the result of 'cat /proc/vmstat" on xenbus at memory/vmstat +XENBALLOON_SEND_VMSTAT=1 + +## Type: integer (0 or 1) +## Default: 1 +# +# If xenbus is enabled, whether selfballooning or directed ballooning, +# place the result of 'cat /proc/uptime" on xenbus at memory/uptime +XENBALLOON_SEND_UPTIME=1 diff -r 11318234588e -r 08f77df14cba tools/xenballoon/xenballoond --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/xenballoon/xenballoond Wed Jul 02 11:30:37 2008 +0900 @@ -0,0 +1,205 @@ +#!/bin/bash +# +# Copyright (C) 2008 Oracle Corporation and/or its affiliates. +# All rights reserved. +# Written by: Dan Magenheimer <dan.magenheimer@xxxxxxxxxx> +# +# xenballoond - In-guest engine for Xen memory ballooning +# Version: 080630 +# +# Two "policies" are implemented: +# - Selfballooning: Adjust memory periodically, with no (or little) input +# from domain0. Target memory is determined solely by the +# Committed_AS line in /proc/meminfo, but parameters may adjust +# the rate at which the target is achieved. +# - Directed ballooning: Adjust memory solely as directed by domain0 +# +# Under some circumstances, "output" may also be generated; the contents +# of /proc/meminfo and /proc/vmstat may be periodically placed on xenbus. +# +# If xenbus is running and the /usr/bin/xenstore-* tools are installed, +# "xenbus is enabled". +# +# Parameters are documented in /etc/sysconfig/xenballoon.conf. Although +# some are not used with directed ballooning, all must be set properly. +# If xenbus is enabled, some of these parameters may be overridden by values +# set by domain0 via xenbus. + +minmb() { + RETVAL=$XENBALLOON_MINMEM + if [ $RETVAL -ne 0 ]; then + return $RETVAL + fi + kb=`cat $XENBALLOON_MAXMEMFILE` + let "mb=$kb/1024" + let "pages=$kb/4" + # this algorithm from drivers/xen/balloon/balloon.c:minimum_target() + # which was added to balloon.c in 2008 to avoid ballooning too small + # it is unnecessary here except to accomodate pre-2008 balloon drivers + # note that ranges are adjusted because a VM with "memory=1024" + # gets somewhat less than 1024MB + if [ $mb -lt 125 ]; then + let RETVAL="$(( 8 + ($pages >> 9) ))" + elif [ $mb -lt 500 ]; then + let RETVAL="$(( 40 + ($pages >> 10) ))" + elif [ $mb -lt 2000 ]; then + let RETVAL="$(( 104 + ($pages >> 11) ))" + else + let RETVAL="$(( 296 + ($pages >> 13) ))" + fi + return # value returned in RETVAL in mB +} + +curkb() { + kb=`grep MemTotal /proc/meminfo | sed 's/ */ /' | \ + cut -f2 -d' '` + RETVAL=$kb + return # value returned in RETVAL in kB +} + +downhysteresis() { + RETVAL=$XENBALLOON_AUTO_DOWNHYSTERESIS + if [ $xenstore_enabled = "true" ]; then + if xenstore-exists memory/downhysteresis ; then + RETVAL=`xenstore-read memory/downhysteresis` + fi + fi + return +} + +uphysteresis() { + RETVAL=$XENBALLOON_AUTO_UPHYSTERESIS + if [ $xenstore_enabled = "true" ]; then + if xenstore-exists memory/uphysteresis ; then + RETVAL=`xenstore-read memory/uphysteresis` + fi + fi + return +} + +selfballoon_eval() { + if [ $xenstore_enabled = "true" ]; then + if xenstore-exists memory/selfballoon; then + RETVAL=`xenstore-read memory/selfballoon` + if [ $RETVAL -eq 1 ]; then + selfballoon_enabled=true + return + fi + fi + fi + selfballoon_enabled=$XENBALLOON_SELF + return +} + +selftarget() { + tgtkb=`grep Committed_AS /proc/meminfo | sed 's/ */ /' | cut -f2 -d' '` + minmb + let "minbytes=$RETVAL*1024*1024" + let "tgtbytes=$tgtkb*1024" + if [ $tgtbytes -lt $minbytes ]; then + let "tgtbytes=$minbytes" + fi + RETVAL=$tgtbytes # value returned in RETVAL in bytes + return +} + +# $1 == 1 means use selftarget, else target in kB +balloon_to_target() { + if [ "$1" -eq 1 ]; then + selftarget + tgtbytes=$RETVAL + else + let "tgtbytes=$(( $1 * 1024 ))" + fi + curkb + let "curbytes=$RETVAL*1024" + if [ $curbytes -gt $tgtbytes ]; then + downhysteresis + downhys=$RETVAL + if [ $downhys -ne 0 ]; then + let "tgtbytes=$(( $curbytes - \ + ( ( $curbytes - $tgtbytes ) / $downhys ) ))" + fi + else if [ $curbytes -lt $tgtbytes ]; then + uphysteresis + uphys=$RETVAL + let "tgtbytes=$(( $curbytes + \ + ( ( $tgtbytes - $curbytes ) / $uphys ) ))" + fi + fi + echo $tgtbytes > /proc/xen/balloon + if [ $xenstore_enabled = "true" ]; then + let "tgtkb=$(( $tgtbytes/1024 ))" + xenstore-write memory/selftarget $tgtkb + fi +} + +send_memory_stats() { + if [ ! $xenstore_enabled = "true" ]; then + return + fi + if [ $XENBALLOON_SEND_MEMINFO ]; then + xenstore-write memory/meminfo "`cat /proc/meminfo`" + fi + if [ $XENBALLOON_SEND_VMSTAT ]; then + xenstore-write memory/vmstat "`cat /proc/vmstat`" + fi + if [ $XENBALLOON_SEND_UPTIME ]; then + xenstore-write memory/uptime "`cat /proc/uptime`" + fi +} + +if [ ! -f /proc/xen/balloon ]; then + echo "$0: no balloon driver installed" + exit 0 +fi +if [ ! -f /proc/meminfo ]; then + echo "$0: can't read /proc/meminfo" + exit 0 +fi +xenstore_enabled=true +if [ -f /usr/bin/xenstore-exists -a -f /usr/bin/xenstore-read -a \ + -f /usr/bin/xenstore-write ]; then + xenstore_enabled=true +else + echo "$0: missing /usr/bin/xenstore-* tools, disabling directed ballooning" + xenstore_enabled=false +fi + +. /etc/sysconfig/xenballoon.conf + +while true; +do + # handle special case for PV domains with hot-add memory + if [ ! -f $XENBALLOON_MAXMEMFILE ]; then + maxkb=0 + else + maxkb=`cat $XENBALLOON_MAXMEMFILE` + fi + curkb=`grep MemTotal /proc/meminfo | sed 's/ */ /' | cut -f2 -d' '` + if [ $curkb -gt $maxkb ]; then + echo $curkb > $XENBALLOON_MAXMEMFILE + fi + interval=$XENBALLOON_INTERVAL + # do self-ballooning + selfballoon_eval + if [ $selfballoon_enabled = "true" ]; then + balloon_to_target 1 + interval=$XENBALLOON_SELF_INTERVAL + # or do directed ballooning + elif [ $xenstore_enabled = "true" ]; then + if xenstore-exists memory/target ; then + tgtkb=`xenstore-read memory/target` + balloon_to_target $tgtkb + fi + interval=$XENBALLOON_INTERVAL + fi + send_memory_stats + if [ $xenstore_enabled = "true" ]; then + if xenstore-exists memory/interval ; then + interval=`xenstore-read memory/interval` + fi + fi + sleep $interval +done & + diff -r 11318234588e -r 08f77df14cba tools/xenballoon/xenballoond.README --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/xenballoon/xenballoond.README Wed Jul 02 11:30:37 2008 +0900 @@ -0,0 +1,82 @@ +Xenballoond.README +Preliminary version 0.1, 2008/06/30 + +Copyright (C) 2008 Oracle Corporation and/or its affiliates. +All rights reserved. +Written by Dan Magenheimer <dan.magenheimer@xxxxxxxxxx> + +INTRODUCTION + +Xenballoond runs in guest domains and both implements selfballooning and +provides metrics to dom0 for (future) directed ballooning. Both capabilities +provide a foundation for basic "memory overcommit" functionality. + +With selfballooning enabled, xenballoond uses the Committed_AS value found +in /proc/meminfo as a first approximation of how much memory is required +by the guest and feeds this statistic back to the balloon driver to inflate +or deflate the balloon as required to achieve the target guest memory size. +Hysteresis parameters may be adjusted to rate-limit balloon inflation +and deflation. + +If configured, certain selfballooning parameters -- including notably +enabling/disabling of self-ballooning -- can be controlled from domain0. +(These are fully documented in xenballoon.conf.) + +If configured, the following guest statistics are sent back to domain0: +- /proc/meminfo +- /proc/vmstat +- /proc/uptime +In a future release, some of these values will be used by a policy module +in domain0 to control guest balloon size and provide memory balancing +across all guests on a given system. + +Note that no page sharing (content-based or otherwise) is implemented +and no VMM-based swapping is necessary. + +For more information, see: +http://www.xen.org/files/xensummitboston08/MemoryOvercommit-XenSummit2008.pdf +http://wiki.xensource.com/xenwiki/Open_Topics_For_Discussion?action=AttachFile&do=get&target=Memory+Overcommit.pdf + +INSTALLATION AND DEPLOYMENT + +In this preliminary release: +- directed ballooning is not implemented, though a monitor is provided +- only Redhat-based guests are supported + +Guest prerequisites to use xenballoond: +- each guest must be configured with adequate[1] swap space +- each guest must have the balloon driver installed (/proc/xen/balloon exists) +- if directed ballooning (or monitoring) is desired, xenstore tools must be + installed in each guest in /usr/bin [2] + +[1] for best results, for a guest that is configured with maxmem=N and + requires Z MB of swap space without xenballoond, available swap should + be increased to N+Z MB when xenballoond is running +[2] specifically xenstore-read, xenstore-exists, and xenstore-write must + be installed. Binaries can be obtained, for example, by building + xen-vvv.gz/tools in a guest-binary-compatible development tree + +Instructions to install/deploy xenballoond (in Redhat-based system): +- in each guest: + - ensure pre-requisites are met (see above) + - place xenballoon.conf in /etc/sysconfig + - place xenballoond in /usr/sbin + - copy xenballoond.init to /etc/rc.d/init.d/xenballoond (note file rename) + - edit /etc/sysconfig/xenballoond.conf as desired (especially note that + selfballooning defaults as off) + - start xenballoond with "service xenballoond start", and/or configure + xenballoond to start at init (e.g. "chkconfig xenballoond on") +- in domain0: + - if monitoring is desired, xenballoon-monitor may be installed in /usr/sbin +- note that certain xenballoond.conf variables may be overridden by domain0 + if xenstore is running in the guest; these are fully documented in + xenballoond.conf + +TODO: +080630 modifications to support SUSE-based and debian-based guests +080630 domain0 ballooning policy module +080630 experiment with more aggressive (optionally) memory minimum targets +080630 BUG: xenballoond doesn't properly record the fact that it's running; + e.g. flipping between run levels 5 and 3 launches additional daemons +080630 BUG: reports of possible incompatibilites between ballooning and + save/restore/migrate have not been duplicated diff -r 11318234588e -r 08f77df14cba tools/xenballoon/xenballoond.init --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/xenballoon/xenballoond.init Wed Jul 02 11:30:37 2008 +0900 @@ -0,0 +1,91 @@ +#!/bin/bash +# +# xenballoond Script to start and stop Xen ballooning daemon. +# +# Copyright (C) 2008 Oracle Corporation and/or its affiliates. +# All rights reserved. +# Written by: Dan Magenheimer <dan.magenheimer@xxxxxxxxxx> +# +# chkconfig: 2345 98 01 +# description: Starts and stops the Xen control daemon. +### BEGIN INIT INFO +# Provides: xenballoond +# Required-Start: $syslog $remote_fs +# Should-Start: +# Required-Stop: $syslog $remote_fs +# Should-Stop: +# Default-Start: 3 4 5 +# Default-Stop: 0 1 2 6 +# Default-Enabled: yes +# Short-Description: Start/stop xend +# Description: Starts and stops the Xen ballooning daemon. +### END INIT INFO + +# Source function library +. /etc/init.d/functions + +#don't use in domain0 +[ -f /proc/xen/capabilities ] && \ + grep -q "control_d" /proc/xen/capabilities && exit 0 + +if [ -f /etc/sysconfig/xenballoon.conf ]; then + . /etc/sysconfig/xenballoon.conf +fi + +# Check that balloon driver is present +[ ! -f /proc/xen/balloon ] && exit 0 + +# Record original memory (in kB) +[ -z "$XENBALLOON_MAXMEMFILE" ] && exit 0 +let maxmem=`grep MemTotal /proc/meminfo | sed 's/ */ /' | cut -f2 -d' '` +if [ -f "$XENBALLOON_MAXMEMFILE" ]; then + let oldmax=`cat $XENBALLOON_MAXMEMFILE` + if [ $oldmax -gt $maxmem ]; then + let maxmem=oldmax + fi +fi +echo $maxmem > $XENBALLOON_MAXMEMFILE + +RETVAL=0 +prog="xenballoond" + +start() { + # Start daemons. + echo -n $"Starting $prog: " + daemon xenballoond $OPTIONS + RETVAL=$? + echo + return $RETVAL +} + +stop() { + echo -n $"Shutting down $prog: " + killproc xenballoond + RETVAL=$? + echo + return $RETVAL +} + +# See how we were called. +case "$1" in + start) + start + ;; + stop) + stop + ;; + status) + status xenballoond + RETVAL=$? + ;; + restart|reload) + stop + start + RETVAL=$? + ;; + *) + echo $"Usage: $0 {start|stop|restart|status}" + exit 1 +esac + +exit $RETVAL diff -r 11318234588e -r 08f77df14cba tools/xentrace/xenctx.c --- a/tools/xentrace/xenctx.c Thu Jun 19 12:48:04 2008 +0900 +++ b/tools/xentrace/xenctx.c Wed Jul 02 11:30:37 2008 +0900 @@ -702,7 +702,7 @@ void dump_ctx(int vcpu) void dump_ctx(int vcpu) { int ret; - vcpu_guest_context_t ctx; + vcpu_guest_context_any_t ctx; xc_dominfo_t dominfo; xc_handle = xc_interface_open(); /* for accessing control interface */ @@ -727,10 +727,10 @@ void dump_ctx(int vcpu) exit(-1); } - print_ctx(&ctx); + print_ctx(&ctx.c); #ifndef NO_TRANSLATION - if (is_kernel_text(INSTR_POINTER((&ctx.user_regs)))) - print_stack(&ctx, vcpu); + if (is_kernel_text(INSTR_POINTER((&ctx.c.user_regs)))) + print_stack(&ctx.c, vcpu); #endif if (!dominfo.paused) { diff -r 11318234588e -r 08f77df14cba tools/xm-test/lib/XmTestLib/block_utils.py --- a/tools/xm-test/lib/XmTestLib/block_utils.py Thu Jun 19 12:48:04 2008 +0900 +++ b/tools/xm-test/lib/XmTestLib/block_utils.py Wed Jul 02 11:30:37 2008 +0900 @@ -15,7 +15,7 @@ __all__ = [ "block_attach", "block_detac def get_state(domain, devname): - number = xen.util.blkif.blkdev_name_to_number(devname) + (path, number) = xen.util.blkif.blkdev_name_to_number(devname) s, o = traceCommand("xm block-list %s | awk '/^%d/ {print $4}'" % (domain.getName(), number)) if s != 0: diff -r 11318234588e -r 08f77df14cba xen/arch/ia64/vmx/vmx_hypercall.c --- a/xen/arch/ia64/vmx/vmx_hypercall.c Thu Jun 19 12:48:04 2008 +0900 +++ b/xen/arch/ia64/vmx/vmx_hypercall.c Wed Jul 02 11:30:37 2008 +0900 @@ -204,6 +204,53 @@ do_hvm_op(unsigned long op, XEN_GUEST_HA rc = -ENOSYS; break; + case HVMOP_modified_memory: + { + struct xen_hvm_modified_memory a; + struct domain *d; + unsigned long pfn; + + if ( copy_from_guest(&a, arg, 1) ) + return -EFAULT; + + if ( a.domid == DOMID_SELF ) + { + d = rcu_lock_current_domain(); + } + else + { + if ( (d = rcu_lock_domain_by_id(a.domid)) == NULL ) + return -ESRCH; + if ( !IS_PRIV_FOR(current->domain, d) ) + { + rc = -EPERM; + goto param_fail3; + } + } + + rc = -EINVAL; + if ( !is_hvm_domain(d) ) + goto param_fail3; + + rc = -EINVAL; + if ( a.first_pfn > domain_get_maximum_gpfn(d) + || a.first_pfn + a.nr - 1 < a.first_pfn + || a.first_pfn + a.nr - 1 > domain_get_maximum_gpfn(d)) + goto param_fail3; + + rc = 0; + if ( !d->arch.shadow_bitmap ) + goto param_fail3; + + for (pfn = a.first_pfn; pfn < a.first_pfn + a.nr; pfn++) + if (pfn < d->arch.shadow_bitmap_size) + set_bit(pfn, d->arch.shadow_bitmap); + + param_fail3: + rcu_unlock_domain(d); + break; + } + default: gdprintk(XENLOG_INFO, "Bad HVM op %ld.\n", op); rc = -ENOSYS; diff -r 11318234588e -r 08f77df14cba xen/arch/ia64/xen/mm.c --- a/xen/arch/ia64/xen/mm.c Thu Jun 19 12:48:04 2008 +0900 +++ b/xen/arch/ia64/xen/mm.c Wed Jul 02 11:30:37 2008 +0900 @@ -207,7 +207,7 @@ alloc_dom_xen_and_dom_io(void) * Any Xen-heap pages that we will allow to be mapped will have * their domain field set to dom_xen. */ - dom_xen = alloc_domain(DOMID_XEN); + dom_xen = domain_create(DOMID_XEN, DOMCRF_dummy, 0); BUG_ON(dom_xen == NULL); /* @@ -215,7 +215,7 @@ alloc_dom_xen_and_dom_io(void) * This domain owns I/O pages that are within the range of the page_info * array. Mappings occur at the priv of the caller. */ - dom_io = alloc_domain(DOMID_IO); + dom_io = domain_create(DOMID_IO, DOMCRF_dummy, 0); BUG_ON(dom_io == NULL); } @@ -1553,7 +1553,7 @@ expose_p2m_init(void) * Initialise our DOMID_P2M domain. * This domain owns m2p table pages. */ - dom_p2m = alloc_domain(DOMID_P2M); + dom_p2m = domain_create(DOMID_P2M, DOMCRF_dummy, 0); BUG_ON(dom_p2m == NULL); dom_p2m->max_pages = ~0U; diff -r 11318234588e -r 08f77df14cba xen/arch/x86/acpi/cpufreq/Makefile --- a/xen/arch/x86/acpi/cpufreq/Makefile Thu Jun 19 12:48:04 2008 +0900 +++ b/xen/arch/x86/acpi/cpufreq/Makefile Wed Jul 02 11:30:37 2008 +0900 @@ -1,3 +1,4 @@ obj-y += cpufreq.o obj-y += cpufreq.o obj-y += utility.o obj-y += cpufreq_ondemand.o +obj-y += powernow.o diff -r 11318234588e -r 08f77df14cba xen/arch/x86/acpi/cpufreq/cpufreq.c --- a/xen/arch/x86/acpi/cpufreq/cpufreq.c Thu Jun 19 12:48:04 2008 +0900 +++ b/xen/arch/x86/acpi/cpufreq/cpufreq.c Wed Jul 02 11:30:37 2008 +0900 @@ -47,6 +47,10 @@ struct processor_pminfo processor_pminfo struct processor_pminfo processor_pminfo[NR_CPUS]; struct cpufreq_policy xen_px_policy[NR_CPUS]; +static cpumask_t *cpufreq_dom_pt; +static cpumask_t cpufreq_dom_mask; +static unsigned int cpufreq_dom_max; + enum { UNDEFINED_CAPABLE = 0, SYSTEM_INTEL_MSR_CAPABLE, @@ -60,7 +64,6 @@ struct acpi_cpufreq_data { struct processor_performance *acpi_data; struct cpufreq_frequency_table *freq_table; unsigned int max_freq; - unsigned int resume; unsigned int cpu_feature; }; @@ -328,14 +331,16 @@ static int acpi_cpufreq_target(struct cp next_perf_state = data->freq_table[next_state].index; if (perf->state == next_perf_state) { - if (unlikely(data->resume)) { - printk("xen_pminfo: @acpi_cpufreq_target, " - "Called after resume, resetting to P%d\n", + if (unlikely(policy->resume)) { + printk(KERN_INFO "Called after resume, resetting to P%d\n", next_perf_state); - data->resume = 0; + policy->resume = 0; } - else + else { + printk(KERN_INFO "Already at target state (P%d)\n", + next_perf_state); return 0; + } } switch (data->cpu_feature) { @@ -531,7 +536,7 @@ acpi_cpufreq_cpu_init(struct cpufreq_pol * the first call to ->target() should result in us actually * writing something to the appropriate registers. */ - data->resume = 1; + policy->resume = 1; return result; @@ -549,61 +554,101 @@ static struct cpufreq_driver acpi_cpufre .init = acpi_cpufreq_cpu_init, }; -int acpi_cpufreq_init(void) -{ - unsigned int i, ret = 0; - unsigned int dom, max_dom = 0; - cpumask_t *pt, dom_mask; - - cpus_clear(dom_mask); +void cpufreq_dom_exit(void) +{ + cpufreq_dom_max = 0; + cpus_clear(cpufreq_dom_mask); + if (cpufreq_dom_pt) + xfree(cpufreq_dom_pt); +} + +int cpufreq_dom_init(void) +{ + unsigned int i; + + cpufreq_dom_max = 0; + cpus_clear(cpufreq_dom_mask); for_each_online_cpu(i) { - cpu_set(processor_pminfo[i].perf.domain_info.domain, dom_mask); - if (max_dom < processor_pminfo[i].perf.domain_info.domain) - max_dom = processor_pminfo[i].perf.domain_info.domain; - } - max_dom++; - - pt = xmalloc_array(cpumask_t, max_dom); - if (!pt) + cpu_set(processor_pminfo[i].perf.domain_info.domain, cpufreq_dom_mask); + if (cpufreq_dom_max < processor_pminfo[i].perf.domain_info.domain) + cpufreq_dom_max = processor_pminfo[i].perf.domain_info.domain; + } + cpufreq_dom_max++; + + cpufreq_dom_pt = xmalloc_array(cpumask_t, cpufreq_dom_max); + if (!cpufreq_dom_pt) return -ENOMEM; - memset(pt, 0, max_dom * sizeof(cpumask_t)); - - /* get cpumask of each psd domain */ + memset(cpufreq_dom_pt, 0, cpufreq_dom_max * sizeof(cpumask_t)); + for_each_online_cpu(i) - cpu_set(i, pt[processor_pminfo[i].perf.domain_info.domain]); + cpu_set(i, cpufreq_dom_pt[processor_pminfo[i].perf.domain_info.domain]); for_each_online_cpu(i) - processor_pminfo[i].perf.shared_cpu_map = - pt[processor_pminfo[i].perf.domain_info.domain]; - - cpufreq_driver = &acpi_cpufreq_driver; - - /* setup cpufreq infrastructure */ + processor_pminfo[i].perf.shared_cpu_map = + cpufreq_dom_pt[processor_pminfo[i].perf.domain_info.domain]; + + return 0; +} + +static int cpufreq_cpu_init(void) +{ + int i, ret = 0; + for_each_online_cpu(i) { xen_px_policy[i].cpu = i; ret = px_statistic_init(i); if (ret) - goto out; + return ret; ret = acpi_cpufreq_cpu_init(&xen_px_policy[i]); if (ret) - goto out; - } - - /* setup ondemand cpufreq */ - for (dom=0; dom<max_dom; dom++) { - if (!cpu_isset(dom, dom_mask)) + return ret; + } + return ret; +} + +int cpufreq_dom_dbs(unsigned int event) +{ + int cpu, dom, ret = 0; + + for (dom=0; dom<cpufreq_dom_max; dom++) { + if (!cpu_isset(dom, cpufreq_dom_mask)) continue; - i = first_cpu(pt[dom]); - ret = cpufreq_governor_dbs(&xen_px_policy[i], CPUFREQ_GOV_START); + cpu = first_cpu(cpufreq_dom_pt[dom]); + ret = cpufreq_governor_dbs(&xen_px_policy[cpu], event); if (ret) - goto out; - } - -out: - xfree(pt); - + return ret; + } return ret; } + +int acpi_cpufreq_init(void) +{ + int ret = 0; + + /* setup cpumask of psd dom and shared cpu map of cpu */ + ret = cpufreq_dom_init(); + if (ret) + goto err; + + /* setup cpufreq driver */ + cpufreq_driver = &acpi_cpufreq_driver; + + /* setup cpufreq infrastructure */ + ret = cpufreq_cpu_init(); + if (ret) + goto err; + + /* setup cpufreq dbs according to dom coordiation */ + ret = cpufreq_dom_dbs(CPUFREQ_GOV_START); + if (ret) + goto err; + + return ret; + +err: + cpufreq_dom_exit(); + return ret; +} diff -r 11318234588e -r 08f77df14cba xen/arch/x86/acpi/cpufreq/cpufreq_ondemand.c --- a/xen/arch/x86/acpi/cpufreq/cpufreq_ondemand.c Thu Jun 19 12:48:04 2008 +0900 +++ b/xen/arch/x86/acpi/cpufreq/cpufreq_ondemand.c Wed Jul 02 11:30:37 2008 +0900 @@ -52,7 +52,7 @@ static struct dbs_tuners { static struct timer dbs_timer[NR_CPUS]; -static inline uint64_t get_cpu_idle_time(unsigned int cpu) +inline uint64_t get_cpu_idle_time(unsigned int cpu) { uint64_t idle_ns; struct vcpu *v; @@ -79,6 +79,12 @@ static void dbs_check_cpu(struct cpu_dbs return; policy = this_dbs_info->cur_policy; + + if (unlikely(policy->resume)) { + __cpufreq_driver_target(policy, policy->max,CPUFREQ_RELATION_H); + return; + } + cur_ns = NOW(); total_ns = cur_ns - this_dbs_info->prev_cpu_wall; this_dbs_info->prev_cpu_wall = NOW(); @@ -217,8 +223,7 @@ int cpufreq_governor_dbs(struct cpufreq_ break; case CPUFREQ_GOV_STOP: - if (this_dbs_info->enable) - dbs_timer_exit(this_dbs_info); + dbs_timer_exit(this_dbs_info); dbs_enable--; break; @@ -233,5 +238,4 @@ int cpufreq_governor_dbs(struct cpufreq_ break; } return 0; -} - +} diff -r 11318234588e -r 08f77df14cba xen/arch/x86/acpi/cpufreq/powernow.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/arch/x86/acpi/cpufreq/powernow.c Wed Jul 02 11:30:37 2008 +0900 @@ -0,0 +1,305 @@ +/* + * powernow - AMD Architectural P-state Driver ($Revision: 1.4 $) + * + * Copyright (C) 2008 Mark Langsdorf <mark.langsdorf@xxxxxxx> + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ + +#include <xen/types.h> +#include <xen/errno.h> +#include <xen/delay.h> +#include <xen/cpumask.h> +#include <xen/timer.h> +#include <xen/xmalloc.h> +#include <asm/bug.h> +#include <asm/msr.h> +#include <asm/io.h> +#include <asm/config.h> +#include <asm/processor.h> +#include <asm/percpu.h> +#include <asm/cpufeature.h> +#include <acpi/acpi.h> +#include <acpi/cpufreq/cpufreq.h> + +#define CPUID_FREQ_VOLT_CAPABILITIES 0x80000007 +#define USE_HW_PSTATE 0x00000080 +#define HW_PSTATE_MASK 0x00000007 +#define HW_PSTATE_VALID_MASK 0x80000000 +#define HW_PSTATE_MAX_MASK 0x000000f0 +#define HW_PSTATE_MAX_SHIFT 4 +#define MSR_PSTATE_DEF_BASE 0xc0010064 /* base of Pstate MSRs */ +#define MSR_PSTATE_STATUS 0xc0010063 /* Pstate Status MSR */ +#define MSR_PSTATE_CTRL 0xc0010062 /* Pstate control MSR */ +#define MSR_PSTATE_CUR_LIMIT 0xc0010061 /* pstate current limit MSR */ + +extern struct processor_pminfo processor_pminfo[NR_CPUS]; +extern struct cpufreq_policy xen_px_policy[NR_CPUS]; + +struct powernow_cpufreq_data { + struct processor_performance *acpi_data; + struct cpufreq_frequency_table *freq_table; + unsigned int max_freq; + unsigned int resume; + unsigned int cpu_feature; +}; + +static struct powernow_cpufreq_data *drv_data[NR_CPUS]; + +struct drv_cmd { + unsigned int type; + cpumask_t mask; + u64 addr; + u32 val; +}; + +static void transition_pstate(void *drvcmd) +{ + struct drv_cmd *cmd; + cmd = (struct drv_cmd *) drvcmd; + + wrmsr(MSR_PSTATE_CTRL, cmd->val, 0); +} + +static int powernow_cpufreq_target(struct cpufreq_policy *policy, + unsigned int target_freq, unsigned int relation) +{ + struct powernow_cpufreq_data *data = drv_data[policy->cpu]; + struct processor_performance *perf; + struct cpufreq_freqs freqs; + cpumask_t online_policy_cpus; + struct drv_cmd cmd; + unsigned int next_state = 0; /* Index into freq_table */ + unsigned int next_perf_state = 0; /* Index into perf table */ + int result = 0; + + if (unlikely(data == NULL || + data->acpi_data == NULL || data->freq_table == NULL)) { + return -ENODEV; + } + + perf = data->acpi_data; + result = cpufreq_frequency_table_target(policy, + data->freq_table, + target_freq, + relation, &next_state); + if (unlikely(result)) + return -ENODEV; + + online_policy_cpus = policy->cpus; + + next_perf_state = data->freq_table[next_state].index; + if (perf->state == next_perf_state) { + if (unlikely(data->resume)) + data->resume = 0; + else + return 0; + } + + cpus_clear(cmd.mask); + + if (policy->shared_type != CPUFREQ_SHARED_TYPE_ANY) + cmd.mask = online_policy_cpus; + else + cpu_set(policy->cpu, cmd.mask); + + freqs.old = perf->states[perf->state].core_frequency * 1000; + freqs.new = data->freq_table[next_state].frequency; + + cmd.val = next_perf_state; + + on_selected_cpus( cmd.mask, transition_pstate, (void *) &cmd, 0, 0); + + perf->state = next_perf_state; + policy->cur = freqs.new; + + return result; +} + +static int powernow_cpufreq_cpu_init(struct cpufreq_policy *policy) +{ + unsigned int i; + unsigned int valid_states = 0; + unsigned int cpu = policy->cpu; + struct powernow_cpufreq_data *data; + unsigned int result = 0; + struct processor_performance *perf; + u32 max_hw_pstate, hi = 0, lo = 0; + + data = xmalloc(struct powernow_cpufreq_data); + if (!data) + return -ENOMEM; + memset(data, 0, sizeof(struct powernow_cpufreq_data)); + + drv_data[cpu] = data; + + data->acpi_data = &processor_pminfo[cpu].perf; + + perf = data->acpi_data; + policy->shared_type = perf->shared_type; + + /* + * Will let policy->cpus know about dependency only when software + * coordination is required. + */ + if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL || + policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) { + policy->cpus = perf->shared_cpu_map; + } else { + policy->cpus = cpumask_of_cpu(cpu); + } + + /* capability check */ + if (perf->state_count <= 1) { + printk("No P-States\n"); + result = -ENODEV; + goto err_unreg; + } + rdmsr(MSR_PSTATE_CUR_LIMIT, hi, lo); + max_hw_pstate = (hi & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT; + + if (perf->control_register.space_id != perf->status_register.space_id) { + result = -ENODEV; + goto err_unreg; + } + + data->freq_table = xmalloc_array(struct cpufreq_frequency_table, + (perf->state_count+1)); + if (!data->freq_table) { + result = -ENOMEM; + goto err_unreg; + } + + /* detect transition latency */ + policy->cpuinfo.transition_latency = 0; + for (i=0; i<perf->state_count; i++) { + if ((perf->states[i].transition_latency * 1000) > + policy->cpuinfo.transition_latency) + policy->cpuinfo.transition_latency = + perf->states[i].transition_latency * 1000; + } + + data->max_freq = perf->states[0].core_frequency * 1000; + /* table init */ + for (i=0; i<perf->state_count && i<max_hw_pstate; i++) { + if (i>0 && perf->states[i].core_frequency >= + data->freq_table[valid_states-1].frequency / 1000) + continue; + + data->freq_table[valid_states].index = perf->states[i].control & HW_PSTATE_MASK; + data->freq_table[valid_states].frequency = + perf->states[i].core_frequency * 1000; + valid_states++; + } + data->freq_table[valid_states].frequency = CPUFREQ_TABLE_END; + perf->state = 0; + + result = cpufreq_frequency_table_cpuinfo(policy, data->freq_table); + if (result) + goto err_freqfree; + + /* + * the first call to ->target() should result in us actually + * writing something to the appropriate registers. + */ + data->resume = 1; + + policy->cur = data->freq_table[i].frequency; + return result; + +err_freqfree: + xfree(data->freq_table); +err_unreg: + xfree(data); + drv_data[cpu] = NULL; + + return result; +} + +static struct cpufreq_driver powernow_cpufreq_driver = { + .target = powernow_cpufreq_target, + .init = powernow_cpufreq_cpu_init, +}; + +int powernow_cpufreq_init(void) +{ + unsigned int i, ret = 0; + unsigned int dom, max_dom = 0; + cpumask_t *pt, dom_mask; + + cpus_clear(dom_mask); + + for_each_online_cpu(i) { + struct cpuinfo_x86 *c = &cpu_data[i]; + if (c->x86_vendor != X86_VENDOR_AMD) + ret = -ENODEV; + else + { + u32 eax, ebx, ecx, edx; + cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx); + if ((edx & USE_HW_PSTATE) != USE_HW_PSTATE) + ret = -ENODEV; + } + if (ret) + return ret; + cpu_set(processor_pminfo[i].perf.domain_info.domain, dom_mask); + if (max_dom < processor_pminfo[i].perf.domain_info.domain) + max_dom = processor_pminfo[i].perf.domain_info.domain; + } + max_dom++; + + pt = xmalloc_array(cpumask_t, max_dom); + if (!pt) + return -ENOMEM; + memset(pt, 0, max_dom * sizeof(cpumask_t)); + + /* get cpumask of each psd domain */ + for_each_online_cpu(i) + cpu_set(i, pt[processor_pminfo[i].perf.domain_info.domain]); + + for_each_online_cpu(i) + processor_pminfo[i].perf.shared_cpu_map = + pt[processor_pminfo[i].perf.domain_info.domain]; + + cpufreq_driver = &powernow_cpufreq_driver; + + /* setup cpufreq infrastructure */ + for_each_online_cpu(i) { + xen_px_policy[i].cpu = i; + + ret = powernow_cpufreq_cpu_init(&xen_px_policy[i]); + if (ret) + goto cpufreq_init_out; + } + + /* setup ondemand cpufreq */ + for (dom=0; dom<max_dom; dom++) { + if (!cpu_isset(dom, dom_mask)) + continue; + i = first_cpu(pt[dom]); + ret = cpufreq_governor_dbs(&xen_px_policy[i], CPUFREQ_GOV_START); + if (ret) + goto cpufreq_init_out; + } + +cpufreq_init_out: + xfree(pt); + + return ret; +} diff -r 11318234588e -r 08f77df14cba xen/arch/x86/acpi/cpufreq/utility.c --- a/xen/arch/x86/acpi/cpufreq/utility.c Thu Jun 19 12:48:04 2008 +0900 +++ b/xen/arch/x86/acpi/cpufreq/utility.c Wed Jul 02 11:30:37 2008 +0900 @@ -37,6 +37,41 @@ struct cpufreq_driver *cpufreq_driver; * Px STATISTIC INFO * *********************************************************************/ +void px_statistic_suspend(void) +{ + int cpu; + uint64_t now; + + now = NOW(); + + for_each_online_cpu(cpu) { + struct pm_px *pxpt = &px_statistic_data[cpu]; + uint64_t total_idle_ns; + uint64_t tmp_idle_ns; + + total_idle_ns = get_cpu_idle_time(cpu); + tmp_idle_ns = total_idle_ns - pxpt->prev_idle_wall; + + pxpt->u.pt[pxpt->u.cur].residency += + now - pxpt->prev_state_wall; + pxpt->u.pt[pxpt->u.cur].residency -= tmp_idle_ns; + } +} + +void px_statistic_resume(void) +{ + int cpu; + uint64_t now; + + now = NOW(); + + for_each_online_cpu(cpu) { + struct pm_px *pxpt = &px_statistic_data[cpu]; + pxpt->prev_state_wall = now; + pxpt->prev_idle_wall = get_cpu_idle_time(cpu); + } +} + void px_statistic_update(cpumask_t cpumask, uint8_t from, uint8_t to) { uint32_t i; @@ -47,15 +82,22 @@ void px_statistic_update(cpumask_t cpuma for_each_cpu_mask(i, cpumask) { struct pm_px *pxpt = &px_statistic_data[i]; uint32_t statnum = processor_pminfo[i].perf.state_count; + uint64_t total_idle_ns; + uint64_t tmp_idle_ns; + + total_idle_ns = get_cpu_idle_time(i); + tmp_idle_ns = total_idle_ns - pxpt->prev_idle_wall; pxpt->u.last = from; pxpt->u.cur = to; pxpt->u.pt[to].count++; pxpt->u.pt[from].residency += now - pxpt->prev_state_wall; + pxpt->u.pt[from].residency -= tmp_idle_ns; (*(pxpt->u.trans_pt + from*statnum + to))++; pxpt->prev_state_wall = now; + pxpt->prev_idle_wall = total_idle_ns; } } @@ -87,6 +129,7 @@ int px_statistic_init(int cpuid) pxpt->u.pt[i].freq = pmpt->perf.states[i].core_frequency; pxpt->prev_state_wall = NOW(); + pxpt->prev_idle_wall = get_cpu_idle_time(cpuid); return 0; } @@ -107,6 +150,7 @@ void px_statistic_reset(int cpuid) } pxpt->prev_state_wall = NOW(); + pxpt->prev_idle_wall = get_cpu_idle_time(cpuid); } @@ -242,3 +286,62 @@ int __cpufreq_driver_getavg(struct cpufr return ret; } + + +/********************************************************************* + * CPUFREQ SUSPEND/RESUME * + *********************************************************************/ + +void cpufreq_suspend(void) +{ + int cpu; + + /* to protect the case when Px was controlled by dom0-kernel */ + /* or when CPU_FREQ not set in which case ACPI Px objects not parsed */ + for_each_online_cpu(cpu) { + struct processor_performance *perf = &processor_pminfo[cpu].perf; + + if (!perf->init) + return; + } + + cpufreq_dom_dbs(CPUFREQ_GOV_STOP); + + cpufreq_dom_exit(); + + px_statistic_suspend(); +} + +int cpufreq_resume(void) +{ + int cpu, ret = 0; + + /* 1. to protect the case when Px was controlled by dom0-kernel */ + /* or when CPU_FREQ not set in which case ACPI Px objects not parsed */ + /* 2. set state and resume flag to sync cpu to right state and freq */ + for_each_online_cpu(cpu) { + struct processor_performance *perf = &processor_pminfo[cpu].perf; + struct cpufreq_policy *policy = &xen_px_policy[cpu]; + + if (!perf->init) + goto err; + perf->state = 0; + policy->resume = 1; + } + + px_statistic_resume(); + + ret = cpufreq_dom_init(); + if (ret) + goto err; + + ret = cpufreq_dom_dbs(CPUFREQ_GOV_START); + if (ret) + goto err; + + return ret; + +err: + cpufreq_dom_exit(); + return ret; +} diff -r 11318234588e -r 08f77df14cba xen/arch/x86/acpi/pmstat.c --- a/xen/arch/x86/acpi/pmstat.c Thu Jun 19 12:48:04 2008 +0900 +++ b/xen/arch/x86/acpi/pmstat.c Wed Jul 02 11:30:37 2008 +0900 @@ -71,11 +71,18 @@ int do_get_pm_info(struct xen_sysctl_get case PMSTAT_get_pxstat: { uint64_t now, ct; + uint64_t total_idle_ns; + uint64_t tmp_idle_ns; + + total_idle_ns = get_cpu_idle_time(op->cpuid); + tmp_idle_ns = total_idle_ns - pxpt->prev_idle_wall; now = NOW(); pxpt->u.usable = pmpt->perf.state_count - pmpt->perf.ppc; pxpt->u.pt[pxpt->u.cur].residency += now - pxpt->prev_state_wall; + pxpt->u.pt[pxpt->u.cur].residency -= tmp_idle_ns; pxpt->prev_state_wall = now; + pxpt->prev_idle_wall = total_idle_ns; ct = pmpt->perf.state_count; if ( copy_to_guest(op->u.getpx.trans_pt, pxpt->u.trans_pt, ct*ct) ) diff -r 11318234588e -r 08f77df14cba xen/arch/x86/acpi/power.c --- a/xen/arch/x86/acpi/power.c Thu Jun 19 12:48:04 2008 +0900 +++ b/xen/arch/x86/acpi/power.c Wed Jul 02 11:30:37 2008 +0900 @@ -27,7 +27,7 @@ #include <public/platform.h> #include <asm/tboot.h> -#define pmprintk(_l, _f, _a...) printk(_l "<PM> " _f "\n", ## _a ) +#include <acpi/cpufreq/cpufreq.h> static char opt_acpi_sleep[20]; string_param("acpi_sleep", opt_acpi_sleep); @@ -124,9 +124,11 @@ static int enter_state(u32 state) if ( !spin_trylock(&pm_lock) ) return -EBUSY; - pmprintk(XENLOG_INFO, "Preparing system for ACPI S%d state.", state); + printk(XENLOG_INFO "Preparing system for ACPI S%d state.", state); freeze_domains(); + + cpufreq_suspend(); disable_nonboot_cpus(); if ( num_online_cpus() != 1 ) @@ -139,11 +141,14 @@ static int enter_state(u32 state) acpi_sleep_prepare(state); + console_start_sync(); + printk("Entering ACPI S%d state.\n", state); + local_irq_save(flags); if ( (error = device_power_down()) ) { - pmprintk(XENLOG_ERR, "Some devices failed to power down."); + printk(XENLOG_ERR "Some devices failed to power down."); goto done; } @@ -162,8 +167,6 @@ static int enter_state(u32 state) break; } - pmprintk(XENLOG_DEBUG, "Back to C."); - /* Restore CR4 and EFER from cached values. */ write_cr4(read_cr4()); if ( cpu_has_efer ) @@ -171,16 +174,18 @@ static int enter_state(u32 state) device_power_up(); - pmprintk(XENLOG_INFO, "Finishing wakeup from ACPI S%d state.", state); + printk(XENLOG_INFO "Finishing wakeup from ACPI S%d state.", state); done: local_irq_restore(flags); + console_end_sync(); acpi_sleep_post(state); if ( !hvm_cpu_up() ) BUG(); enable_cpu: enable_nonboot_cpus(); + cpufreq_resume(); thaw_domains(); spin_unlock(&pm_lock); return error; @@ -206,7 +211,7 @@ int acpi_enter_sleep(struct xenpf_enter_ ((sleep->pm1a_cnt_val ^ sleep->pm1b_cnt_val) & ACPI_BITMASK_SLEEP_ENABLE) ) { - pmprintk(XENLOG_ERR, "Mismatched pm1a/pm1b setting."); + gdprintk(XENLOG_ERR, "Mismatched pm1a/pm1b setting."); return -EINVAL; } @@ -278,7 +283,7 @@ acpi_status asmlinkage acpi_enter_sleep_ if ( tboot_in_measured_env() ) { tboot_sleep(sleep_state); - pmprintk(XENLOG_ERR, "TBOOT failed entering s3 state\n"); + printk(XENLOG_ERR "TBOOT failed entering s3 state\n"); return_ACPI_STATUS(AE_ERROR); } @@ -320,7 +325,7 @@ static int __init acpi_sleep_init(void) p += strspn(p, ", \t"); } - printk(XENLOG_INFO "<PM> ACPI (supports"); + printk(XENLOG_INFO "ACPI sleep modes:"); for ( i = 0; i < ACPI_S_STATE_COUNT; i++ ) { if ( i == ACPI_STATE_S3 ) @@ -331,7 +336,7 @@ static int __init acpi_sleep_init(void) else sleep_states[i] = 0; } - printk(")\n"); + printk("\n"); return 0; } diff -r 11318234588e -r 08f77df14cba xen/arch/x86/hvm/emulate.c --- a/xen/arch/x86/hvm/emulate.c Thu Jun 19 12:48:04 2008 +0900 +++ b/xen/arch/x86/hvm/emulate.c Wed Jul 02 11:30:37 2008 +0900 @@ -21,15 +21,33 @@ static int hvmemul_do_io( int is_mmio, paddr_t addr, unsigned long *reps, int size, - paddr_t value, int dir, int df, int value_is_ptr, unsigned long *val) -{ + paddr_t ram_gpa, int dir, int df, void *p_data) +{ + paddr_t value = ram_gpa; + int value_is_ptr = (p_data == NULL); struct vcpu *curr = current; vcpu_iodata_t *vio = get_ioreq(curr); ioreq_t *p = &vio->vp_ioreq; int rc; - /* Only retrieve the value from singleton (non-REP) reads. */ - ASSERT((val == NULL) || ((dir == IOREQ_READ) && !value_is_ptr)); + /* + * Weird-sized accesses have undefined behaviour: we discard writes + * and read all-ones. + */ + if ( unlikely((size > sizeof(long)) || (size & (size - 1))) ) + { + gdprintk(XENLOG_WARNING, "bad mmio size %d\n", size); + ASSERT(p_data != NULL); /* cannot happen with a REP prefix */ + if ( dir == IOREQ_READ ) + memset(p_data, ~0, size); + return X86EMUL_UNHANDLEABLE; + } + + if ( (p_data != NULL) && (dir == IOREQ_WRITE) ) + { + memcpy(&value, p_data, size); + p_data = NULL; + } if ( is_mmio && !value_is_ptr ) { @@ -47,8 +65,7 @@ static int hvmemul_do_io( unsigned int bytes = curr->arch.hvm_vcpu.mmio_large_read_bytes; if ( (addr >= pa) && ((addr + size) <= (pa + bytes)) ) { - *val = 0; - memcpy(val, &curr->arch.hvm_vcpu.mmio_large_read[addr - pa], + memcpy(p_data, &curr->arch.hvm_vcpu.mmio_large_read[addr - pa], size); return X86EMUL_OKAY; } @@ -61,7 +78,7 @@ static int hvmemul_do_io( break; case HVMIO_completed: curr->arch.hvm_vcpu.io_state = HVMIO_none; - if ( val == NULL ) + if ( p_data == NULL ) return X86EMUL_UNHANDLEABLE; goto finish_access; case HVMIO_dispatched: @@ -82,7 +99,7 @@ static int hvmemul_do_io( } curr->arch.hvm_vcpu.io_state = - (val == NULL) ? HVMIO_dispatched : HVMIO_awaiting_completion; + (p_data == NULL) ? HVMIO_dispatched : HVMIO_awaiting_completion; p->dir = dir; p->data_is_ptr = value_is_ptr; @@ -116,7 +133,7 @@ static int hvmemul_do_io( break; case X86EMUL_UNHANDLEABLE: hvm_send_assist_req(curr); - rc = (val != NULL) ? X86EMUL_RETRY : X86EMUL_OKAY; + rc = (p_data != NULL) ? X86EMUL_RETRY : X86EMUL_OKAY; break; default: BUG(); @@ -126,8 +143,8 @@ static int hvmemul_do_io( return rc; finish_access: - if ( val != NULL ) - *val = curr->arch.hvm_vcpu.io_data; + if ( p_data != NULL ) + memcpy(p_data, &curr->arch.hvm_vcpu.io_data, size); if ( is_mmio && !value_is_ptr ) { @@ -152,7 +169,7 @@ static int hvmemul_do_io( sizeof(curr->arch.hvm_vcpu.mmio_large_read)) ) { memcpy(&curr->arch.hvm_vcpu.mmio_large_read[addr - pa], - val, size); + p_data, size); curr->arch.hvm_vcpu.mmio_large_read_bytes += size; } } @@ -163,18 +180,16 @@ static int hvmemul_do_io( static int hvmemul_do_pio( unsigned long port, unsigned long *reps, int size, - paddr_t value, int dir, int df, int value_is_ptr, unsigned long *val) -{ - return hvmemul_do_io(0, port, reps, size, value, - dir, df, value_is_ptr, val); + paddr_t ram_gpa, int dir, int df, void *p_data) +{ + return hvmemul_do_io(0, port, reps, size, ram_gpa, dir, df, p_data); } static int hvmemul_do_mmio( paddr_t gpa, unsigned long *reps, int size, - paddr_t value, int dir, int df, int value_is_ptr, unsigned long *val) -{ - return hvmemul_do_io(1, gpa, reps, size, value, - dir, df, value_is_ptr, val); + paddr_t ram_gpa, int dir, int df, void *p_data) +{ + return hvmemul_do_io(1, gpa, reps, size, ram_gpa, dir, df, p_data); } /* @@ -287,7 +302,7 @@ static int __hvmemul_read( static int __hvmemul_read( enum x86_segment seg, unsigned long offset, - unsigned long *val, + void *p_data, unsigned int bytes, enum hvm_access_type access_type, struct hvm_emulate_ctxt *hvmemul_ctxt) @@ -302,8 +317,6 @@ static int __hvmemul_read( seg, offset, bytes, access_type, hvmemul_ctxt, &addr); if ( rc != X86EMUL_OKAY ) return rc; - - *val = 0; if ( unlikely(curr->arch.hvm_vcpu.mmio_gva == (addr & PAGE_MASK)) && curr->arch.hvm_vcpu.mmio_gva ) @@ -314,7 +327,7 @@ static int __hvmemul_read( gpa = (((paddr_t)curr->arch.hvm_vcpu.mmio_gpfn << PAGE_SHIFT) | off); if ( (off + bytes) <= PAGE_SIZE ) return hvmemul_do_mmio(gpa, &reps, bytes, 0, - IOREQ_READ, 0, 0, val); + IOREQ_READ, 0, p_data); } if ( (seg != x86_seg_none) && @@ -322,15 +335,13 @@ static int __hvmemul_read( pfec |= PFEC_user_mode; rc = ((access_type == hvm_access_insn_fetch) ? - hvm_fetch_from_guest_virt(val, addr, bytes, pfec) : - hvm_copy_from_guest_virt(val, addr, bytes, pfec)); + hvm_fetch_from_guest_virt(p_data, addr, bytes, pfec) : + hvm_copy_from_guest_virt(p_data, addr, bytes, pfec)); if ( rc == HVMCOPY_bad_gva_to_gfn ) return X86EMUL_EXCEPTION; if ( rc == HVMCOPY_bad_gfn_to_mfn ) { - unsigned long reps = 1; - if ( access_type == hvm_access_insn_fetch ) return X86EMUL_UNHANDLEABLE; @@ -339,7 +350,7 @@ static int __hvmemul_read( if ( rc != X86EMUL_OKAY ) return rc; - return hvmemul_do_mmio(gpa, &reps, bytes, 0, IOREQ_READ, 0, 0, val); + return hvmemul_do_mmio(gpa, &reps, bytes, 0, IOREQ_READ, 0, p_data); } return X86EMUL_OKAY; @@ -348,19 +359,19 @@ static int hvmemul_read( static int hvmemul_read( enum x86_segment seg, unsigned long offset, - unsigned long *val, + void *p_data, unsigned int bytes, struct x86_emulate_ctxt *ctxt) { return __hvmemul_read( - seg, offset, val, bytes, hvm_access_read, + seg, offset, p_data, bytes, hvm_access_read, container_of(ctxt, struct hvm_emulate_ctxt, ctxt)); } static int hvmemul_insn_fetch( enum x86_segment seg, unsigned long offset, - unsigned long *val, + void *p_data, unsigned int bytes, struct x86_emulate_ctxt *ctxt) { @@ -371,19 +382,18 @@ static int hvmemul_insn_fetch( /* Fall back if requested bytes are not in the prefetch cache. */ if ( unlikely((insn_off + bytes) > hvmemul_ctxt->insn_buf_bytes) ) return __hvmemul_read( - seg, offset, val, bytes, + seg, offset, p_data, bytes, hvm_access_insn_fetch, hvmemul_ctxt); /* Hit the cache. Simple memcpy. */ - *val = 0; - memcpy(val, &hvmemul_ctxt->insn_buf[insn_off], bytes); + memcpy(p_data, &hvmemul_ctxt->insn_buf[insn_off], bytes); return X86EMUL_OKAY; } static int hvmemul_write( enum x86_segment seg, unsigned long offset, - unsigned long val, + void *p_data, unsigned int bytes, struct x86_emulate_ctxt *ctxt) { @@ -406,29 +416,27 @@ static int hvmemul_write( unsigned int off = addr & (PAGE_SIZE - 1); gpa = (((paddr_t)curr->arch.hvm_vcpu.mmio_gpfn << PAGE_SHIFT) | off); if ( (off + bytes) <= PAGE_SIZE ) - return hvmemul_do_mmio(gpa, &reps, bytes, val, - IOREQ_WRITE, 0, 0, NULL); + return hvmemul_do_mmio(gpa, &reps, bytes, 0, + IOREQ_WRITE, 0, p_data); } if ( (seg != x86_seg_none) && (hvmemul_ctxt->seg_reg[x86_seg_ss].attr.fields.dpl == 3) ) pfec |= PFEC_user_mode; - rc = hvm_copy_to_guest_virt(addr, &val, bytes, pfec); + rc = hvm_copy_to_guest_virt(addr, p_data, bytes, pfec); if ( rc == HVMCOPY_bad_gva_to_gfn ) return X86EMUL_EXCEPTION; if ( rc == HVMCOPY_bad_gfn_to_mfn ) { - unsigned long reps = 1; - rc = hvmemul_linear_to_phys( addr, &gpa, bytes, &reps, pfec, hvmemul_ctxt); if ( rc != X86EMUL_OKAY ) return rc; - return hvmemul_do_mmio(gpa, &reps, bytes, val, - IOREQ_WRITE, 0, 0, NULL); + return hvmemul_do_mmio(gpa, &reps, bytes, 0, + IOREQ_WRITE, 0, p_data); } return X86EMUL_OKAY; @@ -442,12 +450,8 @@ static int hvmemul_cmpxchg( unsigned int bytes, struct x86_emulate_ctxt *ctxt) { - unsigned long new = 0; - if ( bytes > sizeof(new) ) - return X86EMUL_UNHANDLEABLE; - memcpy(&new, p_new, bytes); /* Fix this in case the guest is really relying on r-m-w atomicity. */ - return hvmemul_write(seg, offset, new, bytes, ctxt); + return hvmemul_write(seg, offset, p_new, bytes, ctxt); } static int hvmemul_rep_ins( @@ -480,7 +484,7 @@ static int hvmemul_rep_ins( return rc; return hvmemul_do_pio(src_port, reps, bytes_per_rep, gpa, IOREQ_READ, - !!(ctxt->regs->eflags & X86_EFLAGS_DF), 1, NULL); + !!(ctxt->regs->eflags & X86_EFLAGS_DF), NULL); } static int hvmemul_rep_outs( @@ -513,7 +517,7 @@ static int hvmemul_rep_outs( return rc; return hvmemul_do_pio(dst_port, reps, bytes_per_rep, gpa, IOREQ_WRITE, - !!(ctxt->regs->eflags & X86_EFLAGS_DF), 1, NULL); + !!(ctxt->regs->eflags & X86_EFLAGS_DF), NULL); } static int hvmemul_rep_movs( @@ -563,14 +567,14 @@ static int hvmemul_rep_movs( if ( !p2m_is_ram(p2mt) ) return hvmemul_do_mmio( sgpa, reps, bytes_per_rep, dgpa, IOREQ_READ, - !!(ctxt->regs->eflags & X86_EFLAGS_DF), 1, NULL); + !!(ctxt->regs->eflags & X86_EFLAGS_DF), NULL); (void)gfn_to_mfn_current(dgpa >> PAGE_SHIFT, &p2mt); if ( p2m_is_ram(p2mt) ) return X86EMUL_UNHANDLEABLE; return hvmemul_do_mmio( dgpa, reps, bytes_per_rep, sgpa, IOREQ_WRITE, - !!(ctxt->regs->eflags & X86_EFLAGS_DF), 1, NULL); + !!(ctxt->regs->eflags & X86_EFLAGS_DF), NULL); } static int hvmemul_read_segment( @@ -607,7 +611,8 @@ static int hvmemul_read_io( struct x86_emulate_ctxt *ctxt) { unsigned long reps = 1; - return hvmemul_do_pio(port, &reps, bytes, 0, IOREQ_READ, 0, 0, val); + *val = 0; + return hvmemul_do_pio(port, &reps, bytes, 0, IOREQ_READ, 0, val); } static int hvmemul_write_io( @@ -617,7 +622,7 @@ static int hvmemul_write_io( struct x86_emulate_ctxt *ctxt) { unsigned long reps = 1; - return hvmemul_do_pio(port, &reps, bytes, val, IOREQ_WRITE, 0, 0, NULL); + return hvmemul_do_pio(port, &reps, bytes, 0, IOREQ_WRITE, 0, &val); } static int hvmemul_read_cr( diff -r 11318234588e -r 08f77df14cba xen/arch/x86/hvm/hvm.c --- a/xen/arch/x86/hvm/hvm.c Thu Jun 19 12:48:04 2008 +0900 +++ b/xen/arch/x86/hvm/hvm.c Wed Jul 02 11:30:37 2008 +0900 @@ -2529,6 +2529,66 @@ long do_hvm_op(unsigned long op, XEN_GUE break; } + case HVMOP_modified_memory: + { + struct xen_hvm_modified_memory a; + struct domain *d; + unsigned long pfn; + + if ( copy_from_guest(&a, arg, 1) ) + return -EFAULT; + + if ( a.domid == DOMID_SELF ) + { + d = rcu_lock_current_domain(); + } + else + { + if ( (d = rcu_lock_domain_by_id(a.domid)) == NULL ) + return -ESRCH; + if ( !IS_PRIV_FOR(current->domain, d) ) + { + rc = -EPERM; + goto param_fail3; + } + } + + rc = -EINVAL; + if ( !is_hvm_domain(d) ) + goto param_fail3; + + rc = xsm_hvm_param(d, op); + if ( rc ) + goto param_fail3; + + rc = -EINVAL; + if ( (a.first_pfn > domain_get_maximum_gpfn(d)) || + ((a.first_pfn + a.nr - 1) < a.first_pfn) || + ((a.first_pfn + a.nr - 1) > domain_get_maximum_gpfn(d)) ) + goto param_fail3; + + rc = 0; + if ( !paging_mode_log_dirty(d) ) + goto param_fail3; + + for ( pfn = a.first_pfn; pfn < a.first_pfn + a.nr; pfn++ ) + { + p2m_type_t t; + mfn_t mfn = gfn_to_mfn(d, pfn, &t); + if ( mfn_x(mfn) != INVALID_MFN ) + { + paging_mark_dirty(d, mfn_x(mfn)); + /* These are most probably not page tables any more */ + /* don't take a long time and don't die either */ + sh_remove_shadows(d->vcpu[0], mfn, 1, 0); + } + } + + param_fail3: + rcu_unlock_domain(d); + break; + } + default: { gdprintk(XENLOG_WARNING, "Bad HVM op %ld.\n", op); diff -r 11318234588e -r 08f77df14cba xen/arch/x86/hvm/vmx/vmcs.c --- a/xen/arch/x86/hvm/vmx/vmcs.c Thu Jun 19 12:48:04 2008 +0900 +++ b/xen/arch/x86/hvm/vmx/vmcs.c Wed Jul 02 11:30:37 2008 +0900 @@ -677,10 +677,11 @@ static int construct_vmcs(struct vcpu *v return 0; } -int vmx_read_guest_msr(struct vcpu *v, u32 msr, u64 *val) -{ - unsigned int i, msr_count = v->arch.hvm_vmx.msr_count; - const struct vmx_msr_entry *msr_area = v->arch.hvm_vmx.msr_area; +int vmx_read_guest_msr(u32 msr, u64 *val) +{ + struct vcpu *curr = current; + unsigned int i, msr_count = curr->arch.hvm_vmx.msr_count; + const struct vmx_msr_entry *msr_area = curr->arch.hvm_vmx.msr_area; for ( i = 0; i < msr_count; i++ ) { @@ -694,10 +695,11 @@ int vmx_read_guest_msr(struct vcpu *v, u return -ESRCH; } -int vmx_write_guest_msr(struct vcpu *v, u32 msr, u64 val) -{ - unsigned int i, msr_count = v->arch.hvm_vmx.msr_count; - struct vmx_msr_entry *msr_area = v->arch.hvm_vmx.msr_area; +int vmx_write_guest_msr(u32 msr, u64 val) +{ + struct vcpu *curr = current; + unsigned int i, msr_count = curr->arch.hvm_vmx.msr_count; + struct vmx_msr_entry *msr_area = curr->arch.hvm_vmx.msr_area; for ( i = 0; i < msr_count; i++ ) { @@ -711,10 +713,20 @@ int vmx_write_guest_msr(struct vcpu *v, return -ESRCH; } -int vmx_add_guest_msr(struct vcpu *v, u32 msr) -{ - unsigned int i, msr_count = v->arch.hvm_vmx.msr_count; - struct vmx_msr_entry *msr_area = v->arch.hvm_vmx.msr_area; +int vmx_add_guest_msr(u32 msr) +{ + struct vcpu *curr = current; + unsigned int i, msr_count = curr->arch.hvm_vmx.msr_count; + struct vmx_msr_entry *msr_area = curr->arch.hvm_vmx.msr_area; + + if ( msr_area == NULL ) + { + if ( (msr_area = alloc_xenheap_page()) == NULL ) + return -ENOMEM; + curr->arch.hvm_vmx.msr_area = msr_area; + __vmwrite(VM_EXIT_MSR_STORE_ADDR, virt_to_maddr(msr_area)); + __vmwrite(VM_ENTRY_MSR_LOAD_ADDR, virt_to_maddr(msr_area)); + } for ( i = 0; i < msr_count; i++ ) if ( msr_area[i].index == msr ) @@ -723,29 +735,29 @@ int vmx_add_guest_msr(struct vcpu *v, u3 if ( msr_count == (PAGE_SIZE / sizeof(struct vmx_msr_entry)) ) return -ENOSPC; - if ( msr_area == NULL ) - { - if ( (msr_area = alloc_xenheap_page()) == NULL ) - return -ENOMEM; - v->arch.hvm_vmx.msr_area = msr_area; - __vmwrite(VM_EXIT_MSR_STORE_ADDR, virt_to_maddr(msr_area)); - __vmwrite(VM_ENTRY_MSR_LOAD_ADDR, virt_to_maddr(msr_area)); - } - msr_area[msr_count].index = msr; msr_area[msr_count].mbz = 0; msr_area[msr_count].data = 0; - v->arch.hvm_vmx.msr_count = ++msr_count; + curr->arch.hvm_vmx.msr_count = ++msr_count; __vmwrite(VM_EXIT_MSR_STORE_COUNT, msr_count); __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, msr_count); return 0; } -int vmx_add_host_load_msr(struct vcpu *v, u32 msr) -{ - unsigned int i, msr_count = v->arch.hvm_vmx.host_msr_count; - struct vmx_msr_entry *msr_area = v->arch.hvm_vmx.host_msr_area; +int vmx_add_host_load_msr(u32 msr) +{ + struct vcpu *curr = current; + unsigned int i, msr_count = curr->arch.hvm_vmx.host_msr_count; + struct vmx_msr_entry *msr_area = curr->arch.hvm_vmx.host_msr_area; + + if ( msr_area == NULL ) + { + if ( (msr_area = alloc_xenheap_page()) == NULL ) + return -ENOMEM; + curr->arch.hvm_vmx.host_msr_area = msr_area; + __vmwrite(VM_EXIT_MSR_LOAD_ADDR, virt_to_maddr(msr_area)); + } for ( i = 0; i < msr_count; i++ ) if ( msr_area[i].index == msr ) @@ -754,18 +766,10 @@ int vmx_add_host_load_msr(struct vcpu *v if ( msr_count == (PAGE_SIZE / sizeof(struct vmx_msr_entry)) ) return -ENOSPC; - if ( msr_area == NULL ) - { - if ( (msr_area = alloc_xenheap_page()) == NULL ) - return -ENOMEM; - v->arch.hvm_vmx.host_msr_area = msr_area; - __vmwrite(VM_EXIT_MSR_LOAD_ADDR, virt_to_maddr(msr_area)); - } - msr_area[msr_count].index = msr; msr_area[msr_count].mbz = 0; rdmsrl(msr, msr_area[msr_count].data); - v->arch.hvm_vmx.host_msr_count = ++msr_count; + curr->arch.hvm_vmx.host_msr_count = ++msr_count; __vmwrite(VM_EXIT_MSR_LOAD_COUNT, msr_count); return 0; @@ -776,21 +780,17 @@ int vmx_create_vmcs(struct vcpu *v) struct arch_vmx_struct *arch_vmx = &v->arch.hvm_vmx; int rc; - if ( arch_vmx->vmcs == NULL ) - { - if ( (arch_vmx->vmcs = vmx_alloc_vmcs()) == NULL ) - return -ENOMEM; - - INIT_LIST_HEAD(&arch_vmx->active_list); - __vmpclear(virt_to_maddr(arch_vmx->vmcs)); - arch_vmx->active_cpu = -1; - arch_vmx->launched = 0; - } + if ( (arch_vmx->vmcs = vmx_alloc_vmcs()) == NULL ) + return -ENOMEM; + + INIT_LIST_HEAD(&arch_vmx->active_list); + __vmpclear(virt_to_maddr(arch_vmx->vmcs)); + arch_vmx->active_cpu = -1; + arch_vmx->launched = 0; if ( (rc = construct_vmcs(v)) != 0 ) { vmx_free_vmcs(arch_vmx->vmcs); - arch_vmx->vmcs = NULL; return rc; } @@ -801,13 +801,13 @@ void vmx_destroy_vmcs(struct vcpu *v) { struct arch_vmx_struct *arch_vmx = &v->arch.hvm_vmx; - if ( arch_vmx->vmcs == NULL ) - return; - vmx_clear_vmcs(v); vmx_free_vmcs(arch_vmx->vmcs); - arch_vmx->vmcs = NULL; + + free_xenheap_page(v->arch.hvm_vmx.host_msr_area); + free_xenheap_page(v->arch.hvm_vmx.msr_area); + free_xenheap_page(v->arch.hvm_vmx.msr_bitmap); } void vm_launch_fail(void) diff -r 11318234588e -r 08f77df14cba xen/arch/x86/hvm/vmx/vmx.c --- a/xen/arch/x86/hvm/vmx/vmx.c Thu Jun 19 12:48:04 2008 +0900 +++ b/xen/arch/x86/hvm/vmx/vmx.c Wed Jul 02 11:30:37 2008 +0900 @@ -1523,7 +1523,8 @@ static int vmx_cr_access(unsigned long e break; case VMX_CONTROL_REG_ACCESS_TYPE_LMSW: value = v->arch.hvm_vcpu.guest_cr[0]; - value = (value & ~0xFFFF) | ((exit_qualification >> 16) & 0xFFFF); + /* LMSW can: (1) set bits 0-3; (2) clear bits 1-3. */ + value = (value & ~0xe) | ((exit_qualification >> 16) & 0xf); HVMTRACE_LONG_1D(LMSW, current, value); return !hvm_set_cr0(value); default: @@ -1655,7 +1656,7 @@ static int vmx_msr_read_intercept(struct goto done; } - if ( vmx_read_guest_msr(v, ecx, &msr_content) == 0 ) + if ( vmx_read_guest_msr(ecx, &msr_content) == 0 ) break; if ( is_last_branch_msr(ecx) ) @@ -1817,12 +1818,12 @@ static int vmx_msr_write_intercept(struc for ( ; (rc == 0) && lbr->count; lbr++ ) for ( i = 0; (rc == 0) && (i < lbr->count); i++ ) - if ( (rc = vmx_add_guest_msr(v, lbr->base + i)) == 0 ) + if ( (rc = vmx_add_guest_msr(lbr->base + i)) == 0 ) vmx_disable_intercept_for_msr(v, lbr->base + i); } if ( (rc < 0) || - (vmx_add_host_load_msr(v, ecx) < 0) ) + (vmx_add_host_load_msr(ecx) < 0) ) vmx_inject_hw_exception(v, TRAP_machine_check, 0); else { @@ -1842,7 +1843,7 @@ static int vmx_msr_write_intercept(struc switch ( long_mode_do_msr_write(regs) ) { case HNDL_unhandled: - if ( (vmx_write_guest_msr(v, ecx, msr_content) != 0) && + if ( (vmx_write_guest_msr(ecx, msr_content) != 0) && !is_last_branch_msr(ecx) ) wrmsr_hypervisor_regs(ecx, regs->eax, regs->edx); break; diff -r 11318234588e -r 08f77df14cba xen/arch/x86/hvm/vmx/vpmu_core2.c --- a/xen/arch/x86/hvm/vmx/vpmu_core2.c Thu Jun 19 12:48:04 2008 +0900 +++ b/xen/arch/x86/hvm/vmx/vpmu_core2.c Wed Jul 02 11:30:37 2008 +0900 @@ -219,12 +219,12 @@ static int core2_vpmu_alloc_resource(str return 0; wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); - if ( vmx_add_host_load_msr(v, MSR_CORE_PERF_GLOBAL_CTRL) ) - return 0; - - if ( vmx_add_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL) ) - return 0; - vmx_write_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, -1ULL); + if ( vmx_add_host_load_msr(MSR_CORE_PERF_GLOBAL_CTRL) ) + return 0; + + if ( vmx_add_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL) ) + return 0; + vmx_write_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, -1ULL); pmu_enable = xmalloc_bytes(sizeof(struct core2_pmu_enable) + (core2_get_pmc_count()-1)*sizeof(char)); @@ -347,7 +347,7 @@ static int core2_vpmu_do_wrmsr(struct cp break; case MSR_CORE_PERF_FIXED_CTR_CTRL: non_global_ctrl = msr_content; - vmx_read_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, &global_ctrl); + vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, &global_ctrl); global_ctrl >>= 32; for ( i = 0; i < 3; i++ ) { @@ -359,7 +359,7 @@ static int core2_vpmu_do_wrmsr(struct cp break; default: tmp = ecx - MSR_P6_EVNTSEL0; - vmx_read_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, &global_ctrl); + vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, &global_ctrl); if ( tmp >= 0 && tmp < core2_get_pmc_count() ) core2_vpmu_cxt->pmu_enable->arch_pmc_enable[tmp] = (global_ctrl >> tmp) & (msr_content >> 22) & 1; @@ -385,7 +385,7 @@ static int core2_vpmu_do_wrmsr(struct cp if ( type != MSR_TYPE_GLOBAL ) wrmsrl(ecx, msr_content); else - vmx_write_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, msr_content); + vmx_write_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, msr_content); return 1; } @@ -410,7 +410,7 @@ static int core2_vpmu_do_rdmsr(struct cp msr_content = core2_vpmu_cxt->global_ovf_status; break; case MSR_CORE_PERF_GLOBAL_CTRL: - vmx_read_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, &msr_content); + vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, &msr_content); break; default: rdmsrl(regs->ecx, msr_content); diff -r 11318234588e -r 08f77df14cba xen/arch/x86/mm.c --- a/xen/arch/x86/mm.c Thu Jun 19 12:48:04 2008 +0900 +++ b/xen/arch/x86/mm.c Wed Jul 02 11:30:37 2008 +0900 @@ -219,7 +219,7 @@ void __init arch_init_memory(void) * Any Xen-heap pages that we will allow to be mapped will have * their domain field set to dom_xen. */ - dom_xen = alloc_domain(DOMID_XEN); + dom_xen = domain_create(DOMID_XEN, DOMCRF_dummy, 0); BUG_ON(dom_xen == NULL); /* @@ -227,7 +227,7 @@ void __init arch_init_memory(void) * This domain owns I/O pages that are within the range of the page_info * array. Mappings occur at the priv of the caller. */ - dom_io = alloc_domain(DOMID_IO); + dom_io = domain_create(DOMID_IO, DOMCRF_dummy, 0); BUG_ON(dom_io == NULL); /* First 1MB of RAM is historically marked as I/O. */ @@ -1933,9 +1933,15 @@ int get_page_type(struct page_info *page { struct domain *d = page_get_owner(page); - /* Never allow a shadowed frame to go from type count 0 to 1 */ - if ( d && shadow_mode_enabled(d) ) - shadow_remove_all_shadows(d->vcpu[0], _mfn(page_to_mfn(page))); + /* Normally we should never let a page go from type count 0 + * to type count 1 when it is shadowed. One exception: + * out-of-sync shadowed pages are allowed to become + * writeable. */ + if ( d && shadow_mode_enabled(d) + && (page->count_info & PGC_page_table) + && !((page->shadow_flags & (1u<<29)) + && type == PGT_writable_page) ) + shadow_remove_all_shadows(d->vcpu[0], _mfn(page_to_mfn(page))); ASSERT(!(x & PGT_pae_xen_l2)); if ( (x & PGT_type_mask) != type ) @@ -3533,15 +3539,14 @@ static int ptwr_emulated_read( static int ptwr_emulated_read( enum x86_segment seg, unsigned long offset, - unsigned long *val, + void *p_data, unsigned int bytes, struct x86_emulate_ctxt *ctxt) { unsigned int rc; unsigned long addr = offset; - *val = 0; - if ( (rc = copy_from_user((void *)val, (void *)addr, bytes)) != 0 ) + if ( (rc = copy_from_user(p_data, (void *)addr, bytes)) != 0 ) { propagate_page_fault(addr + bytes - rc, 0); /* read fault */ return X86EMUL_EXCEPTION; @@ -3568,7 +3573,7 @@ static int ptwr_emulated_update( /* Only allow naturally-aligned stores within the original %cr2 page. */ if ( unlikely(((addr^ptwr_ctxt->cr2) & PAGE_MASK) || (addr & (bytes-1))) ) { - MEM_LOG("Bad ptwr access (cr2=%lx, addr=%lx, bytes=%u)", + MEM_LOG("ptwr_emulate: bad access (cr2=%lx, addr=%lx, bytes=%u)", ptwr_ctxt->cr2, addr, bytes); return X86EMUL_UNHANDLEABLE; } @@ -3676,10 +3681,21 @@ static int ptwr_emulated_write( static int ptwr_emulated_write( enum x86_segment seg, unsigned long offset, - unsigned long val, + void *p_data, unsigned int bytes, struct x86_emulate_ctxt *ctxt) { + paddr_t val = 0; + + if ( (bytes > sizeof(paddr_t)) || (bytes & (bytes -1)) ) + { + MEM_LOG("ptwr_emulate: bad write size (addr=%lx, bytes=%u)", + offset, bytes); + return X86EMUL_UNHANDLEABLE; + } + + memcpy(&val, p_data, bytes); + return ptwr_emulated_update( offset, 0, val, bytes, 0, container_of(ctxt, struct ptwr_emulate_ctxt, ctxt)); @@ -3694,10 +3710,17 @@ static int ptwr_emulated_cmpxchg( struct x86_emulate_ctxt *ctxt) { paddr_t old = 0, new = 0; - if ( bytes > sizeof(paddr_t) ) + + if ( (bytes > sizeof(paddr_t)) || (bytes & (bytes -1)) ) + { + MEM_LOG("ptwr_emulate: bad cmpxchg size (addr=%lx, bytes=%u)", + offset, bytes); return X86EMUL_UNHANDLEABLE; + } + memcpy(&old, p_old, bytes); memcpy(&new, p_new, bytes); + return ptwr_emulated_update( offset, old, new, bytes, 1, container_of(ctxt, struct ptwr_emulate_ctxt, ctxt)); diff -r 11318234588e -r 08f77df14cba xen/arch/x86/mm/shadow/common.c --- a/xen/arch/x86/mm/shadow/common.c Thu Jun 19 12:48:04 2008 +0900 +++ b/xen/arch/x86/mm/shadow/common.c Wed Jul 02 11:30:37 2008 +0900 @@ -54,6 +54,10 @@ void shadow_domain_init(struct domain *d /* Use shadow pagetables for log-dirty support */ paging_log_dirty_init(d, shadow_enable_log_dirty, shadow_disable_log_dirty, shadow_clean_dirty_bitmap); + +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + d->arch.paging.shadow.oos_active = 0; +#endif } /* Setup the shadow-specfic parts of a vcpu struct. Note: The most important @@ -64,6 +68,16 @@ void shadow_domain_init(struct domain *d */ void shadow_vcpu_init(struct vcpu *v) { +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + int i; + + for ( i = 0; i < SHADOW_OOS_PAGES; i++ ) + { + v->arch.paging.shadow.oos[i] = _mfn(INVALID_MFN); + v->arch.paging.shadow.oos_snapshot[i] = _mfn(INVALID_MFN); + } +#endif + v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 3); } @@ -131,7 +145,7 @@ static int static int hvm_read(enum x86_segment seg, unsigned long offset, - unsigned long *val, + void *p_data, unsigned int bytes, enum hvm_access_type access_type, struct sh_emulate_ctxt *sh_ctxt) @@ -144,12 +158,10 @@ hvm_read(enum x86_segment seg, if ( rc ) return rc; - *val = 0; - if ( access_type == hvm_access_insn_fetch ) - rc = hvm_fetch_from_guest_virt(val, addr, bytes, 0); + rc = hvm_fetch_from_guest_virt(p_data, addr, bytes, 0); else - rc = hvm_copy_from_guest_virt(val, addr, bytes, 0); + rc = hvm_copy_from_guest_virt(p_data, addr, bytes, 0); switch ( rc ) { @@ -167,20 +179,20 @@ static int static int hvm_emulate_read(enum x86_segment seg, unsigned long offset, - unsigned long *val, + void *p_data, unsigned int bytes, struct x86_emulate_ctxt *ctxt) { if ( !is_x86_user_segment(seg) ) return X86EMUL_UNHANDLEABLE; - return hvm_read(seg, offset, val, bytes, hvm_access_read, + return hvm_read(seg, offset, p_data, bytes, hvm_access_read, container_of(ctxt, struct sh_emulate_ctxt, ctxt)); } static int hvm_emulate_insn_fetch(enum x86_segment seg, unsigned long offset, - unsigned long *val, + void *p_data, unsigned int bytes, struct x86_emulate_ctxt *ctxt) { @@ -192,19 +204,18 @@ hvm_emulate_insn_fetch(enum x86_segment /* Fall back if requested bytes are not in the prefetch cache. */ if ( unlikely((insn_off + bytes) > sh_ctxt->insn_buf_bytes) ) - return hvm_read(seg, offset, val, bytes, + return hvm_read(seg, offset, p_data, bytes, hvm_access_insn_fetch, sh_ctxt); /* Hit the cache. Simple memcpy. */ - *val = 0; - memcpy(val, &sh_ctxt->insn_buf[insn_off], bytes); + memcpy(p_data, &sh_ctxt->insn_buf[insn_off], bytes); return X86EMUL_OKAY; } static int hvm_emulate_write(enum x86_segment seg, unsigned long offset, - unsigned long val, + void *p_data, unsigned int bytes, struct x86_emulate_ctxt *ctxt) { @@ -227,7 +238,7 @@ hvm_emulate_write(enum x86_segment seg, return rc; return v->arch.paging.mode->shadow.x86_emulate_write( - v, addr, &val, bytes, sh_ctxt); + v, addr, p_data, bytes, sh_ctxt); } static int @@ -279,7 +290,7 @@ static int static int pv_emulate_read(enum x86_segment seg, unsigned long offset, - unsigned long *val, + void *p_data, unsigned int bytes, struct x86_emulate_ctxt *ctxt) { @@ -288,8 +299,7 @@ pv_emulate_read(enum x86_segment seg, if ( !is_x86_user_segment(seg) ) return X86EMUL_UNHANDLEABLE; - *val = 0; - if ( (rc = copy_from_user((void *)val, (void *)offset, bytes)) != 0 ) + if ( (rc = copy_from_user(p_data, (void *)offset, bytes)) != 0 ) { propagate_page_fault(offset + bytes - rc, 0); /* read fault */ return X86EMUL_EXCEPTION; @@ -301,7 +311,7 @@ static int static int pv_emulate_write(enum x86_segment seg, unsigned long offset, - unsigned long val, + void *p_data, unsigned int bytes, struct x86_emulate_ctxt *ctxt) { @@ -311,7 +321,7 @@ pv_emulate_write(enum x86_segment seg, if ( !is_x86_user_segment(seg) ) return X86EMUL_UNHANDLEABLE; return v->arch.paging.mode->shadow.x86_emulate_write( - v, offset, &val, bytes, sh_ctxt); + v, offset, p_data, bytes, sh_ctxt); } static int @@ -427,6 +437,585 @@ void shadow_continue_emulation(struct sh } } } + + +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) +/**************************************************************************/ +/* Out-of-sync shadows. */ + +/* From time to time, we let a shadowed pagetable page go out of sync + * with its shadow: the guest is allowed to write directly to the page, + * and those writes are not synchronously reflected in the shadow. + * This lets us avoid many emulations if the guest is writing a lot to a + * pagetable, but it relaxes a pretty important invariant in the shadow + * pagetable design. Therefore, some rules: + * + * 1. Only L1 pagetables may go out of sync: any page that is shadowed + * at at higher level must be synchronously updated. This makes + * using linear shadow pagetables much less dangerous. + * That means that: (a) unsyncing code needs to check for higher-level + * shadows, and (b) promotion code needs to resync. + * + * 2. All shadow operations on a guest page require the page to be brought + * back into sync before proceeding. This must be done under the + * shadow lock so that the page is guaranteed to remain synced until + * the operation completes. + * + * Exceptions to this rule: the pagefault and invlpg handlers may + * update only one entry on an out-of-sync page without resyncing it. + * + * 3. Operations on shadows that do not start from a guest page need to + * be aware that they may be handling an out-of-sync shadow. + * + * 4. Operations that do not normally take the shadow lock (fast-path + * #PF handler, INVLPG) must fall back to a locking, syncing version + * if they see an out-of-sync table. + * + * 5. Operations corresponding to guest TLB flushes (MOV CR3, INVLPG) + * must explicitly resync all relevant pages or update their + * shadows. + * + * Currently out-of-sync pages are listed in a simple open-addressed + * hash table with a second chance (must resist temptation to radically + * over-engineer hash tables...) The virtual address of the access + * which caused us to unsync the page is also kept in the hash table, as + * a hint for finding the writable mappings later. + * + * We keep a hash per vcpu, because we want as much as possible to do + * the re-sync on the save vcpu we did the unsync on, so the VA hint + * will be valid. + */ + + +#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL +static void sh_oos_audit(struct domain *d) +{ + int idx, expected_idx, expected_idx_alt; + struct page_info *pg; + struct vcpu *v; + + for_each_vcpu(d, v) + { + for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ ) + { + mfn_t *oos = v->arch.paging.shadow.oos; + if ( !mfn_valid(oos[idx]) ) + continue; + + expected_idx = mfn_x(oos[idx]) % SHADOW_OOS_PAGES; + expected_idx_alt = ((expected_idx + 1) % SHADOW_OOS_PAGES); + if ( idx != expected_idx && idx != expected_idx_alt ) + { + printk("%s: idx %d contains gmfn %lx, expected at %d or %d.\n", + __func__, idx, mfn_x(oos[idx]), + expected_idx, expected_idx_alt); + BUG(); + } + pg = mfn_to_page(oos[idx]); + if ( !(pg->count_info & PGC_page_table) ) + { + printk("%s: idx %x gmfn %lx not a pt (count %"PRIx32")\n", + __func__, idx, mfn_x(oos[idx]), pg->count_info); + BUG(); + } + if ( !(pg->shadow_flags & SHF_out_of_sync) ) + { + printk("%s: idx %x gmfn %lx not marked oos (flags %lx)\n", + __func__, idx, mfn_x(oos[idx]), pg->shadow_flags); + BUG(); + } + if ( (pg->shadow_flags & SHF_page_type_mask & ~SHF_L1_ANY) ) + { + printk("%s: idx %x gmfn %lx shadowed as non-l1 (flags %lx)\n", + __func__, idx, mfn_x(oos[idx]), pg->shadow_flags); + BUG(); + } + } + } +} +#endif + +#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES +void oos_audit_hash_is_present(struct domain *d, mfn_t gmfn) +{ + int idx; + struct vcpu *v; + mfn_t *oos; + + ASSERT(mfn_is_out_of_sync(gmfn)); + + for_each_vcpu(d, v) + { + oos = v->arch.paging.shadow.oos; + idx = mfn_x(gmfn) % SHADOW_OOS_PAGES; + if ( mfn_x(oos[idx]) != mfn_x(gmfn) ) + idx = (idx + 1) % SHADOW_OOS_PAGES; + + if ( mfn_x(oos[idx]) == mfn_x(gmfn) ) + return; + } + + SHADOW_ERROR("gmfn %lx marked OOS but not in hash table\n", mfn_x(gmfn)); + BUG(); +} +#endif + +/* Update the shadow, but keep the page out of sync. */ +static inline void _sh_resync_l1(struct vcpu *v, mfn_t gmfn, mfn_t snpmfn) +{ + struct page_info *pg = mfn_to_page(gmfn); + + ASSERT(mfn_valid(gmfn)); + ASSERT(page_is_out_of_sync(pg)); + + /* Call out to the appropriate per-mode resyncing function */ + if ( pg->shadow_flags & SHF_L1_32 ) + SHADOW_INTERNAL_NAME(sh_resync_l1, 2)(v, gmfn, snpmfn); + else if ( pg->shadow_flags & SHF_L1_PAE ) + SHADOW_INTERNAL_NAME(sh_resync_l1, 3)(v, gmfn, snpmfn); +#if CONFIG_PAGING_LEVELS >= 4 + else if ( pg->shadow_flags & SHF_L1_64 ) + SHADOW_INTERNAL_NAME(sh_resync_l1, 4)(v, gmfn, snpmfn); +#endif +} + +#define _FIXUP_IDX(_b, _i) ((_b) * SHADOW_OOS_FT_HASH + (_i)) + +void oos_fixup_add(struct vcpu *v, mfn_t gmfn, + mfn_t smfn, unsigned long off) +{ + int idx, i, free = 0, free_slot = 0; + struct oos_fixup *fixups = v->arch.paging.shadow.oos_fixups; + + idx = mfn_x(gmfn) % SHADOW_OOS_FT_HASH; + for ( i = 0; i < SHADOW_OOS_FT_ENTRIES; i++ ) + { + if ( !mfn_valid(fixups[_FIXUP_IDX(idx, i)].gmfn) + || !mfn_is_out_of_sync(fixups[_FIXUP_IDX(idx, i)].gmfn) ) + { + free = 1; + free_slot = _FIXUP_IDX(idx, i); + } + else if ( (mfn_x(fixups[_FIXUP_IDX(idx, i)].gmfn) == mfn_x(gmfn)) + && (mfn_x(fixups[_FIXUP_IDX(idx, i)].smfn) == mfn_x(smfn)) + && (fixups[_FIXUP_IDX(idx, i)].off == off) ) + { + perfc_incr(shadow_oos_fixup_no_add); + return; + } + } + + if ( free ) + { + if ( !v->arch.paging.shadow.oos_fixup_used ) + v->arch.paging.shadow.oos_fixup_used = 1; + fixups[free_slot].gmfn = gmfn; + fixups[free_slot].smfn = smfn; + fixups[free_slot].off = off; + perfc_incr(shadow_oos_fixup_add_ok); + return; + } + + + perfc_incr(shadow_oos_fixup_add_fail); +} + +void oos_fixup_remove(struct vcpu *v, mfn_t gmfn) +{ + int idx, i; + struct domain *d = v->domain; + + perfc_incr(shadow_oos_fixup_remove); + + /* If the domain is dying we might get called when deallocating + * the shadows. Fixup tables are already freed so exit now. */ + if ( d->is_dying ) + return; + + idx = mfn_x(gmfn) % SHADOW_OOS_FT_HASH; + for_each_vcpu(d, v) + { + struct oos_fixup *fixups = v->arch.paging.shadow.oos_fixups; + for ( i = 0; i < SHADOW_OOS_FT_ENTRIES; i++ ) + if ( mfn_x(fixups[_FIXUP_IDX(idx, i)].gmfn) == mfn_x(gmfn) ) + fixups[_FIXUP_IDX(idx, i)].gmfn = _mfn(INVALID_MFN); + } +} + +int oos_fixup_flush(struct vcpu *v) +{ + int i, rc = 0; + struct oos_fixup *fixups = v->arch.paging.shadow.oos_fixups; + + perfc_incr(shadow_oos_fixup_flush); + + if ( !v->arch.paging.shadow.oos_fixup_used ) + return 0; + + for ( i = 0; i < SHADOW_OOS_FT_HASH * SHADOW_OOS_FT_ENTRIES; i++ ) + { + if ( mfn_valid(fixups[i].gmfn) ) + { + if ( mfn_is_out_of_sync(fixups[i].gmfn) ) + rc |= sh_remove_write_access_from_sl1p(v, fixups[i].gmfn, + fixups[i].smfn, + fixups[i].off); + fixups[i].gmfn = _mfn(INVALID_MFN); + } + } + + v->arch.paging.shadow.oos_fixup_used = 0; + + return rc; +} + +int oos_fixup_flush_gmfn(struct vcpu *v, mfn_t gmfn) +{ + int idx, i, rc = 0; + struct domain *d = v->domain; + + perfc_incr(shadow_oos_fixup_flush_gmfn); + + idx = mfn_x(gmfn) % SHADOW_OOS_FT_HASH; + for_each_vcpu(d, v) + { + struct oos_fixup *fixups = v->arch.paging.shadow.oos_fixups; + + for ( i = 0; i < SHADOW_OOS_FT_ENTRIES; i++ ) + { + if ( mfn_x(fixups[_FIXUP_IDX(idx, i)].gmfn) != mfn_x(gmfn) ) + continue; + + rc |= sh_remove_write_access_from_sl1p(v, + fixups[_FIXUP_IDX(idx,i)].gmfn, + fixups[_FIXUP_IDX(idx,i)].smfn, + fixups[_FIXUP_IDX(idx,i)].off); + + fixups[_FIXUP_IDX(idx,i)].gmfn = _mfn(INVALID_MFN); + } + } + + return rc; +} + +static int oos_remove_write_access(struct vcpu *v, mfn_t gmfn, unsigned long va) +{ + int ftlb = 0; + + ftlb |= oos_fixup_flush_gmfn(v, gmfn); + + switch ( sh_remove_write_access(v, gmfn, 0, va) ) + { + default: + case 0: + break; + + case 1: + ftlb |= 1; + break; + + case -1: + /* An unfindable writeable typecount has appeared, probably via a + * grant table entry: can't shoot the mapping, so try to unshadow + * the page. If that doesn't work either, the guest is granting + * his pagetables and must be killed after all. + * This will flush the tlb, so we can return with no worries. */ + sh_remove_shadows(v, gmfn, 0 /* Be thorough */, 1 /* Must succeed */); + return 1; + } + + if ( ftlb ) + flush_tlb_mask(v->domain->domain_dirty_cpumask); + + return 0; +} + + +/* Pull all the entries on an out-of-sync page back into sync. */ +static void _sh_resync(struct vcpu *v, mfn_t gmfn, unsigned long va, mfn_t snp) +{ + struct page_info *pg = mfn_to_page(gmfn); + + ASSERT(shadow_locked_by_me(v->domain)); + ASSERT(mfn_is_out_of_sync(gmfn)); + /* Guest page must be shadowed *only* as L1 when out of sync. */ + ASSERT(!(mfn_to_page(gmfn)->shadow_flags & SHF_page_type_mask + & ~SHF_L1_ANY)); + ASSERT(!sh_page_has_multiple_shadows(mfn_to_page(gmfn))); + + SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, va=%lx\n", + v->domain->domain_id, v->vcpu_id, mfn_x(gmfn), va); + + /* Need to pull write access so the page *stays* in sync. */ + if ( oos_remove_write_access(v, gmfn, va) ) + { + /* Page has been unshadowed. */ + return; + } + + /* No more writable mappings of this page, please */ + pg->shadow_flags &= ~SHF_oos_may_write; + + /* Update the shadows with current guest entries. */ + _sh_resync_l1(v, gmfn, snp); + + /* Now we know all the entries are synced, and will stay that way */ + pg->shadow_flags &= ~SHF_out_of_sync; + perfc_incr(shadow_resync); +} + + +/* Add an MFN to the list of out-of-sync guest pagetables */ +static void oos_hash_add(struct vcpu *v, mfn_t gmfn, unsigned long va) +{ + int idx, oidx, swap = 0; + void *gptr, *gsnpptr; + mfn_t *oos = v->arch.paging.shadow.oos; + unsigned long *oos_va = v->arch.paging.shadow.oos_va; + mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot; + + idx = mfn_x(gmfn) % SHADOW_OOS_PAGES; + oidx = idx; + + if ( mfn_valid(oos[idx]) + && (mfn_x(oos[idx]) % SHADOW_OOS_PAGES) == idx ) + { + /* Punt the current occupant into the next slot */ + SWAP(oos[idx], gmfn); + SWAP(oos_va[idx], va); + swap = 1; + idx = (idx + 1) % SHADOW_OOS_PAGES; + } + if ( mfn_valid(oos[idx]) ) + { + /* Crush the current occupant. */ + _sh_resync(v, oos[idx], oos_va[idx], oos_snapshot[idx]); + perfc_incr(shadow_unsync_evict); + } + oos[idx] = gmfn; + oos_va[idx] = va; + + if ( swap ) + SWAP(oos_snapshot[idx], oos_snapshot[oidx]); + + gptr = sh_map_domain_page(oos[oidx]); + gsnpptr = sh_map_domain_page(oos_snapshot[oidx]); + memcpy(gsnpptr, gptr, PAGE_SIZE); + sh_unmap_domain_page(gptr); + sh_unmap_domain_page(gsnpptr); +} + +/* Remove an MFN from the list of out-of-sync guest pagetables */ +static void oos_hash_remove(struct vcpu *v, mfn_t gmfn) +{ + int idx; + mfn_t *oos; + struct domain *d = v->domain; + + SHADOW_PRINTK("D%dV%d gmfn %lx\n", + v->domain->domain_id, v->vcpu_id, mfn_x(gmfn)); + + for_each_vcpu(d, v) + { + oos = v->arch.paging.shadow.oos; + idx = mfn_x(gmfn) % SHADOW_OOS_PAGES; + if ( mfn_x(oos[idx]) != mfn_x(gmfn) ) + idx = (idx + 1) % SHADOW_OOS_PAGES; + if ( mfn_x(oos[idx]) == mfn_x(gmfn) ) + { + oos[idx] = _mfn(INVALID_MFN); + return; + } + } + + SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn)); + BUG(); +} + +mfn_t oos_snapshot_lookup(struct vcpu *v, mfn_t gmfn) +{ + int idx; + mfn_t *oos; + mfn_t *oos_snapshot; + struct domain *d = v->domain; + + for_each_vcpu(d, v) + { + oos = v->arch.paging.shadow.oos; + oos_snapshot = v->arch.paging.shadow.oos_snapshot; + idx = mfn_x(gmfn) % SHADOW_OOS_PAGES; + if ( mfn_x(oos[idx]) != mfn_x(gmfn) ) + idx = (idx + 1) % SHADOW_OOS_PAGES; + if ( mfn_x(oos[idx]) == mfn_x(gmfn) ) + { + return oos_snapshot[idx]; + } + } + + SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn)); + BUG(); + return _mfn(INVALID_MFN); +} + +/* Pull a single guest page back into sync */ +void sh_resync(struct vcpu *v, mfn_t gmfn) +{ + int idx; + mfn_t *oos; + unsigned long *oos_va; + mfn_t *oos_snapshot; + struct domain *d = v->domain; + + for_each_vcpu(d, v) + { + oos = v->arch.paging.shadow.oos; + oos_va = v->arch.paging.shadow.oos_va; + oos_snapshot = v->arch.paging.shadow.oos_snapshot; + idx = mfn_x(gmfn) % SHADOW_OOS_PAGES; + if ( mfn_x(oos[idx]) != mfn_x(gmfn) ) + idx = (idx + 1) % SHADOW_OOS_PAGES; + + if ( mfn_x(oos[idx]) == mfn_x(gmfn) ) + { + _sh_resync(v, gmfn, oos_va[idx], oos_snapshot[idx]); + oos[idx] = _mfn(INVALID_MFN); + return; + } + } + + SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn)); + BUG(); +} + +/* Figure out whether it's definitely safe not to sync this l1 table, + * by making a call out to the mode in which that shadow was made. */ +static int sh_skip_sync(struct vcpu *v, mfn_t gl1mfn) +{ + struct page_info *pg = mfn_to_page(gl1mfn); + if ( pg->shadow_flags & SHF_L1_32 ) + return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 2)(v, gl1mfn); + else if ( pg->shadow_flags & SHF_L1_PAE ) + return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 3)(v, gl1mfn); +#if CONFIG_PAGING_LEVELS >= 4 + else if ( pg->shadow_flags & SHF_L1_64 ) + return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 4)(v, gl1mfn); +#endif + SHADOW_ERROR("gmfn 0x%lx was OOS but not shadowed as an l1.\n", + mfn_x(gl1mfn)); + BUG(); + return 0; /* BUG() is no longer __attribute__((noreturn)). */ +} + + +/* Pull all out-of-sync pages back into sync. Pages brought out of sync + * on other vcpus are allowed to remain out of sync, but their contents + * will be made safe (TLB flush semantics); pages unsynced by this vcpu + * are brought back into sync and write-protected. If skip != 0, we try + * to avoid resyncing at all if we think we can get away with it. */ +void sh_resync_all(struct vcpu *v, int skip, int this, int others, int do_locking) +{ + int idx; + struct vcpu *other; + mfn_t *oos = v->arch.paging.shadow.oos; + unsigned long *oos_va = v->arch.paging.shadow.oos_va; + mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot; + + SHADOW_PRINTK("d=%d, v=%d\n", v->domain->domain_id, v->vcpu_id); + + ASSERT(do_locking || shadow_locked_by_me(v->domain)); + + if ( !this ) + goto resync_others; + + if ( do_locking ) + shadow_lock(v->domain); + + if ( oos_fixup_flush(v) ) + flush_tlb_mask(v->domain->domain_dirty_cpumask); + + /* First: resync all of this vcpu's oos pages */ + for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ ) + if ( mfn_valid(oos[idx]) ) + { + /* Write-protect and sync contents */ + _sh_resync(v, oos[idx], oos_va[idx], oos_snapshot[idx]); + oos[idx] = _mfn(INVALID_MFN); + } + + if ( do_locking ) + shadow_unlock(v->domain); + + resync_others: + if ( !others ) + return; + + /* Second: make all *other* vcpus' oos pages safe. */ + for_each_vcpu(v->domain, other) + { + if ( v == other ) + continue; + + if ( do_locking ) + shadow_lock(v->domain); + + oos = other->arch.paging.shadow.oos; + oos_va = other->arch.paging.shadow.oos_va; + oos_snapshot = other->arch.paging.shadow.oos_snapshot; + for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ ) + { + if ( !mfn_valid(oos[idx]) ) + continue; + + if ( skip ) + { + /* Update the shadows and leave the page OOS. */ + if ( sh_skip_sync(v, oos[idx]) ) + continue; + _sh_resync_l1(other, oos[idx], oos_snapshot[idx]); + } + else + { + /* Write-protect and sync contents */ + _sh_resync(other, oos[idx], oos_va[idx], oos_snapshot[idx]); + oos[idx] = _mfn(INVALID_MFN); + } + } + + if ( do_locking ) + shadow_unlock(v->domain); + } +} + +/* Allow a shadowed page to go out of sync */ +int sh_unsync(struct vcpu *v, mfn_t gmfn, unsigned long va) +{ + struct page_info *pg; + + ASSERT(shadow_locked_by_me(v->domain)); + + SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx va %lx\n", + v->domain->domain_id, v->vcpu_id, mfn_x(gmfn), va); + + pg = mfn_to_page(gmfn); + + /* Guest page must be shadowed *only* as L1 and *only* once when out + * of sync. Also, get out now if it's already out of sync. + * Also, can't safely unsync if some vcpus have paging disabled.*/ + if ( pg->shadow_flags & + ((SHF_page_type_mask & ~SHF_L1_ANY) | SHF_out_of_sync) + || sh_page_has_multiple_shadows(pg) + || !is_hvm_domain(v->domain) + || !v->domain->arch.paging.shadow.oos_active ) + return 0; + + pg->shadow_flags |= SHF_out_of_sync|SHF_oos_may_write; + oos_hash_add(v, gmfn, va); + perfc_incr(shadow_unsync); + return 1; +} + +#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */ + /**************************************************************************/ /* Code for "promoting" a guest page to the point where the shadow code is @@ -440,6 +1029,12 @@ void shadow_promote(struct vcpu *v, mfn_ ASSERT(mfn_valid(gmfn)); +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + /* Is the page already shadowed and out of sync? */ + if ( page_is_out_of_sync(page) ) + sh_resync(v, gmfn); +#endif + /* We should never try to promote a gmfn that has writeable mappings */ ASSERT((page->u.inuse.type_info & PGT_type_mask) != PGT_writable_page || (page->u.inuse.type_info & PGT_count_mask) == 0 @@ -463,7 +1058,17 @@ void shadow_demote(struct vcpu *v, mfn_t clear_bit(type, &page->shadow_flags); if ( (page->shadow_flags & SHF_page_type_mask) == 0 ) + { +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + /* Was the page out of sync? */ + if ( page_is_out_of_sync(page) ) + { + oos_hash_remove(v, gmfn); + oos_fixup_remove(v, gmfn); + } +#endif clear_bit(_PGC_page_table, &page->count_info); + } } /**************************************************************************/ @@ -674,7 +1279,8 @@ shadow_order(unsigned int shadow_type) 0, /* SH_type_l3_64_shadow */ 0, /* SH_type_l4_64_shadow */ 2, /* SH_type_p2m_table */ - 0 /* SH_type_monitor_table */ + 0, /* SH_type_monitor_table */ + 0 /* SH_type_oos_snapshot */ }; ASSERT(shadow_type < SH_type_unused); return type_to_order[shadow_type]; @@ -1220,6 +1826,14 @@ static unsigned int sh_set_allocation(st sp = list_entry(d->arch.paging.shadow.freelists[order].next, struct shadow_page_info, list); list_del(&sp->list); +#if defined(__x86_64__) + /* + * Re-instate lock field which we overwrite with shadow_page_info. + * This was safe, since the lock is only used on guest pages. + */ + for ( j = 0; j < 1U << order; j++ ) + spin_lock_init(&((struct page_info *)sp)[j].lock); +#endif d->arch.paging.shadow.free_pages -= 1 << order; d->arch.paging.shadow.total_pages -= 1 << order; free_domheap_pages((struct page_info *)sp, order); @@ -1297,6 +1911,27 @@ static void sh_hash_audit_bucket(struct /* Bad shadow flags on guest page? */ BUG_ON( !(gpg->shadow_flags & (1<<sp->type)) ); /* Bad type count on guest page? */ +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + if ( sp->type == SH_type_l1_32_shadow + || sp->type == SH_type_l1_pae_shadow + || sp->type == SH_type_l1_64_shadow ) + { + if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page + && (gpg->u.inuse.type_info & PGT_count_mask) != 0 ) + { + if ( !page_is_out_of_sync(gpg) ) + { + SHADOW_ERROR("MFN %#lx shadowed (by %#"PRI_mfn")" + " and not OOS but has typecount %#lx\n", + sp->backpointer, + mfn_x(shadow_page_to_mfn(sp)), + gpg->u.inuse.type_info); + BUG(); + } + } + } + else /* Not an l1 */ +#endif if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page && (gpg->u.inuse.type_info & PGT_count_mask) != 0 ) { @@ -1608,7 +2243,8 @@ void sh_destroy_shadow(struct vcpu *v, m /* Remove all writeable mappings of a guest frame from the shadow tables * Returns non-zero if we need to flush TLBs. * level and fault_addr desribe how we found this to be a pagetable; - * level==0 means we have some other reason for revoking write access.*/ + * level==0 means we have some other reason for revoking write access. + * If level==0 we are allowed to fail, returning -1. */ int sh_remove_write_access(struct vcpu *v, mfn_t gmfn, unsigned int level, @@ -1659,7 +2295,12 @@ int sh_remove_write_access(struct vcpu * return 0; /* Early exit if it's already a pagetable, or otherwise not writeable */ - if ( sh_mfn_is_a_page_table(gmfn) + if ( (sh_mfn_is_a_page_table(gmfn) +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + /* Unless they've been allowed to go out of sync with their shadows */ + && !mfn_oos_may_write(gmfn) +#endif + ) || (pg->u.inuse.type_info & PGT_count_mask) == 0 ) return 0; @@ -1676,7 +2317,7 @@ int sh_remove_write_access(struct vcpu * } #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC - if ( v == current && level != 0 ) + if ( v == current ) { unsigned long gfn; /* Heuristic: there is likely to be only one writeable mapping, @@ -1690,6 +2331,8 @@ int sh_remove_write_access(struct vcpu * return 1; \ } while (0) + if ( level == 0 && fault_addr ) + GUESS(fault_addr, 6); if ( v->arch.paging.mode->guest_levels == 2 ) { @@ -1773,13 +2416,19 @@ int sh_remove_write_access(struct vcpu * #endif /* SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC */ /* Brute-force search of all the shadows, by walking the hash */ - perfc_incr(shadow_writeable_bf); + if ( level == 0 ) + perfc_incr(shadow_writeable_bf_1); + else + perfc_incr(shadow_writeable_bf); hash_foreach(v, callback_mask, callbacks, gmfn); /* If that didn't catch the mapping, then there's some non-pagetable * mapping -- ioreq page, grant mapping, &c. */ if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) != 0 ) { + if ( level == 0 ) + return -1; + SHADOW_ERROR("can't remove write access to mfn %lx: guest has " "%lu special-use mappings of it\n", mfn_x(gmfn), (mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask)); @@ -1790,7 +2439,34 @@ int sh_remove_write_access(struct vcpu * return 1; } - +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) +int sh_remove_write_access_from_sl1p(struct vcpu *v, mfn_t gmfn, + mfn_t smfn, unsigned long off) +{ + struct shadow_page_info *sp = mfn_to_shadow_page(smfn); + + ASSERT(mfn_valid(smfn)); + ASSERT(mfn_valid(gmfn)); + + if ( sp->type == SH_type_l1_32_shadow ) + { + return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,2) + (v, gmfn, smfn, off); + } +#if CONFIG_PAGING_LEVELS >= 3 + else if ( sp->type == SH_type_l1_pae_shadow ) + return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,3) + (v, gmfn, smfn, off); +#if CONFIG_PAGING_LEVELS >= 4 + else if ( sp->type == SH_type_l1_64_shadow ) + return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,4) + (v, gmfn, smfn, off); +#endif +#endif + + return 0; +} +#endif /**************************************************************************/ /* Remove all mappings of a guest frame from the shadow tables. @@ -2127,6 +2803,36 @@ static void sh_update_paging_modes(struc } #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */ +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + if ( v->arch.paging.shadow.oos_fixups == NULL ) + { + int i; + v->arch.paging.shadow.oos_fixups = + alloc_xenheap_pages(SHADOW_OOS_FT_ORDER); + if ( v->arch.paging.shadow.oos_fixups == NULL ) + { + SHADOW_ERROR("Could not allocate OOS fixup table" + " for dom %u vcpu %u\n", + v->domain->domain_id, v->vcpu_id); + domain_crash(v->domain); + return; + } + for ( i = 0; i < SHADOW_OOS_FT_HASH * SHADOW_OOS_FT_ENTRIES; i++ ) + v->arch.paging.shadow.oos_fixups[i].gmfn = _mfn(INVALID_MFN); + } + + if ( mfn_x(v->arch.paging.shadow.oos_snapshot[0]) == INVALID_MFN ) + { + int i; + for(i = 0; i < SHADOW_OOS_PAGES; i++) + { + shadow_prealloc(d, SH_type_oos_snapshot, 1); + v->arch.paging.shadow.oos_snapshot[i] = + shadow_alloc(d, SH_type_oos_snapshot, 0); + } + } +#endif /* OOS */ + // Valid transitions handled by this function: // - For PV guests: // - after a shadow mode has been changed @@ -2158,6 +2864,13 @@ static void sh_update_paging_modes(struc /// ASSERT(shadow_mode_translate(d)); ASSERT(shadow_mode_external(d)); + +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + /* Need to resync all our pages now, because if a page goes out + * of sync with paging enabled and is resynced with paging + * disabled, the resync will go wrong. */ + shadow_resync_all(v, 0); +#endif /* OOS */ if ( !hvm_paging_enabled(v) ) { @@ -2254,6 +2967,27 @@ static void sh_update_paging_modes(struc // This *does* happen, at least for CR4.PGE... } +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + /* We need to check that all the vcpus have paging enabled to + * unsync PTs. */ + if ( is_hvm_domain(d) ) + { + int pe = 1; + struct vcpu *vptr; + + for_each_vcpu(d, vptr) + { + if ( !hvm_paging_enabled(vptr) ) + { + pe = 0; + break; + } + } + + d->arch.paging.shadow.oos_active = pe; + } +#endif /* OOS */ + v->arch.paging.mode->update_cr3(v, 0); } @@ -2426,17 +3160,36 @@ void shadow_teardown(struct domain *d) } } -#if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) +#if (SHADOW_OPTIMIZATIONS & (SHOPT_VIRTUAL_TLB|SHOPT_OUT_OF_SYNC)) /* Free the virtual-TLB array attached to each vcpu */ for_each_vcpu(d, v) { +#if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) if ( v->arch.paging.vtlb ) { xfree(v->arch.paging.vtlb); v->arch.paging.vtlb = NULL; } - } #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */ + +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + if ( v->arch.paging.shadow.oos_fixups ) + { + free_xenheap_pages(v->arch.paging.shadow.oos_fixups, + SHADOW_OOS_FT_ORDER); + v->arch.paging.shadow.oos_fixups = NULL; + } + + { + int i; + mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot; + for(i = 0; i < SHADOW_OOS_PAGES; i++) + if ( mfn_valid(oos_snapshot[i]) ) + shadow_free(d, oos_snapshot[i]); + } +#endif /* OOS */ + } +#endif /* (SHADOW_OPTIMIZATIONS & (SHOPT_VIRTUAL_TLB|SHOPT_OUT_OF_SYNC)) */ list_for_each_safe(entry, n, &d->arch.paging.shadow.p2m_freelist) { @@ -3044,7 +3797,11 @@ void shadow_audit_tables(struct vcpu *v) if ( !(SHADOW_AUDIT_ENABLE) ) return; - + +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + sh_oos_audit(v->domain); +#endif + if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL ) mask = ~1; /* Audit every table in the system */ else diff -r 11318234588e -r 08f77df14cba xen/arch/x86/mm/shadow/multi.c --- a/xen/arch/x86/mm/shadow/multi.c Thu Jun 19 12:48:04 2008 +0900 +++ b/xen/arch/x86/mm/shadow/multi.c Wed Jul 02 11:30:37 2008 +0900 @@ -305,22 +305,54 @@ shadow_check_gwalk(struct vcpu *v, unsig } /* Remove write access permissions from a gwalk_t in a batch, and - * return OR-ed result for TLB flush hint + * return OR-ed result for TLB flush hint and need to rewalk the guest + * pages. + * + * Syncing pages will remove write access to that page; but it may + * also give write access to other pages in the path. If we resync any + * pages, re-walk from the beginning. */ +#define GW_RMWR_FLUSHTLB 1 +#define GW_RMWR_REWALK 2 + static inline uint32_t gw_remove_write_accesses(struct vcpu *v, unsigned long va, walk_t *gw) { - int rc = 0; + uint32_t rc = 0; #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */ #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */ - rc = sh_remove_write_access(v, gw->l3mfn, 3, va); -#endif - rc |= sh_remove_write_access(v, gw->l2mfn, 2, va); -#endif +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + if ( mfn_is_out_of_sync(gw->l3mfn) ) + { + sh_resync(v, gw->l3mfn); + rc = GW_RMWR_REWALK; + } + else +#endif /* OOS */ + if ( sh_remove_write_access(v, gw->l3mfn, 3, va) ) + rc = GW_RMWR_FLUSHTLB; +#endif /* GUEST_PAGING_LEVELS >= 4 */ + +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + if ( mfn_is_out_of_sync(gw->l2mfn) ) + { + sh_resync(v, gw->l2mfn); + rc |= GW_RMWR_REWALK; + } + else +#endif /* OOS */ + if ( sh_remove_write_access(v, gw->l2mfn, 2, va) ) + rc |= GW_RMWR_FLUSHTLB; +#endif /* GUEST_PAGING_LEVELS >= 3 */ + if ( !(guest_supports_superpages(v) && - (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) ) - rc |= sh_remove_write_access(v, gw->l1mfn, 1, va); + (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + && !mfn_is_out_of_sync(gw->l1mfn) +#endif /* OOS */ + && sh_remove_write_access(v, gw->l1mfn, 1, va) ) + rc |= GW_RMWR_FLUSHTLB; return rc; } @@ -882,7 +914,12 @@ _sh_propagate(struct vcpu *v, // protect guest page tables // - if ( unlikely((level == 1) && sh_mfn_is_a_page_table(target_mfn)) ) + if ( unlikely((level == 1) + && sh_mfn_is_a_page_table(target_mfn) +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC ) + && !mfn_oos_may_write(target_mfn) +#endif /* OOS */ + ) ) { if ( shadow_mode_trap_reads(d) ) { @@ -1125,6 +1162,9 @@ static int shadow_set_l4e(struct vcpu *v domain_crash(v->domain); return SHADOW_SET_ERROR; } +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC ) + shadow_resync_all(v, 0); +#endif } /* Write the new entry */ @@ -1163,12 +1203,17 @@ static int shadow_set_l3e(struct vcpu *v | (((unsigned long)sl3e) & ~PAGE_MASK)); if ( shadow_l3e_get_flags(new_sl3e) & _PAGE_PRESENT ) + { /* About to install a new reference */ if ( !sh_get_ref(v, shadow_l3e_get_mfn(new_sl3e), paddr) ) { domain_crash(v->domain); return SHADOW_SET_ERROR; - } + } +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC ) + shadow_resync_all(v, 0); +#endif + } /* Write the new entry */ shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn); @@ -1219,12 +1264,29 @@ static int shadow_set_l2e(struct vcpu *v | (((unsigned long)sl2e) & ~PAGE_MASK)); if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT ) + { + mfn_t sl1mfn = shadow_l2e_get_mfn(new_sl2e); + /* About to install a new reference */ - if ( !sh_get_ref(v, shadow_l2e_get_mfn(new_sl2e), paddr) ) + if ( !sh_get_ref(v, sl1mfn, paddr) ) { domain_crash(v->domain); return SHADOW_SET_ERROR; - } + } +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + { + struct shadow_page_info *sp = mfn_to_shadow_page(sl1mfn); + mfn_t gl1mfn = _mfn(sp->backpointer); + + /* If the shadow is a fl1 then the backpointer contains + the GFN instead of the GMFN, and it's definitely not + OOS. */ + if ( (sp->type != SH_type_fl1_shadow) && mfn_valid(gl1mfn) + && mfn_is_out_of_sync(gl1mfn) ) + sh_resync(v, gl1mfn); + } +#endif + } /* Write the new entry */ #if GUEST_PAGING_LEVELS == 2 @@ -1347,6 +1409,9 @@ static int shadow_set_l1e(struct vcpu *v int flags = 0; struct domain *d = v->domain; shadow_l1e_t old_sl1e; +#if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC + mfn_t new_gmfn = shadow_l1e_get_mfn(new_sl1e); +#endif ASSERT(sl1e != NULL); old_sl1e = *sl1e; @@ -1363,8 +1428,18 @@ static int shadow_set_l1e(struct vcpu *v /* Doesn't look like a pagetable. */ flags |= SHADOW_SET_ERROR; new_sl1e = shadow_l1e_empty(); - } else { + } + else + { shadow_vram_get_l1e(new_sl1e, sl1e, sl1mfn, d); +#if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC + if ( mfn_valid(new_gmfn) && mfn_oos_may_write(new_gmfn) + && (shadow_l1e_get_flags(new_sl1e) & _PAGE_RW) ) + { + oos_fixup_add(v, new_gmfn, sl1mfn, pgentry_ptr_to_slot(sl1e)); + } +#endif + } } } @@ -2532,6 +2607,9 @@ static int validate_gl1e(struct vcpu *v, mfn_t gmfn; p2m_type_t p2mt; int result = 0; +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + mfn_t gl1mfn; +#endif /* OOS */ perfc_incr(shadow_validate_gl1e_calls); @@ -2539,10 +2617,138 @@ static int validate_gl1e(struct vcpu *v, gmfn = gfn_to_mfn(v->domain, gfn, &p2mt); l1e_propagate_from_guest(v, new_gl1e, gmfn, &new_sl1e, ft_prefetch, p2mt); + result |= shadow_set_l1e(v, sl1p, new_sl1e, sl1mfn); + +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer); + if ( mfn_valid(gl1mfn) + && mfn_is_out_of_sync(gl1mfn) ) + { + /* Update the OOS snapshot. */ + mfn_t snpmfn = oos_snapshot_lookup(v, gl1mfn); + guest_l1e_t *snp; + + ASSERT(mfn_valid(snpmfn)); + + snp = sh_map_domain_page(snpmfn); + snp[guest_index(new_ge)] = new_gl1e; + sh_unmap_domain_page(snp); + } +#endif /* OOS */ + + return result; +} + +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) +/**************************************************************************/ +/* Special validation function for re-syncing out-of-sync shadows. + * Walks the *shadow* page, and for every entry that it finds, + * revalidates the guest entry that corresponds to it. + * N.B. This function is called with the vcpu that unsynced the page, + * *not* the one that is causing it to be resynced. */ +void sh_resync_l1(struct vcpu *v, mfn_t gl1mfn, mfn_t snpmfn) +{ + mfn_t sl1mfn; + shadow_l1e_t *sl1p; + guest_l1e_t *gl1p, *gp, *snp; + int rc = 0; + + ASSERT(mfn_valid(snpmfn)); + + sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow); + ASSERT(mfn_valid(sl1mfn)); /* Otherwise we would not have been called */ + + snp = sh_map_domain_page(snpmfn); + gp = sh_map_domain_page(gl1mfn); + gl1p = gp; + + SHADOW_FOREACH_L1E(sl1mfn, sl1p, &gl1p, 0, { + guest_l1e_t gl1e = *gl1p; + guest_l1e_t *snpl1p = (guest_l1e_t *)snp + guest_index(gl1p); + + if ( memcmp(snpl1p, &gl1e, sizeof(gl1e)) ) + { + gfn_t gfn; + mfn_t gmfn; + p2m_type_t p2mt; + shadow_l1e_t nsl1e; + + gfn = guest_l1e_get_gfn(gl1e); + gmfn = gfn_to_mfn(v->domain, gfn, &p2mt); + l1e_propagate_from_guest(v, gl1e, gmfn, &nsl1e, ft_prefetch, p2mt); + rc |= shadow_set_l1e(v, sl1p, nsl1e, sl1mfn); + + *snpl1p = gl1e; + } + }); + + sh_unmap_domain_page(gp); + sh_unmap_domain_page(snp); + + /* Setting shadow L1 entries should never need us to flush the TLB */ + ASSERT(!(rc & SHADOW_SET_FLUSH)); +} + +/* Figure out whether it's definitely safe not to sync this l1 table. + * That is: if we can tell that it's only used once, and that the + * toplevel shadow responsible is not one of ours. + * N.B. This function is called with the vcpu that required the resync, + * *not* the one that originally unsynced the page, but it is + * called in the *mode* of the vcpu that unsynced it. Clear? Good. */ +int sh_safe_not_to_sync(struct vcpu *v, mfn_t gl1mfn) +{ + struct shadow_page_info *sp; + mfn_t smfn; + + smfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow); + ASSERT(mfn_valid(smfn)); /* Otherwise we would not have been called */ - result |= shadow_set_l1e(v, sl1p, new_sl1e, sl1mfn); - return result; -} + /* Up to l2 */ + sp = mfn_to_shadow_page(smfn); + if ( sp->count != 1 || !sp->up ) + return 0; + smfn = _mfn(sp->up >> PAGE_SHIFT); + ASSERT(mfn_valid(smfn)); + +#if (SHADOW_PAGING_LEVELS == 4) + /* up to l3 */ + sp = mfn_to_shadow_page(smfn); + if ( sp->count != 1 || !sp->up ) + return 0; + smfn = _mfn(sp->up >> PAGE_SHIFT); + ASSERT(mfn_valid(smfn)); + + /* up to l4 */ + sp = mfn_to_shadow_page(smfn); + if ( sp->count != 1 + || sh_type_is_pinnable(v, SH_type_l3_64_shadow) || !sp->up ) + return 0; + smfn = _mfn(sp->up >> PAGE_SHIFT); + ASSERT(mfn_valid(smfn)); + +#if (GUEST_PAGING_LEVELS == 2) + /* In 2-on-3 shadow mode the up pointer contains the link to the + * shadow page, but the shadow_table contains only the first of the + * four pages that makes the PAE top shadow tables. */ + smfn = _mfn(mfn_x(smfn) & ~0x3UL); +#endif + +#endif + + if ( pagetable_get_pfn(v->arch.shadow_table[0]) == mfn_x(smfn) +#if (SHADOW_PAGING_LEVELS == 3) + || pagetable_get_pfn(v->arch.shadow_table[1]) == mfn_x(smfn) + || pagetable_get_pfn(v->arch.shadow_table[2]) == mfn_x(smfn) + || pagetable_get_pfn(v->arch.shadow_table[3]) == mfn_x(smfn) +#endif + ) + return 0; + + /* Only in use in one toplevel shadow, and it's not the one we're + * running on */ + return 1; +} +#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */ /**************************************************************************/ @@ -2725,6 +2931,10 @@ static void sh_prefetch(struct vcpu *v, shadow_l1e_t sl1e; u32 gflags; p2m_type_t p2mt; +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + guest_l1e_t *snpl1p = NULL; +#endif /* OOS */ + /* Prefetch no further than the end of the _shadow_ l1 MFN */ dist = (PAGE_SIZE - ((unsigned long)ptr_sl1e & ~PAGE_MASK)) / sizeof sl1e; @@ -2737,6 +2947,17 @@ static void sh_prefetch(struct vcpu *v, /* Normal guest page; grab the next guest entry */ gl1p = sh_map_domain_page(gw->l1mfn); gl1p += guest_l1_table_offset(gw->va); + +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + if ( mfn_is_out_of_sync(gw->l1mfn) ) + { + mfn_t snpmfn = oos_snapshot_lookup(v, gw->l1mfn); + + ASSERT(mfn_valid(snpmfn)); + snpl1p = sh_map_domain_page(snpmfn); + snpl1p += guest_l1_table_offset(gw->va); + } +#endif /* OOS */ } for ( i = 1; i < dist ; i++ ) @@ -2774,9 +2995,18 @@ static void sh_prefetch(struct vcpu *v, /* Propagate the entry. */ l1e_propagate_from_guest(v, gl1e, gmfn, &sl1e, ft_prefetch, p2mt); (void) shadow_set_l1e(v, ptr_sl1e + i, sl1e, sl1mfn); + +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + if ( snpl1p != NULL ) + snpl1p[i] = gl1e; +#endif /* OOS */ } if ( gl1p != NULL ) sh_unmap_domain_page(gl1p); +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + if ( snpl1p != NULL ) + sh_unmap_domain_page(snpl1p); +#endif /* OOS */ } #endif /* SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH */ @@ -2805,6 +3035,7 @@ static int sh_page_fault(struct vcpu *v, int r; fetch_type_t ft = 0; p2m_type_t p2mt; + uint32_t rc; #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION int fast_emul = 0; #endif @@ -2830,6 +3061,17 @@ static int sh_page_fault(struct vcpu *v, { fast_emul = 1; gmfn = _mfn(v->arch.paging.shadow.last_emulated_mfn); + +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + /* Fall back to the slow path if we're trying to emulate + writes to an out of sync page. */ + if ( mfn_valid(gmfn) && mfn_is_out_of_sync(gmfn) ) + { + v->arch.paging.last_write_emul_ok = 0; + goto page_fault_slow_path; + } +#endif /* OOS */ + perfc_incr(shadow_fault_fast_emulate); goto early_emulation; } @@ -2855,6 +3097,31 @@ static int sh_page_fault(struct vcpu *v, sizeof(sl1e)) == 0) && sh_l1e_is_magic(sl1e)) ) { +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + /* First, need to check that this isn't an out-of-sync + * shadow l1e. If it is, we fall back to the slow path, which + * will sync it up again. */ + { + shadow_l2e_t sl2e; + mfn_t gl1mfn; + if ( (__copy_from_user(&sl2e, + (sh_linear_l2_table(v) + + shadow_l2_linear_offset(va)), + sizeof(sl2e)) != 0) + || !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) + || !mfn_valid(gl1mfn = _mfn(mfn_to_shadow_page( + shadow_l2e_get_mfn(sl2e))->backpointer)) + || unlikely(mfn_is_out_of_sync(gl1mfn)) ) + { + /* Hit the slow path as if there had been no + * shadow entry at all, and let it tidy up */ + ASSERT(regs->error_code & PFEC_page_present); + regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present); + goto page_fault_slow_path; + } + } +#endif /* SHOPT_OUT_OF_SYNC */ + if ( sh_l1e_is_gnp(sl1e) ) { /* Not-present in a guest PT: pass to the guest as @@ -2890,6 +3157,10 @@ static int sh_page_fault(struct vcpu *v, return EXCRET_fault_fixed; } } + +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + page_fault_slow_path: +#endif #endif /* SHOPT_FAST_FAULT_PATH */ /* Detect if this page fault happened while we were already in Xen @@ -2904,7 +3175,21 @@ static int sh_page_fault(struct vcpu *v, return 0; } - if ( guest_walk_tables(v, va, &gw, regs->error_code) != 0 ) + rewalk: + rc = guest_walk_tables(v, va, &gw, regs->error_code); + +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + if ( !(rc & _PAGE_PRESENT) ) + regs->error_code |= PFEC_page_present; + else if ( regs->error_code & PFEC_page_present ) + { + SHADOW_ERROR("OOS paranoia: Something is wrong in guest TLB" + " flushing. Have fun debugging it.\n"); + regs->error_code &= ~PFEC_page_present; + } +#endif + + if ( rc != 0 ) { perfc_incr(shadow_fault_bail_real_fault); SHADOW_PRINTK("not a shadow fault\n"); @@ -2948,7 +3233,10 @@ static int sh_page_fault(struct vcpu *v, shadow_lock(d); - if ( gw_remove_write_accesses(v, va, &gw) ) + rc = gw_remove_write_accesses(v, va, &gw); + + /* First bit set: Removed write access to a page. */ + if ( rc & GW_RMWR_FLUSHTLB ) { /* Write permission removal is also a hint that other gwalks * overlapping with this one may be inconsistent @@ -2958,11 +3246,20 @@ static int sh_page_fault(struct vcpu *v, flush_tlb_mask(d->domain_dirty_cpumask); } +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + /* Second bit set: Resynced a page. Re-walk needed. */ + if ( rc & GW_RMWR_REWALK ) + { + shadow_unlock(d); + goto rewalk; + } +#endif /* OOS */ + if ( !shadow_check_gwalk(v, va, &gw) ) { perfc_incr(shadow_inconsistent_gwalk); shadow_unlock(d); - return EXCRET_fault_fixed; + goto rewalk; } shadow_audit_tables(v); @@ -2991,17 +3288,45 @@ static int sh_page_fault(struct vcpu *v, return 0; } +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + /* Always unsync when writing to L1 page tables. */ + if ( sh_mfn_is_a_page_table(gmfn) + && ft == ft_demand_write ) + sh_unsync(v, gmfn, va); +#endif /* OOS */ + /* Calculate the shadow entry and write it */ l1e_propagate_from_guest(v, gw.l1e, gmfn, &sl1e, ft, p2mt); r = shadow_set_l1e(v, ptr_sl1e, sl1e, sl1mfn); +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + if ( mfn_valid(gw.l1mfn) + && mfn_is_out_of_sync(gw.l1mfn) ) + { + /* Update the OOS snapshot. */ + mfn_t snpmfn = oos_snapshot_lookup(v, gw.l1mfn); + guest_l1e_t *snp; + + ASSERT(mfn_valid(snpmfn)); + + snp = sh_map_domain_page(snpmfn); + snp[guest_l1_table_offset(va)] = gw.l1e; + sh_unmap_domain_page(snp); + } +#endif /* OOS */ + #if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH /* Prefetch some more shadow entries */ sh_prefetch(v, &gw, ptr_sl1e, sl1mfn); #endif /* Need to emulate accesses to page tables */ - if ( sh_mfn_is_a_page_table(gmfn) ) + if ( sh_mfn_is_a_page_table(gmfn) +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + /* Unless they've been allowed to go out of sync with their shadows */ + && !mfn_is_out_of_sync(gmfn) +#endif + ) { if ( ft == ft_demand_write ) { @@ -3215,6 +3540,7 @@ sh_invlpg(struct vcpu *v, unsigned long * instruction should be issued on the hardware, or 0 if it's safe not * to do so. */ { + mfn_t sl1mfn; shadow_l2e_t sl2e; perfc_incr(shadow_invlpg); @@ -3278,12 +3604,64 @@ sh_invlpg(struct vcpu *v, unsigned long // If so, then we'll need to flush the entire TLB (because that's // easier than invalidating all of the individual 4K pages). // - if ( mfn_to_shadow_page(shadow_l2e_get_mfn(sl2e))->type + sl1mfn = shadow_l2e_get_mfn(sl2e); + if ( mfn_to_shadow_page(sl1mfn)->type == SH_type_fl1_shadow ) { flush_tlb_local(); return 0; } + +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + /* Check to see if the SL1 is out of sync. */ + { + mfn_t gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer); + struct page_info *pg = mfn_to_page(gl1mfn); + if ( mfn_valid(gl1mfn) + && page_is_out_of_sync(pg) ) + { + /* The test above may give false positives, since we don't + * hold the shadow lock yet. Check again with the lock held. */ + shadow_lock(v->domain); + + /* This must still be a copy-from-user because we didn't + * have the shadow lock last time we checked, and the + * higher-level shadows might have disappeared under our + * feet. */ + if ( __copy_from_user(&sl2e, + sh_linear_l2_table(v) + + shadow_l2_linear_offset(va), + sizeof (sl2e)) != 0 ) + { + perfc_incr(shadow_invlpg_fault); + shadow_unlock(v->domain); + return 0; + } + + if ( !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) ) + { + shadow_unlock(v->domain); + return 0; + } + + sl1mfn = shadow_l2e_get_mfn(sl2e); + gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer); + pg = mfn_to_page(gl1mfn); + + if ( likely(sh_mfn_is_a_page_table(gl1mfn) + && page_is_out_of_sync(pg) ) ) + { + shadow_l1e_t *sl1; + sl1 = sh_linear_l1_table(v) + shadow_l1_linear_offset(va); + /* Remove the shadow entry that maps this VA */ + (void) shadow_set_l1e(v, sl1, shadow_l1e_empty(), sl1mfn); + } + shadow_unlock(v->domain); + /* Need the invlpg, to pick up the disappeareance of the sl1e */ + return 1; + } + } +#endif return 1; } @@ -3710,6 +4088,13 @@ sh_update_cr3(struct vcpu *v, int do_loc return; } +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + /* Need to resync all the shadow entries on a TLB flush. Resync + * current vcpus OOS pages before switching to the new shadow + * tables so that the VA hint is still valid. */ + shadow_resync_current_vcpu(v, do_locking); +#endif + if ( do_locking ) shadow_lock(v->domain); ASSERT(shadow_locked_by_me(v->domain)); @@ -3938,11 +4323,70 @@ sh_update_cr3(struct vcpu *v, int do_loc /* Release the lock, if we took it (otherwise it's the caller's problem) */ if ( do_locking ) shadow_unlock(v->domain); + +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + /* Need to resync all the shadow entries on a TLB flush. We only + * update the shadows, leaving the pages out of sync. Also, we try + * to skip synchronization of shadows not mapped in the new + * tables. */ + shadow_sync_other_vcpus(v, do_locking); +#endif + } /**************************************************************************/ /* Functions to revoke guest rights */ + +#if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC +int sh_rm_write_access_from_sl1p(struct vcpu *v, mfn_t gmfn, + mfn_t smfn, unsigned long off) +{ + int r; + shadow_l1e_t *sl1p, sl1e; + struct shadow_page_info *sp; + + ASSERT(mfn_valid(gmfn)); + ASSERT(mfn_valid(smfn)); + + sp = mfn_to_shadow_page(smfn); + + if ( sp->mbz != 0 || +#if GUEST_PAGING_LEVELS == 4 + (sp->type != SH_type_l1_64_shadow) +#elif GUEST_PAGING_LEVELS == 3 + (sp->type != SH_type_l1_pae_shadow) +#elif GUEST_PAGING_LEVELS == 2 + (sp->type != SH_type_l1_32_shadow) +#endif + ) + goto fail; + + sl1p = sh_map_domain_page(smfn); + sl1p += off; + sl1e = *sl1p; + if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW)) + != (_PAGE_PRESENT|_PAGE_RW)) + || (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) ) + { + sh_unmap_domain_page(sl1p); + goto fail; + } + + /* Found it! Need to remove its write permissions. */ + sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW); + r = shadow_set_l1e(v, sl1p, sl1e, smfn); + ASSERT( !(r & SHADOW_SET_ERROR) ); + + sh_unmap_domain_page(sl1p); + perfc_incr(shadow_writeable_h_7); + return 1; + + fail: + perfc_incr(shadow_writeable_h_8); + return 0; +} +#endif /* OOS */ #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC static int sh_guess_wrmap(struct vcpu *v, unsigned long vaddr, mfn_t gmfn) @@ -4437,23 +4881,35 @@ sh_x86_emulate_cmpxchg8b(struct vcpu *v, #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES -#define AUDIT_FAIL(_level, _fmt, _a...) do { \ - printk("Shadow %u-on-%u audit failed at level %i, index %i\n" \ - "gl" #_level "mfn = %" PRI_mfn \ - " sl" #_level "mfn = %" PRI_mfn \ - " &gl" #_level "e = %p &sl" #_level "e = %p" \ - " gl" #_level "e = %" SH_PRI_gpte \ - " sl" #_level "e = %" SH_PRI_pte "\nError: " _fmt "\n", \ - GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \ - _level, guest_index(gl ## _level ## e), \ - mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \ - gl ## _level ## e, sl ## _level ## e, \ - gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, \ - ##_a); \ - BUG(); \ - done = 1; \ +#define AUDIT_FAIL(_level, _fmt, _a...) do { \ + printk("Shadow %u-on-%u audit failed at level %i, index %i\n" \ + "gl" #_level "mfn = %" PRI_mfn \ + " sl" #_level "mfn = %" PRI_mfn \ + " &gl" #_level "e = %p &sl" #_level "e = %p" \ + " gl" #_level "e = %" SH_PRI_gpte \ + " sl" #_level "e = %" SH_PRI_pte "\nError: " _fmt "\n", \ + GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \ + _level, guest_index(gl ## _level ## e), \ + mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \ + gl ## _level ## e, sl ## _level ## e, \ + gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, \ + ##_a); \ + BUG(); \ + done = 1; \ } while (0) +#define AUDIT_FAIL_MIN(_level, _fmt, _a...) do { \ + printk("Shadow %u-on-%u audit failed at level %i\n" \ + "gl" #_level "mfn = %" PRI_mfn \ + " sl" #_level "mfn = %" PRI_mfn \ + " Error: " _fmt "\n", \ + GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \ + _level, \ + mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \ + ##_a); \ + BUG(); \ + done = 1; \ +} while (0) static char * sh_audit_flags(struct vcpu *v, int level, int gflags, int sflags) @@ -4494,6 +4950,16 @@ int sh_audit_l1_table(struct vcpu *v, mf /* Follow the backpointer */ gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer); + +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + /* Out-of-sync l1 shadows can contain anything: just check the OOS hash */ + if ( page_is_out_of_sync(mfn_to_page(gl1mfn)) ) + { + oos_audit_hash_is_present(v->domain, gl1mfn); + return 0; + } +#endif + gl1e = gp = sh_map_domain_page(gl1mfn); SHADOW_FOREACH_L1E(sl1mfn, sl1e, &gl1e, done, { @@ -4574,6 +5040,13 @@ int sh_audit_l2_table(struct vcpu *v, mf /* Follow the backpointer */ gl2mfn = _mfn(mfn_to_shadow_page(sl2mfn)->backpointer); + +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + /* Only L1's may be out of sync. */ + if ( page_is_out_of_sync(mfn_to_page(gl2mfn)) ) + AUDIT_FAIL_MIN(2, "gmfn %lx is out of sync", mfn_x(gl2mfn)); +#endif + gl2e = gp = sh_map_domain_page(gl2mfn); SHADOW_FOREACH_L2E(sl2mfn, sl2e, &gl2e, done, v->domain, { @@ -4616,6 +5089,13 @@ int sh_audit_l3_table(struct vcpu *v, mf /* Follow the backpointer */ gl3mfn = _mfn(mfn_to_shadow_page(sl3mfn)->backpointer); + +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + /* Only L1's may be out of sync. */ + if ( page_is_out_of_sync(mfn_to_page(gl3mfn)) ) + AUDIT_FAIL_MIN(3, "gmfn %lx is out of sync", mfn_x(gl3mfn)); +#endif + gl3e = gp = sh_map_domain_page(gl3mfn); SHADOW_FOREACH_L3E(sl3mfn, sl3e, &gl3e, done, { @@ -4656,6 +5136,13 @@ int sh_audit_l4_table(struct vcpu *v, mf /* Follow the backpointer */ gl4mfn = _mfn(mfn_to_shadow_page(sl4mfn)->backpointer); + +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + /* Only L1's may be out of sync. */ + if ( page_is_out_of_sync(mfn_to_page(gl4mfn)) ) + AUDIT_FAIL_MIN(4, "gmfn %lx is out of sync", mfn_x(gl4mfn)); +#endif + gl4e = gp = sh_map_domain_page(gl4mfn); SHADOW_FOREACH_L4E(sl4mfn, sl4e, &gl4e, done, v->domain, { diff -r 11318234588e -r 08f77df14cba xen/arch/x86/mm/shadow/multi.h --- a/xen/arch/x86/mm/shadow/multi.h Thu Jun 19 12:48:04 2008 +0900 +++ b/xen/arch/x86/mm/shadow/multi.h Wed Jul 02 11:30:37 2008 +0900 @@ -115,3 +115,17 @@ SHADOW_INTERNAL_NAME(sh_destroy_monitor_ extern struct paging_mode SHADOW_INTERNAL_NAME(sh_paging_mode, GUEST_LEVELS); + +#if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC +extern void +SHADOW_INTERNAL_NAME(sh_resync_l1, GUEST_LEVELS) + (struct vcpu *v, mfn_t gmfn, mfn_t snpmfn); + +extern int +SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, GUEST_LEVELS) + (struct vcpu*v, mfn_t gmfn); + +extern int +SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p, GUEST_LEVELS) + (struct vcpu *v, mfn_t gmfn, mfn_t smfn, unsigned long off); +#endif diff -r 11318234588e -r 08f77df14cba xen/arch/x86/mm/shadow/private.h --- a/xen/arch/x86/mm/shadow/private.h Thu Jun 19 12:48:04 2008 +0900 +++ b/xen/arch/x86/mm/shadow/private.h Wed Jul 02 11:30:37 2008 +0900 @@ -63,8 +63,9 @@ extern int shadow_audit_enable; #define SHOPT_SKIP_VERIFY 0x20 /* Skip PTE v'fy when safe to do so */ #define SHOPT_VIRTUAL_TLB 0x40 /* Cache guest v->p translations */ #define SHOPT_FAST_EMULATION 0x80 /* Fast write emulation */ - -#define SHADOW_OPTIMIZATIONS 0xff +#define SHOPT_OUT_OF_SYNC 0x100 /* Allow guest writes to L1 PTs */ + +#define SHADOW_OPTIMIZATIONS 0x1ff /****************************************************************************** @@ -195,9 +196,9 @@ struct shadow_page_info u32 tlbflush_timestamp; }; struct { - unsigned int type:4; /* What kind of shadow is this? */ + unsigned int type:5; /* What kind of shadow is this? */ unsigned int pinned:1; /* Is the shadow pinned? */ - unsigned int count:27; /* Reference count */ + unsigned int count:26; /* Reference count */ u32 mbz; /* Must be zero: this is where the owner * field lives in a non-shadow page */ } __attribute__((packed)); @@ -242,7 +243,8 @@ static inline void shadow_check_page_str #define SH_type_max_shadow (13U) #define SH_type_p2m_table (14U) /* in use as the p2m table */ #define SH_type_monitor_table (15U) /* in use as a monitor table */ -#define SH_type_unused (16U) +#define SH_type_oos_snapshot (16U) /* in use as OOS snapshot */ +#define SH_type_unused (17U) /* * What counts as a pinnable shadow? @@ -301,6 +303,72 @@ static inline int sh_type_is_pinnable(st #define SHF_PAE (SHF_L1_PAE|SHF_FL1_PAE|SHF_L2_PAE|SHF_L2H_PAE) #define SHF_64 (SHF_L1_64|SHF_FL1_64|SHF_L2_64|SHF_L2H_64|SHF_L3_64|SHF_L4_64) +#define SHF_L1_ANY (SHF_L1_32|SHF_L1_PAE|SHF_L1_64) + +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) +/* Marks a guest L1 page table which is shadowed but not write-protected. + * If set, then *only* L1 shadows (SHF_L1_*) are allowed. + * + * out_of_sync indicates that the shadow tables may not reflect the + * guest tables. If it is clear, then the shadow tables *must* reflect + * the guest tables. + * + * oos_may_write indicates that a page may have writable mappings. + * + * Most of the time the flags are synonymous. There is a short period of time + * during resync that oos_may_write is clear but out_of_sync is not. If a + * codepath is called during that time and is sensitive to oos issues, it may + * need to use the second flag. + */ +#define SHF_out_of_sync (1u<<30) +#define SHF_oos_may_write (1u<<29) + +/* Fixup tables are a non-complete writable-mappings reverse map for + OOS pages. This let us quickly resync pages (avoiding brute-force + search of the shadows) when the va hint is not sufficient (i.e., + the pagetable is mapped in multiple places and in multiple + shadows.) */ +#define SHADOW_OOS_FT_ENTRIES \ + ((PAGE_SIZE << SHADOW_OOS_FT_ORDER) \ + / (SHADOW_OOS_FT_HASH * sizeof(struct oos_fixup))) + +#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */ + +static inline int sh_page_has_multiple_shadows(struct page_info *pg) +{ + u32 shadows; + if ( !(pg->count_info & PGC_page_table) ) + return 0; + shadows = pg->shadow_flags & SHF_page_type_mask; + /* More than one type bit set in shadow-flags? */ + return ( (shadows & ~(1UL << find_first_set_bit(shadows))) != 0 ); +} + +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) +/* The caller must verify this is reasonable to call; i.e., valid mfn, + * domain is translated, &c */ +static inline int page_is_out_of_sync(struct page_info *p) +{ + return (p->count_info & PGC_page_table) + && (p->shadow_flags & SHF_out_of_sync); +} + +static inline int mfn_is_out_of_sync(mfn_t gmfn) +{ + return page_is_out_of_sync(mfn_to_page(mfn_x(gmfn))); +} + +static inline int page_oos_may_write(struct page_info *p) +{ + return (p->count_info & PGC_page_table) + && (p->shadow_flags & SHF_oos_may_write); +} + +static inline int mfn_oos_may_write(mfn_t gmfn) +{ + return page_oos_may_write(mfn_to_page(mfn_x(gmfn))); +} +#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */ /****************************************************************************** * Various function declarations @@ -351,7 +419,57 @@ int shadow_cmpxchg_guest_entry(struct vc int shadow_cmpxchg_guest_entry(struct vcpu *v, intpte_t *p, intpte_t *old, intpte_t new, mfn_t gmfn); - +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) +/* Allow a shadowed page to go out of sync */ +int sh_unsync(struct vcpu *v, mfn_t gmfn, unsigned long va); + +/* Pull an out-of-sync page back into sync. */ +void sh_resync(struct vcpu *v, mfn_t gmfn); + +void oos_fixup_add(struct vcpu *v, mfn_t gmfn, mfn_t smfn, unsigned long off); + +int sh_remove_write_access_from_sl1p(struct vcpu *v, mfn_t gmfn, + mfn_t smfn, unsigned long offset); + +/* Pull all out-of-sync shadows back into sync. If skip != 0, we try + * to avoid resyncing where we think we can get away with it. */ + +void sh_resync_all(struct vcpu *v, int skip, int this, int others, int do_locking); + +static inline void +shadow_resync_all(struct vcpu *v, int do_locking) +{ + sh_resync_all(v, + 0 /* skip */, + 1 /* this */, + 1 /* others */, + do_locking); +} + +static inline void +shadow_resync_current_vcpu(struct vcpu *v, int do_locking) +{ + sh_resync_all(v, + 0 /* skip */, + 1 /* this */, + 0 /* others */, + do_locking); +} + +static inline void +shadow_sync_other_vcpus(struct vcpu *v, int do_locking) +{ + sh_resync_all(v, + 1 /* skip */, + 0 /* this */, + 1 /* others */, + do_locking); +} + +void oos_audit_hash_is_present(struct domain *d, mfn_t gmfn); +mfn_t oos_snapshot_lookup(struct vcpu *v, mfn_t gmfn); + +#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */ /****************************************************************************** * Flags used in the return value of the shadow_set_lXe() functions... diff -r 11318234588e -r 08f77df14cba xen/arch/x86/mm/shadow/types.h --- a/xen/arch/x86/mm/shadow/types.h Thu Jun 19 12:48:04 2008 +0900 +++ b/xen/arch/x86/mm/shadow/types.h Wed Jul 02 11:30:37 2008 +0900 @@ -438,6 +438,11 @@ struct shadow_walk_t #define sh_guess_wrmap INTERNAL_NAME(sh_guess_wrmap) #define sh_clear_shadow_entry INTERNAL_NAME(sh_clear_shadow_entry) +#if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC +#define sh_resync_l1 INTERNAL_NAME(sh_resync_l1) +#define sh_safe_not_to_sync INTERNAL_NAME(sh_safe_not_to_sync) +#define sh_rm_write_access_from_sl1p INTERNAL_NAME(sh_rm_write_access_from_sl1p) +#endif /* The sh_guest_(map|get)_* functions depends on Xen's paging levels */ #define sh_guest_map_l1e \ diff -r 11318234588e -r 08f77df14cba xen/arch/x86/platform_hypercall.c --- a/xen/arch/x86/platform_hypercall.c Thu Jun 19 12:48:04 2008 +0900 +++ b/xen/arch/x86/platform_hypercall.c Wed Jul 02 11:30:37 2008 +0900 @@ -408,7 +408,12 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xe cpu_count++; } if ( cpu_count == num_online_cpus() ) - ret = acpi_cpufreq_init(); + { + if ( boot_cpu_data.x86_vendor == X86_VENDOR_AMD ) + ret = powernow_cpufreq_init(); + else + ret = acpi_cpufreq_init(); + } break; } diff -r 11318234588e -r 08f77df14cba xen/arch/x86/x86_emulate/x86_emulate.c --- a/xen/arch/x86/x86_emulate/x86_emulate.c Thu Jun 19 12:48:04 2008 +0900 +++ b/xen/arch/x86/x86_emulate/x86_emulate.c Wed Jul 02 11:30:37 2008 +0900 @@ -142,12 +142,14 @@ static uint8_t opcode_table[256] = { ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, /* 0xD0 - 0xD7 */ - ByteOp|DstMem|SrcImplicit|ModRM, DstMem|SrcImplicit|ModRM, - ByteOp|DstMem|SrcImplicit|ModRM, DstMem|SrcImplicit|ModRM, + ByteOp|DstMem|SrcImplicit|ModRM, DstMem|SrcImplicit|ModRM, + ByteOp|DstMem|SrcImplicit|ModRM, DstMem|SrcImplicit|ModRM, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, /* 0xD8 - 0xDF */ - 0, ImplicitOps|ModRM|Mov, 0, ImplicitOps|ModRM|Mov, - 0, ImplicitOps|ModRM|Mov, ImplicitOps|ModRM|Mov, ImplicitOps|ModRM|Mov, + ImplicitOps|ModRM|Mov, ImplicitOps|ModRM|Mov, + ImplicitOps|ModRM|Mov, ImplicitOps|ModRM|Mov, + ImplicitOps|ModRM|Mov, ImplicitOps|ModRM|Mov, + ImplicitOps|ModRM|Mov, ImplicitOps|ModRM|Mov, /* 0xE0 - 0xE7 */ ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, @@ -216,7 +218,7 @@ static uint8_t twobyte_table[256] = { ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov, /* 0xA0 - 0xA7 */ ImplicitOps, ImplicitOps, ImplicitOps, DstBitBase|SrcReg|ModRM, - DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, 0, 0, + DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, 0, 0, /* 0xA8 - 0xAF */ ImplicitOps, ImplicitOps, 0, DstBitBase|SrcReg|ModRM, DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, 0, DstReg|SrcMem|ModRM, @@ -246,8 +248,20 @@ static uint8_t twobyte_table[256] = { /* Type, address-of, and value of an instruction's operand. */ struct operand { enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type; - unsigned int bytes; - unsigned long val, orig_val; + unsigned int bytes; + + /* Up to 128-byte operand value, addressable as ulong or uint32_t[]. */ + union { + unsigned long val; + uint32_t bigval[4]; + }; + + /* Up to 128-byte operand value, addressable as ulong or uint32_t[]. */ + union { + unsigned long orig_val; + uint32_t orig_bigval[4]; + }; + union { /* OP_REG: Pointer to register field. */ unsigned long *reg; @@ -466,7 +480,7 @@ do{ asm volatile ( /* Fetch next part of the instruction being emulated. */ #define insn_fetch_bytes(_size) \ -({ unsigned long _x, _eip = _regs.eip; \ +({ unsigned long _x = 0, _eip = _regs.eip; \ if ( !mode_64bit() ) _eip = (uint32_t)_eip; /* ignore upper dword */ \ _regs.eip += (_size); /* real hardware doesn't truncate */ \ generate_exception_if((uint8_t)(_regs.eip - ctxt->regs->eip) > 15, \ @@ -594,6 +608,18 @@ do{ struct fpu_insn_ctxt fic; put_fpu(&fic); \ } while (0) +#define emulate_fpu_insn_memsrc(_op, _arg) \ +do{ struct fpu_insn_ctxt fic; \ + get_fpu(X86EMUL_FPU_fpu, &fic); \ + asm volatile ( \ + "movb $2f-1f,%0 \n" \ + "1: " _op " %1 \n" \ + "2: \n" \ + : "=m" (fic.insn_bytes) \ + : "m" (_arg) : "memory" ); \ + put_fpu(&fic); \ +} while (0) + #define emulate_fpu_insn_stub(_bytes...) \ do{ uint8_t stub[] = { _bytes, 0xc3 }; \ struct fpu_insn_ctxt fic = { .insn_bytes = sizeof(stub)-1 }; \ @@ -654,6 +680,19 @@ static void __put_rep_prefix( if ( rep_prefix ) \ __put_rep_prefix(&_regs, ctxt->regs, ad_bytes, reps_completed); \ }) + +/* Compatibility function: read guest memory, zero-extend result to a ulong. */ +static int read_ulong( + enum x86_segment seg, + unsigned long offset, + unsigned long *val, + unsigned int bytes, + struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + *val = 0; + return ops->read(seg, offset, val, bytes, ctxt); +} /* * Unsigned multiplication with double-word result. @@ -841,7 +880,8 @@ static int ioport_access_check( (tr.limit < 0x67) ) goto raise_exception; - if ( (rc = ops->read(x86_seg_none, tr.base + 0x66, &iobmp, 2, ctxt)) ) + if ( (rc = read_ulong(x86_seg_none, tr.base + 0x66, + &iobmp, 2, ctxt, ops)) ) return rc; /* Ensure TSS includes two bytes including byte containing first port. */ @@ -849,7 +889,8 @@ static int ioport_access_check( if ( tr.limit <= iobmp ) goto raise_exception; - if ( (rc = ops->read(x86_seg_none, tr.base + iobmp, &iobmp, 2, ctxt)) ) + if ( (rc = read_ulong(x86_seg_none, tr.base + iobmp, + &iobmp, 2, ctxt, ops)) ) return rc; if ( (iobmp & (((1<<bytes)-1) << (first_port&7))) != 0 ) goto raise_exception; @@ -941,12 +982,12 @@ protmode_load_seg( goto raise_exn; do { - if ( (rc = ops->read(x86_seg_none, desctab.base + (sel & 0xfff8), - &val, 4, ctxt)) ) + if ( (rc = read_ulong(x86_seg_none, desctab.base + (sel & 0xfff8), + &val, 4, ctxt, ops)) ) return rc; desc.a = val; - if ( (rc = ops->read(x86_seg_none, desctab.base + (sel & 0xfff8) + 4, - &val, 4, ctxt)) ) + if ( (rc = read_ulong(x86_seg_none, desctab.base + (sel & 0xfff8) + 4, + &val, 4, ctxt, ops)) ) return rc; desc.b = val; @@ -992,14 +1033,15 @@ protmode_load_seg( if ( (desc.b & (5u<<9)) == (4u<<9) ) goto raise_exn; /* Non-conforming segment: check DPL against RPL and CPL. */ - if ( ((desc.b & (6u<<9)) != (6u<<9)) && ((dpl < cpl) || (dpl < rpl)) ) + if ( ((desc.b & (6u<<9)) != (6u<<9)) && + ((dpl < cpl) || (dpl < rpl)) ) goto raise_exn; break; } /* Ensure Accessed flag is set. */ new_desc_b = desc.b | 0x100; - rc = ((desc.b & 0x100) ? X86EMUL_OKAY : + rc = ((desc.b & 0x100) ? X86EMUL_OKAY : ops->cmpxchg( x86_seg_none, desctab.base + (sel & 0xfff8) + 4, &desc.b, &new_desc_b, 4, ctxt)); @@ -1061,16 +1103,16 @@ decode_register( case 2: p = ®s->edx; break; case 3: p = ®s->ebx; break; case 4: p = (highbyte_regs ? - ((unsigned char *)®s->eax + 1) : + ((unsigned char *)®s->eax + 1) : (unsigned char *)®s->esp); break; case 5: p = (highbyte_regs ? - ((unsigned char *)®s->ecx + 1) : + ((unsigned char *)®s->ecx + 1) : (unsigned char *)®s->ebp); break; case 6: p = (highbyte_regs ? - ((unsigned char *)®s->edx + 1) : + ((unsigned char *)®s->edx + 1) : (unsigned char *)®s->esi); break; case 7: p = (highbyte_regs ? - ((unsigned char *)®s->ebx + 1) : + ((unsigned char *)®s->ebx + 1) : (unsigned char *)®s->edi); break; #if defined(__x86_64__) case 8: p = ®s->r8; break; @@ -1402,8 +1444,8 @@ x86_emulate( case 8: src.val = *(uint64_t *)src.reg; break; } } - else if ( (rc = ops->read(src.mem.seg, src.mem.off, - &src.val, src.bytes, ctxt)) ) + else if ( (rc = read_ulong(src.mem.seg, src.mem.off, + &src.val, src.bytes, ctxt, ops)) ) goto done; break; case SrcImm: @@ -1494,8 +1536,8 @@ x86_emulate( } else if ( !(d & Mov) ) /* optimisation - avoid slow emulated read */ { - if ( (rc = ops->read(dst.mem.seg, dst.mem.off, - &dst.val, dst.bytes, ctxt)) ) + if ( (rc = read_ulong(dst.mem.seg, dst.mem.off, + &dst.val, dst.bytes, ctxt, ops)) ) goto done; dst.orig_val = dst.val; } @@ -1571,8 +1613,8 @@ x86_emulate( int lb, ub, idx; generate_exception_if(mode_64bit() || (src.type != OP_MEM), EXC_UD, -1); - if ( (rc = ops->read(src.mem.seg, src.mem.off + op_bytes, - &src_val2, op_bytes, ctxt)) ) + if ( (rc = read_ulong(src.mem.seg, src.mem.off + op_bytes, + &src_val2, op_bytes, ctxt, ops)) ) goto done; ub = (op_bytes == 2) ? (int16_t)src_val2 : (int32_t)src_val2; lb = (op_bytes == 2) ? (int16_t)src.val : (int32_t)src.val; @@ -1588,8 +1630,8 @@ x86_emulate( /* movsxd */ if ( src.type == OP_REG ) src.val = *(int32_t *)src.reg; - else if ( (rc = ops->read(src.mem.seg, src.mem.off, - &src.val, 4, ctxt)) ) + else if ( (rc = read_ulong(src.mem.seg, src.mem.off, + &src.val, 4, ctxt, ops)) ) goto done; dst.val = (int32_t)src.val; } @@ -1613,8 +1655,8 @@ x86_emulate( unsigned long src1; /* ModR/M source operand */ if ( ea.type == OP_REG ) src1 = *ea.reg; - else if ( (rc = ops->read(ea.mem.seg, ea.mem.off, - &src1, op_bytes, ctxt)) ) + else if ( (rc = read_ulong(ea.mem.seg, ea.mem.off, + &src1, op_bytes, ctxt, ops)) ) goto done; _regs.eflags &= ~(EFLG_OF|EFLG_CF); switch ( dst.bytes ) @@ -1720,8 +1762,8 @@ x86_emulate( /* 64-bit mode: POP defaults to a 64-bit operand. */ if ( mode_64bit() && (dst.bytes == 4) ) dst.bytes = 8; - if ( (rc = ops->read(x86_seg_ss, sp_post_inc(dst.bytes), - &dst.val, dst.bytes, ctxt)) != 0 ) + if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(dst.bytes), + &dst.val, dst.bytes, ctxt, ops)) != 0 ) goto done; break; @@ -1773,8 +1815,8 @@ x86_emulate( dst.val = x86_seg_es; les: /* dst.val identifies the segment */ generate_exception_if(src.type != OP_MEM, EXC_UD, -1); - if ( (rc = ops->read(src.mem.seg, src.mem.off + src.bytes, - &sel, 2, ctxt)) != 0 ) + if ( (rc = read_ulong(src.mem.seg, src.mem.off + src.bytes, + &sel, 2, ctxt, ops)) != 0 ) goto done; if ( (rc = load_seg(dst.val, (uint16_t)sel, ctxt, ops)) != 0 ) goto done; @@ -2020,8 +2062,8 @@ x86_emulate( dst.bytes = op_bytes = 8; if ( dst.type == OP_REG ) dst.val = *dst.reg; - else if ( (rc = ops->read(dst.mem.seg, dst.mem.off, - &dst.val, 8, ctxt)) != 0 ) + else if ( (rc = read_ulong(dst.mem.seg, dst.mem.off, + &dst.val, 8, ctxt, ops)) != 0 ) goto done; } src.val = _regs.eip; @@ -2036,8 +2078,8 @@ x86_emulate( generate_exception_if(dst.type != OP_MEM, EXC_UD, -1); - if ( (rc = ops->read(dst.mem.seg, dst.mem.off+dst.bytes, - &sel, 2, ctxt)) ) + if ( (rc = read_ulong(dst.mem.seg, dst.mem.off+dst.bytes, + &sel, 2, ctxt, ops)) ) goto done; if ( (modrm_reg & 7) == 3 ) /* call */ @@ -2046,9 +2088,9 @@ x86_emulate( fail_if(ops->read_segment == NULL); if ( (rc = ops->read_segment(x86_seg_cs, ®, ctxt)) || (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes), - reg.sel, op_bytes, ctxt)) || + ®.sel, op_bytes, ctxt)) || (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes), - _regs.eip, op_bytes, ctxt)) ) + &_regs.eip, op_bytes, ctxt)) ) goto done; } @@ -2066,12 +2108,12 @@ x86_emulate( dst.bytes = 8; if ( dst.type == OP_REG ) dst.val = *dst.reg; - else if ( (rc = ops->read(dst.mem.seg, dst.mem.off, - &dst.val, 8, ctxt)) != 0 ) + else if ( (rc = read_ulong(dst.mem.seg, dst.mem.off, + &dst.val, 8, ctxt, ops)) != 0 ) goto done; } if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(dst.bytes), - dst.val, dst.bytes, ctxt)) != 0 ) + &dst.val, dst.bytes, ctxt)) != 0 ) goto done; dst.type = OP_NONE; break; @@ -2106,7 +2148,7 @@ x86_emulate( &dst.val, dst.bytes, ctxt); else rc = ops->write( - dst.mem.seg, dst.mem.off, dst.val, dst.bytes, ctxt); + dst.mem.seg, dst.mem.off, &dst.val, dst.bytes, ctxt); if ( rc != 0 ) goto done; default: @@ -2153,7 +2195,7 @@ x86_emulate( if ( mode_64bit() && (op_bytes == 4) ) op_bytes = 8; if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes), - reg.sel, op_bytes, ctxt)) != 0 ) + ®.sel, op_bytes, ctxt)) != 0 ) goto done; break; } @@ -2165,8 +2207,8 @@ x86_emulate( /* 64-bit mode: POP defaults to a 64-bit operand. */ if ( mode_64bit() && (op_bytes == 4) ) op_bytes = 8; - if ( (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes), - &dst.val, op_bytes, ctxt)) != 0 ) + if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes), + &dst.val, op_bytes, ctxt, ops)) != 0 ) goto done; if ( (rc = load_seg(src.val, (uint16_t)dst.val, ctxt, ops)) != 0 ) return rc; @@ -2275,8 +2317,8 @@ x86_emulate( dst.bytes = op_bytes; if ( mode_64bit() && (dst.bytes == 4) ) dst.bytes = 8; - if ( (rc = ops->read(x86_seg_ss, sp_post_inc(dst.bytes), - &dst.val, dst.bytes, ctxt)) != 0 ) + if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(dst.bytes), + &dst.val, dst.bytes, ctxt, ops)) != 0 ) goto done; break; @@ -2288,7 +2330,7 @@ x86_emulate( generate_exception_if(mode_64bit(), EXC_UD, -1); for ( i = 0; i < 8; i++ ) if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes), - regs[i], op_bytes, ctxt)) != 0 ) + ®s[i], op_bytes, ctxt)) != 0 ) goto done; break; } @@ -2303,8 +2345,8 @@ x86_emulate( generate_exception_if(mode_64bit(), EXC_UD, -1); for ( i = 0; i < 8; i++ ) { - if ( (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes), - &dst.val, op_bytes, ctxt)) != 0 ) + if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes), + &dst.val, op_bytes, ctxt, ops)) != 0 ) goto done; switch ( op_bytes ) { @@ -2382,8 +2424,8 @@ x86_emulate( } else { - if ( (rc = ops->read(ea.mem.seg, truncate_ea(_regs.esi), - &dst.val, dst.bytes, ctxt)) != 0 ) + if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.esi), + &dst.val, dst.bytes, ctxt, ops)) != 0 ) goto done; fail_if(ops->write_io == NULL); if ( (rc = ops->write_io(port, dst.bytes, dst.val, ctxt)) != 0 ) @@ -2455,9 +2497,9 @@ x86_emulate( if ( (rc = ops->read_segment(x86_seg_cs, ®, ctxt)) || (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes), - reg.sel, op_bytes, ctxt)) || + ®.sel, op_bytes, ctxt)) || (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes), - _regs.eip, op_bytes, ctxt)) ) + &_regs.eip, op_bytes, ctxt)) ) goto done; if ( (rc = load_seg(x86_seg_cs, sel, ctxt, ops)) != 0 ) @@ -2483,8 +2525,8 @@ x86_emulate( /* 64-bit mode: POP defaults to a 64-bit operand. */ if ( mode_64bit() && (op_bytes == 4) ) op_bytes = 8; - if ( (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes), - &dst.val, op_bytes, ctxt)) != 0 ) + if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes), + &dst.val, op_bytes, ctxt, ops)) != 0 ) goto done; if ( op_bytes == 2 ) dst.val = (uint16_t)dst.val | (_regs.eflags & 0xffff0000u); @@ -2507,8 +2549,8 @@ x86_emulate( dst.type = OP_REG; dst.reg = (unsigned long *)&_regs.eax; dst.bytes = (d & ByteOp) ? 1 : op_bytes; - if ( (rc = ops->read(ea.mem.seg, insn_fetch_bytes(ad_bytes), - &dst.val, dst.bytes, ctxt)) != 0 ) + if ( (rc = read_ulong(ea.mem.seg, insn_fetch_bytes(ad_bytes), + &dst.val, dst.bytes, ctxt, ops)) != 0 ) goto done; break; @@ -2536,8 +2578,8 @@ x86_emulate( } else { - if ( (rc = ops->read(ea.mem.seg, truncate_ea(_regs.esi), - &dst.val, dst.bytes, ctxt)) != 0 ) + if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.esi), + &dst.val, dst.bytes, ctxt, ops)) != 0 ) goto done; dst.type = OP_MEM; nr_reps = 1; @@ -2556,10 +2598,10 @@ x86_emulate( unsigned long next_eip = _regs.eip; get_rep_prefix(); src.bytes = dst.bytes = (d & ByteOp) ? 1 : op_bytes; - if ( (rc = ops->read(ea.mem.seg, truncate_ea(_regs.esi), - &dst.val, dst.bytes, ctxt)) || - (rc = ops->read(x86_seg_es, truncate_ea(_regs.edi), - &src.val, src.bytes, ctxt)) ) + if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.esi), + &dst.val, dst.bytes, ctxt, ops)) || + (rc = read_ulong(x86_seg_es, truncate_ea(_regs.edi), + &src.val, src.bytes, ctxt, ops)) ) goto done; register_address_increment( _regs.esi, (_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes); @@ -2592,8 +2634,8 @@ x86_emulate( dst.type = OP_REG; dst.bytes = (d & ByteOp) ? 1 : op_bytes; dst.reg = (unsigned long *)&_regs.eax; - if ( (rc = ops->read(ea.mem.seg, truncate_ea(_regs.esi), - &dst.val, dst.bytes, ctxt)) != 0 ) + if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.esi), + &dst.val, dst.bytes, ctxt, ops)) != 0 ) goto done; register_address_increment( _regs.esi, (_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes); @@ -2606,8 +2648,8 @@ x86_emulate( get_rep_prefix(); src.bytes = dst.bytes = (d & ByteOp) ? 1 : op_bytes; dst.val = _regs.eax; - if ( (rc = ops->read(x86_seg_es, truncate_ea(_regs.edi), - &src.val, src.bytes, ctxt)) != 0 ) + if ( (rc = read_ulong(x86_seg_es, truncate_ea(_regs.edi), + &src.val, src.bytes, ctxt, ops)) != 0 ) goto done; register_address_increment( _regs.edi, (_regs.eflags & EFLG_DF) ? -src.bytes : src.bytes); @@ -2624,8 +2666,8 @@ x86_emulate( case 0xc3: /* ret (near) */ { int offset = (b == 0xc2) ? insn_fetch_type(uint16_t) : 0; op_bytes = mode_64bit() ? 8 : op_bytes; - if ( (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes + offset), - &dst.val, op_bytes, ctxt)) != 0 ) + if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes + offset), + &dst.val, op_bytes, ctxt, ops)) != 0 ) goto done; _regs.eip = dst.val; break; @@ -2640,7 +2682,7 @@ x86_emulate( dst.bytes = (mode_64bit() && (op_bytes == 4)) ? 8 : op_bytes; dst.reg = (unsigned long *)&_regs.ebp; if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(dst.bytes), - _regs.ebp, dst.bytes, ctxt)) ) + &_regs.ebp, dst.bytes, ctxt)) ) goto done; dst.val = _regs.esp; @@ -2650,14 +2692,14 @@ x86_emulate( { unsigned long ebp, temp_data; ebp = truncate_word(_regs.ebp - i*dst.bytes, ctxt->sp_size/8); - if ( (rc = ops->read(x86_seg_ss, ebp, - &temp_data, dst.bytes, ctxt)) || + if ( (rc = read_ulong(x86_seg_ss, ebp, + &temp_data, dst.bytes, ctxt, ops)) || (rc = ops->write(x86_seg_ss, sp_pre_dec(dst.bytes), - temp_data, dst.bytes, ctxt)) ) + &temp_data, dst.bytes, ctxt)) ) goto done; } if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(dst.bytes), - dst.val, dst.bytes, ctxt)) ) + &dst.val, dst.bytes, ctxt)) ) goto done; } @@ -2683,8 +2725,8 @@ x86_emulate( /* Second writeback, to %%ebp. */ dst.reg = (unsigned long *)&_regs.ebp; - if ( (rc = ops->read(x86_seg_ss, sp_post_inc(dst.bytes), - &dst.val, dst.bytes, ctxt)) ) + if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(dst.bytes), + &dst.val, dst.bytes, ctxt, ops)) ) goto done; break; @@ -2692,10 +2734,10 @@ x86_emulate( case 0xcb: /* ret (far) */ { int offset = (b == 0xca) ? insn_fetch_type(uint16_t) : 0; op_bytes = mode_64bit() ? 8 : op_bytes; - if ( (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes), - &dst.val, op_bytes, ctxt)) || - (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes + offset), - &src.val, op_bytes, ctxt)) || + if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes), + &dst.val, op_bytes, ctxt, ops)) || + (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes + offset), + &src.val, op_bytes, ctxt, ops)) || (rc = load_seg(x86_seg_cs, (uint16_t)src.val, ctxt, ops)) ) goto done; _regs.eip = dst.val; @@ -2729,12 +2771,12 @@ x86_emulate( if ( !mode_iopl() ) mask |= EFLG_IF; fail_if(!in_realmode(ctxt, ops)); - if ( (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes), - &eip, op_bytes, ctxt)) || - (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes), - &cs, op_bytes, ctxt)) || - (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes), - &eflags, op_bytes, ctxt)) ) + if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes), + &eip, op_bytes, ctxt, ops)) || + (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes), + &cs, op_bytes, ctxt, ops)) || + (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes), + &eflags, op_bytes, ctxt, ops)) ) goto done; if ( op_bytes == 2 ) eflags = (uint16_t)eflags | (_regs.eflags & 0xffff0000u); @@ -2779,12 +2821,64 @@ x86_emulate( case 0xd7: /* xlat */ { unsigned long al = (uint8_t)_regs.eax; - if ( (rc = ops->read(ea.mem.seg, truncate_ea(_regs.ebx + al), - &al, 1, ctxt)) != 0 ) + if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.ebx + al), + &al, 1, ctxt, ops)) != 0 ) goto done; *(uint8_t *)&_regs.eax = al; break; } + + case 0xd8: /* FPU 0xd8 */ + switch ( modrm ) + { + case 0xc0 ... 0xc7: /* fadd %stN,%stN */ + case 0xc8 ... 0xcf: /* fmul %stN,%stN */ + case 0xd0 ... 0xd7: /* fcom %stN,%stN */ + case 0xd8 ... 0xdf: /* fcomp %stN,%stN */ + case 0xe0 ... 0xe7: /* fsub %stN,%stN */ + case 0xe8 ... 0xef: /* fsubr %stN,%stN */ + case 0xf0 ... 0xf7: /* fdiv %stN,%stN */ + case 0xf8 ... 0xff: /* fdivr %stN,%stN */ + emulate_fpu_insn_stub(0xd8, modrm); + break; + default: + fail_if(modrm >= 0xc0); + ea.bytes = 4; + src = ea; + if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val, + src.bytes, ctxt)) != 0 ) + goto done; + switch ( modrm_reg & 7 ) + { + case 0: /* fadd */ + emulate_fpu_insn_memsrc("fadds", src.val); + break; + case 1: /* fmul */ + emulate_fpu_insn_memsrc("fmuls", src.val); + break; + case 2: /* fcom */ + emulate_fpu_insn_memsrc("fcoms", src.val); + break; + case 3: /* fcomp */ + emulate_fpu_insn_memsrc("fcomps", src.val); + break; + case 4: /* fsub */ + emulate_fpu_insn_memsrc("fsubs", src.val); + break; + case 5: /* fsubr */ + emulate_fpu_insn_memsrc("fsubrs", src.val); + break; + case 6: /* fdiv */ + emulate_fpu_insn_memsrc("fdivs", src.val); + break; + case 7: /* fdivr */ + emulate_fpu_insn_memsrc("fdivrs", src.val); + break; + default: + goto cannot_emulate; + } + } + break; case 0xd9: /* FPU 0xd9 */ switch ( modrm ) @@ -2822,28 +2916,269 @@ x86_emulate( emulate_fpu_insn_stub(0xd9, modrm); break; default: - fail_if((modrm_reg & 7) != 7); fail_if(modrm >= 0xc0); - /* fnstcw m2byte */ - ea.bytes = 2; - dst = ea; - emulate_fpu_insn_memdst("fnstcw", dst.val); + switch ( modrm_reg & 7 ) + { + case 0: /* fld m32fp */ + ea.bytes = 4; + src = ea; + if ( (rc = ops->read(ea.mem.seg, ea.mem.off, &src.val, + src.bytes, ctxt)) != 0 ) + goto done; + emulate_fpu_insn_memsrc("flds", src.val); + break; + case 2: /* fstp m32fp */ + ea.bytes = 4; + dst = ea; + dst.type = OP_MEM; + emulate_fpu_insn_memdst("fsts", dst.val); + break; + case 3: /* fstp m32fp */ + ea.bytes = 4; + dst = ea; + dst.type = OP_MEM; + emulate_fpu_insn_memdst("fstps", dst.val); + break; + /* case 4: fldenv - TODO */ + case 5: /* fldcw m2byte */ + ea.bytes = 2; + src = ea; + if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val, + src.bytes, ctxt)) != 0 ) + goto done; + emulate_fpu_insn_memsrc("fldcw", src.val); + break; + /* case 6: fstenv - TODO */ + case 7: /* fnstcw m2byte */ + ea.bytes = 2; + dst = ea; + dst.type = OP_MEM; + emulate_fpu_insn_memdst("fnstcw", dst.val); + break; + default: + goto cannot_emulate; + } + } + break; + + case 0xda: /* FPU 0xda */ + switch ( modrm ) + { + case 0xc0 ... 0xc7: /* fcmovb %stN */ + case 0xc8 ... 0xcf: /* fcmove %stN */ + case 0xd0 ... 0xd7: /* fcmovbe %stN */ + case 0xd8 ... 0xdf: /* fcmovu %stN */ + case 0xe9: /* fucompp */ + emulate_fpu_insn_stub(0xda, modrm); + break; + default: + fail_if(modrm >= 0xc0); + ea.bytes = 8; + src = ea; + if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val, + src.bytes, ctxt)) != 0 ) + goto done; + switch ( modrm_reg & 7 ) + { + case 0: /* fiadd m64i */ + emulate_fpu_insn_memsrc("fiaddl", src.val); + break; + case 1: /* fimul m64i */ + emulate_fpu_insn_memsrc("fimul", src.val); + break; + case 2: /* ficom m64i */ + emulate_fpu_insn_memsrc("ficoml", src.val); + break; + case 3: /* ficomp m64i */ + emulate_fpu_insn_memsrc("ficompl", src.val); + break; + case 4: /* fisub m64i */ + emulate_fpu_insn_memsrc("fisubl", src.val); + break; + case 5: /* fisubr m64i */ + emulate_fpu_insn_memsrc("fisubrl", src.val); + break; + case 6: /* fidiv m64i */ + emulate_fpu_insn_memsrc("fidivl", src.val); + break; + case 7: /* fidivr m64i */ + emulate_fpu_insn_memsrc("fidivrl", src.val); + break; + default: + goto cannot_emulate; + } } break; case 0xdb: /* FPU 0xdb */ - fail_if(modrm != 0xe3); - /* fninit */ - emulate_fpu_insn("fninit"); + switch ( modrm ) + { + case 0xc0 ... 0xc7: /* fcmovnb %stN */ + case 0xc8 ... 0xcf: /* fcmovne %stN */ + case 0xd0 ... 0xd7: /* fcmovnbe %stN */ + case 0xd8 ... 0xdf: /* fcmovnu %stN */ + emulate_fpu_insn_stub(0xdb, modrm); + break; + case 0xe2: /* fnclex */ + emulate_fpu_insn("fnclex"); + break; + case 0xe3: /* fninit */ + emulate_fpu_insn("fninit"); + break; + case 0xe4: /* fsetpm - 287 only, ignored by 387 */ + break; + case 0xe8 ... 0xef: /* fucomi %stN */ + case 0xf0 ... 0xf7: /* fcomi %stN */ + emulate_fpu_insn_stub(0xdb, modrm); + break; + default: + fail_if(modrm >= 0xc0); + switch ( modrm_reg & 7 ) + { + case 0: /* fild m32i */ + ea.bytes = 4; + src = ea; + if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val, + src.bytes, ctxt)) != 0 ) + goto done; + emulate_fpu_insn_memsrc("fildl", src.val); + break; + case 1: /* fisttp m32i */ + ea.bytes = 4; + dst = ea; + dst.type = OP_MEM; + emulate_fpu_insn_memdst("fisttpl", dst.val); + break; + case 2: /* fist m32i */ + ea.bytes = 4; + dst = ea; + dst.type = OP_MEM; + emulate_fpu_insn_memdst("fistl", dst.val); + break; + case 3: /* fistp m32i */ + ea.bytes = 4; + dst = ea; + dst.type = OP_MEM; + emulate_fpu_insn_memdst("fistpl", dst.val); + break; + case 5: /* fld m80fp */ + ea.bytes = 10; + src = ea; + if ( (rc = ops->read(src.mem.seg, src.mem.off, + &src.val, src.bytes, ctxt)) != 0 ) + goto done; + emulate_fpu_insn_memdst("fldt", src.val); + break; + case 7: /* fstp m80fp */ + ea.bytes = 10; + dst.type = OP_MEM; + dst = ea; + emulate_fpu_insn_memdst("fstpt", dst.val); + break; + default: + goto cannot_emulate; + } + } + break; + + case 0xdc: /* FPU 0xdc */ + switch ( modrm ) + { + case 0xc0 ... 0xc7: /* fadd %stN */ + case 0xc8 ... 0xcf: /* fmul %stN */ + case 0xe0 ... 0xe7: /* fsubr %stN */ + case 0xe8 ... 0xef: /* fsub %stN */ + case 0xf0 ... 0xf7: /* fdivr %stN */ + case 0xf8 ... 0xff: /* fdiv %stN */ + emulate_fpu_insn_stub(0xdc, modrm); + break; + default: + fail_if(modrm >= 0xc0); + ea.bytes = 8; + src = ea; + if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val, + src.bytes, ctxt)) != 0 ) + goto done; + switch ( modrm_reg & 7 ) + { + case 0: /* fadd m64fp */ + emulate_fpu_insn_memsrc("faddl", src.val); + break; + case 1: /* fmul m64fp */ + emulate_fpu_insn_memsrc("fmull", src.val); + break; + case 2: /* fcom m64fp */ + emulate_fpu_insn_memsrc("fcoml", src.val); + break; + case 3: /* fcomp m64fp */ + emulate_fpu_insn_memsrc("fcompl", src.val); + break; + case 4: /* fsub m64fp */ + emulate_fpu_insn_memsrc("fsubl", src.val); + break; + case 5: /* fsubr m64fp */ + emulate_fpu_insn_memsrc("fsubrl", src.val); + break; + case 6: /* fdiv m64fp */ + emulate_fpu_insn_memsrc("fdivl", src.val); + break; + case 7: /* fdivr m64fp */ + emulate_fpu_insn_memsrc("fdivrl", src.val); + break; + } + } break; case 0xdd: /* FPU 0xdd */ - fail_if((modrm_reg & 7) != 7); - fail_if(modrm >= 0xc0); - /* fnstsw m2byte */ - ea.bytes = 2; - dst = ea; - emulate_fpu_insn_memdst("fnstsw", dst.val); + switch ( modrm ) + { + case 0xc0 ... 0xc7: /* ffree %stN */ + case 0xd0 ... 0xd7: /* fst %stN */ + case 0xd8 ... 0xdf: /* fstp %stN */ + case 0xe0 ... 0xe7: /* fucom %stN */ + case 0xe8 ... 0xef: /* fucomp %stN */ + emulate_fpu_insn_stub(0xdd, modrm); + break; + default: + fail_if(modrm >= 0xc0); + switch ( modrm_reg & 7 ) + { + case 0: /* fld m64fp */; + ea.bytes = 8; + src = ea; + if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val, + src.bytes, ctxt)) != 0 ) + goto done; + emulate_fpu_insn_memsrc("fldl", src.val); + break; + case 1: /* fisttp m64i */ + ea.bytes = 8; + dst = ea; + dst.type = OP_MEM; + emulate_fpu_insn_memdst("fisttpll", dst.val); + break; + case 2: /* fst m64fp */ + ea.bytes = 8; + dst = ea; + dst.type = OP_MEM; + emulate_fpu_insn_memsrc("fstl", dst.val); + break; + case 3: /* fstp m64fp */ + ea.bytes = 8; + dst = ea; + dst.type = OP_MEM; + emulate_fpu_insn_memdst("fstpl", dst.val); + break; + case 7: /* fnstsw m2byte */ + ea.bytes = 2; + dst = ea; + dst.type = OP_MEM; + emulate_fpu_insn_memdst("fnstsw", dst.val); + break; + default: + goto cannot_emulate; + } + } break; case 0xde: /* FPU 0xde */ @@ -2859,17 +3194,120 @@ x86_emulate( emulate_fpu_insn_stub(0xde, modrm); break; default: - goto cannot_emulate; + fail_if(modrm >= 0xc0); + ea.bytes = 2; + src = ea; + if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val, + src.bytes, ctxt)) != 0 ) + goto done; + switch ( modrm_reg & 7 ) + { + case 0: /* fiadd m16i */ + emulate_fpu_insn_memsrc("fiadd", src.val); + break; + case 1: /* fimul m16i */ + emulate_fpu_insn_memsrc("fimul", src.val); + break; + case 2: /* ficom m16i */ + emulate_fpu_insn_memsrc("ficom", src.val); + break; + case 3: /* ficomp m16i */ + emulate_fpu_insn_memsrc("ficomp", src.val); + break; + case 4: /* fisub m16i */ + emulate_fpu_insn_memsrc("fisub", src.val); + break; + case 5: /* fisubr m16i */ + emulate_fpu_insn_memsrc("fisubr", src.val); + break; + case 6: /* fidiv m16i */ + emulate_fpu_insn_memsrc("fidiv", src.val); + break; + case 7: /* fidivr m16i */ + emulate_fpu_insn_memsrc("fidivr", src.val); + break; + default: + goto cannot_emulate; + } } break; case 0xdf: /* FPU 0xdf */ - fail_if(modrm != 0xe0); - /* fnstsw %ax */ - dst.bytes = 2; - dst.type = OP_REG; - dst.reg = (unsigned long *)&_regs.eax; - emulate_fpu_insn_memdst("fnstsw", dst.val); + switch ( modrm ) + { + case 0xe0: + /* fnstsw %ax */ + dst.bytes = 2; + dst.type = OP_REG; + dst.reg = (unsigned long *)&_regs.eax; + emulate_fpu_insn_memdst("fnstsw", dst.val); + break; + case 0xf0 ... 0xf7: /* fcomip %stN */ + case 0xf8 ... 0xff: /* fucomip %stN */ + emulate_fpu_insn_stub(0xdf, modrm); + break; + default: + fail_if(modrm >= 0xc0); + switch ( modrm_reg & 7 ) + { + case 0: /* fild m16i */ + ea.bytes = 2; + src = ea; + if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val, + src.bytes, ctxt)) != 0 ) + goto done; + emulate_fpu_insn_memsrc("fild", src.val); + break; + case 1: /* fisttp m16i */ + ea.bytes = 2; + dst = ea; + dst.type = OP_MEM; + emulate_fpu_insn_memdst("fisttp", dst.val); + break; + case 2: /* fist m16i */ + ea.bytes = 2; + dst = ea; + dst.type = OP_MEM; + emulate_fpu_insn_memdst("fist", dst.val); + break; + case 3: /* fistp m16i */ + ea.bytes = 2; + dst = ea; + dst.type = OP_MEM; + emulate_fpu_insn_memdst("fistp", dst.val); + break; + case 4: /* fbld m80dec */ + ea.bytes = 10; + dst = ea; + if ( (rc = ops->read(src.mem.seg, src.mem.off, + &src.val, src.bytes, ctxt)) != 0 ) + goto done; + emulate_fpu_insn_memdst("fbld", src.val); + break; + case 5: /* fild m64i */ + ea.bytes = 8; + src = ea; + if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val, + src.bytes, ctxt)) != 0 ) + goto done; + emulate_fpu_insn_memsrc("fildll", src.val); + break; + case 6: /* fbstp packed bcd */ + ea.bytes = 10; + dst = ea; + dst.type = OP_MEM; + emulate_fpu_insn_memdst("fbstp", dst.val); + break; + case 7: /* fistp m64i */ + ea.bytes = 8; + dst = ea; + dst.type = OP_MEM; + emulate_fpu_insn_memdst("fistpll", dst.val); + break; + default: + goto cannot_emulate; + } + } break; case 0xe0 ... 0xe2: /* loop{,z,nz} */ { @@ -2924,7 +3362,6 @@ x86_emulate( /* out */ fail_if(ops->write_io == NULL); rc = ops->write_io(port, op_bytes, _regs.eax, ctxt); - } else { @@ -3242,9 +3679,9 @@ x86_emulate( if ( op_bytes == 2 ) reg.base &= 0xffffff; if ( (rc = ops->write(ea.mem.seg, ea.mem.off+0, - reg.limit, 2, ctxt)) || + ®.limit, 2, ctxt)) || (rc = ops->write(ea.mem.seg, ea.mem.off+2, - reg.base, mode_64bit() ? 8 : 4, ctxt)) ) + ®.base, mode_64bit() ? 8 : 4, ctxt)) ) goto done; break; case 2: /* lgdt */ @@ -3252,10 +3689,10 @@ x86_emulate( generate_exception_if(ea.type != OP_MEM, EXC_UD, -1); fail_if(ops->write_segment == NULL); memset(®, 0, sizeof(reg)); - if ( (rc = ops->read(ea.mem.seg, ea.mem.off+0, - &limit, 2, ctxt)) || - (rc = ops->read(ea.mem.seg, ea.mem.off+2, - &base, mode_64bit() ? 8 : 4, ctxt)) ) + if ( (rc = read_ulong(ea.mem.seg, ea.mem.off+0, + &limit, 2, ctxt, ops)) || + (rc = read_ulong(ea.mem.seg, ea.mem.off+2, + &base, mode_64bit() ? 8 : 4, ctxt, ops)) ) goto done; reg.base = base; reg.limit = limit; @@ -3267,7 +3704,8 @@ x86_emulate( goto done; break; case 4: /* smsw */ - ea.bytes = 2; + if ( ea.type == OP_MEM ) + ea.bytes = 2; dst = ea; fail_if(ops->read_cr == NULL); if ( (rc = ops->read_cr(0, &dst.val, ctxt)) ) @@ -3281,11 +3719,11 @@ x86_emulate( goto done; if ( ea.type == OP_REG ) cr0w = *ea.reg; - else if ( (rc = ops->read(ea.mem.seg, ea.mem.off, - &cr0w, 2, ctxt)) ) + else if ( (rc = read_ulong(ea.mem.seg, ea.mem.off, + &cr0w, 2, ctxt, ops)) ) goto done; - cr0 &= 0xffff0000; - cr0 |= (uint16_t)cr0w; + /* LMSW can: (1) set bits 0-3; (2) clear bits 1-3. */ + cr0 = (cr0 & ~0xe) | (cr0w & 0xf); if ( (rc = ops->write_cr(0, cr0, ctxt)) ) goto done; break; @@ -3404,8 +3842,10 @@ x86_emulate( if ( ea.type == OP_MEM ) { unsigned long lval, hval; - if ( (rc = ops->read(ea.mem.seg, ea.mem.off+0, &lval, 4, ctxt)) || - (rc = ops->read(ea.mem.seg, ea.mem.off+4, &hval, 4, ctxt)) ) + if ( (rc = read_ulong(ea.mem.seg, ea.mem.off+0, + &lval, 4, ctxt, ops)) || + (rc = read_ulong(ea.mem.seg, ea.mem.off+4, + &hval, 4, ctxt, ops)) ) goto done; val = ((uint64_t)hval << 32) | (uint32_t)lval; stub[2] = modrm & 0x38; /* movq (%eax),%mmN */ @@ -3428,8 +3868,8 @@ x86_emulate( if ( ea.type == OP_MEM ) { unsigned long lval = (uint32_t)val, hval = (uint32_t)(val >> 32); - if ( (rc = ops->write(ea.mem.seg, ea.mem.off+0, lval, 4, ctxt)) || - (rc = ops->write(ea.mem.seg, ea.mem.off+4, hval, 4, ctxt)) ) + if ( (rc = ops->write(ea.mem.seg, ea.mem.off+0, &lval, 4, ctxt)) || + (rc = ops->write(ea.mem.seg, ea.mem.off+4, &hval, 4, ctxt)) ) goto done; } break; @@ -3481,8 +3921,8 @@ x86_emulate( /* Get actual old value. */ for ( i = 0; i < (op_bytes/sizeof(long)); i++ ) - if ( (rc = ops->read(ea.mem.seg, ea.mem.off + i*sizeof(long), - &old[i], sizeof(long), ctxt)) != 0 ) + if ( (rc = read_ulong(ea.mem.seg, ea.mem.off + i*sizeof(long), + &old[i], sizeof(long), ctxt, ops)) != 0 ) goto done; /* Get expected and proposed values. */ diff -r 11318234588e -r 08f77df14cba xen/arch/x86/x86_emulate/x86_emulate.h --- a/xen/arch/x86/x86_emulate/x86_emulate.h Thu Jun 19 12:48:04 2008 +0900 +++ b/xen/arch/x86/x86_emulate/x86_emulate.h Wed Jul 02 11:30:37 2008 +0900 @@ -102,7 +102,8 @@ enum x86_emulate_fpu_type { }; /* - * These operations represent the instruction emulator's interface to memory. + * These operations represent the instruction emulator's interface to memory, + * I/O ports, privileged state... pretty much everything other than GPRs. * * NOTES: * 1. If the access fails (cannot emulate, or a standard access faults) then @@ -110,8 +111,7 @@ enum x86_emulate_fpu_type { * some out-of-band mechanism, unknown to the emulator. The memop signals * failure by returning X86EMUL_EXCEPTION to the emulator, which will * then immediately bail. - * 2. Valid access sizes are 1, 2, 4 and 8 (x86/64 only) bytes. - * 3. The emulator cannot handle 64-bit mode emulation on an x86/32 system. + * 2. The emulator cannot handle 64-bit mode emulation on an x86/32 system. */ struct x86_emulate_ops { @@ -121,19 +121,25 @@ struct x86_emulate_ops * All memory-access functions: * @seg: [IN ] Segment being dereferenced (specified as x86_seg_??). * @offset:[IN ] Offset within segment. + * @p_data:[IN ] Pointer to i/o data buffer (length is @bytes) * Read functions: * @val: [OUT] Value read, zero-extended to 'ulong'. * Write functions: * @val: [IN ] Value to write (low-order bytes used as req'd). * Variable-length access functions: - * @bytes: [IN ] Number of bytes to read or write. - */ - - /* read: Emulate a memory read. */ + * @bytes: [IN ] Number of bytes to read or write. Valid access sizes are + * 1, 2, 4 and 8 (x86/64 only) bytes, unless otherwise + * stated. + */ + + /* + * read: Emulate a memory read. + * @bytes: Access length (0 < @bytes < 4096). + */ int (*read)( enum x86_segment seg, unsigned long offset, - unsigned long *val, + void *p_data, unsigned int bytes, struct x86_emulate_ctxt *ctxt); @@ -144,15 +150,18 @@ struct x86_emulate_ops int (*insn_fetch)( enum x86_segment seg, unsigned long offset, - unsigned long *val, - unsigned int bytes, - struct x86_emulate_ctxt *ctxt); - - /* write: Emulate a memory write. */ + void *p_data, + unsigned int bytes, + struct x86_emulate_ctxt *ctxt); + + /* + * write: Emulate a memory write. + * @bytes: Access length (0 < @bytes < 4096). + */ int (*write)( enum x86_segment seg, unsigned long offset, - unsigned long val, + void *p_data, unsigned int bytes, struct x86_emulate_ctxt *ctxt); diff -r 11318234588e -r 08f77df14cba xen/common/domain.c --- a/xen/common/domain.c Thu Jun 19 12:48:04 2008 +0900 +++ b/xen/common/domain.c Wed Jul 02 11:30:37 2008 +0900 @@ -73,21 +73,133 @@ int current_domain_id(void) return current->domain->domain_id; } -struct domain *alloc_domain(domid_t domid) +static struct domain *alloc_domain_struct(void) +{ + return xmalloc(struct domain); +} + +static void free_domain_struct(struct domain *d) +{ + xfree(d); +} + +static void __domain_finalise_shutdown(struct domain *d) +{ + struct vcpu *v; + + BUG_ON(!spin_is_locked(&d->shutdown_lock)); + + if ( d->is_shut_down ) + return; + + for_each_vcpu ( d, v ) + if ( !v->paused_for_shutdown ) + return; + + d->is_shut_down = 1; + send_guest_global_virq(dom0, VIRQ_DOM_EXC); +} + +static void vcpu_check_shutdown(struct vcpu *v) +{ + struct domain *d = v->domain; + + spin_lock(&d->shutdown_lock); + + if ( d->is_shutting_down ) + { + if ( !v->paused_for_shutdown ) + vcpu_pause_nosync(v); + v->paused_for_shutdown = 1; + v->defer_shutdown = 0; + __domain_finalise_shutdown(d); + } + + spin_unlock(&d->shutdown_lock); +} + +struct vcpu *alloc_vcpu( + struct domain *d, unsigned int vcpu_id, unsigned int cpu_id) +{ + struct vcpu *v; + + BUG_ON(d->vcpu[vcpu_id] != NULL); + + if ( (v = alloc_vcpu_struct()) == NULL ) + return NULL; + + v->domain = d; + v->vcpu_id = vcpu_id; + + v->runstate.state = is_idle_vcpu(v) ? RUNSTATE_running : RUNSTATE_offline; + v->runstate.state_entry_time = NOW(); + + if ( !is_idle_domain(d) ) + { + set_bit(_VPF_down, &v->pause_flags); + v->vcpu_info = (void *)&shared_info(d, vcpu_info[vcpu_id]); + } + + if ( sched_init_vcpu(v, cpu_id) != 0 ) + { + free_vcpu_struct(v); + return NULL; + } + + if ( vcpu_initialise(v) != 0 ) + { + sched_destroy_vcpu(v); + free_vcpu_struct(v); + return NULL; + } + + d->vcpu[vcpu_id] = v; + if ( vcpu_id != 0 ) + d->vcpu[v->vcpu_id-1]->next_in_list = v; + + /* Must be called after making new vcpu visible to for_each_vcpu(). */ + vcpu_check_shutdown(v); + + return v; +} + +struct vcpu *alloc_idle_vcpu(unsigned int cpu_id) { struct domain *d; - - if ( (d = xmalloc(struct domain)) == NULL ) + struct vcpu *v; + unsigned int vcpu_id = cpu_id % MAX_VIRT_CPUS; + + if ( (v = idle_vcpu[cpu_id]) != NULL ) + return v; + + d = (vcpu_id == 0) ? + domain_create(IDLE_DOMAIN_ID, 0, 0) : + idle_vcpu[cpu_id - vcpu_id]->domain; + BUG_ON(d == NULL); + + v = alloc_vcpu(d, vcpu_id, cpu_id); + idle_vcpu[cpu_id] = v; + + return v; +} + +struct domain *domain_create( + domid_t domid, unsigned int domcr_flags, ssidref_t ssidref) +{ + struct domain *d, **pd; + enum { INIT_xsm = 1u<<0, INIT_rangeset = 1u<<1, INIT_evtchn = 1u<<2, + INIT_gnttab = 1u<<3, INIT_arch = 1u<<4 }; + int init_status = 0; + + if ( (d = alloc_domain_struct()) == NULL ) return NULL; memset(d, 0, sizeof(*d)); d->domain_id = domid; if ( xsm_alloc_security_domain(d) != 0 ) - { - free_domain(d); - return NULL; - } + goto fail; + init_status |= INIT_xsm; atomic_set(&d->refcnt, 1); spin_lock_init(&d->domain_lock); @@ -97,132 +209,17 @@ struct domain *alloc_domain(domid_t domi INIT_LIST_HEAD(&d->page_list); INIT_LIST_HEAD(&d->xenpage_list); - return d; -} - -void free_domain(struct domain *d) -{ - xsm_free_security_domain(d); - xfree(d); -} - -static void __domain_finalise_shutdown(struct domain *d) -{ - struct vcpu *v; - - BUG_ON(!spin_is_locked(&d->shutdown_lock)); - - if ( d->is_shut_down ) - return; - - for_each_vcpu ( d, v ) - if ( !v->paused_for_shutdown ) - return; - - d->is_shut_down = 1; - send_guest_global_virq(dom0, VIRQ_DOM_EXC); -} - -static void vcpu_check_shutdown(struct vcpu *v) -{ - struct domain *d = v->domain; - - spin_lock(&d->shutdown_lock); - - if ( d->is_shutting_down ) - { - if ( !v->paused_for_shutdown ) - vcpu_pause_nosync(v); - v->paused_for_shutdown = 1; - v->defer_shutdown = 0; - __domain_finalise_shutdown(d); - } - - spin_unlock(&d->shutdown_lock); -} - -struct vcpu *alloc_vcpu( - struct domain *d, unsigned int vcpu_id, unsigned int cpu_id) -{ - struct vcpu *v; - - BUG_ON(d->vcpu[vcpu_id] != NULL); - - if ( (v = alloc_vcpu_struct()) == NULL ) - return NULL; - - v->domain = d; - v->vcpu_id = vcpu_id; - - v->runstate.state = is_idle_vcpu(v) ? RUNSTATE_running : RUNSTATE_offline; - v->runstate.state_entry_time = NOW(); - - if ( !is_idle_domain(d) ) - { - set_bit(_VPF_down, &v->pause_flags); - v->vcpu_info = (void *)&shared_info(d, vcpu_info[vcpu_id]); - } - - if ( sched_init_vcpu(v, cpu_id) != 0 ) - { - free_vcpu_struct(v); - return NULL; - } - - if ( vcpu_initialise(v) != 0 ) - { - sched_destroy_vcpu(v); - free_vcpu_struct(v); - return NULL; - } - - d->vcpu[vcpu_id] = v; - if ( vcpu_id != 0 ) - d->vcpu[v->vcpu_id-1]->next_in_list = v; - - /* Must be called after making new vcpu visible to for_each_vcpu(). */ - vcpu_check_shutdown(v); - - return v; -} - -struct vcpu *alloc_idle_vcpu(unsigned int cpu_id) -{ - struct domain *d; - struct vcpu *v; - unsigned int vcpu_id = cpu_id % MAX_VIRT_CPUS; - - if ( (v = idle_vcpu[cpu_id]) != NULL ) - return v; - - d = (vcpu_id == 0) ? - domain_create(IDLE_DOMAIN_ID, 0, 0) : - idle_vcpu[cpu_id - vcpu_id]->domain; - BUG_ON(d == NULL); - - v = alloc_vcpu(d, vcpu_id, cpu_id); - idle_vcpu[cpu_id] = v; - - return v; -} - -struct domain *domain_create( - domid_t domid, unsigned int domcr_flags, ssidref_t ssidref) -{ - struct domain *d, **pd; - enum { INIT_evtchn = 1, INIT_gnttab = 2, INIT_arch = 8 }; - int init_status = 0; - - if ( (d = alloc_domain(domid)) == NULL ) - return NULL; - if ( domcr_flags & DOMCRF_hvm ) d->is_hvm = 1; if ( (domid == 0) && opt_dom0_vcpus_pin ) d->is_pinned = 1; + if ( domcr_flags & DOMCRF_dummy ) + return d; + rangeset_domain_initialise(d); + init_status |= INIT_rangeset; if ( !is_idle_domain(d) ) { @@ -278,8 +275,11 @@ struct domain *domain_create( grant_table_destroy(d); if ( init_status & INIT_evtchn ) evtchn_destroy(d); - rangeset_domain_destroy(d); - free_domain(d); + if ( init_status & INIT_rangeset ) + rangeset_domain_destroy(d); + if ( init_status & INIT_xsm ) + xsm_free_security_domain(d); + free_domain_struct(d); return NULL; } @@ -535,7 +535,8 @@ static void complete_domain_destroy(stru if ( d->target != NULL ) put_domain(d->target); - free_domain(d); + xsm_free_security_domain(d); + free_domain_struct(d); send_guest_global_virq(dom0, VIRQ_DOM_EXC); } diff -r 11318234588e -r 08f77df14cba xen/drivers/passthrough/vtd/dmar.c --- a/xen/drivers/passthrough/vtd/dmar.c Thu Jun 19 12:48:04 2008 +0900 +++ b/xen/drivers/passthrough/vtd/dmar.c Wed Jul 02 11:30:37 2008 +0900 @@ -383,7 +383,8 @@ acpi_parse_one_drhd(struct acpi_dmar_ent dmaru->address = drhd->address; dmaru->include_all = drhd->flags & 1; /* BIT0: INCLUDE_ALL */ INIT_LIST_HEAD(&dmaru->ioapic_list); - dprintk(XENLOG_INFO VTDPREFIX, "dmaru->address = %lx\n", dmaru->address); + dprintk(XENLOG_INFO VTDPREFIX, "dmaru->address = %"PRIx64"\n", + dmaru->address); dev_scope_start = (void *)(drhd + 1); dev_scope_end = ((void *)drhd) + header->length; diff -r 11318234588e -r 08f77df14cba xen/drivers/passthrough/vtd/dmar.h --- a/xen/drivers/passthrough/vtd/dmar.h Thu Jun 19 12:48:04 2008 +0900 +++ b/xen/drivers/passthrough/vtd/dmar.h Wed Jul 02 11:30:37 2008 +0900 @@ -42,28 +42,28 @@ struct acpi_ioapic_unit { struct acpi_drhd_unit { struct list_head list; - unsigned long address; /* register base address of the unit */ - struct pci_dev *devices; /* target devices */ + u64 address; /* register base address of the unit */ + struct pci_dev *devices; /* target devices */ int devices_cnt; - u8 include_all:1; + u8 include_all:1; struct iommu *iommu; struct list_head ioapic_list; }; struct acpi_rmrr_unit { struct list_head list; - unsigned long base_address; - unsigned long end_address; + u64 base_address; + u64 end_address; struct pci_dev *devices; /* target devices */ int devices_cnt; - u8 allow_all:1; + u8 allow_all:1; }; struct acpi_atsr_unit { struct list_head list; - struct pci_dev *devices; /* target devices */ + struct pci_dev *devices; /* target devices */ int devices_cnt; - u8 all_ports:1; + u8 all_ports:1; }; #define for_each_iommu(domain, iommu) \ diff -r 11318234588e -r 08f77df14cba xen/drivers/passthrough/vtd/intremap.c --- a/xen/drivers/passthrough/vtd/intremap.c Thu Jun 19 12:48:04 2008 +0900 +++ b/xen/drivers/passthrough/vtd/intremap.c Wed Jul 02 11:30:37 2008 +0900 @@ -52,7 +52,7 @@ static void remap_entry_to_ioapic_rte( unsigned long flags; struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu); - if ( ir_ctrl == NULL || ir_ctrl->iremap_index < 0 ) + if ( ir_ctrl == NULL ) { dprintk(XENLOG_ERR VTDPREFIX, "remap_entry_to_ioapic_rte: ir_ctl is not ready\n"); @@ -153,6 +153,7 @@ static void ioapic_rte_to_remap_entry(st } memcpy(iremap_entry, &new_ire, sizeof(struct iremap_entry)); + iommu_flush_cache_entry(iremap_entry); iommu_flush_iec_index(iommu, 0, index); invalidate_sync(iommu); @@ -170,7 +171,8 @@ unsigned int io_apic_read_remap_rte( struct iommu *iommu = ioapic_to_iommu(mp_ioapics[apic].mpc_apicid); struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu); - if ( !iommu || !ir_ctrl || ir_ctrl->iremap_maddr == 0 ) + if ( !iommu || !ir_ctrl || ir_ctrl->iremap_maddr == 0 || + ir_ctrl->iremap_index == -1 ) { *IO_APIC_BASE(apic) = reg; return *(IO_APIC_BASE(apic)+4); @@ -377,6 +379,7 @@ static void msi_msg_to_remap_entry( remap_rte->data = 0; memcpy(iremap_entry, &new_ire, sizeof(struct iremap_entry)); + iommu_flush_cache_entry(iremap_entry); iommu_flush_iec_index(iommu, 0, index); invalidate_sync(iommu); diff -r 11318234588e -r 08f77df14cba xen/drivers/passthrough/vtd/iommu.c --- a/xen/drivers/passthrough/vtd/iommu.c Thu Jun 19 12:48:04 2008 +0900 +++ b/xen/drivers/passthrough/vtd/iommu.c Wed Jul 02 11:30:37 2008 +0900 @@ -1269,7 +1269,6 @@ static int domain_context_mapping( } static int domain_context_unmap_one( - struct domain *domain, struct iommu *iommu, u8 bus, u8 devfn) { @@ -1300,7 +1299,6 @@ static int domain_context_unmap_one( } static int domain_context_unmap( - struct domain *domain, struct iommu *iommu, struct pci_dev *pdev) { @@ -1320,14 +1318,13 @@ static int domain_context_unmap( PCI_FUNC(pdev->devfn), PCI_SUBORDINATE_BUS); break; case DEV_TYPE_PCIe_ENDPOINT: - ret = domain_context_unmap_one(domain, iommu, + ret = domain_context_unmap_one(iommu, (u8)(pdev->bus), (u8)(pdev->devfn)); break; case DEV_TYPE_PCI: if ( pdev->bus == 0 ) ret = domain_context_unmap_one( - domain, iommu, - (u8)(pdev->bus), (u8)(pdev->devfn)); + iommu, (u8)(pdev->bus), (u8)(pdev->devfn)); else { if ( bus2bridge[pdev->bus].bus != 0 ) @@ -1335,7 +1332,7 @@ static int domain_context_unmap( "domain_context_unmap:" "bus2bridge[%d].bus != 0\n", pdev->bus); - ret = domain_context_unmap_one(domain, iommu, + ret = domain_context_unmap_one(iommu, (u8)(bus2bridge[pdev->bus].bus), (u8)(bus2bridge[pdev->bus].devfn)); @@ -1345,8 +1342,7 @@ static int domain_context_unmap( for ( func = 0; func < 8; func++ ) { ret = domain_context_unmap_one( - domain, iommu, - pdev->bus, (u8)PCI_DEVFN(dev, func)); + iommu, pdev->bus, (u8)PCI_DEVFN(dev, func)); if ( ret ) return ret; } @@ -1389,7 +1385,7 @@ void reassign_device_ownership( found: drhd = acpi_find_matched_drhd_unit(pdev); iommu = drhd->iommu; - domain_context_unmap(source, iommu, pdev); + domain_context_unmap(iommu, pdev); /* Move pci device from the source domain to target domain. */ spin_lock_irqsave(&source_hd->iommu_list_lock, flags); @@ -1589,7 +1585,7 @@ static int iommu_prepare_rmrr_dev( struct pci_dev *pdev) { struct acpi_drhd_unit *drhd; - unsigned long size; + u64 size; int ret; /* page table init */ diff -r 11318234588e -r 08f77df14cba xen/drivers/passthrough/vtd/qinval.c --- a/xen/drivers/passthrough/vtd/qinval.c Thu Jun 19 12:48:04 2008 +0900 +++ b/xen/drivers/passthrough/vtd/qinval.c Wed Jul 02 11:30:37 2008 +0900 @@ -222,7 +222,7 @@ int invalidate_sync(struct iommu *iommu) int ret = -1; struct qi_ctrl *qi_ctrl = iommu_qi_ctrl(iommu); - if ( qi_ctrl->qinval_maddr == 0 ) + if ( qi_ctrl->qinval_maddr != 0 ) { ret = queue_invalidate_wait(iommu, 0, 1, 1, 1, &qi_ctrl->qinval_poll_status); @@ -416,7 +416,6 @@ int qinval_setup(struct iommu *iommu) int qinval_setup(struct iommu *iommu) { s_time_t start_time; - u32 status = 0; struct qi_ctrl *qi_ctrl; struct iommu_flush *flush; @@ -450,15 +449,12 @@ int qinval_setup(struct iommu *iommu) /* Make sure hardware complete it */ start_time = NOW(); - for ( ; ; ) - { - status = dmar_readl(iommu->reg, DMAR_GSTS_REG); - if ( status & DMA_GSTS_QIES ) - break; + while ( !(dmar_readl(iommu->reg, DMAR_GSTS_REG) & DMA_GSTS_QIES) ) + { if ( NOW() > (start_time + DMAR_OPERATION_TIMEOUT) ) panic("Cannot set QIE field for queue invalidation\n"); cpu_relax(); } - status = 0; - return status; -} + + return 0; +} diff -r 11318234588e -r 08f77df14cba xen/drivers/passthrough/vtd/utils.c --- a/xen/drivers/passthrough/vtd/utils.c Thu Jun 19 12:48:04 2008 +0900 +++ b/xen/drivers/passthrough/vtd/utils.c Wed Jul 02 11:30:37 2008 +0900 @@ -166,7 +166,7 @@ void print_iommu_regs(struct acpi_drhd_u struct iommu *iommu = drhd->iommu; printk("---- print_iommu_regs ----\n"); - printk("print_iommu_regs: drhd->address = %lx\n", drhd->address); + printk("print_iommu_regs: drhd->address = %"PRIx64"\n", drhd->address); printk("print_iommu_regs: DMAR_VER_REG = %x\n", dmar_readl(iommu->reg,DMAR_VER_REG)); printk("print_iommu_regs: DMAR_CAP_REG = %"PRIx64"\n", diff -r 11318234588e -r 08f77df14cba xen/include/acpi/cpufreq/cpufreq.h --- a/xen/include/acpi/cpufreq/cpufreq.h Thu Jun 19 12:48:04 2008 +0900 +++ b/xen/include/acpi/cpufreq/cpufreq.h Wed Jul 02 11:30:37 2008 +0900 @@ -36,7 +36,10 @@ struct cpufreq_policy { unsigned int max; /* in kHz */ unsigned int cur; /* in kHz, only needed if cpufreq * governors are used */ + unsigned int resume; /* flag for cpufreq 1st run + * S3 wakeup, hotplug cpu, etc */ }; +extern struct cpufreq_policy xen_px_policy[NR_CPUS]; #define CPUFREQ_SHARED_TYPE_NONE (0) /* None */ #define CPUFREQ_SHARED_TYPE_HW (1) /* HW does needed coordination */ diff -r 11318234588e -r 08f77df14cba xen/include/acpi/cpufreq/processor_perf.h --- a/xen/include/acpi/cpufreq/processor_perf.h Thu Jun 19 12:48:04 2008 +0900 +++ b/xen/include/acpi/cpufreq/processor_perf.h Wed Jul 02 11:30:37 2008 +0900 @@ -6,9 +6,21 @@ int get_cpu_id(u8); int acpi_cpufreq_init(void); +int powernow_cpufreq_init(void); + void px_statistic_update(cpumask_t, uint8_t, uint8_t); int px_statistic_init(int); void px_statistic_reset(int); +void px_statistic_suspend(void); +void px_statistic_resume(void); + +void cpufreq_dom_exit(void); +int cpufreq_dom_init(void); +int cpufreq_dom_dbs(unsigned int); +void cpufreq_suspend(void); +int cpufreq_resume(void); + +inline uint64_t get_cpu_idle_time(unsigned int); struct processor_performance { uint32_t state; @@ -44,6 +56,7 @@ struct pm_px { struct pm_px { struct px_stat u; uint64_t prev_state_wall; + uint64_t prev_idle_wall; }; extern struct pm_px px_statistic_data[NR_CPUS]; diff -r 11318234588e -r 08f77df14cba xen/include/asm-x86/domain.h --- a/xen/include/asm-x86/domain.h Thu Jun 19 12:48:04 2008 +0900 +++ b/xen/include/asm-x86/domain.h Wed Jul 02 11:30:37 2008 +0900 @@ -103,6 +103,9 @@ struct shadow_domain { * emulation and remove write permission */ atomic_t gtable_dirty_version; + + /* OOS */ + int oos_active; }; struct shadow_vcpu { @@ -122,6 +125,17 @@ struct shadow_vcpu { unsigned long last_emulated_frame; /* Last MFN that we emulated a write successfully */ unsigned long last_emulated_mfn; + + /* Shadow out-of-sync: pages that this vcpu has let go out of sync */ + mfn_t oos[SHADOW_OOS_PAGES]; + unsigned long oos_va[SHADOW_OOS_PAGES]; + mfn_t oos_snapshot[SHADOW_OOS_PAGES]; + struct oos_fixup { + mfn_t gmfn; + mfn_t smfn; + unsigned long off; + } *oos_fixups; + int oos_fixup_used; }; /************************************************/ diff -r 11318234588e -r 08f77df14cba xen/include/asm-x86/hvm/vmx/vmcs.h --- a/xen/include/asm-x86/hvm/vmx/vmcs.h Thu Jun 19 12:48:04 2008 +0900 +++ b/xen/include/asm-x86/hvm/vmx/vmcs.h Wed Jul 02 11:30:37 2008 +0900 @@ -333,10 +333,10 @@ enum vmcs_field { #define VMCS_VPID_WIDTH 16 void vmx_disable_intercept_for_msr(struct vcpu *v, u32 msr); -int vmx_read_guest_msr(struct vcpu *v, u32 msr, u64 *val); -int vmx_write_guest_msr(struct vcpu *v, u32 msr, u64 val); -int vmx_add_guest_msr(struct vcpu *v, u32 msr); -int vmx_add_host_load_msr(struct vcpu *v, u32 msr); +int vmx_read_guest_msr(u32 msr, u64 *val); +int vmx_write_guest_msr(u32 msr, u64 val); +int vmx_add_guest_msr(u32 msr); +int vmx_add_host_load_msr(u32 msr); #endif /* ASM_X86_HVM_VMX_VMCS_H__ */ diff -r 11318234588e -r 08f77df14cba xen/include/asm-x86/mm.h --- a/xen/include/asm-x86/mm.h Thu Jun 19 12:48:04 2008 +0900 +++ b/xen/include/asm-x86/mm.h Wed Jul 02 11:30:37 2008 +0900 @@ -130,6 +130,14 @@ static inline u32 pickle_domptr(struct d /* The order of the largest allocation unit we use for shadow pages */ #define SHADOW_MAX_ORDER 2 /* Need up to 16k allocs for 32-bit on PAE/64 */ +/* The number of out-of-sync shadows we allow per vcpu (prime, please) */ +#define SHADOW_OOS_PAGES 3 + +/* The order OOS fixup tables per vcpu */ +#define SHADOW_OOS_FT_ORDER 1 +/* OOS fixup tables hash entries */ +#define SHADOW_OOS_FT_HASH 13 + #define page_get_owner(_p) (unpickle_domptr((_p)->u.inuse._domain)) #define page_set_owner(_p,_d) ((_p)->u.inuse._domain = pickle_domptr(_d)) diff -r 11318234588e -r 08f77df14cba xen/include/asm-x86/perfc_defn.h --- a/xen/include/asm-x86/perfc_defn.h Thu Jun 19 12:48:04 2008 +0900 +++ b/xen/include/asm-x86/perfc_defn.h Wed Jul 02 11:30:37 2008 +0900 @@ -80,7 +80,11 @@ PERFCOUNTER(shadow_writeable_h_3, "shad PERFCOUNTER(shadow_writeable_h_3, "shadow writeable: 64b w2k3") PERFCOUNTER(shadow_writeable_h_4, "shadow writeable: linux low/solaris") PERFCOUNTER(shadow_writeable_h_5, "shadow writeable: linux high") +PERFCOUNTER(shadow_writeable_h_6, "shadow writeable: unsync va") +PERFCOUNTER(shadow_writeable_h_7, "shadow writeable: sl1p") +PERFCOUNTER(shadow_writeable_h_8, "shadow writeable: sl1p failed") PERFCOUNTER(shadow_writeable_bf, "shadow writeable brute-force") +PERFCOUNTER(shadow_writeable_bf_1, "shadow writeable resync bf") PERFCOUNTER(shadow_mappings, "shadow removes all mappings") PERFCOUNTER(shadow_mappings_bf, "shadow rm-mappings brute-force") PERFCOUNTER(shadow_early_unshadow, "shadow unshadows for fork/exit") @@ -101,4 +105,15 @@ PERFCOUNTER(shadow_em_ex_non_pt, "shad PERFCOUNTER(shadow_em_ex_non_pt, "shadow extra non-pt-write op") PERFCOUNTER(shadow_em_ex_fail, "shadow extra emulation failed") +PERFCOUNTER(shadow_oos_fixup_add_ok, "shadow OOS fixups adds") +PERFCOUNTER(shadow_oos_fixup_no_add, "shadow OOS fixups no adds") +PERFCOUNTER(shadow_oos_fixup_add_fail, "shadow OOS fixups adds failed") +PERFCOUNTER(shadow_oos_fixup_remove, "shadow OOS fixups removes") +PERFCOUNTER(shadow_oos_fixup_flush, "shadow OOS fixups flushes") +PERFCOUNTER(shadow_oos_fixup_flush_gmfn,"shadow OOS fixups gmfn flushes") + +PERFCOUNTER(shadow_unsync, "shadow OOS unsyncs") +PERFCOUNTER(shadow_unsync_evict, "shadow OOS evictions") +PERFCOUNTER(shadow_resync, "shadow OOS resyncs") + /*#endif*/ /* __XEN_PERFC_DEFN_H__ */ diff -r 11318234588e -r 08f77df14cba xen/include/public/hvm/hvm_op.h --- a/xen/include/public/hvm/hvm_op.h Thu Jun 19 12:48:04 2008 +0900 +++ b/xen/include/public/hvm/hvm_op.h Wed Jul 02 11:30:37 2008 +0900 @@ -92,6 +92,19 @@ typedef struct xen_hvm_track_dirty_vram typedef struct xen_hvm_track_dirty_vram xen_hvm_track_dirty_vram_t; DEFINE_XEN_GUEST_HANDLE(xen_hvm_track_dirty_vram_t); +/* Notify that some pages got modified by the Device Model. */ +#define HVMOP_modified_memory 7 +struct xen_hvm_modified_memory { + /* Domain to be updated. */ + domid_t domid; + /* First pfn. */ + uint64_aligned_t first_pfn; + /* Number of pages. */ + uint64_aligned_t nr; +}; +typedef struct xen_hvm_modified_memory xen_hvm_modified_memory_t; +DEFINE_XEN_GUEST_HANDLE(xen_hvm_modified_memory_t); + #endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */ #endif /* __XEN_PUBLIC_HVM_HVM_OP_H__ */ diff -r 11318234588e -r 08f77df14cba xen/include/xen/domain.h --- a/xen/include/xen/domain.h Thu Jun 19 12:48:04 2008 +0900 +++ b/xen/include/xen/domain.h Wed Jul 02 11:30:37 2008 +0900 @@ -15,9 +15,6 @@ int boot_vcpu( struct domain *d, int vcpuid, vcpu_guest_context_u ctxt); struct vcpu *alloc_idle_vcpu(unsigned int cpu_id); void vcpu_reset(struct vcpu *v); - -struct domain *alloc_domain(domid_t domid); -void free_domain(struct domain *d); struct xen_domctl_getdomaininfo; void getdomaininfo(struct domain *d, struct xen_domctl_getdomaininfo *info); diff -r 11318234588e -r 08f77df14cba xen/include/xen/sched.h --- a/xen/include/xen/sched.h Thu Jun 19 12:48:04 2008 +0900 +++ b/xen/include/xen/sched.h Wed Jul 02 11:30:37 2008 +0900 @@ -315,10 +315,14 @@ struct domain *domain_create( struct domain *domain_create( domid_t domid, unsigned int domcr_flags, ssidref_t ssidref); /* DOMCRF_hvm: Create an HVM domain, as opposed to a PV domain. */ -#define _DOMCRF_hvm 0 -#define DOMCRF_hvm (1U<<_DOMCRF_hvm) -#define _DOMCRF_hap 1 -#define DOMCRF_hap (1U<<_DOMCRF_hap) +#define _DOMCRF_hvm 0 +#define DOMCRF_hvm (1U<<_DOMCRF_hvm) + /* DOMCRF_hap: Create a domain with hardware-assisted paging. */ +#define _DOMCRF_hap 1 +#define DOMCRF_hap (1U<<_DOMCRF_hap) + /* DOMCRF_dummy: Create a dummy domain (not scheduled; not on domain list) */ +#define _DOMCRF_dummy 2 +#define DOMCRF_dummy (1U<<_DOMCRF_dummy) int construct_dom0( struct domain *d, _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-changelog
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |