[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-changelog] [xen-unstable] merge with xen-unstable.hg
# HG changeset patch # User Alex Williamson <alex.williamson@xxxxxx> # Date 1175627091 21600 # Node ID f378c424e0ced4cbc584e5c6125d065f1cc05d0c # Parent fc9e2f7920c95229caaf5ad8fc44965dd891f600 # Parent 7e431ea834a877b1f0c90bdb1e6f1346da4e81cc merge with xen-unstable.hg --- README | 22 docs/src/user.tex | 4 linux-2.6-xen-sparse/arch/ia64/Kconfig | 9 linux-2.6-xen-sparse/drivers/xen/Kconfig | 16 linux-2.6-xen-sparse/drivers/xen/Makefile | 7 linux-2.6-xen-sparse/drivers/xen/blktap/blktap.c | 38 linux-2.6-xen-sparse/drivers/xen/core/Makefile | 3 linux-2.6-xen-sparse/drivers/xen/gntdev/Makefile | 1 linux-2.6-xen-sparse/drivers/xen/gntdev/gntdev.c | 973 +++++++++++++++++++++++ linux-2.6-xen-sparse/drivers/xen/util.c | 22 linux-2.6-xen-sparse/include/linux/mm.h | 4 linux-2.6-xen-sparse/include/xen/driver_util.h | 3 linux-2.6-xen-sparse/include/xen/public/gntdev.h | 105 ++ linux-2.6-xen-sparse/mm/memory.c | 9 tools/blktap/drivers/qcow2raw.c | 9 tools/examples/xmexample.hvm | 4 tools/ioemu/target-i386-dm/helper2.c | 33 tools/ioemu/vl.c | 3 tools/ioemu/vl.h | 6 tools/ioemu/xenstore.c | 69 + tools/libxc/ia64/xc_ia64_linux_restore.c | 51 - tools/libxc/xc_core.c | 4 tools/libxc/xc_core_x86.c | 12 tools/libxc/xc_hvm_restore.c | 14 tools/libxc/xc_hvm_save.c | 7 tools/libxc/xc_linux.c | 156 +++ tools/libxc/xc_linux_restore.c | 85 +- tools/libxc/xc_linux_save.c | 66 - tools/libxc/xc_resume.c | 4 tools/libxc/xenctrl.h | 59 + tools/libxc/xenguest.h | 9 tools/libxc/xg_private.h | 9 tools/python/xen/lowlevel/scf/scf.c | 2 tools/python/xen/xend/XendCheckpoint.py | 13 tools/python/xen/xend/XendConfig.py | 3 tools/python/xen/xend/XendDomainInfo.py | 8 tools/python/xen/xend/balloon.py | 18 tools/python/xen/xend/image.py | 3 tools/python/xen/xend/osdep.py | 50 + tools/python/xen/xend/server/SrvServer.py | 4 tools/python/xen/xend/server/relocate.py | 8 tools/python/xen/xm/create.py | 6 tools/python/xen/xm/main.py | 8 tools/python/xen/xm/xenapi_create.py | 1 tools/xcutils/xc_restore.c | 33 tools/xenstat/xentop/xentop.c | 2 xen/arch/x86/hvm/hvm.c | 9 xen/arch/x86/hvm/intercept.c | 38 xen/arch/x86/hvm/io.c | 11 xen/arch/x86/hvm/platform.c | 20 xen/arch/x86/hvm/rtc.c | 8 xen/arch/x86/hvm/svm/vmcb.c | 28 xen/arch/x86/hvm/vmx/vmcs.c | 2 xen/arch/x86/hvm/vmx/vmx.c | 13 xen/arch/x86/mm.c | 3 xen/arch/x86/mm/hap/hap.c | 68 - xen/arch/x86/mm/shadow/multi.c | 4 xen/arch/x86/setup.c | 4 xen/arch/x86/time.c | 2 xen/arch/x86/traps.c | 17 xen/arch/x86/x86_32/traps.c | 7 xen/arch/x86/x86_64/traps.c | 10 xen/common/domain.c | 121 ++ xen/common/domctl.c | 5 xen/common/page_alloc.c | 12 xen/common/symbols.c | 12 xen/drivers/char/console.c | 8 xen/include/asm-x86/domain.h | 1 xen/include/asm-x86/hvm/io.h | 2 xen/include/asm-x86/hvm/support.h | 1 xen/include/asm-x86/hvm/vmx/vmcs.h | 1 xen/include/asm-x86/processor.h | 8 xen/include/asm-x86/time.h | 5 xen/include/public/hvm/ioreq.h | 1 xen/include/xen/sched.h | 12 75 files changed, 2055 insertions(+), 353 deletions(-) diff -r fc9e2f7920c9 -r f378c424e0ce README --- a/README Fri Mar 30 17:18:42 2007 -0600 +++ b/README Tue Apr 03 13:04:51 2007 -0600 @@ -177,3 +177,25 @@ 5. To rebuild a kernel with a modified c an initial ram disk, just like a native system e.g. # depmod 2.6.16-xen # mkinitrd -v -f --with=aacraid --with=sd_mod --with=scsi_mod initrd-2.6.16-xen.img 2.6.16-xen + + +Python Runtime Libraries +======================== + +Xend (the Xen daemon) has the following runtime dependencies: + + * Python 2.3 or later. + In many distros, the XML-aspects to the standard library + (xml.dom.minidom etc) are broken out into a separate python-xml package. + This is also required. + + URL: http://www.python.org/ + Debian: python, python-xml + + * For optional SSL support, pyOpenSSL: + URL: http://pyopenssl.sourceforge.net/ + Debian: python-pyopenssl + + * For optional PAM support, PyPAM: + URL: http://www.pangalactic.org/PyPAM/ + Debian: python-pam diff -r fc9e2f7920c9 -r f378c424e0ce docs/src/user.tex --- a/docs/src/user.tex Fri Mar 30 17:18:42 2007 -0600 +++ b/docs/src/user.tex Tue Apr 03 13:04:51 2007 -0600 @@ -3250,6 +3250,10 @@ editing \path{grub.conf}. \item [ dma\_emergency\_pool=xxx ] Specify lower bound on size of DMA pool below which ordinary allocations will fail rather than fall back to allocating from the DMA pool. +\item [ hap ] Instruct Xen to detect hardware-assisted paging support, such + as AMD-V's nested paging or Intel\textregistered VT's extended paging. If + available, Xen will use hardware-assisted paging instead of shadow paging + for guest memory management. \end{description} In addition, the following options may be specified on the Xen command diff -r fc9e2f7920c9 -r f378c424e0ce linux-2.6-xen-sparse/arch/ia64/Kconfig --- a/linux-2.6-xen-sparse/arch/ia64/Kconfig Fri Mar 30 17:18:42 2007 -0600 +++ b/linux-2.6-xen-sparse/arch/ia64/Kconfig Tue Apr 03 13:04:51 2007 -0600 @@ -576,15 +576,6 @@ source "crypto/Kconfig" # override default values of drivers/xen/Kconfig # if XEN -config XEN_UTIL - default n - -config XEN_BALLOON - default y - -config XEN_REBOOT - default y - config XEN_SMPBOOT default n endif diff -r fc9e2f7920c9 -r f378c424e0ce linux-2.6-xen-sparse/drivers/xen/Kconfig --- a/linux-2.6-xen-sparse/drivers/xen/Kconfig Fri Mar 30 17:18:42 2007 -0600 +++ b/linux-2.6-xen-sparse/drivers/xen/Kconfig Tue Apr 03 13:04:51 2007 -0600 @@ -253,22 +253,6 @@ config NO_IDLE_HZ bool default y -config XEN_UTIL - bool - default y - -config XEN_BALLOON - bool - default y - -config XEN_DEVMEM - bool - default y - -config XEN_REBOOT - bool - default y - config XEN_SMPBOOT bool default y diff -r fc9e2f7920c9 -r f378c424e0ce linux-2.6-xen-sparse/drivers/xen/Makefile --- a/linux-2.6-xen-sparse/drivers/xen/Makefile Fri Mar 30 17:18:42 2007 -0600 +++ b/linux-2.6-xen-sparse/drivers/xen/Makefile Tue Apr 03 13:04:51 2007 -0600 @@ -3,10 +3,11 @@ obj-y += evtchn/ obj-y += evtchn/ obj-y += privcmd/ obj-y += xenbus/ +obj-y += gntdev/ +obj-y += balloon/ +obj-y += char/ -obj-$(CONFIG_XEN_UTIL) += util.o -obj-$(CONFIG_XEN_BALLOON) += balloon/ -obj-$(CONFIG_XEN_DEVMEM) += char/ +obj-y += util.o obj-$(CONFIG_XEN_BLKDEV_BACKEND) += blkback/ obj-$(CONFIG_XEN_BLKDEV_TAP) += blktap/ obj-$(CONFIG_XEN_NETDEV_BACKEND) += netback/ diff -r fc9e2f7920c9 -r f378c424e0ce linux-2.6-xen-sparse/drivers/xen/blktap/blktap.c --- a/linux-2.6-xen-sparse/drivers/xen/blktap/blktap.c Fri Mar 30 17:18:42 2007 -0600 +++ b/linux-2.6-xen-sparse/drivers/xen/blktap/blktap.c Tue Apr 03 13:04:51 2007 -0600 @@ -44,6 +44,7 @@ #include <asm/hypervisor.h> #include "common.h" #include <xen/balloon.h> +#include <xen/driver_util.h> #include <linux/kernel.h> #include <linux/fs.h> #include <linux/mm.h> @@ -55,30 +56,6 @@ #define MAX_TAP_DEV 256 /*the maximum number of tapdisk ring devices */ #define MAX_DEV_NAME 100 /*the max tapdisk ring device name e.g. blktap0 */ - - -struct class *xen_class; -EXPORT_SYMBOL_GPL(xen_class); - -/* - * Setup the xen class. This should probably go in another file, but - * since blktap is the only user of it so far, it gets to keep it. - */ -int setup_xen_class(void) -{ - int ret; - - if (xen_class) - return 0; - - xen_class = class_create(THIS_MODULE, "xen"); - if ((ret = IS_ERR(xen_class))) { - xen_class = NULL; - return ret; - } - - return 0; -} /* * The maximum number of requests that can be outstanding at any time @@ -347,6 +324,7 @@ static const struct file_operations blkt static tap_blkif_t *get_next_free_dev(void) { + struct class *class; tap_blkif_t *info; int minor; @@ -409,9 +387,10 @@ found: wmb(); tapfds[minor] = info; - class_device_create(xen_class, NULL, - MKDEV(blktap_major, minor), NULL, - "blktap%d", minor); + if ((class = get_xen_class()) != NULL) + class_device_create(class, NULL, + MKDEV(blktap_major, minor), NULL, + "blktap%d", minor); } out: @@ -1487,6 +1466,7 @@ static int __init blkif_init(void) static int __init blkif_init(void) { int i, ret; + struct class *class; if (!is_running_on_xen()) return -ENODEV; @@ -1522,7 +1502,7 @@ static int __init blkif_init(void) DPRINTK("Created misc_dev [/dev/xen/blktap%d]\n",i); /* Make sure the xen class exists */ - if (!setup_xen_class()) { + if ((class = get_xen_class()) != NULL) { /* * This will allow udev to create the blktap ctrl device. * We only want to create blktap0 first. We don't want @@ -1530,7 +1510,7 @@ static int __init blkif_init(void) * We only create the device when a request of a new device is * made. */ - class_device_create(xen_class, NULL, + class_device_create(class, NULL, MKDEV(blktap_major, 0), NULL, "blktap0"); } else { diff -r fc9e2f7920c9 -r f378c424e0ce linux-2.6-xen-sparse/drivers/xen/core/Makefile --- a/linux-2.6-xen-sparse/drivers/xen/core/Makefile Fri Mar 30 17:18:42 2007 -0600 +++ b/linux-2.6-xen-sparse/drivers/xen/core/Makefile Tue Apr 03 13:04:51 2007 -0600 @@ -2,12 +2,11 @@ # Makefile for the linux kernel. # -obj-y := evtchn.o gnttab.o features.o +obj-y := evtchn.o gnttab.o features.o reboot.o machine_reboot.o obj-$(CONFIG_PROC_FS) += xen_proc.o obj-$(CONFIG_SYSFS) += hypervisor_sysfs.o obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o obj-$(CONFIG_XEN_SYSFS) += xen_sysfs.o -obj-$(CONFIG_XEN_REBOOT) += reboot.o machine_reboot.o obj-$(CONFIG_XEN_SMPBOOT) += smpboot.o obj-$(CONFIG_KEXEC) += machine_kexec.o diff -r fc9e2f7920c9 -r f378c424e0ce linux-2.6-xen-sparse/drivers/xen/gntdev/Makefile --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/linux-2.6-xen-sparse/drivers/xen/gntdev/Makefile Tue Apr 03 13:04:51 2007 -0600 @@ -0,0 +1,1 @@ +obj-y := gntdev.o diff -r fc9e2f7920c9 -r f378c424e0ce linux-2.6-xen-sparse/drivers/xen/gntdev/gntdev.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/linux-2.6-xen-sparse/drivers/xen/gntdev/gntdev.c Tue Apr 03 13:04:51 2007 -0600 @@ -0,0 +1,973 @@ +/****************************************************************************** + * gntdev.c + * + * Device for accessing (in user-space) pages that have been granted by other + * domains. + * + * Copyright (c) 2006-2007, D G Murray. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <asm/atomic.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/fs.h> +#include <linux/device.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <asm/uaccess.h> +#include <asm/io.h> +#include <xen/gnttab.h> +#include <asm/hypervisor.h> +#include <xen/balloon.h> +#include <xen/evtchn.h> +#include <xen/driver_util.h> + +#include <linux/types.h> +#include <xen/public/gntdev.h> + + +#define DRIVER_AUTHOR "Derek G. Murray <Derek.Murray@xxxxxxxxxxxx>" +#define DRIVER_DESC "User-space granted page access driver" + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR(DRIVER_AUTHOR); +MODULE_DESCRIPTION(DRIVER_DESC); + +#define MAX_GRANTS 128 + +/* A slot can be in one of three states: + * + * 0. GNTDEV_SLOT_INVALID: + * This slot is not associated with a grant reference, and is therefore free + * to be overwritten by a new grant reference. + * + * 1. GNTDEV_SLOT_NOT_YET_MAPPED: + * This slot is associated with a grant reference (via the + * IOCTL_GNTDEV_MAP_GRANT_REF ioctl), but it has not yet been mmap()-ed. + * + * 2. GNTDEV_SLOT_MAPPED: + * This slot is associated with a grant reference, and has been mmap()-ed. + */ +typedef enum gntdev_slot_state { + GNTDEV_SLOT_INVALID = 0, + GNTDEV_SLOT_NOT_YET_MAPPED, + GNTDEV_SLOT_MAPPED +} gntdev_slot_state_t; + +#define GNTDEV_INVALID_HANDLE -1 +#define GNTDEV_FREE_LIST_INVALID -1 +/* Each opened instance of gntdev is associated with a list of grants, + * represented by an array of elements of the following type, + * gntdev_grant_info_t. + */ +typedef struct gntdev_grant_info { + gntdev_slot_state_t state; + union { + uint32_t free_list_index; + struct { + domid_t domid; + grant_ref_t ref; + grant_handle_t kernel_handle; + grant_handle_t user_handle; + uint64_t dev_bus_addr; + } valid; + } u; +} gntdev_grant_info_t; + +/* Private data structure, which is stored in the file pointer for files + * associated with this device. + */ +typedef struct gntdev_file_private_data { + + /* Array of grant information. */ + gntdev_grant_info_t grants[MAX_GRANTS]; + + /* Read/write semaphore used to protect the grants array. */ + struct rw_semaphore grants_sem; + + /* An array of indices of free slots in the grants array. + * N.B. An entry in this list may temporarily have the value + * GNTDEV_FREE_LIST_INVALID if the corresponding slot has been removed + * from the list by the contiguous allocator, but the list has not yet + * been compressed. However, this is not visible across invocations of + * the device. + */ + int32_t free_list[MAX_GRANTS]; + + /* The number of free slots in the grants array. */ + uint32_t free_list_size; + + /* Read/write semaphore used to protect the free list. */ + struct rw_semaphore free_list_sem; + + /* Index of the next slot after the most recent contiguous allocation, + * for use in a next-fit allocator. + */ + uint32_t next_fit_index; + + /* Used to map grants into the kernel, before mapping them into user + * space. + */ + struct page **foreign_pages; + +} gntdev_file_private_data_t; + +/* Module lifecycle operations. */ +static int __init gntdev_init(void); +static void __exit gntdev_exit(void); + +module_init(gntdev_init); +module_exit(gntdev_exit); + +/* File operations. */ +static int gntdev_open(struct inode *inode, struct file *flip); +static int gntdev_release(struct inode *inode, struct file *flip); +static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma); +static int gntdev_ioctl (struct inode *inode, struct file *flip, + unsigned int cmd, unsigned long arg); + +static struct file_operations gntdev_fops = { + .owner = THIS_MODULE, + .open = gntdev_open, + .release = gntdev_release, + .mmap = gntdev_mmap, + .ioctl = gntdev_ioctl +}; + +/* VM operations. */ +static void gntdev_vma_close(struct vm_area_struct *vma); +static pte_t gntdev_clear_pte(struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep, int is_fullmm); + +static struct vm_operations_struct gntdev_vmops = { + .close = gntdev_vma_close, + .zap_pte = gntdev_clear_pte +}; + +/* Global variables. */ + +/* The driver major number, for use when unregistering the driver. */ +static int gntdev_major; + +#define GNTDEV_NAME "gntdev" + +/* Memory mapping functions + * ------------------------ + * + * Every granted page is mapped into both kernel and user space, and the two + * following functions return the respective virtual addresses of these pages. + * + * When shadow paging is disabled, the granted page is mapped directly into + * user space; when it is enabled, it is mapped into the kernel and remapped + * into user space using vm_insert_page() (see gntdev_mmap(), below). + */ + +/* Returns the virtual address (in user space) of the @page_index'th page + * in the given VM area. + */ +static inline unsigned long get_user_vaddr (struct vm_area_struct *vma, + int page_index) +{ + return (unsigned long) vma->vm_start + (page_index << PAGE_SHIFT); +} + +/* Returns the virtual address (in kernel space) of the @slot_index'th page + * mapped by the gntdev instance that owns the given private data struct. + */ +static inline unsigned long get_kernel_vaddr (gntdev_file_private_data_t *priv, + int slot_index) +{ + unsigned long pfn; + void *kaddr; + pfn = page_to_pfn(priv->foreign_pages[slot_index]); + kaddr = pfn_to_kaddr(pfn); + return (unsigned long) kaddr; +} + +/* Helper functions. */ + +/* Adds information about a grant reference to the list of grants in the file's + * private data structure. Returns non-zero on failure. On success, sets the + * value of *offset to the offset that should be mmap()-ed in order to map the + * grant reference. + */ +static int add_grant_reference(struct file *flip, + struct ioctl_gntdev_grant_ref *op, + uint64_t *offset) +{ + gntdev_file_private_data_t *private_data + = (gntdev_file_private_data_t *) flip->private_data; + + uint32_t slot_index; + + if (unlikely(private_data->free_list_size == 0)) { + return -ENOMEM; + } + + slot_index = private_data->free_list[--private_data->free_list_size]; + + /* Copy the grant information into file's private data. */ + private_data->grants[slot_index].state = GNTDEV_SLOT_NOT_YET_MAPPED; + private_data->grants[slot_index].u.valid.domid = op->domid; + private_data->grants[slot_index].u.valid.ref = op->ref; + + /* The offset is calculated as the index of the chosen entry in the + * file's private data's array of grant information. This is then + * shifted to give an offset into the virtual "file address space". + */ + *offset = slot_index << PAGE_SHIFT; + + return 0; +} + +/* Adds the @count grant references to the contiguous range in the slot array + * beginning at @first_slot. It is assumed that @first_slot was returned by a + * previous invocation of find_contiguous_free_range(), during the same + * invocation of the driver. + */ +static int add_grant_references(struct file *flip, + int count, + struct ioctl_gntdev_grant_ref *ops, + uint32_t first_slot) +{ + gntdev_file_private_data_t *private_data + = (gntdev_file_private_data_t *) flip->private_data; + int i; + + for (i = 0; i < count; ++i) { + + /* First, mark the slot's entry in the free list as invalid. */ + int free_list_index = + private_data->grants[first_slot+i].u.free_list_index; + private_data->free_list[free_list_index] = + GNTDEV_FREE_LIST_INVALID; + + /* Now, update the slot. */ + private_data->grants[first_slot+i].state = + GNTDEV_SLOT_NOT_YET_MAPPED; + private_data->grants[first_slot+i].u.valid.domid = + ops[i].domid; + private_data->grants[first_slot+i].u.valid.ref = ops[i].ref; + } + + return 0; +} + +/* Scans through the free list for @flip, removing entries that are marked as + * GNTDEV_SLOT_INVALID. This will reduce the recorded size of the free list to + * the number of valid entries. + */ +static void compress_free_list(struct file *flip) +{ + gntdev_file_private_data_t *private_data + = (gntdev_file_private_data_t *) flip->private_data; + int i, j = 0, old_size; + + old_size = private_data->free_list_size; + for (i = 0; i < old_size; ++i) { + if (private_data->free_list[i] != GNTDEV_FREE_LIST_INVALID) { + private_data->free_list[j] = + private_data->free_list[i]; + ++j; + } else { + --private_data->free_list_size; + } + } +} + +/* Searches the grant array in the private data of @flip for a range of + * @num_slots contiguous slots in the GNTDEV_SLOT_INVALID state. + * + * Returns the index of the first slot if a range is found, otherwise -ENOMEM. + */ +static int find_contiguous_free_range(struct file *flip, + uint32_t num_slots) +{ + gntdev_file_private_data_t *private_data + = (gntdev_file_private_data_t *) flip->private_data; + + int i; + int start_index = private_data->next_fit_index; + int range_start = 0, range_length; + + if (private_data->free_list_size < num_slots) { + return -ENOMEM; + } + + /* First search from the start_index to the end of the array. */ + range_length = 0; + for (i = start_index; i < MAX_GRANTS; ++i) { + if (private_data->grants[i].state == GNTDEV_SLOT_INVALID) { + if (range_length == 0) { + range_start = i; + } + ++range_length; + if (range_length == num_slots) { + return range_start; + } + } + } + + /* Now search from the start of the array to the start_index. */ + range_length = 0; + for (i = 0; i < start_index; ++i) { + if (private_data->grants[i].state == GNTDEV_SLOT_INVALID) { + if (range_length == 0) { + range_start = i; + } + ++range_length; + if (range_length == num_slots) { + return range_start; + } + } + } + + return -ENOMEM; +} + +/* Interface functions. */ + +/* Initialises the driver. Called when the module is loaded. */ +static int __init gntdev_init(void) +{ + struct class *class; + struct class_device *device; + + if (!is_running_on_xen()) { + printk(KERN_ERR "You must be running Xen to use gntdev\n"); + return -ENODEV; + } + + gntdev_major = register_chrdev(0, GNTDEV_NAME, &gntdev_fops); + if (gntdev_major < 0) + { + printk(KERN_ERR "Could not register gntdev device\n"); + return -ENOMEM; + } + + /* Note that if the sysfs code fails, we will still initialise the + * device, and output the major number so that the device can be + * created manually using mknod. + */ + if ((class = get_xen_class()) == NULL) { + printk(KERN_ERR "Error setting up xen_class\n"); + printk(KERN_ERR "gntdev created with major number = %d\n", + gntdev_major); + return 0; + } + + device = class_device_create(class, NULL, MKDEV(gntdev_major, 0), + NULL, GNTDEV_NAME); + if (IS_ERR(device)) { + printk(KERN_ERR "Error creating gntdev device in xen_class\n"); + printk(KERN_ERR "gntdev created with major number = %d\n", + gntdev_major); + return 0; + } + + return 0; +} + +/* Cleans up and unregisters the driver. Called when the driver is unloaded. + */ +static void __exit gntdev_exit(void) +{ + struct class *class; + if ((class = get_xen_class()) != NULL) + class_device_destroy(class, MKDEV(gntdev_major, 0)); + unregister_chrdev(gntdev_major, GNTDEV_NAME); +} + +/* Called when the device is opened. */ +static int gntdev_open(struct inode *inode, struct file *flip) +{ + gntdev_file_private_data_t *private_data; + int i; + + try_module_get(THIS_MODULE); + + /* Allocate space for the per-instance private data. */ + private_data = kmalloc(sizeof(*private_data), GFP_KERNEL); + if (!private_data) + goto nomem_out; + + /* Allocate space for the kernel-mapping of granted pages. */ + private_data->foreign_pages = + alloc_empty_pages_and_pagevec(MAX_GRANTS); + if (!private_data->foreign_pages) + goto nomem_out2; + + /* Initialise the free-list, which contains all slots at first. + */ + for (i = 0; i < MAX_GRANTS; ++i) { + private_data->free_list[MAX_GRANTS - i - 1] = i; + private_data->grants[i].state = GNTDEV_SLOT_INVALID; + private_data->grants[i].u.free_list_index = MAX_GRANTS - i - 1; + } + private_data->free_list_size = MAX_GRANTS; + private_data->next_fit_index = 0; + + init_rwsem(&private_data->grants_sem); + init_rwsem(&private_data->free_list_sem); + + flip->private_data = private_data; + + return 0; + +nomem_out2: + kfree(private_data); +nomem_out: + return -ENOMEM; +} + +/* Called when the device is closed. + */ +static int gntdev_release(struct inode *inode, struct file *flip) +{ + if (flip->private_data) { + gntdev_file_private_data_t *private_data = + (gntdev_file_private_data_t *) flip->private_data; + if (private_data->foreign_pages) { + free_empty_pages_and_pagevec + (private_data->foreign_pages, MAX_GRANTS); + } + kfree(private_data); + } + module_put(THIS_MODULE); + return 0; +} + +/* Called when an attempt is made to mmap() the device. The private data from + * @flip contains the list of grant references that can be mapped. The vm_pgoff + * field of @vma contains the index into that list that refers to the grant + * reference that will be mapped. Only mappings that are a multiple of + * PAGE_SIZE are handled. + */ +static int gntdev_mmap (struct file *flip, struct vm_area_struct *vma) +{ + struct gnttab_map_grant_ref op; + unsigned long slot_index = vma->vm_pgoff; + unsigned long kernel_vaddr, user_vaddr; + uint32_t size = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + uint64_t ptep; + int ret; + int flags; + int i; + struct page *page; + gntdev_file_private_data_t *private_data = flip->private_data; + + if (unlikely(!private_data)) { + printk(KERN_ERR "File's private data is NULL.\n"); + return -EINVAL; + } + + if (unlikely((size <= 0) || (size + slot_index) > MAX_GRANTS)) { + printk(KERN_ERR "Invalid number of pages or offset" + "(num_pages = %d, first_slot = %ld).\n", + size, slot_index); + return -ENXIO; + } + + if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED)) { + printk(KERN_ERR "Writable mappings must be shared.\n"); + return -EINVAL; + } + + /* Slots must be in the NOT_YET_MAPPED state. */ + down_write(&private_data->grants_sem); + for (i = 0; i < size; ++i) { + if (private_data->grants[slot_index + i].state != + GNTDEV_SLOT_NOT_YET_MAPPED) { + printk(KERN_ERR "Slot (index = %ld) is in the wrong " + "state (%d).\n", slot_index + i, + private_data->grants[slot_index + i].state); + up_write(&private_data->grants_sem); + return -EINVAL; + } + } + + /* Install the hook for unmapping. */ + vma->vm_ops = &gntdev_vmops; + + /* The VM area contains pages from another VM. */ + vma->vm_flags |= VM_FOREIGN; + vma->vm_private_data = kzalloc(size * sizeof(struct page_struct *), + GFP_KERNEL); + if (vma->vm_private_data == NULL) { + printk(KERN_ERR "Couldn't allocate mapping structure for VM " + "area.\n"); + return -ENOMEM; + } + + /* This flag prevents Bad PTE errors when the memory is unmapped. */ + vma->vm_flags |= VM_RESERVED; + + /* This flag prevents this VM area being copied on a fork(). A better + * behaviour might be to explicitly carry out the appropriate mappings + * on fork(), but I don't know if there's a hook for this. + */ + vma->vm_flags |= VM_DONTCOPY; + +#ifdef CONFIG_X86 + /* This flag ensures that the page tables are not unpinned before the + * VM area is unmapped. Therefore Xen still recognises the PTE as + * belonging to an L1 pagetable, and the grant unmap operation will + * succeed, even if the process does not exit cleanly. + */ + vma->vm_mm->context.has_foreign_mappings = 1; +#endif + + for (i = 0; i < size; ++i) { + + flags = GNTMAP_host_map; + if (!(vma->vm_flags & VM_WRITE)) + flags |= GNTMAP_readonly; + + kernel_vaddr = get_kernel_vaddr(private_data, slot_index + i); + user_vaddr = get_user_vaddr(vma, i); + page = pfn_to_page(__pa(kernel_vaddr) >> PAGE_SHIFT); + + gnttab_set_map_op(&op, kernel_vaddr, flags, + private_data->grants[slot_index+i] + .u.valid.ref, + private_data->grants[slot_index+i] + .u.valid.domid); + + /* Carry out the mapping of the grant reference. */ + ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, + &op, 1); + BUG_ON(ret); + if (op.status) { + printk(KERN_ERR "Error mapping the grant reference " + "into the kernel (%d). domid = %d; ref = %d\n", + op.status, + private_data->grants[slot_index+i] + .u.valid.domid, + private_data->grants[slot_index+i] + .u.valid.ref); + goto undo_map_out; + } + + /* Store a reference to the page that will be mapped into user + * space. + */ + ((struct page **) vma->vm_private_data)[i] = page; + + /* Mark mapped page as reserved. */ + SetPageReserved(page); + + /* Record the grant handle, for use in the unmap operation. */ + private_data->grants[slot_index+i].u.valid.kernel_handle = + op.handle; + private_data->grants[slot_index+i].u.valid.dev_bus_addr = + op.dev_bus_addr; + + private_data->grants[slot_index+i].state = GNTDEV_SLOT_MAPPED; + private_data->grants[slot_index+i].u.valid.user_handle = + GNTDEV_INVALID_HANDLE; + + /* Now perform the mapping to user space. */ + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + + /* NOT USING SHADOW PAGE TABLES. */ + /* In this case, we map the grant(s) straight into user + * space. + */ + + /* Get the machine address of the PTE for the user + * page. + */ + if ((ret = create_lookup_pte_addr(vma->vm_mm, + vma->vm_start + + (i << PAGE_SHIFT), + &ptep))) + { + printk(KERN_ERR "Error obtaining PTE pointer " + "(%d).\n", ret); + goto undo_map_out; + } + + /* Configure the map operation. */ + + /* The reference is to be used by host CPUs. */ + flags = GNTMAP_host_map; + + /* Specifies a user space mapping. */ + flags |= GNTMAP_application_map; + + /* The map request contains the machine address of the + * PTE to update. + */ + flags |= GNTMAP_contains_pte; + + if (!(vma->vm_flags & VM_WRITE)) + flags |= GNTMAP_readonly; + + gnttab_set_map_op(&op, ptep, flags, + private_data->grants[slot_index+i] + .u.valid.ref, + private_data->grants[slot_index+i] + .u.valid.domid); + + /* Carry out the mapping of the grant reference. */ + ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, + &op, 1); + BUG_ON(ret); + if (op.status) { + printk(KERN_ERR "Error mapping the grant " + "reference into user space (%d). domid " + "= %d; ref = %d\n", op.status, + private_data->grants[slot_index+i].u + .valid.domid, + private_data->grants[slot_index+i].u + .valid.ref); + goto undo_map_out; + } + + /* Record the grant handle, for use in the unmap + * operation. + */ + private_data->grants[slot_index+i].u. + valid.user_handle = op.handle; + + /* Update p2m structure with the new mapping. */ + set_phys_to_machine(__pa(kernel_vaddr) >> PAGE_SHIFT, + FOREIGN_FRAME(private_data-> + grants[slot_index+i] + .u.valid.dev_bus_addr + >> PAGE_SHIFT)); + } else { + /* USING SHADOW PAGE TABLES. */ + /* In this case, we simply insert the page into the VM + * area. */ + ret = vm_insert_page(vma, user_vaddr, page); + } + + } + + up_write(&private_data->grants_sem); + return 0; + +undo_map_out: + /* If we have a mapping failure, the unmapping will be taken care of + * by do_mmap_pgoff(), which will eventually call gntdev_clear_pte(). + * All we need to do here is free the vma_private_data. + */ + kfree(vma->vm_private_data); + + /* THIS IS VERY UNPLEASANT: do_mmap_pgoff() will set the vma->vm_file + * to NULL on failure. However, we need this in gntdev_clear_pte() to + * unmap the grants. Therefore, we smuggle a reference to the file's + * private data in the VM area's private data pointer. + */ + vma->vm_private_data = private_data; + + up_write(&private_data->grants_sem); + + return -ENOMEM; +} + +static pte_t gntdev_clear_pte(struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep, int is_fullmm) +{ + int slot_index, ret; + pte_t copy; + struct gnttab_unmap_grant_ref op; + gntdev_file_private_data_t *private_data; + + /* THIS IS VERY UNPLEASANT: do_mmap_pgoff() will set the vma->vm_file + * to NULL on failure. However, we need this in gntdev_clear_pte() to + * unmap the grants. Therefore, we smuggle a reference to the file's + * private data in the VM area's private data pointer. + */ + if (vma->vm_file) { + private_data = (gntdev_file_private_data_t *) + vma->vm_file->private_data; + } else if (vma->vm_private_data) { + private_data = (gntdev_file_private_data_t *) + vma->vm_private_data; + } else { + private_data = NULL; /* gcc warning */ + BUG(); + } + + /* Copy the existing value of the PTE for returning. */ + copy = *ptep; + + /* Calculate the grant relating to this PTE. */ + slot_index = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT); + + /* Only unmap grants if the slot has been mapped. This could be being + * called from a failing mmap(). + */ + if (private_data->grants[slot_index].state == GNTDEV_SLOT_MAPPED) { + + /* First, we clear the user space mapping, if it has been made. + */ + if (private_data->grants[slot_index].u.valid.user_handle != + GNTDEV_INVALID_HANDLE && + !xen_feature(XENFEAT_auto_translated_physmap)) { + /* NOT USING SHADOW PAGE TABLES. */ + gnttab_set_unmap_op(&op, virt_to_machine(ptep), + GNTMAP_contains_pte, + private_data->grants[slot_index] + .u.valid.user_handle); + ret = HYPERVISOR_grant_table_op( + GNTTABOP_unmap_grant_ref, &op, 1); + BUG_ON(ret); + if (op.status) + printk("User unmap grant status = %d\n", + op.status); + } else { + /* USING SHADOW PAGE TABLES. */ + pte_clear_full(vma->vm_mm, addr, ptep, is_fullmm); + } + + /* Finally, we unmap the grant from kernel space. */ + gnttab_set_unmap_op(&op, + get_kernel_vaddr(private_data, slot_index), + GNTMAP_host_map, + private_data->grants[slot_index].u.valid + .kernel_handle); + ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, + &op, 1); + BUG_ON(ret); + if (op.status) + printk("Kernel unmap grant status = %d\n", op.status); + + + /* Return slot to the not-yet-mapped state, so that it may be + * mapped again, or removed by a subsequent ioctl. + */ + private_data->grants[slot_index].state = + GNTDEV_SLOT_NOT_YET_MAPPED; + + /* Invalidate the physical to machine mapping for this page. */ + set_phys_to_machine(__pa(get_kernel_vaddr(private_data, + slot_index)) + >> PAGE_SHIFT, INVALID_P2M_ENTRY); + + } else { + pte_clear_full(vma->vm_mm, addr, ptep, is_fullmm); + } + + return copy; +} + +/* "Destructor" for a VM area. + */ +static void gntdev_vma_close(struct vm_area_struct *vma) { + if (vma->vm_private_data) { + kfree(vma->vm_private_data); + } +} + +/* Called when an ioctl is made on the device. + */ +static int gntdev_ioctl(struct inode *inode, struct file *flip, + unsigned int cmd, unsigned long arg) +{ + int rc = 0; + gntdev_file_private_data_t *private_data = + (gntdev_file_private_data_t *) flip->private_data; + + switch (cmd) { + case IOCTL_GNTDEV_MAP_GRANT_REF: + { + struct ioctl_gntdev_map_grant_ref op; + down_write(&private_data->grants_sem); + down_write(&private_data->free_list_sem); + + if ((rc = copy_from_user(&op, (void __user *) arg, + sizeof(op)))) { + rc = -EFAULT; + goto map_out; + } + if (unlikely(op.count <= 0)) { + rc = -EINVAL; + goto map_out; + } + + if (op.count == 1) { + if ((rc = add_grant_reference(flip, &op.refs[0], + &op.index)) < 0) { + printk(KERN_ERR "Adding grant reference " + "failed (%d).\n", rc); + goto map_out; + } + } else { + struct ioctl_gntdev_grant_ref *refs, *u; + refs = kmalloc(op.count * sizeof(*refs), GFP_KERNEL); + if (!refs) { + rc = -ENOMEM; + goto map_out; + } + u = ((struct ioctl_gntdev_map_grant_ref *)arg)->refs; + if ((rc = copy_from_user(refs, + (void __user *)u, + sizeof(*refs) * op.count))) { + printk(KERN_ERR "Copying refs from user failed" + " (%d).\n", rc); + rc = -EINVAL; + goto map_out; + } + if ((rc = find_contiguous_free_range(flip, op.count)) + < 0) { + printk(KERN_ERR "Finding contiguous range " + "failed (%d).\n", rc); + kfree(refs); + goto map_out; + } + op.index = rc << PAGE_SHIFT; + if ((rc = add_grant_references(flip, op.count, + refs, rc))) { + printk(KERN_ERR "Adding grant references " + "failed (%d).\n", rc); + kfree(refs); + goto map_out; + } + compress_free_list(flip); + kfree(refs); + } + if ((rc = copy_to_user((void __user *) arg, + &op, + sizeof(op)))) { + printk(KERN_ERR "Copying result back to user failed " + "(%d)\n", rc); + rc = -EFAULT; + goto map_out; + } + map_out: + up_write(&private_data->grants_sem); + up_write(&private_data->free_list_sem); + return rc; + } + case IOCTL_GNTDEV_UNMAP_GRANT_REF: + { + struct ioctl_gntdev_unmap_grant_ref op; + int i, start_index; + + down_write(&private_data->grants_sem); + down_write(&private_data->free_list_sem); + + if ((rc = copy_from_user(&op, + (void __user *) arg, + sizeof(op)))) { + rc = -EFAULT; + goto unmap_out; + } + + start_index = op.index >> PAGE_SHIFT; + + /* First, check that all pages are in the NOT_YET_MAPPED + * state. + */ + for (i = 0; i < op.count; ++i) { + if (unlikely + (private_data->grants[start_index + i].state + != GNTDEV_SLOT_NOT_YET_MAPPED)) { + if (private_data->grants[start_index + i].state + == GNTDEV_SLOT_INVALID) { + printk(KERN_ERR + "Tried to remove an invalid " + "grant at offset 0x%x.", + (start_index + i) + << PAGE_SHIFT); + rc = -EINVAL; + } else { + printk(KERN_ERR + "Tried to remove a grant which " + "is currently mmap()-ed at " + "offset 0x%x.", + (start_index + i) + << PAGE_SHIFT); + rc = -EBUSY; + } + goto unmap_out; + } + } + + /* Unmap pages and add them to the free list. + */ + for (i = 0; i < op.count; ++i) { + private_data->grants[start_index+i].state = + GNTDEV_SLOT_INVALID; + private_data->grants[start_index+i].u.free_list_index = + private_data->free_list_size; + private_data->free_list[private_data->free_list_size] = + start_index + i; + ++private_data->free_list_size; + } + compress_free_list(flip); + + unmap_out: + up_write(&private_data->grants_sem); + up_write(&private_data->free_list_sem); + return rc; + } + case IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR: + { + struct ioctl_gntdev_get_offset_for_vaddr op; + struct vm_area_struct *vma; + unsigned long vaddr; + + if ((rc = copy_from_user(&op, + (void __user *) arg, + sizeof(op)))) { + rc = -EFAULT; + goto get_offset_out; + } + vaddr = (unsigned long)op.vaddr; + + down_read(¤t->mm->mmap_sem); + vma = find_vma(current->mm, vaddr); + if (vma == NULL) { + rc = -EFAULT; + goto get_offset_unlock_out; + } + if ((!vma->vm_ops) || (vma->vm_ops != &gntdev_vmops)) { + printk(KERN_ERR "The vaddr specified does not belong " + "to a gntdev instance: %#lx\n", vaddr); + rc = -EFAULT; + goto get_offset_unlock_out; + } + if (vma->vm_start != vaddr) { + printk(KERN_ERR "The vaddr specified in an " + "IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR must be at " + "the start of the VM area. vma->vm_start = " + "%#lx; vaddr = %#lx\n", + vma->vm_start, vaddr); + rc = -EFAULT; + goto get_offset_unlock_out; + } + op.offset = vma->vm_pgoff << PAGE_SHIFT; + op.count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + up_read(¤t->mm->mmap_sem); + if ((rc = copy_to_user((void __user *) arg, + &op, + sizeof(op)))) { + rc = -EFAULT; + goto get_offset_out; + } + goto get_offset_out; + get_offset_unlock_out: + up_read(¤t->mm->mmap_sem); + get_offset_out: + return rc; + } + default: + return -ENOIOCTLCMD; + } + + return 0; +} diff -r fc9e2f7920c9 -r f378c424e0ce linux-2.6-xen-sparse/drivers/xen/util.c --- a/linux-2.6-xen-sparse/drivers/xen/util.c Fri Mar 30 17:18:42 2007 -0600 +++ b/linux-2.6-xen-sparse/drivers/xen/util.c Tue Apr 03 13:04:51 2007 -0600 @@ -4,6 +4,26 @@ #include <linux/vmalloc.h> #include <asm/uaccess.h> #include <xen/driver_util.h> + +struct class *get_xen_class(void) +{ + static struct class *xen_class; + + if (xen_class) + return xen_class; + + xen_class = class_create(THIS_MODULE, "xen"); + if (IS_ERR(xen_class)) { + printk("Failed to create xen sysfs class.\n"); + xen_class = NULL; + } + + return xen_class; +} +EXPORT_SYMBOL_GPL(get_xen_class); + +/* Todo: merge ia64 ('auto-translate physmap') versions of these functions. */ +#ifndef __ia64__ static int f(pte_t *pte, struct page *pmd_page, unsigned long addr, void *data) { @@ -46,3 +66,5 @@ void free_vm_area(struct vm_struct *area kfree(area); } EXPORT_SYMBOL_GPL(free_vm_area); + +#endif /* !__ia64__ */ diff -r fc9e2f7920c9 -r f378c424e0ce linux-2.6-xen-sparse/include/linux/mm.h --- a/linux-2.6-xen-sparse/include/linux/mm.h Fri Mar 30 17:18:42 2007 -0600 +++ b/linux-2.6-xen-sparse/include/linux/mm.h Tue Apr 03 13:04:51 2007 -0600 @@ -205,6 +205,10 @@ struct vm_operations_struct { /* notification that a previously read-only page is about to become * writable, if an error is returned it will cause a SIGBUS */ int (*page_mkwrite)(struct vm_area_struct *vma, struct page *page); + /* Area-specific function for clearing the PTE at @ptep. Returns the + * original value of @ptep. */ + pte_t (*zap_pte)(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, int is_fullmm); #ifdef CONFIG_NUMA int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new); struct mempolicy *(*get_policy)(struct vm_area_struct *vma, diff -r fc9e2f7920c9 -r f378c424e0ce linux-2.6-xen-sparse/include/xen/driver_util.h --- a/linux-2.6-xen-sparse/include/xen/driver_util.h Fri Mar 30 17:18:42 2007 -0600 +++ b/linux-2.6-xen-sparse/include/xen/driver_util.h Tue Apr 03 13:04:51 2007 -0600 @@ -3,9 +3,12 @@ #define __ASM_XEN_DRIVER_UTIL_H__ #include <linux/vmalloc.h> +#include <linux/device.h> /* Allocate/destroy a 'vmalloc' VM area. */ extern struct vm_struct *alloc_vm_area(unsigned long size); extern void free_vm_area(struct vm_struct *area); +extern struct class *get_xen_class(void); + #endif /* __ASM_XEN_DRIVER_UTIL_H__ */ diff -r fc9e2f7920c9 -r f378c424e0ce linux-2.6-xen-sparse/include/xen/public/gntdev.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/linux-2.6-xen-sparse/include/xen/public/gntdev.h Tue Apr 03 13:04:51 2007 -0600 @@ -0,0 +1,105 @@ +/****************************************************************************** + * gntdev.h + * + * Interface to /dev/xen/gntdev. + * + * Copyright (c) 2007, D G Murray + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __LINUX_PUBLIC_GNTDEV_H__ +#define __LINUX_PUBLIC_GNTDEV_H__ + +struct ioctl_gntdev_grant_ref { + /* The domain ID of the grant to be mapped. */ + uint32_t domid; + /* The grant reference of the grant to be mapped. */ + uint32_t ref; +}; + +/* + * Inserts the grant references into the mapping table of an instance + * of gntdev. N.B. This does not perform the mapping, which is deferred + * until mmap() is called with @index as the offset. + */ +#define IOCTL_GNTDEV_MAP_GRANT_REF \ +_IOC(_IOC_NONE, 'G', 0, sizeof(struct ioctl_gntdev_map_grant_ref)) +struct ioctl_gntdev_map_grant_ref { + /* IN parameters */ + /* The number of grants to be mapped. */ + uint32_t count; + uint32_t pad; + /* OUT parameters */ + /* The offset to be used on a subsequent call to mmap(). */ + uint64_t index; + /* Variable IN parameter. */ + /* Array of grant references, of size @count. */ + struct ioctl_gntdev_grant_ref refs[1]; +}; + +/* + * Removes the grant references from the mapping table of an instance of + * of gntdev. N.B. munmap() must be called on the relevant virtual address(es) + * before this ioctl is called, or an error will result. + */ +#define IOCTL_GNTDEV_UNMAP_GRANT_REF \ +_IOC(_IOC_NONE, 'G', 1, sizeof(struct ioctl_gntdev_unmap_grant_ref)) +struct ioctl_gntdev_unmap_grant_ref { + /* IN parameters */ + /* The offset was returned by the corresponding map operation. */ + uint64_t index; + /* The number of pages to be unmapped. */ + uint32_t count; + uint32_t pad; +}; + +/* + * Returns the offset in the driver's address space that corresponds + * to @vaddr. This can be used to perform a munmap(), followed by an + * UNMAP_GRANT_REF ioctl, where no state about the offset is retained by + * the caller. The number of pages that were allocated at the same time as + * @vaddr is returned in @count. + * + * N.B. Where more than one page has been mapped into a contiguous range, the + * supplied @vaddr must correspond to the start of the range; otherwise + * an error will result. It is only possible to munmap() the entire + * contiguously-allocated range at once, and not any subrange thereof. + */ +#define IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR \ +_IOC(_IOC_NONE, 'G', 2, sizeof(struct ioctl_gntdev_get_offset_for_vaddr)) +struct ioctl_gntdev_get_offset_for_vaddr { + /* IN parameters */ + /* The virtual address of the first mapped page in a range. */ + uint64_t vaddr; + /* OUT parameters */ + /* The offset that was used in the initial mmap() operation. */ + uint64_t offset; + /* The number of pages mapped in the VM area that begins at @vaddr. */ + uint32_t count; + uint32_t pad; +}; + +#endif /* __LINUX_PUBLIC_GNTDEV_H__ */ diff -r fc9e2f7920c9 -r f378c424e0ce linux-2.6-xen-sparse/mm/memory.c --- a/linux-2.6-xen-sparse/mm/memory.c Fri Mar 30 17:18:42 2007 -0600 +++ b/linux-2.6-xen-sparse/mm/memory.c Tue Apr 03 13:04:51 2007 -0600 @@ -659,8 +659,12 @@ static unsigned long zap_pte_range(struc page->index > details->last_index)) continue; } - ptent = ptep_get_and_clear_full(mm, addr, pte, - tlb->fullmm); + if (unlikely(vma->vm_ops && vma->vm_ops->zap_pte)) + ptent = vma->vm_ops->zap_pte(vma, addr, pte, + tlb->fullmm); + else + ptent = ptep_get_and_clear_full(mm, addr, pte, + tlb->fullmm); tlb_remove_tlb_entry(tlb, pte, addr); if (unlikely(!page)) continue; @@ -755,6 +759,7 @@ static unsigned long unmap_page_range(st details = NULL; BUG_ON(addr >= end); + tlb_start_vma(tlb, vma); pgd = pgd_offset(vma->vm_mm, addr); do { diff -r fc9e2f7920c9 -r f378c424e0ce tools/blktap/drivers/qcow2raw.c --- a/tools/blktap/drivers/qcow2raw.c Fri Mar 30 17:18:42 2007 -0600 +++ b/tools/blktap/drivers/qcow2raw.c Tue Apr 03 13:04:51 2007 -0600 @@ -51,7 +51,6 @@ #define BLOCK_PROCESSSZ 4096 static int maxfds, *qcowio_fd, *aio_fd, running = 1, complete = 0; -static int read_complete = 0, write_complete = 0; static int returned_read_events = 0, returned_write_events = 0; static int submit_events = 0; static uint32_t read_idx = 0, write_idx = 0; @@ -109,8 +108,6 @@ static int send_write_responses(struct d written += BLOCK_PROCESSSZ; returned_write_events++; write_idx = idx; - if (complete && (returned_write_events == submit_events)) - write_complete = 1; debug_output(written, dd->td_state->size << 9); free(private); @@ -126,8 +123,6 @@ static int send_read_responses(struct di returned_read_events++; read_idx = idx; - if (complete && (returned_read_events == submit_events)) - read_complete = 1; ret = ddaio.drv->td_queue_write(&ddaio, idx, BLOCK_PROCESSSZ>>9, private, send_write_responses, idx, private); @@ -136,7 +131,7 @@ static int send_read_responses(struct di return 0; } - if ( (complete && returned_read_events == submit_events) || + if ( (returned_read_events == submit_events) || (returned_read_events % 10 == 0) ) { ddaio.drv->td_submit(&ddaio); } @@ -299,6 +294,7 @@ int main(int argc, char *argv[]) } /*Attempt to read 4k sized blocks*/ + submit_events++; ret = ddqcow.drv->td_queue_read(&ddqcow, i>>9, BLOCK_PROCESSSZ>>9, buf, send_read_responses, i>>9, buf); @@ -309,7 +305,6 @@ int main(int argc, char *argv[]) exit(-1); } else { i += BLOCK_PROCESSSZ; - submit_events++; } if (i >= ddqcow.td_state->size<<9) { diff -r fc9e2f7920c9 -r f378c424e0ce tools/examples/xmexample.hvm --- a/tools/examples/xmexample.hvm Fri Mar 30 17:18:42 2007 -0600 +++ b/tools/examples/xmexample.hvm Tue Apr 03 13:04:51 2007 -0600 @@ -180,6 +180,10 @@ serial='pty' #----------------------------------------------------------------------------- +# set the real time clock offset in seconds [default=0 i.e. same as dom0] +#rtc_timeoffset=3600 + +#----------------------------------------------------------------------------- # start in full screen #full-screen=1 diff -r fc9e2f7920c9 -r f378c424e0ce tools/ioemu/target-i386-dm/helper2.c --- a/tools/ioemu/target-i386-dm/helper2.c Fri Mar 30 17:18:42 2007 -0600 +++ b/tools/ioemu/target-i386-dm/helper2.c Tue Apr 03 13:04:51 2007 -0600 @@ -73,6 +73,8 @@ int vcpus = 1; int vcpus = 1; int xc_handle; + +long time_offset = 0; shared_iopage_t *shared_page = NULL; @@ -439,6 +441,34 @@ void cpu_ioreq_xor(CPUState *env, ioreq_ req->data = tmp1; } +void timeoffset_get() +{ + char *p; + + p = xenstore_vm_read(domid, "rtc/timeoffset", NULL); + if (!p) + return; + + if (sscanf(p, "%ld", &time_offset) == 1) + fprintf(logfile, "Time offset set %ld\n", time_offset); + else + time_offset = 0; + + xc_domain_set_time_offset(xc_handle, domid, time_offset); + + free(p); +} + +void cpu_ioreq_timeoffset(CPUState *env, ioreq_t *req) +{ + char b[64]; + + time_offset += (ulong)req->data; + + sprintf(b, "%ld", time_offset); + xenstore_vm_write(domid, "rtc/timeoffset", b); +} + void cpu_ioreq_xchg(CPUState *env, ioreq_t *req) { unsigned long tmp1; @@ -478,6 +508,9 @@ void __handle_ioreq(CPUState *env, ioreq case IOREQ_TYPE_XCHG: cpu_ioreq_xchg(env, req); break; + case IOREQ_TYPE_TIMEOFFSET: + cpu_ioreq_timeoffset(env, req); + break; default: hw_error("Invalid ioreq type 0x%x\n", req->type); } diff -r fc9e2f7920c9 -r f378c424e0ce tools/ioemu/vl.c --- a/tools/ioemu/vl.c Fri Mar 30 17:18:42 2007 -0600 +++ b/tools/ioemu/vl.c Tue Apr 03 13:04:51 2007 -0600 @@ -6670,6 +6670,9 @@ int main(int argc, char **argv) } free(page_array); #endif + + timeoffset_get(); + #else /* !CONFIG_DM */ phys_ram_base = qemu_vmalloc(phys_ram_size); diff -r fc9e2f7920c9 -r f378c424e0ce tools/ioemu/vl.h --- a/tools/ioemu/vl.h Fri Mar 30 17:18:42 2007 -0600 +++ b/tools/ioemu/vl.h Tue Apr 03 13:04:51 2007 -0600 @@ -1276,6 +1276,12 @@ int xenstore_unsubscribe_from_hotplug_st const char *inst, const char *token); +int xenstore_vm_write(int domid, char *key, char *val); +char *xenstore_vm_read(int domid, char *key, int *len); + +/* helper2.c */ +extern long time_offset; +void timeoffset_get(void); /* xen_platform.c */ void pci_xen_platform_init(PCIBus *bus); diff -r fc9e2f7920c9 -r f378c424e0ce tools/ioemu/xenstore.c --- a/tools/ioemu/xenstore.c Fri Mar 30 17:18:42 2007 -0600 +++ b/tools/ioemu/xenstore.c Tue Apr 03 13:04:51 2007 -0600 @@ -567,3 +567,72 @@ int xenstore_unsubscribe_from_hotplug_st return rc; } + +char *xenstore_vm_read(int domid, char *key, int *len) +{ + char *buf = NULL, *path = NULL, *value = NULL; + + if (xsh == NULL) + goto out; + + path = xs_get_domain_path(xsh, domid); + if (path == NULL) { + fprintf(logfile, "xs_get_domain_path(%d): error\n", domid); + goto out; + } + + pasprintf(&buf, "%s/vm", path); + free(path); + path = xs_read(xsh, XBT_NULL, buf, NULL); + if (path == NULL) { + fprintf(logfile, "xs_read(%s): read error\n", buf); + goto out; + } + + pasprintf(&buf, "%s/%s", path, key); + value = xs_read(xsh, XBT_NULL, buf, len); + if (value == NULL) { + fprintf(logfile, "xs_read(%s): read error\n", buf); + goto out; + } + + out: + free(path); + free(buf); + return value; +} + +int xenstore_vm_write(int domid, char *key, char *value) +{ + char *buf = NULL, *path = NULL; + int rc = -1; + + if (xsh == NULL) + goto out; + + path = xs_get_domain_path(xsh, domid); + if (path == NULL) { + fprintf(logfile, "xs_get_domain_path(%d): error\n"); + goto out; + } + + pasprintf(&buf, "%s/vm", path); + free(path); + path = xs_read(xsh, XBT_NULL, buf, NULL); + if (path == NULL) { + fprintf(logfile, "xs_read(%s): read error\n", buf); + goto out; + } + + pasprintf(&buf, "%s/%s", path, key); + rc = xs_write(xsh, XBT_NULL, buf, value, strlen(value)); + if (rc) { + fprintf(logfile, "xs_write(%s, %s): write error\n", buf, key); + goto out; + } + + out: + free(path); + free(buf); + return rc; +} diff -r fc9e2f7920c9 -r f378c424e0ce tools/libxc/ia64/xc_ia64_linux_restore.c --- a/tools/libxc/ia64/xc_ia64_linux_restore.c Fri Mar 30 17:18:42 2007 -0600 +++ b/tools/libxc/ia64/xc_ia64_linux_restore.c Tue Apr 03 13:04:51 2007 -0600 @@ -14,8 +14,14 @@ #define PFN_TO_KB(_pfn) ((_pfn) << (PAGE_SHIFT - 10)) -/* total number of pages used by the current guest */ -static unsigned long max_pfn; +/* number of pfns this guest has (i.e. number of entries in the P2M) */ +static unsigned long p2m_size; + +/* number of 'in use' pfns in the guest (i.e. #P2M entries with a valid mfn) */ +static unsigned long nr_pfns; + +/* largest possible value of nr_pfns (i.e. domain's maximum memory size) */ +static unsigned long max_nr_pfns; static ssize_t read_exact(int fd, void *buf, size_t count) @@ -57,9 +63,9 @@ read_page(int xc_handle, int io_fd, uint int xc_linux_restore(int xc_handle, int io_fd, uint32_t dom, - unsigned long nr_pfns, unsigned int store_evtchn, - unsigned long *store_mfn, unsigned int console_evtchn, - unsigned long *console_mfn) + unsigned long p2msize, unsigned long maxnrpfns, + unsigned int store_evtchn, unsigned long *store_mfn, + unsigned int console_evtchn, unsigned long *console_mfn) { DECLARE_DOMCTL; int rc = 1, i; @@ -79,10 +85,13 @@ xc_linux_restore(int xc_handle, int io_f /* A temporary mapping of the guest's start_info page. */ start_info_t *start_info; - max_pfn = nr_pfns; - - DPRINTF("xc_linux_restore start: max_pfn = %ld\n", max_pfn); - + p2m_size = p2msize; + max_nr_pfns = maxnrpfns; + + /* For info only */ + nr_pfns = 0; + + DPRINTF("xc_linux_restore start: p2m_size = %lx\n", p2m_size); if (!read_exact(io_fd, &ver, sizeof(unsigned long))) { ERROR("Error when reading version"); @@ -99,29 +108,29 @@ xc_linux_restore(int xc_handle, int io_f return 1; } - if (xc_domain_setmaxmem(xc_handle, dom, PFN_TO_KB(max_pfn)) != 0) { + if (xc_domain_setmaxmem(xc_handle, dom, PFN_TO_KB(max_nr_pfns)) != 0) { errno = ENOMEM; goto out; } /* Get pages. */ - page_array = malloc(max_pfn * sizeof(unsigned long)); + page_array = malloc(p2m_size * sizeof(unsigned long)); if (page_array == NULL) { ERROR("Could not allocate memory"); goto out; } - for ( i = 0; i < max_pfn; i++ ) + for ( i = 0; i < p2m_size; i++ ) page_array[i] = i; - if ( xc_domain_memory_populate_physmap(xc_handle, dom, max_pfn, + if ( xc_domain_memory_populate_physmap(xc_handle, dom, p2m_size, 0, 0, page_array) ) { ERROR("Failed to allocate memory for %ld KB to dom %d.\n", - PFN_TO_KB(max_pfn), dom); - goto out; - } - DPRINTF("Allocated memory by %ld KB\n", PFN_TO_KB(max_pfn)); + PFN_TO_KB(p2m_size), dom); + goto out; + } + DPRINTF("Allocated memory by %ld KB\n", PFN_TO_KB(p2m_size)); if (!read_exact(io_fd, &domctl.u.arch_setup, sizeof(domctl.u.arch_setup))) { ERROR("read: domain setup"); @@ -131,9 +140,9 @@ xc_linux_restore(int xc_handle, int io_f /* Build firmware (will be overwritten). */ domctl.domain = (domid_t)dom; domctl.u.arch_setup.flags &= ~XEN_DOMAINSETUP_query; - domctl.u.arch_setup.bp = ((nr_pfns - 3) << PAGE_SHIFT) + domctl.u.arch_setup.bp = ((p2m_size - 3) << PAGE_SHIFT) + sizeof (start_info_t); - domctl.u.arch_setup.maxmem = (nr_pfns - 3) << PAGE_SHIFT; + domctl.u.arch_setup.maxmem = (p2m_size - 3) << PAGE_SHIFT; domctl.cmd = XEN_DOMCTL_arch_setup; if (xc_domctl(xc_handle, &domctl)) @@ -157,8 +166,6 @@ xc_linux_restore(int xc_handle, int io_f } if (gmfn == INVALID_MFN) break; - - //DPRINTF("xc_linux_restore: page %lu/%lu at %lx\n", gmfn, max_pfn, pfn); if (read_page(xc_handle, io_fd, dom, gmfn) < 0) goto out; @@ -281,7 +288,7 @@ xc_linux_restore(int xc_handle, int io_f /* Uncanonicalise the suspend-record frame number and poke resume rec. */ start_info = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, PROT_READ | PROT_WRITE, gmfn); - start_info->nr_pages = max_pfn; + start_info->nr_pages = p2m_size; start_info->shared_info = shared_info_frame << PAGE_SHIFT; start_info->flags = 0; *store_mfn = start_info->store_mfn; diff -r fc9e2f7920c9 -r f378c424e0ce tools/libxc/xc_core.c --- a/tools/libxc/xc_core.c Fri Mar 30 17:18:42 2007 -0600 +++ b/tools/libxc/xc_core.c Tue Apr 03 13:04:51 2007 -0600 @@ -312,7 +312,7 @@ xc_domain_dumpcore_via_callback(int xc_h int auto_translated_physmap; xen_pfn_t *p2m = NULL; - unsigned long max_pfn = 0; + unsigned long p2m_size = 0; struct xen_dumpcore_p2m *p2m_array = NULL; uint64_t *pfn_array = NULL; @@ -396,7 +396,7 @@ xc_domain_dumpcore_via_callback(int xc_h } sts = xc_core_arch_map_p2m(xc_handle, &info, live_shinfo, - &p2m, &max_pfn); + &p2m, &p2m_size); if ( sts != 0 ) goto out; } diff -r fc9e2f7920c9 -r f378c424e0ce tools/libxc/xc_core_x86.c --- a/tools/libxc/xc_core_x86.c Fri Mar 30 17:18:42 2007 -0600 +++ b/tools/libxc/xc_core_x86.c Tue Apr 03 13:04:51 2007 -0600 @@ -38,7 +38,7 @@ xc_core_arch_memory_map_get(int xc_handl xc_core_memory_map_t **mapp, unsigned int *nr_entries) { - unsigned long max_pfn = max_gpfn(xc_handle, info->domid); + unsigned long p2m_size = max_gpfn(xc_handle, info->domid); xc_core_memory_map_t *map; map = malloc(sizeof(*map)); @@ -49,7 +49,7 @@ xc_core_arch_memory_map_get(int xc_handl } map->addr = 0; - map->size = max_pfn << PAGE_SHIFT; + map->size = p2m_size << PAGE_SHIFT; *mapp = map; *nr_entries = 1; @@ -65,13 +65,13 @@ xc_core_arch_map_p2m(int xc_handle, xc_d xen_pfn_t *live_p2m_frame_list_list = NULL; xen_pfn_t *live_p2m_frame_list = NULL; uint32_t dom = info->domid; - unsigned long max_pfn = max_gpfn(xc_handle, info->domid); + unsigned long p2m_size = max_gpfn(xc_handle, info->domid); int ret = -1; int err; - if ( max_pfn < info->nr_pages ) + if ( p2m_size < info->nr_pages ) { - ERROR("max_pfn < nr_pages -1 (%lx < %lx", max_pfn, info->nr_pages - 1); + ERROR("p2m_size < nr_pages -1 (%lx < %lx", p2m_size, info->nr_pages - 1); goto out; } @@ -106,7 +106,7 @@ xc_core_arch_map_p2m(int xc_handle, xc_d goto out; } - *pfnp = max_pfn; + *pfnp = p2m_size; ret = 0; diff -r fc9e2f7920c9 -r f378c424e0ce tools/libxc/xc_hvm_restore.c --- a/tools/libxc/xc_hvm_restore.c Fri Mar 30 17:18:42 2007 -0600 +++ b/tools/libxc/xc_hvm_restore.c Tue Apr 03 13:04:51 2007 -0600 @@ -95,7 +95,7 @@ int xc_hvm_restore(int xc_handle, int io unsigned long pfn_array_size = max_pfn + 1; /* Number of pages of memory the guest has. *Not* the same as max_pfn. */ - unsigned long nr_pages = max_pfn + 1; + unsigned long nr_pages = max_pfn; /* MMIO hole doesn't contain RAM */ if ( nr_pages >= HVM_BELOW_4G_MMIO_START >> PAGE_SHIFT ) nr_pages -= HVM_BELOW_4G_MMIO_LENGTH >> PAGE_SHIFT; @@ -270,7 +270,6 @@ int xc_hvm_restore(int xc_handle, int io }/*while 1*/ -/* xc_set_hvm_param(xc_handle, dom, HVM_PARAM_APIC_ENABLED, apic);*/ xc_set_hvm_param(xc_handle, dom, HVM_PARAM_PAE_ENABLED, pae); xc_set_hvm_param(xc_handle, dom, HVM_PARAM_STORE_EVTCHN, store_evtchn); @@ -279,13 +278,22 @@ int xc_hvm_restore(int xc_handle, int io else shared_page_nr = (v_end >> PAGE_SHIFT) - 1; + /* Ensure we clear these pages */ + if ( xc_clear_domain_page(xc_handle, dom, shared_page_nr) || + xc_clear_domain_page(xc_handle, dom, shared_page_nr-1) || + xc_clear_domain_page(xc_handle, dom, shared_page_nr-2) ) { + rc = -1; + goto out; + } + xc_set_hvm_param(xc_handle, dom, HVM_PARAM_STORE_PFN, shared_page_nr-1); xc_set_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN, shared_page_nr-2); xc_set_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN, shared_page_nr); /* caculate the store_mfn , wrong val cause hang when introduceDomain */ *store_mfn = (v_end >> PAGE_SHIFT) - 2; - DPRINTF("hvm restore:calculate new store_mfn=0x%lx,v_end=0x%llx..\n", *store_mfn, v_end); + DPRINTF("hvm restore: calculate new store_mfn=0x%lx, v_end=0x%llx.\n", + *store_mfn, v_end); if (!read_exact(io_fd, &nr_vcpus, sizeof(uint32_t))) { ERROR("error read nr vcpu !\n"); diff -r fc9e2f7920c9 -r f378c424e0ce tools/libxc/xc_hvm_save.c --- a/tools/libxc/xc_hvm_save.c Fri Mar 30 17:18:42 2007 -0600 +++ b/tools/libxc/xc_hvm_save.c Tue Apr 03 13:04:51 2007 -0600 @@ -332,10 +332,10 @@ int xc_hvm_save(int xc_handle, int io_fd unsigned long total_sent = 0; - DPRINTF("xc_hvm_save:dom=%d, max_iters=%d, max_factor=%d, flags=0x%x, live=%d, debug=%d.\n", - dom, max_iters, max_factor, flags, + DPRINTF("xc_hvm_save: dom=%d, max_iters=%d, max_factor=%d, flags=0x%x, " + "live=%d, debug=%d.\n", dom, max_iters, max_factor, flags, live, debug); - + /* If no explicit control parameters given, use defaults */ if(!max_iters) max_iters = DEF_MAX_ITERS; @@ -382,7 +382,6 @@ int xc_hvm_save(int xc_handle, int io_fd ERROR("HVM: Could not read magic PFN parameters"); goto out; } - DPRINTF("saved hvm domain info:max_memkb=0x%lx, max_mfn=0x%lx, " "nr_pages=0x%lx\n", info.max_memkb, max_mfn, info.nr_pages); diff -r fc9e2f7920c9 -r f378c424e0ce tools/libxc/xc_linux.c --- a/tools/libxc/xc_linux.c Fri Mar 30 17:18:42 2007 -0600 +++ b/tools/libxc/xc_linux.c Tue Apr 03 13:04:51 2007 -0600 @@ -2,6 +2,9 @@ * * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * xc_gnttab functions: + * Copyright (c) 2007, D G Murray <Derek.Murray@xxxxxxxxxxxx> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as @@ -13,6 +16,7 @@ #include <xen/memory.h> #include <xen/sys/evtchn.h> +#include <xen/sys/gntdev.h> #include <unistd.h> #include <fcntl.h> @@ -361,6 +365,158 @@ void discard_file_cache(int fd, int flus out: errno = saved_errno; +} + +#define GNTTAB_DEV_NAME "/dev/xen/gntdev" + +int xc_gnttab_open(void) +{ + struct stat st; + int fd; + int devnum; + + devnum = xc_find_device_number("gntdev"); + + /* Make sure any existing device file links to correct device. */ + if ( (lstat(GNTTAB_DEV_NAME, &st) != 0) || !S_ISCHR(st.st_mode) || + (st.st_rdev != devnum) ) + (void)unlink(GNTTAB_DEV_NAME); + +reopen: + if ( (fd = open(GNTTAB_DEV_NAME, O_RDWR)) == -1 ) + { + if ( (errno == ENOENT) && + ((mkdir("/dev/xen", 0755) == 0) || (errno == EEXIST)) && + (mknod(GNTTAB_DEV_NAME, S_IFCHR|0600, devnum) == 0) ) + goto reopen; + + PERROR("Could not open grant table interface"); + return -1; + } + + return fd; +} + +int xc_gnttab_close(int xcg_handle) +{ + return close(xcg_handle); +} + +void *xc_gnttab_map_grant_ref(int xcg_handle, + uint32_t domid, + uint32_t ref, + int prot) +{ + struct ioctl_gntdev_map_grant_ref map; + void *addr; + + map.count = 1; + map.refs[0].domid = domid; + map.refs[0].ref = ref; + + if ( ioctl(xcg_handle, IOCTL_GNTDEV_MAP_GRANT_REF, &map) ) + return NULL; + + addr = mmap(NULL, PAGE_SIZE, prot, MAP_SHARED, xcg_handle, map.index); + if ( addr == MAP_FAILED ) + { + int saved_errno = errno; + struct ioctl_gntdev_unmap_grant_ref unmap_grant; + /* Unmap the driver slots used to store the grant information. */ + unmap_grant.index = map.index; + unmap_grant.count = 1; + ioctl(xcg_handle, IOCTL_GNTDEV_UNMAP_GRANT_REF, &unmap_grant); + errno = saved_errno; + return NULL; + } + + return addr; +} + +void *xc_gnttab_map_grant_refs(int xcg_handle, + uint32_t count, + uint32_t *domids, + uint32_t *refs, + int prot) +{ + struct ioctl_gntdev_map_grant_ref *map; + void *addr = NULL; + int i; + + map = malloc(sizeof(*map) + + (count-1) * sizeof(struct ioctl_gntdev_map_grant_ref)); + if ( map == NULL ) + return NULL; + + for ( i = 0; i < count; i++ ) + { + map->refs[i].domid = domids[i]; + map->refs[i].ref = refs[i]; + } + + map->count = count; + + if ( ioctl(xcg_handle, IOCTL_GNTDEV_MAP_GRANT_REF, &map) ) + goto out; + + addr = mmap(NULL, PAGE_SIZE * count, prot, MAP_SHARED, xcg_handle, + map->index); + if ( addr == MAP_FAILED ) + { + int saved_errno = errno; + struct ioctl_gntdev_unmap_grant_ref unmap_grant; + /* Unmap the driver slots used to store the grant information. */ + unmap_grant.index = map->index; + unmap_grant.count = count; + ioctl(xcg_handle, IOCTL_GNTDEV_UNMAP_GRANT_REF, &unmap_grant); + errno = saved_errno; + addr = NULL; + } + + out: + free(map); + return addr; +} + +int xc_gnttab_munmap(int xcg_handle, + void *start_address, + uint32_t count) +{ + struct ioctl_gntdev_get_offset_for_vaddr get_offset; + struct ioctl_gntdev_unmap_grant_ref unmap_grant; + int rc; + + if ( start_address == NULL ) + { + errno = EINVAL; + return -1; + } + + /* First, it is necessary to get the offset which was initially used to + * mmap() the pages. + */ + get_offset.vaddr = (unsigned long)start_address; + if ( (rc = ioctl(xcg_handle, IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR, + &get_offset)) ) + return rc; + + if ( get_offset.count != count ) + { + errno = EINVAL; + return -1; + } + + /* Next, unmap the memory. */ + if ( (rc = munmap(start_address, count * getpagesize())) ) + return rc; + + /* Finally, unmap the driver slots used to store the grant information. */ + unmap_grant.index = get_offset.offset; + unmap_grant.count = count; + if ( (rc = ioctl(xcg_handle, IOCTL_GNTDEV_UNMAP_GRANT_REF, &unmap_grant)) ) + return rc; + + return 0; } /* diff -r fc9e2f7920c9 -r f378c424e0ce tools/libxc/xc_linux_restore.c --- a/tools/libxc/xc_linux_restore.c Fri Mar 30 17:18:42 2007 -0600 +++ b/tools/libxc/xc_linux_restore.c Tue Apr 03 13:04:51 2007 -0600 @@ -22,8 +22,14 @@ static unsigned long hvirt_start; /* #levels of page tables used by the current guest */ static unsigned int pt_levels; -/* total number of pages used by the current guest */ -static unsigned long max_pfn; +/* number of pfns this guest has (i.e. number of entries in the P2M) */ +static unsigned long p2m_size; + +/* number of 'in use' pfns in the guest (i.e. #P2M entries with a valid mfn) */ +static unsigned long nr_pfns; + +/* largest possible value of nr_pfns (i.e. domain's maximum memory size) */ +static unsigned long max_nr_pfns; /* Live mapping of the table mapping each PFN to its current MFN. */ static xen_pfn_t *live_p2m = NULL; @@ -33,7 +39,6 @@ static xen_pfn_t *p2m = NULL; /* A table of P2M mappings in the current region */ static xen_pfn_t *p2m_batch = NULL; - static ssize_t read_exact(int fd, void *buf, size_t count) @@ -85,11 +90,11 @@ static int uncanonicalize_pagetable(int pfn = (pte >> PAGE_SHIFT) & MFN_MASK_X86; - if(pfn >= max_pfn) { + if(pfn >= p2m_size) { /* This "page table page" is probably not one; bail. */ ERROR("Frame number in type %lu page table is out of range: " - "i=%d pfn=0x%lx max_pfn=%lu", - type >> 28, i, pfn, max_pfn); + "i=%d pfn=0x%lx p2m_size=%lu", + type >> 28, i, pfn, p2m_size); return 0; } @@ -138,8 +143,9 @@ static int uncanonicalize_pagetable(int return 1; } -int xc_linux_restore(int xc_handle, int io_fd, - uint32_t dom, unsigned long nr_pfns, + +int xc_linux_restore(int xc_handle, int io_fd, uint32_t dom, + unsigned long p2msize, unsigned long maxnrpfns, unsigned int store_evtchn, unsigned long *store_mfn, unsigned int console_evtchn, unsigned long *console_mfn) { @@ -191,9 +197,13 @@ int xc_linux_restore(int xc_handle, int unsigned int max_vcpu_id = 0; int new_ctxt_format = 0; - max_pfn = nr_pfns; - - DPRINTF("xc_linux_restore start: max_pfn = %lx\n", max_pfn); + p2m_size = p2msize; + max_nr_pfns = maxnrpfns; + + /* For info only */ + nr_pfns = 0; + + DPRINTF("xc_linux_restore start: p2m_size = %lx\n", p2m_size); /* * XXX For now, 32bit dom0's can only save/restore 32bit domUs @@ -294,8 +304,8 @@ int xc_linux_restore(int xc_handle, int } /* We want zeroed memory so use calloc rather than malloc. */ - p2m = calloc(max_pfn, sizeof(xen_pfn_t)); - pfn_type = calloc(max_pfn, sizeof(unsigned long)); + p2m = calloc(p2m_size, sizeof(xen_pfn_t)); + pfn_type = calloc(p2m_size, sizeof(unsigned long)); region_mfn = calloc(MAX_BATCH_SIZE, sizeof(xen_pfn_t)); p2m_batch = calloc(MAX_BATCH_SIZE, sizeof(xen_pfn_t)); @@ -325,13 +335,13 @@ int xc_linux_restore(int xc_handle, int } shared_info_frame = domctl.u.getdomaininfo.shared_info_frame; - if (xc_domain_setmaxmem(xc_handle, dom, PFN_TO_KB(max_pfn)) != 0) { + if (xc_domain_setmaxmem(xc_handle, dom, PFN_TO_KB(max_nr_pfns)) != 0) { errno = ENOMEM; goto out; } /* Mark all PFNs as invalid; we allocate on demand */ - for ( pfn = 0; pfn < max_pfn; pfn++ ) + for ( pfn = 0; pfn < p2m_size; pfn++ ) p2m[pfn] = INVALID_P2M_ENTRY; if(!(mmu = xc_init_mmu_updates(xc_handle, dom))) { @@ -352,7 +362,7 @@ int xc_linux_restore(int xc_handle, int int j, nr_mfns = 0; - this_pc = (n * 100) / max_pfn; + this_pc = (n * 100) / p2m_size; if ( (this_pc - prev_pc) >= 5 ) { PPRINTF("\b\b\b\b%3d%%", this_pc); @@ -436,6 +446,7 @@ int xc_linux_restore(int xc_handle, int if (p2m[pfn] == INVALID_P2M_ENTRY) { /* We just allocated a new mfn above; update p2m */ p2m[pfn] = p2m_batch[nr_mfns++]; + nr_pfns++; } /* setup region_mfn[] for batch map */ @@ -465,7 +476,7 @@ int xc_linux_restore(int xc_handle, int /* a bogus/unmapped page: skip it */ continue; - if ( pfn > max_pfn ) + if ( pfn > p2m_size ) { ERROR("pfn out of range"); goto out; @@ -518,7 +529,7 @@ int xc_linux_restore(int xc_handle, int else if ( pagetype != XEN_DOMCTL_PFINFO_NOTAB ) { ERROR("Bogus page type %lx page table is out of range: " - "i=%d max_pfn=%lu", pagetype, i, max_pfn); + "i=%d p2m_size=%lu", pagetype, i, p2m_size); goto out; } @@ -598,7 +609,7 @@ int xc_linux_restore(int xc_handle, int int j, k; /* First pass: find all L3TABs current in > 4G mfns and get new mfns */ - for ( i = 0; i < max_pfn; i++ ) + for ( i = 0; i < p2m_size; i++ ) { if ( ((pfn_type[i] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) == XEN_DOMCTL_PFINFO_L3TAB) && @@ -646,7 +657,7 @@ int xc_linux_restore(int xc_handle, int /* Second pass: find all L1TABs and uncanonicalize them */ j = 0; - for ( i = 0; i < max_pfn; i++ ) + for ( i = 0; i < p2m_size; i++ ) { if ( ((pfn_type[i] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) == XEN_DOMCTL_PFINFO_L1TAB) ) @@ -655,7 +666,7 @@ int xc_linux_restore(int xc_handle, int j++; } - if(i == (max_pfn-1) || j == MAX_BATCH_SIZE) { + if(i == (p2m_size-1) || j == MAX_BATCH_SIZE) { if (!(region_base = xc_map_foreign_batch( xc_handle, dom, PROT_READ | PROT_WRITE, @@ -689,7 +700,7 @@ int xc_linux_restore(int xc_handle, int * will barf when doing the type-checking. */ nr_pins = 0; - for ( i = 0; i < max_pfn; i++ ) + for ( i = 0; i < p2m_size; i++ ) { if ( (pfn_type[i] & XEN_DOMCTL_PFINFO_LPINTAB) == 0 ) continue; @@ -736,7 +747,7 @@ int xc_linux_restore(int xc_handle, int } DPRINTF("\b\b\b\b100%%\n"); - DPRINTF("Memory reloaded.\n"); + DPRINTF("Memory reloaded (%ld pages of max %ld)\n", nr_pfns, max_nr_pfns); /* Get the list of PFNs that are not in the psuedo-phys map */ { @@ -808,7 +819,7 @@ int xc_linux_restore(int xc_handle, int * resume record. */ pfn = ctxt.user_regs.edx; - if ((pfn >= max_pfn) || + if ((pfn >= p2m_size) || (pfn_type[pfn] != XEN_DOMCTL_PFINFO_NOTAB)) { ERROR("Suspend record frame number is bad"); goto out; @@ -816,7 +827,7 @@ int xc_linux_restore(int xc_handle, int ctxt.user_regs.edx = mfn = p2m[pfn]; start_info = xc_map_foreign_range( xc_handle, dom, PAGE_SIZE, PROT_READ | PROT_WRITE, mfn); - start_info->nr_pages = max_pfn; + start_info->nr_pages = p2m_size; start_info->shared_info = shared_info_frame << PAGE_SHIFT; start_info->flags = 0; *store_mfn = start_info->store_mfn = p2m[start_info->store_mfn]; @@ -835,7 +846,7 @@ int xc_linux_restore(int xc_handle, int for (j = 0; (512*j) < ctxt.gdt_ents; j++) { pfn = ctxt.gdt_frames[j]; - if ((pfn >= max_pfn) || + if ((pfn >= p2m_size) || (pfn_type[pfn] != XEN_DOMCTL_PFINFO_NOTAB)) { ERROR("GDT frame number is bad"); goto out; @@ -846,16 +857,16 @@ int xc_linux_restore(int xc_handle, int /* Uncanonicalise the page table base pointer. */ pfn = xen_cr3_to_pfn(ctxt.ctrlreg[3]); - if (pfn >= max_pfn) { - ERROR("PT base is bad: pfn=%lu max_pfn=%lu type=%08lx", - pfn, max_pfn, pfn_type[pfn]); + if (pfn >= p2m_size) { + ERROR("PT base is bad: pfn=%lu p2m_size=%lu type=%08lx", + pfn, p2m_size, pfn_type[pfn]); goto out; } if ( (pfn_type[pfn] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) != ((unsigned long)pt_levels<<XEN_DOMCTL_PFINFO_LTAB_SHIFT) ) { ERROR("PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx", - pfn, max_pfn, pfn_type[pfn], + pfn, p2m_size, pfn_type[pfn], (unsigned long)pt_levels<<XEN_DOMCTL_PFINFO_LTAB_SHIFT); goto out; } @@ -867,16 +878,16 @@ int xc_linux_restore(int xc_handle, int { pfn = xen_cr3_to_pfn(ctxt.ctrlreg[1]); - if (pfn >= max_pfn) { - ERROR("User PT base is bad: pfn=%lu max_pfn=%lu type=%08lx", - pfn, max_pfn, pfn_type[pfn]); + if (pfn >= p2m_size) { + ERROR("User PT base is bad: pfn=%lu p2m_size=%lu type=%08lx", + pfn, p2m_size, pfn_type[pfn]); goto out; } if ( (pfn_type[pfn] & XEN_DOMCTL_PFINFO_LTABTYPE_MASK) != ((unsigned long)pt_levels<<XEN_DOMCTL_PFINFO_LTAB_SHIFT) ) { ERROR("User PT base is bad. pfn=%lu nr=%lu type=%08lx %08lx", - pfn, max_pfn, pfn_type[pfn], + pfn, p2m_size, pfn_type[pfn], (unsigned long)pt_levels<<XEN_DOMCTL_PFINFO_LTAB_SHIFT); goto out; } @@ -915,7 +926,7 @@ int xc_linux_restore(int xc_handle, int /* Uncanonicalise the pfn-to-mfn table frame-number list. */ for (i = 0; i < P2M_FL_ENTRIES; i++) { pfn = p2m_frame_list[i]; - if ((pfn >= max_pfn) || (pfn_type[pfn] != XEN_DOMCTL_PFINFO_NOTAB)) { + if ((pfn >= p2m_size) || (pfn_type[pfn] != XEN_DOMCTL_PFINFO_NOTAB)) { ERROR("PFN-to-MFN frame number is bad"); goto out; } @@ -930,8 +941,8 @@ int xc_linux_restore(int xc_handle, int goto out; } - memcpy(live_p2m, p2m, P2M_SIZE); - munmap(live_p2m, P2M_SIZE); + memcpy(live_p2m, p2m, ROUNDUP(p2m_size * sizeof(xen_pfn_t), PAGE_SHIFT)); + munmap(live_p2m, ROUNDUP(p2m_size * sizeof(xen_pfn_t), PAGE_SHIFT)); DPRINTF("Domain ready to be built.\n"); diff -r fc9e2f7920c9 -r f378c424e0ce tools/libxc/xc_linux_save.c --- a/tools/libxc/xc_linux_save.c Fri Mar 30 17:18:42 2007 -0600 +++ b/tools/libxc/xc_linux_save.c Tue Apr 03 13:04:51 2007 -0600 @@ -25,7 +25,7 @@ ** */ #define DEF_MAX_ITERS 29 /* limit us to 30 times round loop */ -#define DEF_MAX_FACTOR 3 /* never send more than 3x nr_pfns */ +#define DEF_MAX_FACTOR 3 /* never send more than 3x p2m_size */ /* max mfn of the whole machine */ @@ -37,8 +37,8 @@ static unsigned long hvirt_start; /* #levels of page tables used by the current guest */ static unsigned int pt_levels; -/* total number of pages used by the current guest */ -static unsigned long max_pfn; +/* number of pfns this guest has (i.e. number of entries in the P2M) */ +static unsigned long p2m_size; /* Live mapping of the table mapping each PFN to its current MFN. */ static xen_pfn_t *live_p2m = NULL; @@ -57,7 +57,7 @@ static unsigned long m2p_mfn0; */ #define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn) \ (((_mfn) < (max_mfn)) && \ - ((mfn_to_pfn(_mfn) < (max_pfn)) && \ + ((mfn_to_pfn(_mfn) < (p2m_size)) && \ (live_p2m[mfn_to_pfn(_mfn)] == (_mfn)))) @@ -79,7 +79,7 @@ static unsigned long m2p_mfn0; */ #define BITS_PER_LONG (sizeof(unsigned long) * 8) -#define BITMAP_SIZE ((max_pfn + BITS_PER_LONG - 1) / 8) +#define BITMAP_SIZE ((p2m_size + BITS_PER_LONG - 1) / 8) #define BITMAP_ENTRY(_nr,_bmap) \ ((volatile unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG] @@ -343,7 +343,7 @@ static int print_stats(int xc_handle, ui } -static int analysis_phase(int xc_handle, uint32_t domid, int max_pfn, +static int analysis_phase(int xc_handle, uint32_t domid, int p2m_size, unsigned long *arr, int runs) { long long start, now; @@ -356,7 +356,7 @@ static int analysis_phase(int xc_handle, int i; xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_CLEAN, - arr, max_pfn, NULL, 0, NULL); + arr, p2m_size, NULL, 0, NULL); DPRINTF("#Flush\n"); for ( i = 0; i < 40; i++ ) { usleep(50000); @@ -682,7 +682,7 @@ int xc_linux_save(int xc_handle, int io_ /* base of the region in which domain memory is mapped */ unsigned char *region_base = NULL; - /* power of 2 order of max_pfn */ + /* power of 2 order of p2m_size */ int order_nr; /* bitmap of pages: @@ -730,7 +730,7 @@ int xc_linux_save(int xc_handle, int io_ goto out; } - max_pfn = live_shinfo->arch.max_pfn; + p2m_size = live_shinfo->arch.max_pfn; live_p2m_frame_list_list = map_frame_list_list(xc_handle, dom, live_shinfo); @@ -777,7 +777,7 @@ int xc_linux_save(int xc_handle, int io_ memcpy(p2m_frame_list, live_p2m_frame_list, P2M_FL_SIZE); /* Canonicalise the pfn-to-mfn table frame-number list. */ - for (i = 0; i < max_pfn; i += fpp) { + for (i = 0; i < p2m_size; i += fpp) { if (!translate_mfn_to_pfn(&p2m_frame_list[i/fpp])) { ERROR("Frame# in pfn-to-mfn frame list is not in pseudophys"); ERROR("entry %d: p2m_frame_list[%ld] is 0x%"PRIx64, i, i/fpp, @@ -813,12 +813,12 @@ int xc_linux_save(int xc_handle, int io_ } /* pretend we sent all the pages last iteration */ - sent_last_iter = max_pfn; - - - /* calculate the power of 2 order of max_pfn, e.g. + sent_last_iter = p2m_size; + + + /* calculate the power of 2 order of p2m_size, e.g. 15->4 16->4 17->5 */ - for (i = max_pfn-1, order_nr = 0; i ; i >>= 1, order_nr++) + for (i = p2m_size-1, order_nr = 0; i ; i >>= 1, order_nr++) continue; /* Setup to_send / to_fix and to_skip bitmaps */ @@ -844,7 +844,7 @@ int xc_linux_save(int xc_handle, int io_ return 1; } - analysis_phase(xc_handle, dom, max_pfn, to_skip, 0); + analysis_phase(xc_handle, dom, p2m_size, to_skip, 0); /* We want zeroed memory so use calloc rather than malloc. */ pfn_type = calloc(MAX_BATCH_SIZE, sizeof(*pfn_type)); @@ -867,7 +867,7 @@ int xc_linux_save(int xc_handle, int io_ { int err=0; unsigned long mfn; - for (i = 0; i < max_pfn; i++) { + for (i = 0; i < p2m_size; i++) { mfn = live_p2m[i]; if((mfn != INVALID_P2M_ENTRY) && (mfn_to_pfn(mfn) != i)) { @@ -882,8 +882,8 @@ int xc_linux_save(int xc_handle, int io_ /* Start writing out the saved-domain record. */ - if (!write_exact(io_fd, &max_pfn, sizeof(unsigned long))) { - ERROR("write: max_pfn"); + if (!write_exact(io_fd, &p2m_size, sizeof(unsigned long))) { + ERROR("write: p2m_size"); goto out; } @@ -929,9 +929,9 @@ int xc_linux_save(int xc_handle, int io_ DPRINTF("Saving memory pages: iter %d 0%%", iter); - while( N < max_pfn ){ - - unsigned int this_pc = (N * 100) / max_pfn; + while( N < p2m_size ){ + + unsigned int this_pc = (N * 100) / p2m_size; if ((this_pc - prev_pc) >= 5) { DPRINTF("\b\b\b\b%3d%%", this_pc); @@ -942,7 +942,7 @@ int xc_linux_save(int xc_handle, int io_ but this is fast enough for the moment. */ if (!last_iter && xc_shadow_control( xc_handle, dom, XEN_DOMCTL_SHADOW_OP_PEEK, - to_skip, max_pfn, NULL, 0, NULL) != max_pfn) { + to_skip, p2m_size, NULL, 0, NULL) != p2m_size) { ERROR("Error peeking shadow bitmap"); goto out; } @@ -950,9 +950,9 @@ int xc_linux_save(int xc_handle, int io_ /* load pfn_type[] with the mfn of all the pages we're doing in this batch. */ - for (batch = 0; batch < MAX_BATCH_SIZE && N < max_pfn ; N++) { - - int n = permute(N, max_pfn, order_nr); + for (batch = 0; batch < MAX_BATCH_SIZE && N < p2m_size ; N++) { + + int n = permute(N, p2m_size, order_nr); if (debug) { DPRINTF("%d pfn= %08lx mfn= %08lx %d [mfn]= %08lx\n", @@ -1123,7 +1123,7 @@ int xc_linux_save(int xc_handle, int io_ print_stats( xc_handle, dom, sent_this_iter, &stats, 1); DPRINTF("Total pages sent= %ld (%.2fx)\n", - total_sent, ((float)total_sent)/max_pfn ); + total_sent, ((float)total_sent)/p2m_size ); DPRINTF("(of which %ld were fixups)\n", needed_to_fix ); } @@ -1150,7 +1150,7 @@ int xc_linux_save(int xc_handle, int io_ if (((sent_this_iter > sent_last_iter) && RATE_IS_MAX()) || (iter >= max_iters) || (sent_this_iter+skip_this_iter < 50) || - (total_sent > max_pfn*max_factor)) { + (total_sent > p2m_size*max_factor)) { DPRINTF("Start last iteration\n"); last_iter = 1; @@ -1168,7 +1168,7 @@ int xc_linux_save(int xc_handle, int io_ if (xc_shadow_control(xc_handle, dom, XEN_DOMCTL_SHADOW_OP_CLEAN, to_send, - max_pfn, NULL, 0, &stats) != max_pfn) { + p2m_size, NULL, 0, &stats) != p2m_size) { ERROR("Error flushing shadow PT"); goto out; } @@ -1220,7 +1220,7 @@ int xc_linux_save(int xc_handle, int io_ unsigned int i,j; unsigned long pfntab[1024]; - for (i = 0, j = 0; i < max_pfn; i++) { + for (i = 0, j = 0; i < p2m_size; i++) { if (!is_mapped(live_p2m[i])) j++; } @@ -1230,13 +1230,13 @@ int xc_linux_save(int xc_handle, int io_ goto out; } - for (i = 0, j = 0; i < max_pfn; ) { + for (i = 0, j = 0; i < p2m_size; ) { if (!is_mapped(live_p2m[i])) pfntab[j++] = i; i++; - if (j == 1024 || i == max_pfn) { + if (j == 1024 || i == p2m_size) { if(!write_exact(io_fd, &pfntab, sizeof(unsigned long)*j)) { ERROR("Error when writing to state file (6b) (errno %d)", errno); @@ -1333,7 +1333,7 @@ int xc_linux_save(int xc_handle, int io_ munmap(live_p2m_frame_list, P2M_FLL_ENTRIES * PAGE_SIZE); if (live_p2m) - munmap(live_p2m, P2M_SIZE); + munmap(live_p2m, ROUNDUP(p2m_size * sizeof(xen_pfn_t), PAGE_SHIFT)); if (live_m2p) munmap(live_m2p, M2P_SIZE(max_mfn)); diff -r fc9e2f7920c9 -r f378c424e0ce tools/libxc/xc_resume.c --- a/tools/libxc/xc_resume.c Fri Mar 30 17:18:42 2007 -0600 +++ b/tools/libxc/xc_resume.c Tue Apr 03 13:04:51 2007 -0600 @@ -46,7 +46,7 @@ static int xc_domain_resume_any(int xc_h xc_dominfo_t info; int i, rc = -1; #if defined(__i386__) || defined(__x86_64__) - unsigned long mfn, max_pfn = 0; + unsigned long mfn, p2m_size = 0; vcpu_guest_context_t ctxt; start_info_t *start_info; shared_info_t *shinfo = NULL; @@ -74,7 +74,7 @@ static int xc_domain_resume_any(int xc_h goto out; } - max_pfn = shinfo->arch.max_pfn; + p2m_size = shinfo->arch.max_pfn; p2m_frame_list_list = xc_map_foreign_range(xc_handle, domid, PAGE_SIZE, PROT_READ, diff -r fc9e2f7920c9 -r f378c424e0ce tools/libxc/xenctrl.h --- a/tools/libxc/xenctrl.h Fri Mar 30 17:18:42 2007 -0600 +++ b/tools/libxc/xenctrl.h Tue Apr 03 13:04:51 2007 -0600 @@ -4,6 +4,9 @@ * A library for low-level access to the Xen control interfaces. * * Copyright (c) 2003-2004, K A Fraser. + * + * xc_gnttab functions: + * Copyright (c) 2007, D G Murray <Derek.Murray@xxxxxxxxxxxx> */ #ifndef XENCTRL_H @@ -740,6 +743,62 @@ evtchn_port_t xc_evtchn_pending(int xce_ */ int xc_evtchn_unmask(int xce_handle, evtchn_port_t port); +/************************** + * GRANT TABLE OPERATIONS * + **************************/ + +/* + * Return a handle to the grant table driver, or -1 on failure, in which case + * errno will be set appropriately. + */ +int xc_gnttab_open(void); + +/* + * Close a handle previously allocated with xc_gnttab_open(). + */ +int xc_gnttab_close(int xcg_handle); + +/* + * Memory maps a grant reference from one domain to a local address range. + * Mappings should be unmapped with xc_gnttab_munmap. Returns NULL on failure. + * + * @parm xcg_handle a handle on an open grant table interface + * @parm domid the domain to map memory from + * @parm ref the grant reference ID to map + * @parm prot same flag as in mmap() + */ +void *xc_gnttab_map_grant_ref(int xcg_handle, + uint32_t domid, + uint32_t ref, + int prot); + +/** + * Memory maps one or more grant references from one or more domains to a + * contiguous local address range. Mappings should be unmapped with + * xc_gnttab_munmap. Returns NULL on failure. + * + * @parm xcg_handle a handle on an open grant table interface + * @parm count the number of grant references to be mapped + * @parm domids an array of @count domain IDs by which the corresponding @refs + * were granted + * @parm refs an array of @count grant references to be mapped + * @parm prot same flag as in mmap() + */ +void *xc_gnttab_map_grant_refs(int xcg_handle, + uint32_t count, + uint32_t *domids, + uint32_t *refs, + int prot); + +/* + * Unmaps the @count pages starting at @start_address, which were mapped by a + * call to xc_gnttab_map_grant_ref or xc_gnttab_map_grant_refs. Returns zero + * on success, otherwise sets errno and returns non-zero. + */ +int xc_gnttab_munmap(int xcg_handle, + void *start_address, + uint32_t count); + int xc_hvm_set_pci_intx_level( int xc_handle, domid_t dom, uint8_t domain, uint8_t bus, uint8_t device, uint8_t intx, diff -r fc9e2f7920c9 -r f378c424e0ce tools/libxc/xenguest.h --- a/tools/libxc/xenguest.h Fri Mar 30 17:18:42 2007 -0600 +++ b/tools/libxc/xenguest.h Tue Apr 03 13:04:51 2007 -0600 @@ -43,15 +43,16 @@ int xc_hvm_save(int xc_handle, int io_fd * @parm xc_handle a handle to an open hypervisor interface * @parm fd the file descriptor to restore a domain from * @parm dom the id of the domain - * @parm nr_pfns the number of pages + * @parm p2m_size number of pages the guest has (i.e. number entries in P2M) + * @parm max_nr_pfns domains maximum real memory allocation, in pages * @parm store_evtchn the store event channel for this domain to use * @parm store_mfn returned with the mfn of the store page * @return 0 on success, -1 on failure */ int xc_linux_restore(int xc_handle, int io_fd, uint32_t dom, - unsigned long nr_pfns, unsigned int store_evtchn, - unsigned long *store_mfn, unsigned int console_evtchn, - unsigned long *console_mfn); + unsigned long p2m_size, unsigned long max_nr_pfns, + unsigned int store_evtchn, unsigned long *store_mfn, + unsigned int console_evtchn, unsigned long *console_mfn); /** * This function will restore a saved hvm domain running unmodified guest. diff -r fc9e2f7920c9 -r f378c424e0ce tools/libxc/xg_private.h --- a/tools/libxc/xg_private.h Fri Mar 30 17:18:42 2007 -0600 +++ b/tools/libxc/xg_private.h Tue Apr 03 13:04:51 2007 -0600 @@ -148,17 +148,16 @@ typedef l4_pgentry_64_t l4_pgentry_t; #define ROUNDUP(_x,_w) (((unsigned long)(_x)+(1UL<<(_w))-1) & ~((1UL<<(_w))-1)) -/* Size in bytes of the P2M (rounded up to the nearest PAGE_SIZE bytes) */ -#define P2M_SIZE ROUNDUP((max_pfn * sizeof(xen_pfn_t)), PAGE_SHIFT) - /* Number of xen_pfn_t in a page */ #define fpp (PAGE_SIZE/sizeof(xen_pfn_t)) +/* XXX SMH: following 3 skanky macros rely on variable p2m_size being set */ + /* Number of entries in the pfn_to_mfn_frame_list_list */ -#define P2M_FLL_ENTRIES (((max_pfn)+(fpp*fpp)-1)/(fpp*fpp)) +#define P2M_FLL_ENTRIES (((p2m_size)+(fpp*fpp)-1)/(fpp*fpp)) /* Number of entries in the pfn_to_mfn_frame_list */ -#define P2M_FL_ENTRIES (((max_pfn)+fpp-1)/fpp) +#define P2M_FL_ENTRIES (((p2m_size)+fpp-1)/fpp) /* Size in bytes of the pfn_to_mfn_frame_list */ #define P2M_FL_SIZE ((P2M_FL_ENTRIES)*sizeof(unsigned long)) diff -r fc9e2f7920c9 -r f378c424e0ce tools/python/xen/lowlevel/scf/scf.c --- a/tools/python/xen/lowlevel/scf/scf.c Fri Mar 30 17:18:42 2007 -0600 +++ b/tools/python/xen/lowlevel/scf/scf.c Tue Apr 03 13:04:51 2007 -0600 @@ -26,7 +26,7 @@ #include <libscf.h> #include <stdio.h> -#define XEND_FMRI "svc:/system/xen/xend:default" +#define XEND_FMRI "svc:/system/xctl/xend:default" #define XEND_PG "config" static PyObject *scf_exc; diff -r fc9e2f7920c9 -r f378c424e0ce tools/python/xen/xend/XendCheckpoint.py --- a/tools/python/xen/xend/XendCheckpoint.py Fri Mar 30 17:18:42 2007 -0600 +++ b/tools/python/xen/xend/XendCheckpoint.py Tue Apr 03 13:04:51 2007 -0600 @@ -187,6 +187,7 @@ def restore(xd, fd, dominfo = None, paus assert console_port nr_pfns = (dominfo.getMemoryTarget() + 3) / 4 + max_nr_pfns = (dominfo.getMemoryMaximum() + 3) / 4 # if hvm, pass mem size to calculate the store_mfn image_cfg = dominfo.info.get('image', {}) @@ -203,17 +204,17 @@ def restore(xd, fd, dominfo = None, paus try: l = read_exact(fd, sizeof_unsigned_long, "not a valid guest state file: pfn count read") - max_pfn = unpack("L", l)[0] # native sizeof long - - if max_pfn > 16*1024*1024: # XXX + p2m_size = unpack("L", l)[0] # native sizeof long + + if p2m_size > 16*1024*1024: # XXX raise XendError( "not a valid guest state file: pfn count out of range") shadow = dominfo.info['shadow_memory'] log.debug("restore:shadow=0x%x, _static_max=0x%x, _static_min=0x%x, " - "nr_pfns=0x%x.", dominfo.info['shadow_memory'], + "p2m_size=0x%x.", dominfo.info['shadow_memory'], dominfo.info['memory_static_max'], - dominfo.info['memory_static_min'], nr_pfns) + dominfo.info['memory_static_min'], p2m_size) balloon.free(xc.pages_to_kib(nr_pfns) + shadow * 1024) @@ -221,7 +222,7 @@ def restore(xd, fd, dominfo = None, paus dominfo.info['shadow_memory'] = shadow_cur cmd = map(str, [xen.util.auxbin.pathTo(XC_RESTORE), - fd, dominfo.getDomid(), max_pfn, + fd, dominfo.getDomid(), p2m_size, max_nr_pfns, store_port, console_port, int(is_hvm), pae, apic]) log.debug("[xc_restore]: %s", string.join(cmd)) diff -r fc9e2f7920c9 -r f378c424e0ce tools/python/xen/xend/XendConfig.py --- a/tools/python/xen/xend/XendConfig.py Fri Mar 30 17:18:42 2007 -0600 +++ b/tools/python/xen/xend/XendConfig.py Tue Apr 03 13:04:51 2007 -0600 @@ -118,7 +118,7 @@ LEGACY_CFG_TO_XENAPI_CFG = reverse_dict( # Platform configuration keys. XENAPI_PLATFORM_CFG = [ 'acpi', 'apic', 'boot', 'device_model', 'display', 'fda', 'fdb', 'keymap', 'isa', 'localtime', - 'nographic', 'pae', 'serial', 'sdl', + 'nographic', 'pae', 'rtc_timeoffset', 'serial', 'sdl', 'soundhw','stdvga', 'usb', 'usbdevice', 'vnc', 'vncconsole', 'vncdisplay', 'vnclisten', 'vncpasswd', 'vncunused', 'xauthority'] @@ -203,6 +203,7 @@ LEGACY_CFG_TYPES = { 'on_xend_stop': str, 'on_xend_start': str, 'online_vcpus': int, + 'rtc/timeoffset': str, } # Values that should be stored in xenstore's /vm/<uuid> that is used diff -r fc9e2f7920c9 -r f378c424e0ce tools/python/xen/xend/XendDomainInfo.py --- a/tools/python/xen/xend/XendDomainInfo.py Fri Mar 30 17:18:42 2007 -0600 +++ b/tools/python/xen/xend/XendDomainInfo.py Tue Apr 03 13:04:51 2007 -0600 @@ -859,7 +859,8 @@ class XendDomainInfo: # Check whether values in the configuration have # changed in Xenstore. - cfg_vm = ['name', 'on_poweroff', 'on_reboot', 'on_crash'] + cfg_vm = ['name', 'on_poweroff', 'on_reboot', 'on_crash', + 'rtc/timeoffset'] vm_details = self._readVMDetails([(k,XendConfig.LEGACY_CFG_TYPES[k]) for k in cfg_vm]) @@ -888,6 +889,11 @@ class XendDomainInfo: self.info.update_with_image_sxp(sxp.from_string(image_sxp)) changed = True + # Check if the rtc offset has changes + if vm_details.get("rtc/timeoffset", 0) != self.info["platform"].get("rtc_timeoffset", 0): + self.info["platform"]["rtc_timeoffset"] = vm_details.get("rtc/timeoffset", 0) + changed = True + if changed: # Update the domain section of the store, as this contains some # parameters derived from the VM configuration. diff -r fc9e2f7920c9 -r f378c424e0ce tools/python/xen/xend/balloon.py --- a/tools/python/xen/xend/balloon.py Fri Mar 30 17:18:42 2007 -0600 +++ b/tools/python/xen/xend/balloon.py Tue Apr 03 13:04:51 2007 -0600 @@ -25,9 +25,7 @@ import XendOptions import XendOptions from XendLogging import log from XendError import VmError - - -PROC_XEN_BALLOON = '/proc/xen/balloon' +import osdep RETRY_LIMIT = 20 RETRY_LIMIT_INCR = 5 @@ -51,19 +49,7 @@ def _get_proc_balloon(label): """Returns the value for the named label. Returns None if the label was not found or the value was non-numeric.""" - f = file(PROC_XEN_BALLOON, 'r') - try: - for line in f: - keyvalue = line.split(':') - if keyvalue[0] == label: - values = keyvalue[1].split() - if values[0].isdigit(): - return int(values[0]) - else: - return None - return None - finally: - f.close() + return osdep.lookup_balloon_stat(label) def get_dom0_current_alloc(): """Returns the current memory allocation (in KiB) of dom0.""" diff -r fc9e2f7920c9 -r f378c424e0ce tools/python/xen/xend/image.py --- a/tools/python/xen/xend/image.py Fri Mar 30 17:18:42 2007 -0600 +++ b/tools/python/xen/xend/image.py Tue Apr 03 13:04:51 2007 -0600 @@ -256,9 +256,12 @@ class HVMImageHandler(ImageHandler): self.xauthority = vmConfig['platform'].get('xauthority') self.vncconsole = vmConfig['platform'].get('vncconsole') + rtc_timeoffset = vmConfig['platform'].get('rtc_timeoffset') + self.vm.storeVm(("image/dmargs", " ".join(self.dmargs)), ("image/device-model", self.device_model), ("image/display", self.display)) + self.vm.storeVm(("rtc/timeoffset", rtc_timeoffset)) self.pid = None diff -r fc9e2f7920c9 -r f378c424e0ce tools/python/xen/xend/osdep.py --- a/tools/python/xen/xend/osdep.py Fri Mar 30 17:18:42 2007 -0600 +++ b/tools/python/xen/xend/osdep.py Tue Apr 03 13:04:51 2007 -0600 @@ -41,6 +41,55 @@ _vif_script = { "SunOS": "vif-vnic" } +def _linux_balloon_stat(label): + """Returns the value for the named label, or None if an error occurs.""" + + PROC_XEN_BALLOON = '/proc/xen/balloon' + f = file(PROC_XEN_BALLOON, 'r') + try: + for line in f: + keyvalue = line.split(':') + if keyvalue[0] == label: + values = keyvalue[1].split() + if values[0].isdigit(): + return int(values[0]) + else: + return None + return None + finally: + f.close() + +def _solaris_balloon_stat(label): + """Returns the value for the named label, or None if an error occurs.""" + + import fcntl + import array + DEV_XEN_BALLOON = '/dev/xen/balloon' + BLN_IOCTL_CURRENT = 0x4201 + BLN_IOCTL_TARGET = 0x4202 + BLN_IOCTL_LOW = 0x4203 + BLN_IOCTL_HIGH = 0x4204 + BLN_IOCTL_LIMIT = 0x4205 + label_to_ioctl = { 'Current allocation' : BLN_IOCTL_CURRENT, + 'Requested target' : BLN_IOCTL_TARGET, + 'Low-mem balloon' : BLN_IOCTL_LOW, + 'High-mem balloon' : BLN_IOCTL_HIGH, + 'Xen hard limit' : BLN_IOCTL_LIMIT } + + f = file(DEV_XEN_BALLOON, 'r') + try: + values = array.array('L', [0]) + if fcntl.ioctl(f.fileno(), label_to_ioctl[label], values, 1) == 0: + return values[0] + else: + return None + finally: + f.close() + +_balloon_stat = { + "SunOS": _solaris_balloon_stat +} + def _get(var, default=None): return var.get(os.uname()[0], default) @@ -49,3 +98,4 @@ pygrub_path = _get(_pygrub_path, "/usr/b pygrub_path = _get(_pygrub_path, "/usr/bin/pygrub") netback_type = _get(_netback_type, "netfront") vif_script = _get(_vif_script, "vif-bridge") +lookup_balloon_stat = _get(_balloon_stat, _linux_balloon_stat) diff -r fc9e2f7920c9 -r f378c424e0ce tools/python/xen/xend/server/SrvServer.py --- a/tools/python/xen/xend/server/SrvServer.py Fri Mar 30 17:18:42 2007 -0600 +++ b/tools/python/xen/xend/server/SrvServer.py Tue Apr 03 13:04:51 2007 -0600 @@ -212,8 +212,8 @@ def _loadConfig(servers, root, reload): if server_cfg[1] in [XendAPI.AUTH_PAM, XendAPI.AUTH_NONE]: auth_method = server_cfg[1] - if len(server_cfg) > 2: - hosts_allowed = server_cfg[2] or None + if len(server_cfg) > 2 and len(server_cfg[2]): + hosts_allowed = map(re.compile, server_cfg[2].split(' ')) if len(server_cfg) > 4: # SSL key and cert file diff -r fc9e2f7920c9 -r f378c424e0ce tools/python/xen/xend/server/relocate.py --- a/tools/python/xen/xend/server/relocate.py Fri Mar 30 17:18:42 2007 -0600 +++ b/tools/python/xen/xend/server/relocate.py Tue Apr 03 13:04:51 2007 -0600 @@ -106,8 +106,12 @@ class RelocationProtocol(protocol.Protoc def op_receive(self, name, _): if self.transport: self.send_reply(["ready", name]) - XendDomain.instance().domain_restore_fd( - self.transport.sock.fileno()) + try: + XendDomain.instance().domain_restore_fd( + self.transport.sock.fileno()) + except: + self.send_error() + self.close() else: log.error(name + ": no transport") raise XendError(name + ": no transport") diff -r fc9e2f7920c9 -r f378c424e0ce tools/python/xen/xm/create.py --- a/tools/python/xen/xm/create.py Fri Mar 30 17:18:42 2007 -0600 +++ b/tools/python/xen/xm/create.py Tue Apr 03 13:04:51 2007 -0600 @@ -185,6 +185,10 @@ gopts.var('cpus', val='CPUS', gopts.var('cpus', val='CPUS', fn=set_value, default=None, use="CPUS to run the domain on.") + +gopts.var('rtc_timeoffset', val='RTC_TIMEOFFSET', + fn=set_value, default="0", + use="Set RTC offset.") gopts.var('pae', val='PAE', fn=set_int, default=1, @@ -717,7 +721,7 @@ def configure_hvm(config_image, vals): args = [ 'device_model', 'pae', 'vcpus', 'boot', 'fda', 'fdb', 'localtime', 'serial', 'stdvga', 'isa', 'nographic', 'soundhw', 'vnc', 'vncdisplay', 'vncunused', 'vncconsole', 'vnclisten', - 'sdl', 'display', 'xauthority', + 'sdl', 'display', 'xauthority', 'rtc_timeoffset', 'acpi', 'apic', 'usb', 'usbdevice', 'keymap' ] for a in args: if a in vals.__dict__ and vals.__dict__[a] is not None: diff -r fc9e2f7920c9 -r f378c424e0ce tools/python/xen/xm/main.py --- a/tools/python/xen/xm/main.py Fri Mar 30 17:18:42 2007 -0600 +++ b/tools/python/xen/xm/main.py Tue Apr 03 13:04:51 2007 -0600 @@ -929,10 +929,10 @@ def xm_label_list(doms): if security.active_policy not in ['INACTIVE', 'NULL', 'DEFAULT']: if not d['seclabel']: d['seclabel'] = 'ERROR' - elif security.active_policy in ['DEFAULT']: - d['seclabel'] = 'DEFAULT' - else: - d['seclabel'] = 'INACTIVE' + elif security.active_policy in ['DEFAULT']: + d['seclabel'] = 'DEFAULT' + else: + d['seclabel'] = 'INACTIVE' output.append((format % d, d['seclabel'])) diff -r fc9e2f7920c9 -r f378c424e0ce tools/python/xen/xm/xenapi_create.py --- a/tools/python/xen/xm/xenapi_create.py Fri Mar 30 17:18:42 2007 -0600 +++ b/tools/python/xen/xm/xenapi_create.py Tue Apr 03 13:04:51 2007 -0600 @@ -20,7 +20,6 @@ from xen.xm.main import server, get_default_SR from xml.dom.minidom import parse, getDOMImplementation -from xml.dom.ext import PrettyPrint from xml.parsers.xmlproc import xmlproc, xmlval, xmldtd from xen.xend import sxp from xen.xend.XendAPIConstants import XEN_API_ON_NORMAL_EXIT, \ diff -r fc9e2f7920c9 -r f378c424e0ce tools/xcutils/xc_restore.c --- a/tools/xcutils/xc_restore.c Fri Mar 30 17:18:42 2007 -0600 +++ b/tools/xcutils/xc_restore.c Tue Apr 03 13:04:51 2007 -0600 @@ -18,15 +18,14 @@ int int main(int argc, char **argv) { - unsigned int xc_fd, io_fd, domid, max_pfn, store_evtchn, console_evtchn; + unsigned int xc_fd, io_fd, domid, store_evtchn, console_evtchn; unsigned int hvm, pae, apic; int ret; - unsigned long store_mfn, console_mfn; + unsigned long p2m_size, max_nr_pfns, store_mfn, console_mfn; - if (argc != 9) - errx(1, - "usage: %s iofd domid max_pfn store_evtchn console_evtchn hvm pae apic", - argv[0]); + if (argc != 10) + errx(1, "usage: %s iofd domid p2m_size max_nr_pfns store_evtchn " + "console_evtchn hvm pae apic", argv[0]); xc_fd = xc_interface_open(); if (xc_fd < 0) @@ -34,19 +33,21 @@ main(int argc, char **argv) io_fd = atoi(argv[1]); domid = atoi(argv[2]); - max_pfn = atoi(argv[3]); - store_evtchn = atoi(argv[4]); - console_evtchn = atoi(argv[5]); - hvm = atoi(argv[6]); - pae = atoi(argv[7]); - apic = atoi(argv[8]); + p2m_size = atoi(argv[3]); + max_nr_pfns = atoi(argv[4]); + store_evtchn = atoi(argv[5]); + console_evtchn = atoi(argv[6]); + hvm = atoi(argv[7]); + pae = atoi(argv[8]); + apic = atoi(argv[9]); if (hvm) { - ret = xc_hvm_restore(xc_fd, io_fd, domid, max_pfn, store_evtchn, + ret = xc_hvm_restore(xc_fd, io_fd, domid, max_nr_pfns, store_evtchn, &store_mfn, pae, apic); - } else - ret = xc_linux_restore(xc_fd, io_fd, domid, max_pfn, store_evtchn, - &store_mfn, console_evtchn, &console_mfn); + } else + ret = xc_linux_restore(xc_fd, io_fd, domid, p2m_size, + max_nr_pfns, store_evtchn, &store_mfn, + console_evtchn, &console_mfn); if (ret == 0) { printf("store-mfn %li\n", store_mfn); diff -r fc9e2f7920c9 -r f378c424e0ce tools/xenstat/xentop/xentop.c --- a/tools/xenstat/xentop/xentop.c Fri Mar 30 17:18:42 2007 -0600 +++ b/tools/xenstat/xentop/xentop.c Tue Apr 03 13:04:51 2007 -0600 @@ -984,6 +984,8 @@ static void top(void) if(!batch) do_bottom_line(); + + free(domains); } int main(int argc, char **argv) diff -r fc9e2f7920c9 -r f378c424e0ce xen/arch/x86/hvm/hvm.c --- a/xen/arch/x86/hvm/hvm.c Fri Mar 30 17:18:42 2007 -0600 +++ b/xen/arch/x86/hvm/hvm.c Tue Apr 03 13:04:51 2007 -0600 @@ -59,9 +59,6 @@ struct hvm_function_table hvm_funcs __re /* I/O permission bitmap is globally shared by all HVM guests. */ char __attribute__ ((__section__ (".bss.page_aligned"))) hvm_io_bitmap[3*PAGE_SIZE]; -/* MSR permission bitmap is globally shared by all HVM guests. */ -char __attribute__ ((__section__ (".bss.page_aligned"))) - hvm_msr_bitmap[PAGE_SIZE]; void hvm_enable(struct hvm_function_table *fns) { @@ -74,9 +71,6 @@ void hvm_enable(struct hvm_function_tabl */ memset(hvm_io_bitmap, ~0, sizeof(hvm_io_bitmap)); clear_bit(0x80, hvm_io_bitmap); - - /* All MSR accesses are intercepted by default. */ - memset(hvm_msr_bitmap, ~0, sizeof(hvm_msr_bitmap)); hvm_funcs = *fns; hvm_enabled = 1; @@ -378,6 +372,9 @@ void hvm_send_assist_req(struct vcpu *v) void hvm_send_assist_req(struct vcpu *v) { ioreq_t *p; + + if ( unlikely(!vcpu_start_shutdown_deferral(v)) ) + return; /* implicitly bins the i/o operation */ p = &get_vio(v->domain, v->vcpu_id)->vp_ioreq; if ( unlikely(p->state != STATE_IOREQ_NONE) ) diff -r fc9e2f7920c9 -r f378c424e0ce xen/arch/x86/hvm/intercept.c --- a/xen/arch/x86/hvm/intercept.c Fri Mar 30 17:18:42 2007 -0600 +++ b/xen/arch/x86/hvm/intercept.c Tue Apr 03 13:04:51 2007 -0600 @@ -155,28 +155,13 @@ static inline void hvm_mmio_access(struc } } -int hvm_buffered_io_intercept(ioreq_t *p) +int hvm_buffered_io_send(ioreq_t *p) { struct vcpu *v = current; spinlock_t *buffered_io_lock; buffered_iopage_t *buffered_iopage = (buffered_iopage_t *)(v->domain->arch.hvm_domain.buffered_io_va); unsigned long tmp_write_pointer = 0; - int i; - - /* ignore READ ioreq_t! */ - if ( p->dir == IOREQ_READ ) - return 0; - - for ( i = 0; i < HVM_BUFFERED_IO_RANGE_NR; i++ ) { - if ( p->addr >= hvm_buffered_io_ranges[i]->start_addr && - p->addr + p->size - 1 < hvm_buffered_io_ranges[i]->start_addr + - hvm_buffered_io_ranges[i]->length ) - break; - } - - if ( i == HVM_BUFFERED_IO_RANGE_NR ) - return 0; buffered_io_lock = &v->domain->arch.hvm_domain.buffered_io_lock; spin_lock(buffered_io_lock); @@ -205,6 +190,27 @@ int hvm_buffered_io_intercept(ioreq_t *p return 1; } +int hvm_buffered_io_intercept(ioreq_t *p) +{ + int i; + + /* ignore READ ioreq_t! */ + if ( p->dir == IOREQ_READ ) + return 0; + + for ( i = 0; i < HVM_BUFFERED_IO_RANGE_NR; i++ ) { + if ( p->addr >= hvm_buffered_io_ranges[i]->start_addr && + p->addr + p->size - 1 < hvm_buffered_io_ranges[i]->start_addr + + hvm_buffered_io_ranges[i]->length ) + break; + } + + if ( i == HVM_BUFFERED_IO_RANGE_NR ) + return 0; + + return hvm_buffered_io_send(p); +} + int hvm_mmio_intercept(ioreq_t *p) { struct vcpu *v = current; diff -r fc9e2f7920c9 -r f378c424e0ce xen/arch/x86/hvm/io.c --- a/xen/arch/x86/hvm/io.c Fri Mar 30 17:18:42 2007 -0600 +++ b/xen/arch/x86/hvm/io.c Tue Apr 03 13:04:51 2007 -0600 @@ -771,10 +771,11 @@ void hvm_io_assist(struct vcpu *v) struct cpu_user_regs *regs; struct hvm_io_op *io_opp; unsigned long gmfn; + struct domain *d = v->domain; io_opp = &v->arch.hvm_vcpu.io_op; regs = &io_opp->io_context; - vio = get_vio(v->domain, v->vcpu_id); + vio = get_vio(d, v->vcpu_id); p = &vio->vp_ioreq; if ( p->state != STATE_IORESP_READY ) @@ -797,11 +798,13 @@ void hvm_io_assist(struct vcpu *v) memcpy(guest_cpu_user_regs(), regs, HVM_CONTEXT_STACK_BYTES); /* Has memory been dirtied? */ - if ( p->dir == IOREQ_READ && p->data_is_ptr ) + if ( (p->dir == IOREQ_READ) && p->data_is_ptr ) { gmfn = get_mfn_from_gpfn(paging_gva_to_gfn(v, p->data)); - mark_dirty(v->domain, gmfn); - } + mark_dirty(d, gmfn); + } + + vcpu_end_shutdown_deferral(v); } /* diff -r fc9e2f7920c9 -r f378c424e0ce xen/arch/x86/hvm/platform.c --- a/xen/arch/x86/hvm/platform.c Fri Mar 30 17:18:42 2007 -0600 +++ b/xen/arch/x86/hvm/platform.c Tue Apr 03 13:04:51 2007 -0600 @@ -921,6 +921,26 @@ static void send_mmio_req(unsigned char hvm_send_assist_req(v); } +void send_timeoffset_req(unsigned long timeoff) +{ + ioreq_t p[1]; + + if ( timeoff == 0 ) + return; + + memset(p, 0, sizeof(*p)); + + p->type = IOREQ_TYPE_TIMEOFFSET; + p->size = 4; + p->dir = IOREQ_WRITE; + p->data = timeoff; + + p->state = STATE_IOREQ_READY; + + if ( !hvm_buffered_io_send(p) ) + printk("Unsuccessful timeoffset update\n"); +} + static void mmio_operands(int type, unsigned long gpa, struct hvm_io_op *mmio_op, unsigned char op_size) diff -r fc9e2f7920c9 -r f378c424e0ce xen/arch/x86/hvm/rtc.c --- a/xen/arch/x86/hvm/rtc.c Fri Mar 30 17:18:42 2007 -0600 +++ b/xen/arch/x86/hvm/rtc.c Tue Apr 03 13:04:51 2007 -0600 @@ -157,6 +157,10 @@ static void rtc_set_time(RTCState *s) static void rtc_set_time(RTCState *s) { struct tm *tm = &s->current_tm; + unsigned long before, after; /* XXX s_time_t */ + + before = mktime(tm->tm_year, tm->tm_mon, tm->tm_mday, + tm->tm_hour, tm->tm_min, tm->tm_sec); tm->tm_sec = from_bcd(s, s->hw.cmos_data[RTC_SECONDS]); tm->tm_min = from_bcd(s, s->hw.cmos_data[RTC_MINUTES]); @@ -168,6 +172,10 @@ static void rtc_set_time(RTCState *s) tm->tm_mday = from_bcd(s, s->hw.cmos_data[RTC_DAY_OF_MONTH]); tm->tm_mon = from_bcd(s, s->hw.cmos_data[RTC_MONTH]) - 1; tm->tm_year = from_bcd(s, s->hw.cmos_data[RTC_YEAR]) + 100; + + after = mktime(tm->tm_year, tm->tm_mon, tm->tm_mday, + tm->tm_hour, tm->tm_min, tm->tm_sec); + send_timeoffset_req(after - before); } static void rtc_copy_date(RTCState *s) diff -r fc9e2f7920c9 -r f378c424e0ce xen/arch/x86/hvm/svm/vmcb.c --- a/xen/arch/x86/hvm/svm/vmcb.c Fri Mar 30 17:18:42 2007 -0600 +++ b/xen/arch/x86/hvm/svm/vmcb.c Tue Apr 03 13:04:51 2007 -0600 @@ -79,6 +79,30 @@ struct host_save_area *alloc_host_save_a return hsa; } +static void disable_intercept_for_msr(char *msr_bitmap, u32 msr) +{ + /* + * See AMD64 Programmers Manual, Vol 2, Section 15.10 (MSR-Bitmap Address). + */ + if ( msr <= 0x1fff ) + { + __clear_bit(msr*2, msr_bitmap + 0x000); + __clear_bit(msr*2+1, msr_bitmap + 0x000); + } + else if ( (msr >= 0xc0000000) && (msr <= 0xc0001fff) ) + { + msr &= 0x1fff; + __clear_bit(msr*2, msr_bitmap + 0x800); + __clear_bit(msr*2+1, msr_bitmap + 0x800); + } + else if ( (msr >= 0xc001000) && (msr <= 0xc0011fff) ) + { + msr &= 0x1fff; + __clear_bit(msr*2, msr_bitmap + 0x1000); + __clear_bit(msr*2+1, msr_bitmap + 0x1000); + } +} + static int construct_vmcb(struct vcpu *v) { struct arch_svm_struct *arch_svm = &v->arch.hvm_svm; @@ -114,6 +138,10 @@ static int construct_vmcb(struct vcpu *v if ( arch_svm->msrpm == NULL ) return -ENOMEM; memset(arch_svm->msrpm, 0xff, MSRPM_SIZE); + + disable_intercept_for_msr((char *)arch_svm->msrpm, MSR_FS_BASE); + disable_intercept_for_msr((char *)arch_svm->msrpm, MSR_GS_BASE); + vmcb->msrpm_base_pa = (u64)virt_to_maddr(arch_svm->msrpm); vmcb->iopm_base_pa = (u64)virt_to_maddr(hvm_io_bitmap); diff -r fc9e2f7920c9 -r f378c424e0ce xen/arch/x86/hvm/vmx/vmcs.c --- a/xen/arch/x86/hvm/vmx/vmcs.c Fri Mar 30 17:18:42 2007 -0600 +++ b/xen/arch/x86/hvm/vmx/vmcs.c Tue Apr 03 13:04:51 2007 -0600 @@ -289,7 +289,7 @@ static void construct_vmcs(struct vcpu * v->arch.hvm_vcpu.u.vmx.exec_control = vmx_cpu_based_exec_control; if ( cpu_has_vmx_msr_bitmap ) - __vmwrite(MSR_BITMAP, virt_to_maddr(hvm_msr_bitmap)); + __vmwrite(MSR_BITMAP, virt_to_maddr(vmx_msr_bitmap)); /* I/O access bitmap. */ __vmwrite(IO_BITMAP_A, virt_to_maddr(hvm_io_bitmap)); diff -r fc9e2f7920c9 -r f378c424e0ce xen/arch/x86/hvm/vmx/vmx.c --- a/xen/arch/x86/hvm/vmx/vmx.c Fri Mar 30 17:18:42 2007 -0600 +++ b/xen/arch/x86/hvm/vmx/vmx.c Tue Apr 03 13:04:51 2007 -0600 @@ -51,6 +51,8 @@ #include <public/hvm/save.h> #include <asm/hvm/trace.h> +char *vmx_msr_bitmap; + static void vmx_ctxt_switch_from(struct vcpu *v); static void vmx_ctxt_switch_to(struct vcpu *v); @@ -1005,14 +1007,14 @@ static void disable_intercept_for_msr(u3 */ if ( msr <= 0x1fff ) { - __clear_bit(msr, hvm_msr_bitmap + 0x000); /* read-low */ - __clear_bit(msr, hvm_msr_bitmap + 0x800); /* write-low */ + __clear_bit(msr, vmx_msr_bitmap + 0x000); /* read-low */ + __clear_bit(msr, vmx_msr_bitmap + 0x800); /* write-low */ } else if ( (msr >= 0xc0000000) && (msr <= 0xc0001fff) ) { msr &= 0x1fff; - __clear_bit(msr, hvm_msr_bitmap + 0x400); /* read-high */ - __clear_bit(msr, hvm_msr_bitmap + 0xc00); /* write-high */ + __clear_bit(msr, vmx_msr_bitmap + 0x400); /* read-high */ + __clear_bit(msr, vmx_msr_bitmap + 0xc00); /* write-high */ } } @@ -1105,6 +1107,9 @@ int start_vmx(void) if ( cpu_has_vmx_msr_bitmap ) { printk("VMX: MSR intercept bitmap enabled\n"); + vmx_msr_bitmap = alloc_xenheap_page(); + BUG_ON(vmx_msr_bitmap == NULL); + memset(vmx_msr_bitmap, ~0, PAGE_SIZE); disable_intercept_for_msr(MSR_FS_BASE); disable_intercept_for_msr(MSR_GS_BASE); } diff -r fc9e2f7920c9 -r f378c424e0ce xen/arch/x86/mm.c --- a/xen/arch/x86/mm.c Fri Mar 30 17:18:42 2007 -0600 +++ b/xen/arch/x86/mm.c Tue Apr 03 13:04:51 2007 -0600 @@ -806,7 +806,8 @@ void put_page_from_l1e(l1_pgentry_t l1e, * (Note that the undestroyable active grants are not a security hole in * Xen. All active grants can safely be cleaned up when the domain dies.) */ - if ( (l1e_get_flags(l1e) & _PAGE_GNTTAB) && !d->is_shutdown && !d->is_dying ) + if ( (l1e_get_flags(l1e) & _PAGE_GNTTAB) && + !d->is_shutting_down && !d->is_dying ) { MEM_LOG("Attempt to implicitly unmap a granted PTE %" PRIpte, l1e_get_intpte(l1e)); diff -r fc9e2f7920c9 -r f378c424e0ce xen/arch/x86/mm/hap/hap.c --- a/xen/arch/x86/mm/hap/hap.c Fri Mar 30 17:18:42 2007 -0600 +++ b/xen/arch/x86/mm/hap/hap.c Tue Apr 03 13:04:51 2007 -0600 @@ -52,7 +52,7 @@ /************************************************/ /* HAP SUPPORT FUNCTIONS */ /************************************************/ -mfn_t hap_alloc(struct domain *d, unsigned long backpointer) +mfn_t hap_alloc(struct domain *d) { struct page_info *sp = NULL; void *p; @@ -82,43 +82,43 @@ void hap_free(struct domain *d, mfn_t sm list_add_tail(&sp->list, &d->arch.paging.hap.freelists); } -static int hap_alloc_p2m_pages(struct domain *d) -{ - struct page_info *pg; - - ASSERT(hap_locked_by_me(d)); - - pg = mfn_to_page(hap_alloc(d, 0)); - d->arch.paging.hap.p2m_pages += 1; - d->arch.paging.hap.total_pages -= 1; - - page_set_owner(pg, d); - pg->count_info = 1; - list_add_tail(&pg->list, &d->arch.paging.hap.p2m_freelist); - - return 1; -} - struct page_info * hap_alloc_p2m_page(struct domain *d) { - struct list_head *entry; struct page_info *pg; mfn_t mfn; void *p; hap_lock(d); - - if ( list_empty(&d->arch.paging.hap.p2m_freelist) && - !hap_alloc_p2m_pages(d) ) { - hap_unlock(d); - return NULL; - } - entry = d->arch.paging.hap.p2m_freelist.next; - list_del(entry); - + +#if CONFIG_PAGING_LEVELS == 3 + /* Under PAE mode, top-level P2M table should be allocated below 4GB space + * because the size of h_cr3 is only 32-bit. We use alloc_domheap_pages to + * force this requirement. This page will be de-allocated in + * hap_free_p2m_page(), like other P2M pages. + */ + if ( d->arch.paging.hap.p2m_pages == 0 ) + { + pg = alloc_domheap_pages(NULL, 0, MEMF_bits(32)); + d->arch.paging.hap.p2m_pages += 1; + } + else +#endif + { + pg = mfn_to_page(hap_alloc(d)); + + d->arch.paging.hap.p2m_pages += 1; + d->arch.paging.hap.total_pages -= 1; + } + + if ( pg == NULL ) { + hap_unlock(d); + return NULL; + } + hap_unlock(d); - pg = list_entry(entry, struct page_info, list); + page_set_owner(pg, d); + pg->count_info = 1; mfn = page_to_mfn(pg); p = hap_map_domain_page(mfn); clear_page(p); @@ -141,6 +141,7 @@ void hap_free_p2m_page(struct domain *d, page_set_owner(pg, NULL); free_domheap_pages(pg, 0); d->arch.paging.hap.p2m_pages--; + ASSERT( d->arch.paging.hap.p2m_pages >= 0 ); } /* Return the size of the pool, rounded up to the nearest MB */ @@ -320,7 +321,7 @@ mfn_t hap_make_monitor_table(struct vcpu #if CONFIG_PAGING_LEVELS == 4 { mfn_t m4mfn; - m4mfn = hap_alloc(d, 0); + m4mfn = hap_alloc(d); hap_install_xen_entries_in_l4(v, m4mfn, m4mfn); return m4mfn; } @@ -331,12 +332,12 @@ mfn_t hap_make_monitor_table(struct vcpu l2_pgentry_t *l2e; int i; - m3mfn = hap_alloc(d, 0); + m3mfn = hap_alloc(d); /* Install a monitor l2 table in slot 3 of the l3 table. * This is used for all Xen entries, including linear maps */ - m2mfn = hap_alloc(d, 0); + m2mfn = hap_alloc(d); l3e = hap_map_domain_page(m3mfn); l3e[3] = l3e_from_pfn(mfn_x(m2mfn), _PAGE_PRESENT); hap_install_xen_entries_in_l2h(v, m2mfn); @@ -357,7 +358,7 @@ mfn_t hap_make_monitor_table(struct vcpu { mfn_t m2mfn; - m2mfn = hap_alloc(d, 0); + m2mfn = hap_alloc(d); hap_install_xen_entries_in_l2(v, m2mfn, m2mfn); return m2mfn; @@ -390,7 +391,6 @@ void hap_domain_init(struct domain *d) { hap_lock_init(d); INIT_LIST_HEAD(&d->arch.paging.hap.freelists); - INIT_LIST_HEAD(&d->arch.paging.hap.p2m_freelist); } /* return 0 for success, -errno for failure */ diff -r fc9e2f7920c9 -r f378c424e0ce xen/arch/x86/mm/shadow/multi.c --- a/xen/arch/x86/mm/shadow/multi.c Fri Mar 30 17:18:42 2007 -0600 +++ b/xen/arch/x86/mm/shadow/multi.c Tue Apr 03 13:04:51 2007 -0600 @@ -2823,8 +2823,8 @@ static int sh_page_fault(struct vcpu *v, * are OK, this can only have been caused by a failed * shadow_set_l*e(), which will have crashed the guest. * Get out of the fault handler immediately. */ - ASSERT(d->is_shutdown); - unmap_walk(v, &gw); + ASSERT(d->is_shutting_down); + unmap_walk(v, &gw); shadow_unlock(d); return 0; } diff -r fc9e2f7920c9 -r f378c424e0ce xen/arch/x86/setup.c --- a/xen/arch/x86/setup.c Fri Mar 30 17:18:42 2007 -0600 +++ b/xen/arch/x86/setup.c Tue Apr 03 13:04:51 2007 -0600 @@ -591,8 +591,6 @@ void __init __start_xen(multiboot_info_t numa_initmem_init(0, max_page); - end_boot_allocator(); - /* Initialise the Xen heap, skipping RAM holes. */ nr_pages = 0; for ( i = 0; i < e820.nr_map; i++ ) @@ -617,6 +615,8 @@ void __init __start_xen(multiboot_info_t printk("Xen heap: %luMB (%lukB)\n", nr_pages >> (20 - PAGE_SHIFT), nr_pages << (PAGE_SHIFT - 10)); + + end_boot_allocator(); early_boot = 0; diff -r fc9e2f7920c9 -r f378c424e0ce xen/arch/x86/time.c --- a/xen/arch/x86/time.c Fri Mar 30 17:18:42 2007 -0600 +++ b/xen/arch/x86/time.c Tue Apr 03 13:04:51 2007 -0600 @@ -573,7 +573,7 @@ static void init_platform_timer(void) * machines were long is 32-bit! (However, as time_t is signed, we * will already get problems at other places on 2038-01-19 03:14:08) */ -static inline unsigned long +unsigned long mktime (unsigned int year, unsigned int mon, unsigned int day, unsigned int hour, unsigned int min, unsigned int sec) diff -r fc9e2f7920c9 -r f378c424e0ce xen/arch/x86/traps.c --- a/xen/arch/x86/traps.c Fri Mar 30 17:18:42 2007 -0600 +++ b/xen/arch/x86/traps.c Tue Apr 03 13:04:51 2007 -0600 @@ -285,23 +285,32 @@ void show_xen_trace() show_trace(®s); } -void show_stack_overflow(unsigned long esp) +void show_stack_overflow(unsigned int cpu, unsigned long esp) { #ifdef MEMORY_GUARD - unsigned long esp_top; + unsigned long esp_top, esp_bottom; unsigned long *stack, addr; - esp_top = (esp | (STACK_SIZE - 1)) - DEBUG_STACK_SIZE; + esp_bottom = (esp | (STACK_SIZE - 1)) + 1; + esp_top = esp_bottom - DEBUG_STACK_SIZE; + + printk("Valid stack range: %p-%p, sp=%p, tss.esp0=%p\n", + (void *)esp_top, (void *)esp_bottom, (void *)esp, + (void *)init_tss[cpu].esp0); /* Trigger overflow trace if %esp is within 512 bytes of the guard page. */ if ( ((unsigned long)(esp - esp_top) > 512) && ((unsigned long)(esp_top - esp) > 512) ) + { + printk("No stack overflow detected. Skipping stack trace.\n"); return; + } if ( esp < esp_top ) esp = esp_top; - printk("Xen stack overflow:\n "); + printk("Xen stack overflow (dumping trace %p-%p):\n ", + (void *)esp, (void *)esp_bottom); stack = (unsigned long *)esp; while ( ((long)stack & (STACK_SIZE-BYTES_PER_LONG)) != 0 ) diff -r fc9e2f7920c9 -r f378c424e0ce xen/arch/x86/x86_32/traps.c --- a/xen/arch/x86/x86_32/traps.c Fri Mar 30 17:18:42 2007 -0600 +++ b/xen/arch/x86/x86_32/traps.c Tue Apr 03 13:04:51 2007 -0600 @@ -139,7 +139,7 @@ void show_page_walk(unsigned long addr) unmap_domain_page(l1t); } -#define DOUBLEFAULT_STACK_SIZE 1024 +#define DOUBLEFAULT_STACK_SIZE 2048 static struct tss_struct doublefault_tss; static unsigned char doublefault_stack[DOUBLEFAULT_STACK_SIZE]; @@ -167,7 +167,7 @@ asmlinkage void do_double_fault(void) tss->esi, tss->edi, tss->ebp, tss->esp); printk("ds: %04x es: %04x fs: %04x gs: %04x ss: %04x\n", tss->ds, tss->es, tss->fs, tss->gs, tss->ss); - show_stack_overflow(tss->esp); + show_stack_overflow(cpu, tss->esp); panic("DOUBLE FAULT -- system shutdown\n"); } @@ -268,8 +268,7 @@ void __init percpu_traps_init(void) tss->ds = __HYPERVISOR_DS; tss->es = __HYPERVISOR_DS; tss->ss = __HYPERVISOR_DS; - tss->esp = (unsigned long) - &doublefault_stack[DOUBLEFAULT_STACK_SIZE]; + tss->esp = (unsigned long)&doublefault_stack[DOUBLEFAULT_STACK_SIZE]; tss->__cr3 = __pa(idle_pg_table); tss->cs = __HYPERVISOR_CS; tss->eip = (unsigned long)do_double_fault; diff -r fc9e2f7920c9 -r f378c424e0ce xen/arch/x86/x86_64/traps.c --- a/xen/arch/x86/x86_64/traps.c Fri Mar 30 17:18:42 2007 -0600 +++ b/xen/arch/x86/x86_64/traps.c Tue Apr 03 13:04:51 2007 -0600 @@ -171,7 +171,7 @@ asmlinkage void do_double_fault(struct c printk("r12: %016lx r13: %016lx r14: %016lx\n", regs->r12, regs->r13, regs->r14); printk("r15: %016lx\n", regs->r15); - show_stack_overflow(regs->rsp); + show_stack_overflow(cpu, regs->rsp); panic("DOUBLE FAULT -- system shutdown\n"); } @@ -270,18 +270,18 @@ void __init percpu_traps_init(void) stack_bottom = (char *)get_stack_bottom(); stack = (char *)((unsigned long)stack_bottom & ~(STACK_SIZE - 1)); - /* Double-fault handler has its own per-CPU 1kB stack. */ - init_tss[cpu].ist[0] = (unsigned long)&stack[1024]; + /* Double-fault handler has its own per-CPU 2kB stack. */ + init_tss[cpu].ist[0] = (unsigned long)&stack[2048]; /* NMI handler has its own per-CPU 1kB stack. */ - init_tss[cpu].ist[1] = (unsigned long)&stack[2048]; + init_tss[cpu].ist[1] = (unsigned long)&stack[3072]; /* * Trampoline for SYSCALL entry from long mode. */ /* Skip the NMI and DF stacks. */ - stack = &stack[2048]; + stack = &stack[3072]; wrmsr(MSR_LSTAR, (unsigned long)stack, ((unsigned long)stack>>32)); /* movq %rsp, saversp(%rip) */ diff -r fc9e2f7920c9 -r f378c424e0ce xen/common/domain.c --- a/xen/common/domain.c Fri Mar 30 17:18:42 2007 -0600 +++ b/xen/common/domain.c Tue Apr 03 13:04:51 2007 -0600 @@ -59,6 +59,7 @@ struct domain *alloc_domain(domid_t domi atomic_set(&d->refcnt, 1); spin_lock_init(&d->big_lock); spin_lock_init(&d->page_alloc_lock); + spin_lock_init(&d->shutdown_lock); INIT_LIST_HEAD(&d->page_list); INIT_LIST_HEAD(&d->xenpage_list); @@ -83,6 +84,45 @@ void free_domain(struct domain *d) xfree(d); } +static void __domain_finalise_shutdown(struct domain *d) +{ + struct vcpu *v; + + BUG_ON(!spin_is_locked(&d->shutdown_lock)); + + if ( d->is_shut_down ) + return; + + for_each_vcpu ( d, v ) + if ( !v->paused_for_shutdown ) + return; + + d->is_shut_down = 1; + + for_each_vcpu ( d, v ) + vcpu_sleep_nosync(v); + + send_guest_global_virq(dom0, VIRQ_DOM_EXC); +} + +static void vcpu_check_shutdown(struct vcpu *v) +{ + struct domain *d = v->domain; + + spin_lock(&d->shutdown_lock); + + if ( d->is_shutting_down ) + { + if ( !v->paused_for_shutdown ) + atomic_inc(&v->pause_count); + v->paused_for_shutdown = 1; + v->defer_shutdown = 0; + __domain_finalise_shutdown(d); + } + + spin_unlock(&d->shutdown_lock); +} + struct vcpu *alloc_vcpu( struct domain *d, unsigned int vcpu_id, unsigned int cpu_id) { @@ -121,6 +161,9 @@ struct vcpu *alloc_vcpu( d->vcpu[vcpu_id] = v; if ( vcpu_id != 0 ) d->vcpu[v->vcpu_id-1]->next_in_list = v; + + /* Must be called after making new vcpu visible to for_each_vcpu(). */ + vcpu_check_shutdown(v); return v; } @@ -286,7 +329,7 @@ void domain_kill(struct domain *d) void __domain_crash(struct domain *d) { - if ( d->is_shutdown ) + if ( d->is_shutting_down ) { /* Print nothing: the domain is already shutting down. */ } @@ -335,16 +378,73 @@ void domain_shutdown(struct domain *d, u if ( d->domain_id == 0 ) dom0_shutdown(reason); - atomic_inc(&d->pause_count); - if ( !xchg(&d->is_shutdown, 1) ) - d->shutdown_code = reason; - else - domain_unpause(d); + spin_lock(&d->shutdown_lock); + + if ( d->is_shutting_down ) + { + spin_unlock(&d->shutdown_lock); + return; + } + + d->is_shutting_down = 1; + d->shutdown_code = reason; + + smp_mb(); /* set shutdown status /then/ check for per-cpu deferrals */ for_each_vcpu ( d, v ) - vcpu_sleep_nosync(v); - - send_guest_global_virq(dom0, VIRQ_DOM_EXC); + { + if ( v->defer_shutdown ) + continue; + atomic_inc(&v->pause_count); + v->paused_for_shutdown = 1; + } + + __domain_finalise_shutdown(d); + + spin_unlock(&d->shutdown_lock); +} + +void domain_resume(struct domain *d) +{ + struct vcpu *v; + + /* + * Some code paths assume that shutdown status does not get reset under + * their feet (e.g., some assertions make this assumption). + */ + domain_pause(d); + + spin_lock(&d->shutdown_lock); + + d->is_shutting_down = d->is_shut_down = 0; + + for_each_vcpu ( d, v ) + { + if ( v->paused_for_shutdown ) + vcpu_unpause(v); + v->paused_for_shutdown = 0; + } + + spin_unlock(&d->shutdown_lock); + + domain_unpause(d); +} + +int vcpu_start_shutdown_deferral(struct vcpu *v) +{ + v->defer_shutdown = 1; + smp_mb(); /* set deferral status /then/ check for shutdown */ + if ( unlikely(v->domain->is_shutting_down) ) + vcpu_check_shutdown(v); + return v->defer_shutdown; +} + +void vcpu_end_shutdown_deferral(struct vcpu *v) +{ + v->defer_shutdown = 0; + smp_mb(); /* clear deferral status /then/ check for shutdown */ + if ( unlikely(v->domain->is_shutting_down) ) + vcpu_check_shutdown(v); } void domain_pause_for_debugger(void) @@ -425,7 +525,6 @@ void vcpu_pause_nosync(struct vcpu *v) void vcpu_unpause(struct vcpu *v) { - ASSERT(v != current); if ( atomic_dec_and_test(&v->pause_count) ) vcpu_wake(v); } @@ -445,8 +544,6 @@ void domain_unpause(struct domain *d) void domain_unpause(struct domain *d) { struct vcpu *v; - - ASSERT(d != current->domain); if ( atomic_dec_and_test(&d->pause_count) ) for_each_vcpu( d, v ) diff -r fc9e2f7920c9 -r f378c424e0ce xen/common/domctl.c --- a/xen/common/domctl.c Fri Mar 30 17:18:42 2007 -0600 +++ b/xen/common/domctl.c Tue Apr 03 13:04:51 2007 -0600 @@ -115,7 +115,7 @@ void getdomaininfo(struct domain *d, str info->flags = flags | (d->is_dying ? XEN_DOMINF_dying : 0) | - (d->is_shutdown ? XEN_DOMINF_shutdown : 0) | + (d->is_shut_down ? XEN_DOMINF_shutdown : 0) | (d->is_paused_by_controller ? XEN_DOMINF_paused : 0) | d->shutdown_code << XEN_DOMINF_shutdownshift; @@ -287,8 +287,7 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc if ( d == NULL ) break; - if ( xchg(&d->is_shutdown, 0) ) - domain_unpause(d); + domain_resume(d); rcu_unlock_domain(d); ret = 0; } diff -r fc9e2f7920c9 -r f378c424e0ce xen/common/page_alloc.c --- a/xen/common/page_alloc.c Fri Mar 30 17:18:42 2007 -0600 +++ b/xen/common/page_alloc.c Tue Apr 03 13:04:51 2007 -0600 @@ -512,6 +512,14 @@ void init_heap_pages( ASSERT(zone < NR_ZONES); + if ( unlikely(avail[0] == NULL) ) + { + /* Start-of-day memory node 0 initialisation. */ + init_heap_block(&_heap0); + _heap[0] = &_heap0; + avail[0] = avail0; + } + if ( likely(page_to_mfn(pg) != 0) ) nid_prev = phys_to_nid(page_to_maddr(pg-1)); else @@ -569,10 +577,6 @@ void end_boot_allocator(void) { unsigned long i; int curr_free, next_free; - - init_heap_block(&_heap0); - _heap[0] = &_heap0; - avail[0] = avail0; /* Pages that are free now go to the domain sub-allocator. */ if ( (curr_free = next_free = !allocated_in_map(first_valid_mfn)) ) diff -r fc9e2f7920c9 -r f378c424e0ce xen/common/symbols.c --- a/xen/common/symbols.c Fri Mar 30 17:18:42 2007 -0600 +++ b/xen/common/symbols.c Tue Apr 03 13:04:51 2007 -0600 @@ -16,6 +16,7 @@ #include <xen/init.h> #include <xen/lib.h> #include <xen/string.h> +#include <xen/spinlock.h> extern unsigned long symbols_addresses[]; extern unsigned long symbols_num_syms; @@ -140,12 +141,15 @@ void __print_symbol(const char *fmt, uns void __print_symbol(const char *fmt, unsigned long address) { const char *name; - unsigned long offset, size; - char namebuf[KSYM_NAME_LEN+1]; + unsigned long offset, size, flags; + static DEFINE_SPINLOCK(lock); + static char namebuf[KSYM_NAME_LEN+1]; #define BUFFER_SIZE sizeof("%s+%#lx/%#lx [%s]") + KSYM_NAME_LEN + \ 2*(BITS_PER_LONG*3/10) + 1 - char buffer[BUFFER_SIZE]; + static char buffer[BUFFER_SIZE]; + + spin_lock_irqsave(&lock, flags); name = symbols_lookup(address, &size, &offset, namebuf); @@ -155,4 +159,6 @@ void __print_symbol(const char *fmt, uns snprintf(buffer, BUFFER_SIZE, "%s+%#lx/%#lx", name, offset, size); printk(fmt, buffer); + + spin_unlock_irqrestore(&lock, flags); } diff -r fc9e2f7920c9 -r f378c424e0ce xen/drivers/char/console.c --- a/xen/drivers/char/console.c Fri Mar 30 17:18:42 2007 -0600 +++ b/xen/drivers/char/console.c Tue Apr 03 13:04:51 2007 -0600 @@ -858,19 +858,20 @@ void panic(const char *fmt, ...) void panic(const char *fmt, ...) { va_list args; - char buf[128]; unsigned long flags; static DEFINE_SPINLOCK(lock); + static char buf[128]; debugtrace_dump(); + + /* Protects buf[] and ensure multi-line message prints atomically. */ + spin_lock_irqsave(&lock, flags); va_start(args, fmt); (void)vsnprintf(buf, sizeof(buf), fmt, args); va_end(args); - /* Spit out multiline message in one go. */ console_start_sync(); - spin_lock_irqsave(&lock, flags); printk("\n****************************************\n"); printk("Panic on CPU %d:\n", smp_processor_id()); printk(buf); @@ -879,6 +880,7 @@ void panic(const char *fmt, ...) printk("Manual reset required ('noreboot' specified)\n"); else printk("Reboot in five seconds...\n"); + spin_unlock_irqrestore(&lock, flags); debugger_trap_immediate(); diff -r fc9e2f7920c9 -r f378c424e0ce xen/include/asm-x86/domain.h --- a/xen/include/asm-x86/domain.h Fri Mar 30 17:18:42 2007 -0600 +++ b/xen/include/asm-x86/domain.h Tue Apr 03 13:04:51 2007 -0600 @@ -115,7 +115,6 @@ struct hap_domain { const char *locker_function; struct list_head freelists; - struct list_head p2m_freelist; unsigned int total_pages; /* number of pages allocated */ unsigned int free_pages; /* number of pages on freelists */ unsigned int p2m_pages; /* number of pages allocates to p2m */ diff -r fc9e2f7920c9 -r f378c424e0ce xen/include/asm-x86/hvm/io.h --- a/xen/include/asm-x86/hvm/io.h Fri Mar 30 17:18:42 2007 -0600 +++ b/xen/include/asm-x86/hvm/io.h Tue Apr 03 13:04:51 2007 -0600 @@ -127,6 +127,7 @@ static inline int hvm_portio_intercept(i } extern int hvm_mmio_intercept(ioreq_t *p); +extern int hvm_buffered_io_send(ioreq_t *p); extern int hvm_buffered_io_intercept(ioreq_t *p); static inline int register_portio_handler( @@ -145,6 +146,7 @@ static inline int irq_masked(unsigned lo extern void send_pio_req(unsigned long port, unsigned long count, int size, paddr_t value, int dir, int df, int value_is_ptr); +void send_timeoffset_req(unsigned long timeoff); extern void handle_mmio(unsigned long gpa); extern void hvm_interrupt_post(struct vcpu *v, int vector, int type); extern void hvm_io_assist(struct vcpu *v); diff -r fc9e2f7920c9 -r f378c424e0ce xen/include/asm-x86/hvm/support.h --- a/xen/include/asm-x86/hvm/support.h Fri Mar 30 17:18:42 2007 -0600 +++ b/xen/include/asm-x86/hvm/support.h Tue Apr 03 13:04:51 2007 -0600 @@ -215,7 +215,6 @@ int hvm_load(struct domain *d, hvm_domai /* End of save/restore */ extern char hvm_io_bitmap[]; -extern char hvm_msr_bitmap[]; extern int hvm_enabled; void hvm_enable(struct hvm_function_table *); diff -r fc9e2f7920c9 -r f378c424e0ce xen/include/asm-x86/hvm/vmx/vmcs.h --- a/xen/include/asm-x86/hvm/vmx/vmcs.h Fri Mar 30 17:18:42 2007 -0600 +++ b/xen/include/asm-x86/hvm/vmx/vmcs.h Tue Apr 03 13:04:51 2007 -0600 @@ -121,6 +121,7 @@ extern u32 vmx_vmentry_control; #define cpu_has_vmx_msr_bitmap \ (vmx_cpu_based_exec_control & CPU_BASED_ACTIVATE_MSR_BITMAP) +extern char *vmx_msr_bitmap; /* VMCS Encordings */ enum vmcs_field { diff -r fc9e2f7920c9 -r f378c424e0ce xen/include/asm-x86/processor.h --- a/xen/include/asm-x86/processor.h Fri Mar 30 17:18:42 2007 -0600 +++ b/xen/include/asm-x86/processor.h Tue Apr 03 13:04:51 2007 -0600 @@ -413,9 +413,9 @@ struct tss_struct { struct tss_struct { unsigned short back_link,__blh; #ifdef __x86_64__ - u64 rsp0; - u64 rsp1; - u64 rsp2; + union { u64 rsp0, esp0; }; + union { u64 rsp1, esp1; }; + union { u64 rsp2, esp2; }; u64 reserved1; u64 ist[7]; u64 reserved2; @@ -553,7 +553,7 @@ extern always_inline void prefetchw(cons void show_stack(struct cpu_user_regs *regs); void show_xen_trace(void); -void show_stack_overflow(unsigned long esp); +void show_stack_overflow(unsigned int cpu, unsigned long esp); void show_registers(struct cpu_user_regs *regs); void show_execution_state(struct cpu_user_regs *regs); void show_page_walk(unsigned long addr); diff -r fc9e2f7920c9 -r f378c424e0ce xen/include/asm-x86/time.h --- a/xen/include/asm-x86/time.h Fri Mar 30 17:18:42 2007 -0600 +++ b/xen/include/asm-x86/time.h Tue Apr 03 13:04:51 2007 -0600 @@ -16,4 +16,9 @@ static inline cycles_t get_cycles(void) return c; } +unsigned long +mktime (unsigned int year, unsigned int mon, + unsigned int day, unsigned int hour, + unsigned int min, unsigned int sec); + #endif /* __X86_TIME_H__ */ diff -r fc9e2f7920c9 -r f378c424e0ce xen/include/public/hvm/ioreq.h --- a/xen/include/public/hvm/ioreq.h Fri Mar 30 17:18:42 2007 -0600 +++ b/xen/include/public/hvm/ioreq.h Tue Apr 03 13:04:51 2007 -0600 @@ -39,6 +39,7 @@ #define IOREQ_TYPE_XOR 4 #define IOREQ_TYPE_XCHG 5 #define IOREQ_TYPE_ADD 6 +#define IOREQ_TYPE_TIMEOFFSET 7 /* * VMExit dispatcher should cooperate with instruction decoder to diff -r fc9e2f7920c9 -r f378c424e0ce xen/include/xen/sched.h --- a/xen/include/xen/sched.h Fri Mar 30 17:18:42 2007 -0600 +++ b/xen/include/xen/sched.h Tue Apr 03 13:04:51 2007 -0600 @@ -114,6 +114,10 @@ struct vcpu bool_t nmi_pending; /* Avoid NMI reentry by allowing NMIs to be masked for short periods. */ bool_t nmi_masked; + /* Require shutdown to be deferred for some asynchronous operation? */ + bool_t defer_shutdown; + /* VCPU is paused following shutdown request (d->is_shutting_down)? */ + bool_t paused_for_shutdown; unsigned long pause_flags; atomic_t pause_count; @@ -193,7 +197,9 @@ struct domain bool_t is_paused_by_controller; /* Guest has shut down (inc. reason code)? */ - bool_t is_shutdown; + spinlock_t shutdown_lock; + bool_t is_shutting_down; /* in process of shutting down? */ + bool_t is_shut_down; /* fully shut down? */ int shutdown_code; atomic_t pause_count; @@ -331,7 +337,11 @@ void domain_destroy(struct domain *d); void domain_destroy(struct domain *d); void domain_kill(struct domain *d); void domain_shutdown(struct domain *d, u8 reason); +void domain_resume(struct domain *d); void domain_pause_for_debugger(void); + +int vcpu_start_shutdown_deferral(struct vcpu *v); +void vcpu_end_shutdown_deferral(struct vcpu *v); /* * Mark specified domain as crashed. This function always returns, even if the _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-changelog
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |