[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen-unstable] merge with xen-unstable.hg



# HG changeset patch
# User Isaku Yamahata <yamahata@xxxxxxxxxxxxx>
# Date 1214965837 -32400
# Node ID 08f77df14cba8e2dfe580779bb9ca2f64e1ae0ae
# Parent  11318234588e61b45df5a06fe6a29264854ba22a
# Parent  19970181d6a46aee1199857b6d3c6bedc7507121
merge with xen-unstable.hg
---
 docs/ChangeLog                               |    9 
 extras/mini-os/arch/x86/mm.c                 |   11 
 extras/mini-os/blkfront.c                    |    1 
 extras/mini-os/fbfront.c                     |    2 
 extras/mini-os/fs-front.c                    |   10 
 extras/mini-os/lib/sys.c                     |    2 
 extras/mini-os/netfront.c                    |    6 
 stubdom/grub.patches/99minios                |   10 
 stubdom/grub/Makefile                        |    2 
 tools/blktap/drivers/Makefile                |   10 
 tools/blktap/drivers/blktapctrl.c            |    2 
 tools/blktap/drivers/block-qcow.c            |   35 +
 tools/blktap/drivers/block-qcow2.c           |    5 
 tools/blktap/drivers/check_gcrypt            |   14 
 tools/blktap/lib/blktaplib.h                 |    2 
 tools/debugger/xenitp/xenitp.c               |   24 
 tools/examples/xend-config.sxp               |    3 
 tools/firmware/hvmloader/hvmloader.c         |   10 
 tools/firmware/rombios/rombios.c             |   35 -
 tools/ioemu/hw/xen_console.c                 |    8 
 tools/ioemu/target-i386-dm/exec-dm.c         |   17 
 tools/ioemu/xenstore.c                       |   11 
 tools/libxc/ia64/xc_ia64_hvm_build.c         |    7 
 tools/libxc/ia64/xc_ia64_linux_restore.c     |   24 
 tools/libxc/ia64/xc_ia64_linux_save.c        |   19 
 tools/libxc/xc_core.c                        |    8 
 tools/libxc/xc_core_ia64.c                   |    3 
 tools/libxc/xc_core_ia64.h                   |    2 
 tools/libxc/xc_domain.c                      |   65 --
 tools/libxc/xc_domain_restore.c              |   12 
 tools/libxc/xc_domain_save.c                 |   20 
 tools/libxc/xc_misc.c                        |   28 
 tools/libxc/xc_pagetab.c                     |    4 
 tools/libxc/xc_private.h                     |    4 
 tools/libxc/xc_ptrace.c                      |   34 -
 tools/libxc/xc_ptrace_core.c                 |    8 
 tools/libxc/xc_resume.c                      |   10 
 tools/libxc/xenctrl.h                        |   44 +
 tools/libxc/xg_save_restore.h                |   22 
 tools/python/xen/util/blkif.py               |   41 -
 tools/python/xen/xend/XendConfig.py          |    2 
 tools/python/xen/xend/XendOptions.py         |    7 
 tools/python/xen/xend/image.py               |   20 
 tools/python/xen/xend/server/blkif.py        |    6 
 tools/python/xen/xm/main.py                  |    3 
 tools/tests/test_x86_emulator.c              |    9 
 tools/xenballoon/xenballoon-monitor          |   43 +
 tools/xenballoon/xenballoon.conf             |   91 +++
 tools/xenballoon/xenballoond                 |  205 ++++++
 tools/xenballoon/xenballoond.README          |   82 ++
 tools/xenballoon/xenballoond.init            |   91 +++
 tools/xentrace/xenctx.c                      |    8 
 tools/xm-test/lib/XmTestLib/block_utils.py   |    2 
 xen/arch/ia64/vmx/vmx_hypercall.c            |   47 +
 xen/arch/ia64/xen/mm.c                       |    6 
 xen/arch/x86/acpi/cpufreq/Makefile           |    1 
 xen/arch/x86/acpi/cpufreq/cpufreq.c          |  139 +++-
 xen/arch/x86/acpi/cpufreq/cpufreq_ondemand.c |   14 
 xen/arch/x86/acpi/cpufreq/powernow.c         |  305 ++++++++++
 xen/arch/x86/acpi/cpufreq/utility.c          |  103 +++
 xen/arch/x86/acpi/pmstat.c                   |    7 
 xen/arch/x86/acpi/power.c                    |   25 
 xen/arch/x86/hvm/emulate.c                   |  113 +--
 xen/arch/x86/hvm/hvm.c                       |   60 +
 xen/arch/x86/hvm/vmx/vmcs.c                  |  100 +--
 xen/arch/x86/hvm/vmx/vmx.c                   |   11 
 xen/arch/x86/hvm/vmx/vpmu_core2.c            |   20 
 xen/arch/x86/mm.c                            |   45 +
 xen/arch/x86/mm/shadow/common.c              |  811 ++++++++++++++++++++++++++-
 xen/arch/x86/mm/shadow/multi.c               |  559 +++++++++++++++++-
 xen/arch/x86/mm/shadow/multi.h               |   14 
 xen/arch/x86/mm/shadow/private.h             |  130 ++++
 xen/arch/x86/mm/shadow/types.h               |    5 
 xen/arch/x86/platform_hypercall.c            |    7 
 xen/arch/x86/x86_emulate/x86_emulate.c       |  700 ++++++++++++++++++-----
 xen/arch/x86/x86_emulate/x86_emulate.h       |   37 -
 xen/common/domain.c                          |  259 ++++----
 xen/drivers/passthrough/vtd/dmar.c           |    3 
 xen/drivers/passthrough/vtd/dmar.h           |   16 
 xen/drivers/passthrough/vtd/intremap.c       |    7 
 xen/drivers/passthrough/vtd/iommu.c          |   16 
 xen/drivers/passthrough/vtd/qinval.c         |   16 
 xen/drivers/passthrough/vtd/utils.c          |    2 
 xen/include/acpi/cpufreq/cpufreq.h           |    3 
 xen/include/acpi/cpufreq/processor_perf.h    |   13 
 xen/include/asm-x86/domain.h                 |   14 
 xen/include/asm-x86/hvm/vmx/vmcs.h           |    8 
 xen/include/asm-x86/mm.h                     |    8 
 xen/include/asm-x86/perfc_defn.h             |   15 
 xen/include/public/hvm/hvm_op.h              |   13 
 xen/include/xen/domain.h                     |    3 
 xen/include/xen/sched.h                      |   12 
 92 files changed, 3996 insertions(+), 824 deletions(-)

diff -r 11318234588e -r 08f77df14cba docs/ChangeLog
--- a/docs/ChangeLog    Thu Jun 19 12:48:04 2008 +0900
+++ b/docs/ChangeLog    Wed Jul 02 11:30:37 2008 +0900
@@ -16,6 +16,15 @@ Xen 3.3 release
 Xen 3.3 release
 ---------------
 
+17903: Add greater than 16 xvd device availability
+http://xenbits.xensource.com/xen-unstable.hg?rev/0728459b3c8d
+
+The tools can now attach a disk of the form:
+(1<<28) | (device<<8) | partition
+to support many more xvd disks and up to 256 partitions.
+The linux guest frontend has been expanded to support
+this new construct, while legacy guests should just ignore it.
+       
 17538: Add XENPF_set_processor_pminfo
 http://xenbits.xensource.com/xen-unstable.hg?rev/5bb9093eb0e9
 
diff -r 11318234588e -r 08f77df14cba extras/mini-os/arch/x86/mm.c
--- a/extras/mini-os/arch/x86/mm.c      Thu Jun 19 12:48:04 2008 +0900
+++ b/extras/mini-os/arch/x86/mm.c      Wed Jul 02 11:30:37 2008 +0900
@@ -528,18 +528,13 @@ void *map_frames_ex(unsigned long *f, un
 
 static void clear_bootstrap(void)
 {
-    xen_pfn_t mfns[] = { virt_to_mfn(&shared_info) };
-    int n = sizeof(mfns)/sizeof(*mfns);
     pte_t nullpte = { };
 
     /* Use first page as the CoW zero page */
     memset(&_text, 0, PAGE_SIZE);
-    mfn_zero = pfn_to_mfn((unsigned long) &_text);
-    if (HYPERVISOR_update_va_mapping((unsigned long) &_text, nullpte, 
UVMF_INVLPG))
-       printk("Unable to unmap first page\n");
-
-    if (free_physical_pages(mfns, n) != n)
-       printk("Unable to free bootstrap pages\n");
+    mfn_zero = virt_to_mfn((unsigned long) &_text);
+    if (HYPERVISOR_update_va_mapping(0, nullpte, UVMF_INVLPG))
+       printk("Unable to unmap NULL page\n");
 }
 
 void arch_init_p2m(unsigned long max_pfn)
diff -r 11318234588e -r 08f77df14cba extras/mini-os/blkfront.c
--- a/extras/mini-os/blkfront.c Thu Jun 19 12:48:04 2008 +0900
+++ b/extras/mini-os/blkfront.c Wed Jul 02 11:30:37 2008 +0900
@@ -125,7 +125,6 @@ struct blkfront_dev *init_blkfront(char 
 
     dev->events = NULL;
 
-    // FIXME: proper frees on failures
 again:
     err = xenbus_transaction_start(&xbt);
     if (err) {
diff -r 11318234588e -r 08f77df14cba extras/mini-os/fbfront.c
--- a/extras/mini-os/fbfront.c  Thu Jun 19 12:48:04 2008 +0900
+++ b/extras/mini-os/fbfront.c  Wed Jul 02 11:30:37 2008 +0900
@@ -100,7 +100,6 @@ struct kbdfront_dev *init_kbdfront(char 
     s->in_cons = s->in_prod = 0;
     s->out_cons = s->out_prod = 0;
 
-    // FIXME: proper frees on failures
 again:
     err = xenbus_transaction_start(&xbt);
     if (err) {
@@ -408,7 +407,6 @@ struct fbfront_dev *init_fbfront(char *n
         s->pd[i] = 0;
 
 
-    // FIXME: proper frees on failures
 again:
     err = xenbus_transaction_start(&xbt);
     if (err) {
diff -r 11318234588e -r 08f77df14cba extras/mini-os/fs-front.c
--- a/extras/mini-os/fs-front.c Thu Jun 19 12:48:04 2008 +0900
+++ b/extras/mini-os/fs-front.c Wed Jul 02 11:30:37 2008 +0900
@@ -136,8 +136,8 @@ again:
 again:    
     old_id = freelist[0];
     /* Note: temporal inconsistency, since freelist[0] can be changed by 
someone
-     * else, but we are a sole owner of freelist[id], it's OK. */
-    freelist[id] = old_id;
+     * else, but we are a sole owner of freelist[id + 1], it's OK. */
+    freelist[id + 1] = old_id;
     new_id = id;
     if(cmpxchg(&freelist[0], old_id, new_id) != old_id)
     {
@@ -154,7 +154,7 @@ static inline unsigned short get_id_from
 
 again:    
     old_id = freelist[0];
-    new_id = freelist[old_id];
+    new_id = freelist[old_id + 1];
     if(cmpxchg(&freelist[0], old_id, new_id) != old_id)
     {
         printk("Cmpxchg on freelist remove failed.\n");
@@ -785,8 +785,8 @@ static void alloc_request_table(struct f
     printk("Allocating request array for import %d, nr_entries = %d.\n",
             import->import_id, import->nr_entries);
     requests = xmalloc_array(struct fs_request, import->nr_entries);
-    import->freelist = xmalloc_array(unsigned short, import->nr_entries);
-    memset(import->freelist, 0, sizeof(unsigned short) * import->nr_entries);
+    import->freelist = xmalloc_array(unsigned short, import->nr_entries + 1);
+    memset(import->freelist, 0, sizeof(unsigned short) * (import->nr_entries + 
1));
     for(i=0; i<import->nr_entries; i++)
     {
        /* TODO: that's a lot of memory */
diff -r 11318234588e -r 08f77df14cba extras/mini-os/lib/sys.c
--- a/extras/mini-os/lib/sys.c  Thu Jun 19 12:48:04 2008 +0900
+++ b/extras/mini-os/lib/sys.c  Wed Jul 02 11:30:37 2008 +0900
@@ -686,7 +686,7 @@ static int select_poll(int nfds, fd_set 
 #ifdef LIBC_VERBOSE
     static int nb;
     static int nbread[NOFILE], nbwrite[NOFILE], nbexcept[NOFILE];
-    static s64_t lastshown;
+    static s_time_t lastshown;
 
     nb++;
 #endif
diff -r 11318234588e -r 08f77df14cba extras/mini-os/netfront.c
--- a/extras/mini-os/netfront.c Thu Jun 19 12:48:04 2008 +0900
+++ b/extras/mini-os/netfront.c Wed Jul 02 11:30:37 2008 +0900
@@ -38,7 +38,7 @@ struct netfront_dev {
 struct netfront_dev {
     domid_t dom;
 
-    unsigned short tx_freelist[NET_TX_RING_SIZE];
+    unsigned short tx_freelist[NET_TX_RING_SIZE + 1];
     struct semaphore tx_sem;
 
     struct net_buffer rx_buffers[NET_RX_RING_SIZE];
@@ -70,14 +70,14 @@ void init_rx_buffers(struct netfront_dev
 
 static inline void add_id_to_freelist(unsigned int id,unsigned short* freelist)
 {
-    freelist[id] = freelist[0];
+    freelist[id + 1] = freelist[0];
     freelist[0]  = id;
 }
 
 static inline unsigned short get_id_from_freelist(unsigned short* freelist)
 {
     unsigned int id = freelist[0];
-    freelist[0] = freelist[id];
+    freelist[0] = freelist[id + 1];
     return id;
 }
 
diff -r 11318234588e -r 08f77df14cba stubdom/grub.patches/99minios
--- a/stubdom/grub.patches/99minios     Thu Jun 19 12:48:04 2008 +0900
+++ b/stubdom/grub.patches/99minios     Wed Jul 02 11:30:37 2008 +0900
@@ -832,7 +832,18 @@ Index: grub/stage2/fsys_reiserfs.c
 Index: grub/stage2/fsys_reiserfs.c
 ===================================================================
 --- grub.orig/stage2/fsys_reiserfs.c   2008-06-16 15:18:03.410933000 +0100
-+++ grub/stage2/fsys_reiserfs.c        2008-06-16 15:18:14.786009000 +0100
++++ grub/stage2/fsys_reiserfs.c        2008-06-20 18:33:52.002100000 +0100
+@@ -224,8 +224,8 @@
+ 
+ struct disk_child
+ {
+-  unsigned long       dc_block_number;              /* Disk child's block 
number. */
+-  unsigned short      dc_size;                            /* Disk child's 
used space.   */
++  __u32       dc_block_number;              /* Disk child's block number. */
++  __u16      dc_size;                     /* Disk child's used space.   */
+ };
+ 
+ #define DC_SIZE (sizeof (struct disk_child))
 @@ -369,7 +369,14 @@
  static __inline__ unsigned long
  log2 (unsigned long word)
diff -r 11318234588e -r 08f77df14cba stubdom/grub/Makefile
--- a/stubdom/grub/Makefile     Thu Jun 19 12:48:04 2008 +0900
+++ b/stubdom/grub/Makefile     Wed Jul 02 11:30:37 2008 +0900
@@ -5,7 +5,7 @@ vpath %.c ../grub-cvs
 
 BOOT=boot-$(XEN_TARGET_ARCH).o
 
-DEF_CPPFLAGS += -I$(XEN_ROOT)/tools/libxc -I.
+DEF_CPPFLAGS += -I$(XEN_ROOT)/tools/libxc -I$(XEN_ROOT)/tools/include -I.
 DEF_CPPFLAGS += -I../grub-cvs/stage1
 DEF_CPPFLAGS += -I../grub-cvs/stage2
 DEF_CPPFLAGS += -I../grub-cvs/netboot
diff -r 11318234588e -r 08f77df14cba tools/blktap/drivers/Makefile
--- a/tools/blktap/drivers/Makefile     Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/blktap/drivers/Makefile     Wed Jul 02 11:30:37 2008 +0900
@@ -17,8 +17,16 @@ CFLAGS   += -Wp,-MD,.$(@F).d
 CFLAGS   += -Wp,-MD,.$(@F).d
 DEPS      = .*.d
 
+ifeq ($(shell . ./check_gcrypt),"yes")
+CFLAGS += -DUSE_GCRYPT
+CRYPT_LIB := -lgcrypt
+else
+CRYPT_LIB := -lcrypto
+$(warning *** libgcrypt not installed: falling back to libcrypto ***)
+endif
+
 LDFLAGS_blktapctrl := $(LDFLAGS_libxenctrl) $(LDFLAGS_libxenstore) -L../lib 
-lblktap
-LDFLAGS_img := $(LIBAIO_DIR)/libaio.a -lcrypto -lpthread -lz
+LDFLAGS_img := $(LIBAIO_DIR)/libaio.a $(CRYPT_LIB) -lpthread -lz
 
 BLK-OBJS-y  := block-aio.o
 BLK-OBJS-y  += block-sync.o
diff -r 11318234588e -r 08f77df14cba tools/blktap/drivers/blktapctrl.c
--- a/tools/blktap/drivers/blktapctrl.c Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/blktap/drivers/blktapctrl.c Wed Jul 02 11:30:37 2008 +0900
@@ -127,7 +127,7 @@ static int get_new_dev(int *major, int *
        char *devname;
        
        tr.domid = blkif->domid;
-        tr.busid = (unsigned short)blkif->be_id;
+        tr.busid = blkif->be_id;
        ret = ioctl(ctlfd, BLKTAP_IOCTL_NEWINTF, tr );
        
        if ( (ret <= 0)||(ret > MAX_TAP_DEV) ) {
diff -r 11318234588e -r 08f77df14cba tools/blktap/drivers/block-qcow.c
--- a/tools/blktap/drivers/block-qcow.c Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/blktap/drivers/block-qcow.c Wed Jul 02 11:30:37 2008 +0900
@@ -33,7 +33,6 @@
 #include <zlib.h>
 #include <inttypes.h>
 #include <libaio.h>
-#include <openssl/md5.h>
 #include "bswap.h"
 #include "aes.h"
 #include "tapdisk.h"
@@ -146,6 +145,35 @@ struct tdqcow_state {
 
 static int decompress_cluster(struct tdqcow_state *s, uint64_t cluster_offset);
 
+#ifdef USE_GCRYPT
+
+#include <gcrypt.h>
+
+static uint32_t gen_cksum(char *ptr, int len)
+{
+       int i;
+       uint32_t md[4];
+
+       /* Convert L1 table to big endian */
+       for(i = 0; i < len / sizeof(uint64_t); i++) {
+               cpu_to_be64s(&((uint64_t*) ptr)[i]);
+       }
+
+       /* Generate checksum */
+       gcry_md_hash_buffer(GCRY_MD_MD5, md, ptr, len);
+
+       /* Convert L1 table back to native endianess */
+       for(i = 0; i < len / sizeof(uint64_t); i++) {
+               be64_to_cpus(&((uint64_t*) ptr)[i]);
+       }
+
+       return md[0];
+}
+
+#else /* use libcrypto */
+
+#include <openssl/md5.h>
+
 static uint32_t gen_cksum(char *ptr, int len)
 {
        int i;
@@ -153,9 +181,8 @@ static uint32_t gen_cksum(char *ptr, int
        uint32_t ret;
 
        md = malloc(MD5_DIGEST_LENGTH);
-
        if(!md) return 0;
-       
+
        /* Convert L1 table to big endian */
        for(i = 0; i < len / sizeof(uint64_t); i++) {
                cpu_to_be64s(&((uint64_t*) ptr)[i]);
@@ -175,6 +202,8 @@ static uint32_t gen_cksum(char *ptr, int
        free(md);
        return ret;
 }
+
+#endif
 
 static int get_filesize(char *filename, uint64_t *size, struct stat *st)
 {
diff -r 11318234588e -r 08f77df14cba tools/blktap/drivers/block-qcow2.c
--- a/tools/blktap/drivers/block-qcow2.c        Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/blktap/drivers/block-qcow2.c        Wed Jul 02 11:30:37 2008 +0900
@@ -254,10 +254,7 @@ static int bdrv_pread(int fd, int64_t of
  */
 static int bdrv_pwrite(int fd, int64_t offset, const void *buf, int count)
 {
-       int ret;
-       
-       ret = lseek(fd, offset, SEEK_SET);
-       if (ret != offset) {
+       if (lseek(fd, offset, SEEK_SET) == -1) {
                DPRINTF("bdrv_pwrite failed seek (%#"PRIx64").\n", offset);
                return -1;
        }
diff -r 11318234588e -r 08f77df14cba tools/blktap/drivers/check_gcrypt
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/blktap/drivers/check_gcrypt Wed Jul 02 11:30:37 2008 +0900
@@ -0,0 +1,14 @@
+#!/bin/sh
+
+cat > .gcrypt.c << EOF
+#include <gcrypt.h>
+int main(void) { return 0; }
+EOF
+
+if $1 -o .gcrypt .gcrypt.c -lgcrypt 2>/dev/null ; then
+  echo "yes"
+else
+  echo "no"
+fi
+
+rm -f .gcrypt*
diff -r 11318234588e -r 08f77df14cba tools/blktap/lib/blktaplib.h
--- a/tools/blktap/lib/blktaplib.h      Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/blktap/lib/blktaplib.h      Wed Jul 02 11:30:37 2008 +0900
@@ -161,7 +161,7 @@ typedef struct tapdev_info {
 
 typedef struct domid_translate {
        unsigned short domid;
-       unsigned short busid;
+       uint32_t busid;
 } domid_translate_t ;
 
 typedef struct image {
diff -r 11318234588e -r 08f77df14cba tools/debugger/xenitp/xenitp.c
--- a/tools/debugger/xenitp/xenitp.c    Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/debugger/xenitp/xenitp.c    Wed Jul 02 11:30:37 2008 +0900
@@ -57,6 +57,16 @@ static int cur_vcpu;
 #define CFM_SOF_MASK            0x3f
 
 int virt_to_phys (int is_inst, unsigned long vaddr, unsigned long *paddr);
+
+/* wrapper for vcpu_gest_context_any_t */
+static int xc_ia64_vcpu_getcontext(int xc_handle,
+                                   uint32_t domid,
+                                   uint32_t vcpu,
+                                   vcpu_guest_context_t *ctxt)
+{
+    return xc_vcpu_getcontext(xc_handle, domid, vcpu,
+                              (vcpu_guest_context_any_t *)ctxt);
+}
 
 static inline unsigned int ctx_slot (vcpu_guest_context_t *ctx)
 {
@@ -729,7 +739,7 @@ int wait_domain (int vcpu, vcpu_guest_co
         fflush (stdout);
         nanosleep (&ts, NULL);
     }
-    return xc_vcpu_getcontext (xc_handle, domid, vcpu, ctx);
+    return xc_ia64_vcpu_getcontext (xc_handle, domid, vcpu, ctx);
 }
 
 int virt_to_phys (int is_inst, unsigned long vaddr, unsigned long *paddr)
@@ -945,13 +955,13 @@ char *parse_arg (char **buf)
     return res;
 }
 
-vcpu_guest_context_t vcpu_ctx[MAX_VIRT_CPUS];
+vcpu_guest_context_any_t vcpu_ctx_any[MAX_VIRT_CPUS];
 
 int vcpu_setcontext (int vcpu)
 {
     int ret;
 
-    ret = xc_vcpu_setcontext (xc_handle, domid, vcpu, &vcpu_ctx[vcpu]);
+    ret = xc_vcpu_setcontext (xc_handle, domid, vcpu, &vcpu_ctx_any[vcpu]);
     if (ret < 0)
         perror ("xc_vcpu_setcontext");
 
@@ -1518,7 +1528,7 @@ enum cmd_status do_command (int vcpu, ch
     int flag_ambiguous;
 
     cur_vcpu = vcpu;
-    cur_ctx = &vcpu_ctx[vcpu];
+    cur_ctx = &vcpu_ctx_any[vcpu].c;
 
     /* Handle repeat last-command.  */
     if (*line == 0) {
@@ -1575,7 +1585,7 @@ void xenitp (int vcpu)
     int ret;
     struct sigaction sa;
 
-    cur_ctx = &vcpu_ctx[vcpu];
+    cur_ctx = &vcpu_ctx_any[vcpu].c;
 
     xc_handle = xc_interface_open (); /* for accessing control interface */
 
@@ -1588,9 +1598,9 @@ void xenitp (int vcpu)
         exit (-1);
     }
 
-    ret = xc_vcpu_getcontext (xc_handle, domid, vcpu, cur_ctx);
+    ret = xc_ia64_vcpu_getcontext (xc_handle, domid, vcpu, cur_ctx);
     if (ret < 0) {
-        perror ("xc_vcpu_getcontext");
+        perror ("xc_ia64_vcpu_getcontext");
         exit (-1);
     }
 
diff -r 11318234588e -r 08f77df14cba tools/examples/xend-config.sxp
--- a/tools/examples/xend-config.sxp    Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/examples/xend-config.sxp    Wed Jul 02 11:30:37 2008 +0900
@@ -242,3 +242,6 @@
 
 # Script to run when the label of a resource has changed.
 #(resource-label-change-script '')
+
+# Rotation count of qemu-dm log file.
+#(qemu-dm-logrotate-count 10)
diff -r 11318234588e -r 08f77df14cba tools/firmware/hvmloader/hvmloader.c
--- a/tools/firmware/hvmloader/hvmloader.c      Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/firmware/hvmloader/hvmloader.c      Wed Jul 02 11:30:37 2008 +0900
@@ -206,10 +206,12 @@ static void pci_setup(void)
             pci_writew(devfn, 0x3d, 0x0001);
             break;
         case 0x0101:
-            /* PIIX3 IDE */
-            ASSERT((vendor_id == 0x8086) && (device_id == 0x7010));
-            pci_writew(devfn, 0x40, 0x8000); /* enable IDE0 */
-            pci_writew(devfn, 0x42, 0x8000); /* enable IDE1 */
+            if ( vendor_id == 0x8086 )
+            {
+                /* Intel ICHs since PIIX3: enable IDE legacy mode. */
+                pci_writew(devfn, 0x40, 0x8000); /* enable IDE0 */
+                pci_writew(devfn, 0x42, 0x8000); /* enable IDE1 */
+            }
             break;
         }
 
diff -r 11318234588e -r 08f77df14cba tools/firmware/rombios/rombios.c
--- a/tools/firmware/rombios/rombios.c  Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/firmware/rombios/rombios.c  Wed Jul 02 11:30:37 2008 +0900
@@ -9783,6 +9783,27 @@ smbios_init:
 
 #endif
 
+#if BX_TCGBIOS
+; The section between the POST entry and the NMI entry is filling up
+; and causes crashes if this code was directly there
+tcpa_post_part1:
+  call _tcpa_acpi_init
+
+  push dword #0
+  call _tcpa_initialize_tpm
+  add sp, #4
+
+  call _tcpa_do_measure_POSTs
+  call _tcpa_wake_event     /* specs: 3.2.3.7 */
+  ret
+
+tcpa_post_part2:
+  call _tcpa_calling_int19h          /* specs: 8.2.3 step 1 */
+  call _tcpa_add_event_separators    /* specs: 8.2.3 step 2 */
+  /* we do not call int 19h handler but keep following eventlog */
+  call _tcpa_returned_int19h         /* specs: 8.2.3 step 3/7 */
+  ret
+#endif
 
 
 ;; for 'C' strings and other data, insert them here with
@@ -10003,14 +10024,7 @@ post_default_ints:
   mov  0x0410, ax
 
 #if BX_TCGBIOS
-  call _tcpa_acpi_init
-
-  push dword #0
-  call _tcpa_initialize_tpm
-  add sp, #4
-
-  call _tcpa_do_measure_POSTs
-  call _tcpa_wake_event     /* specs: 3.2.3.7 */
+  call tcpa_post_part1
 #endif
 
   ;; Parallel setup
@@ -10138,10 +10152,7 @@ post_default_ints:
   call _interactive_bootkey
 
 #if BX_TCGBIOS
-  call _tcpa_calling_int19h          /* specs: 8.2.3 step 1 */
-  call _tcpa_add_event_separators    /* specs: 8.2.3 step 2 */
-  /* we do not call int 19h handler but keep following eventlog */
-  call _tcpa_returned_int19h         /* specs: 8.2.3 step 3/7 */
+  call tcpa_post_part2
 #endif
 
   ;; Start the boot sequence.   See the comments in int19_relocated 
diff -r 11318234588e -r 08f77df14cba tools/ioemu/hw/xen_console.c
--- a/tools/ioemu/hw/xen_console.c      Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/ioemu/hw/xen_console.c      Wed Jul 02 11:30:37 2008 +0900
@@ -160,16 +160,18 @@ int xs_gather(struct xs_handle *xs, cons
 
 static int domain_create_ring(struct domain *dom)
 {
-       int err, remote_port, ring_ref, rc;
+       int err, remote_port, ring_ref, limit, rc;
 
        err = xs_gather(dom->xsh, dom->serialpath,
                        "ring-ref", "%u", &ring_ref,
                        "port", "%i", &remote_port,
+                       "limit", "%i", &limit,
                        NULL);
        if (err) {
                err = xs_gather(dom->xsh, dom->conspath,
                                "ring-ref", "%u", &ring_ref,
                                "port", "%i", &remote_port,
+                               "limit", "%i", &limit,
                                NULL);
                if (err) {
                        fprintf(stderr, "Console: failed to find ring-ref/port 
yet\n");
@@ -178,7 +180,9 @@ static int domain_create_ring(struct dom
                dom->use_consolepath = 1;
        } else
                dom->use_consolepath = 0;
-       fprintf(stderr, "Console: got ring-ref %d port %d\n", ring_ref, 
remote_port);
+       dom->buffer.max_capacity = limit;
+       fprintf(stderr, "Console: got ring-ref %d port %d limit %d\n", 
+               ring_ref, remote_port, limit);
 
        if ((ring_ref == dom->ring_ref) && (remote_port == dom->remote_port))
                goto out;
diff -r 11318234588e -r 08f77df14cba tools/ioemu/target-i386-dm/exec-dm.c
--- a/tools/ioemu/target-i386-dm/exec-dm.c      Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/ioemu/target-i386-dm/exec-dm.c      Wed Jul 02 11:30:37 2008 +0900
@@ -483,9 +483,11 @@ static void memcpy_words(void *dst, void
 }
 #endif
 
-void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf, 
-                            int len, int is_write)
-{
+void cpu_physical_memory_rw(target_phys_addr_t _addr, uint8_t *buf, 
+                            int _len, int is_write)
+{
+    target_phys_addr_t addr = _addr;
+    int len = _len;
     int l, io_index;
     uint8_t *ptr;
     uint32_t val;
@@ -520,6 +522,7 @@ void cpu_physical_memory_rw(target_phys_
             } else if ((ptr = phys_ram_addr(addr)) != NULL) {
                 /* Writing to RAM */
                 memcpy_words(ptr, buf, l);
+#ifndef CONFIG_STUBDOM
                 if (logdirty_bitmap != NULL) {
                     /* Record that we have dirtied this frame */
                     unsigned long pfn = addr >> TARGET_PAGE_BITS;
@@ -531,6 +534,7 @@ void cpu_physical_memory_rw(target_phys_
                             |= 1UL << pfn % HOST_LONG_BITS;
                     }
                 }
+#endif
 #ifdef __ia64__
                 sync_icache(ptr, l);
 #endif 
@@ -566,6 +570,13 @@ void cpu_physical_memory_rw(target_phys_
         addr += l;
     }
 
+#ifdef CONFIG_STUBDOM
+    if (logdirty_bitmap != NULL)
+        xc_hvm_modified_memory(xc_handle, domid, _addr >> TARGET_PAGE_BITS,
+                (_addr + _len + TARGET_PAGE_SIZE - 1) >> TARGET_PAGE_BITS
+                    - _addr >> TARGET_PAGE_BITS);
+#endif
+
     mapcache_unlock();
 }
 #endif
diff -r 11318234588e -r 08f77df14cba tools/ioemu/xenstore.c
--- a/tools/ioemu/xenstore.c    Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/ioemu/xenstore.c    Wed Jul 02 11:30:37 2008 +0900
@@ -260,8 +260,6 @@ void xenstore_parse_domain_config(int hv
                    /* autoguess qcow vs qcow2 */
                } else if (!strcmp(drv,"file") || !strcmp(drv,"phy")) {
                    format = &bdrv_raw;
-               } else if (!strcmp(drv,"phy")) {
-                   format = &bdrv_raw;
                } else {
                    format = bdrv_find_format(drv);
                    if (!format) {
@@ -404,6 +402,10 @@ void xenstore_process_logdirty_event(voi
             /* No key yet: wait for the next watch */
             return;
 
+#ifdef CONFIG_STUBDOM
+        /* We pass the writes to hypervisor */
+        seg = (void*)1;
+#else
         strncpy(key_terminated, key_ascii, 16);
         free(key_ascii);
         key = (key_t) strtoull(key_terminated, NULL, 16);
@@ -419,11 +421,6 @@ void xenstore_process_logdirty_event(voi
         fprintf(logfile, "%s: key=%16.16llx size=%lu\n", __FUNCTION__,
                 (unsigned long long)key, logdirty_bitmap_size);
 
-#ifdef CONFIG_STUBDOM
-        /* XXX we just can't use shm. */
-        fprintf(logfile, "Log dirty is not implemented in stub domains!\n");
-        return;
-#else
         shmid = shmget(key, 2 * logdirty_bitmap_size, S_IRUSR|S_IWUSR);
         if (shmid == -1) {
             fprintf(logfile, "Log-dirty: shmget failed: segment %16.16llx "
diff -r 11318234588e -r 08f77df14cba tools/libxc/ia64/xc_ia64_hvm_build.c
--- a/tools/libxc/ia64/xc_ia64_hvm_build.c      Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/ia64/xc_ia64_hvm_build.c      Wed Jul 02 11:30:37 2008 +0900
@@ -1052,7 +1052,8 @@ int
 int
 xc_hvm_build(int xc_handle, uint32_t domid, int memsize, const char 
*image_name)
 {
-    vcpu_guest_context_t st_ctxt, *ctxt = &st_ctxt;
+    vcpu_guest_context_any_t st_ctxt_any;
+    vcpu_guest_context_t *ctxt = &st_ctxt_any.c;
     char *image = NULL;
     unsigned long image_size;
     unsigned long nr_pages;
@@ -1079,14 +1080,14 @@ xc_hvm_build(int xc_handle, uint32_t dom
 
     free(image);
 
-    memset(ctxt, 0, sizeof(*ctxt));
+    memset(&st_ctxt_any, 0, sizeof(st_ctxt_any));
     ctxt->regs.ip = 0x80000000ffffffb0UL;
     ctxt->regs.ar.fpsr = xc_ia64_fpsr_default();
     ctxt->regs.cr.itir = 14 << 2;
     ctxt->regs.psr = IA64_PSR_AC | IA64_PSR_BN;
     ctxt->regs.cr.dcr = 0;
     ctxt->regs.cr.pta = 15 << 2;
-    return xc_vcpu_setcontext(xc_handle, domid, 0, ctxt);
+    return xc_vcpu_setcontext(xc_handle, domid, 0, &st_ctxt_any);
 
 error_out:
     free(image);
diff -r 11318234588e -r 08f77df14cba tools/libxc/ia64/xc_ia64_linux_restore.c
--- a/tools/libxc/ia64/xc_ia64_linux_restore.c  Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/ia64/xc_ia64_linux_restore.c  Wed Jul 02 11:30:37 2008 +0900
@@ -117,8 +117,9 @@ xc_ia64_recv_unallocated_list(int xc_han
 
 static int
 xc_ia64_recv_vcpu_context(int xc_handle, int io_fd, uint32_t dom,
-                          uint32_t vcpu, vcpu_guest_context_t *ctxt)
-{
+                          uint32_t vcpu, vcpu_guest_context_any_t *ctxt_any)
+{
+    vcpu_guest_context_t *ctxt = &ctxt_any->c;
     if (read_exact(io_fd, ctxt, sizeof(*ctxt))) {
         ERROR("Error when reading ctxt");
         return -1;
@@ -128,14 +129,14 @@ xc_ia64_recv_vcpu_context(int xc_handle,
 
     /* Initialize and set registers.  */
     ctxt->flags = VGCF_EXTRA_REGS | VGCF_SET_CR_IRR | VGCF_online;
-    if (xc_vcpu_setcontext(xc_handle, dom, vcpu, ctxt) != 0) {
+    if (xc_vcpu_setcontext(xc_handle, dom, vcpu, ctxt_any) != 0) {
         ERROR("Couldn't set vcpu context");
         return -1;
     }
 
     /* Just a check.  */
     ctxt->flags = 0;
-    if (xc_vcpu_getcontext(xc_handle, dom, vcpu, ctxt)) {
+    if (xc_vcpu_getcontext(xc_handle, dom, vcpu, ctxt_any)) {
         ERROR("Could not get vcpu context");
         return -1;
     }
@@ -226,19 +227,20 @@ xc_ia64_pv_recv_vcpu_context(int xc_hand
     int rc = -1;
 
     /* A copy of the CPU context of the guest. */
-    vcpu_guest_context_t ctxt;
-    
-    if (lock_pages(&ctxt, sizeof(ctxt))) {
+    vcpu_guest_context_any_t ctxt_any;
+    vcpu_guest_context_t *ctxt = &ctxt_any.c;
+
+    if (lock_pages(&ctxt_any, sizeof(ctxt_any))) {
         /* needed for build domctl, but might as well do early */
         ERROR("Unable to lock_pages ctxt");
         return -1;
     }
 
-    if (xc_ia64_recv_vcpu_context(xc_handle, io_fd, dom, vcpu, &ctxt))
+    if (xc_ia64_recv_vcpu_context(xc_handle, io_fd, dom, vcpu, &ctxt_any))
         goto out;
 
     /* Then get privreg page.  */
-    if (read_page(xc_handle, io_fd, dom, ctxt.privregs_pfn) < 0) {
+    if (read_page(xc_handle, io_fd, dom, ctxt->privregs_pfn) < 0) {
         ERROR("Could not read vcpu privregs");
         goto out;
     }
@@ -441,12 +443,12 @@ xc_ia64_hvm_recv_context(int xc_handle, 
     /* vcpu context */
     for (i = 0; i <= info.max_vcpu_id; i++) {
         /* A copy of the CPU context of the guest. */
-        vcpu_guest_context_t ctxt;
+        vcpu_guest_context_any_t ctxt_any;
 
         if (!__test_bit(i, vcpumap))
             continue;
 
-        if (xc_ia64_recv_vcpu_context(xc_handle, io_fd, dom, i, &ctxt))
+        if (xc_ia64_recv_vcpu_context(xc_handle, io_fd, dom, i, &ctxt_any))
             goto out;
 
         /* system context of vcpu is recieved as hvm context. */
diff -r 11318234588e -r 08f77df14cba tools/libxc/ia64/xc_ia64_linux_save.c
--- a/tools/libxc/ia64/xc_ia64_linux_save.c     Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/ia64/xc_ia64_linux_save.c     Wed Jul 02 11:30:37 2008 +0900
@@ -180,9 +180,10 @@ xc_ia64_send_unallocated_list(int xc_han
 
 static int
 xc_ia64_send_vcpu_context(int xc_handle, int io_fd, uint32_t dom,
-                          uint32_t vcpu, vcpu_guest_context_t *ctxt)
-{
-    if (xc_vcpu_getcontext(xc_handle, dom, vcpu, ctxt)) {
+                          uint32_t vcpu, vcpu_guest_context_any_t *ctxt_any)
+{
+    vcpu_guest_context_t *ctxt = &ctxt_any->c;
+    if (xc_vcpu_getcontext(xc_handle, dom, vcpu, ctxt_any)) {
         ERROR("Could not get vcpu context");
         return -1;
     }
@@ -269,17 +270,19 @@ xc_ia64_pv_send_context(int xc_handle, i
     /* vcpu context */
     for (i = 0; i <= info->max_vcpu_id; i++) {
         /* A copy of the CPU context of the guest. */
-        vcpu_guest_context_t ctxt;
+        vcpu_guest_context_any_t ctxt_any;
+        vcpu_guest_context_t *ctxt = &ctxt_any.c;
+
         char *mem;
 
         if (!__test_bit(i, vcpumap))
             continue;
 
-        if (xc_ia64_send_vcpu_context(xc_handle, io_fd, dom, i, &ctxt))
+        if (xc_ia64_send_vcpu_context(xc_handle, io_fd, dom, i, &ctxt_any))
             goto out;
 
         mem = xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
-                                   PROT_READ|PROT_WRITE, ctxt.privregs_pfn);
+                                   PROT_READ|PROT_WRITE, ctxt->privregs_pfn);
         if (mem == NULL) {
             ERROR("cannot map privreg page");
             goto out;
@@ -337,12 +340,12 @@ xc_ia64_hvm_send_context(int xc_handle, 
     /* vcpu context */
     for (i = 0; i <= info->max_vcpu_id; i++) {
         /* A copy of the CPU context of the guest. */
-        vcpu_guest_context_t ctxt;
+        vcpu_guest_context_any_t ctxt_any;
 
         if (!__test_bit(i, vcpumap))
             continue;
 
-        if (xc_ia64_send_vcpu_context(xc_handle, io_fd, dom, i, &ctxt))
+        if (xc_ia64_send_vcpu_context(xc_handle, io_fd, dom, i, &ctxt_any))
             goto out;
 
         /* system context of vcpu is sent as hvm context. */
diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_core.c
--- a/tools/libxc/xc_core.c     Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/xc_core.c     Wed Jul 02 11:30:37 2008 +0900
@@ -407,7 +407,7 @@ xc_domain_dumpcore_via_callback(int xc_h
 
     int nr_vcpus = 0;
     char *dump_mem, *dump_mem_start = NULL;
-    vcpu_guest_context_t  ctxt[MAX_VIRT_CPUS];
+    vcpu_guest_context_any_t  ctxt[MAX_VIRT_CPUS];
     struct xc_core_arch_context arch_ctxt;
     char dummy[PAGE_SIZE];
     int dummy_len;
@@ -581,10 +581,10 @@ xc_domain_dumpcore_via_callback(int xc_h
         PERROR("Could not get section header for .xen_prstatus");
         goto out;
     }
-    filesz = sizeof(ctxt[0]) * nr_vcpus;
+    filesz = sizeof(ctxt[0].c) * nr_vcpus;
     sts = xc_core_shdr_set(shdr, strtab, XEN_DUMPCORE_SEC_PRSTATUS,
                            SHT_PROGBITS, offset, filesz,
-                           __alignof__(ctxt[0]), sizeof(ctxt[0]));
+                           __alignof__(ctxt[0].c), sizeof(ctxt[0].c));
     if ( sts != 0 )
         goto out;
     offset += filesz;
@@ -707,7 +707,7 @@ xc_domain_dumpcore_via_callback(int xc_h
         goto out;
 
     /* prstatus: .xen_prstatus */
-    sts = dump_rtn(args, (char *)&ctxt, sizeof(ctxt[0]) * nr_vcpus);
+    sts = dump_rtn(args, (char *)&ctxt[0].c, sizeof(ctxt[0].c) * nr_vcpus);
     if ( sts != 0 )
         goto out;
 
diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_core_ia64.c
--- a/tools/libxc/xc_core_ia64.c        Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/xc_core_ia64.c        Wed Jul 02 11:30:37 2008 +0900
@@ -308,9 +308,10 @@ xc_core_arch_context_free(struct xc_core
 
 int
 xc_core_arch_context_get(struct xc_core_arch_context* arch_ctxt,
-                         vcpu_guest_context_t* ctxt,
+                         vcpu_guest_context_any_t* ctxt_any,
                          int xc_handle, uint32_t domid)
 {
+    vcpu_guest_context_t *ctxt = &ctxt_any->c;
     mapped_regs_t* mapped_regs;
 
     if ( ctxt->privregs_pfn == VGC_PRIVREGS_HVM )
diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_core_ia64.h
--- a/tools/libxc/xc_core_ia64.h        Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/xc_core_ia64.h        Wed Jul 02 11:30:37 2008 +0900
@@ -40,7 +40,7 @@ xc_core_arch_context_free(struct xc_core
 xc_core_arch_context_free(struct xc_core_arch_context* arch_ctxt);
 int
 xc_core_arch_context_get(struct xc_core_arch_context* arch_ctxt,
-                         vcpu_guest_context_t* ctxt,
+                         vcpu_guest_context_any_t* ctxt,
                          int xc_handle, uint32_t domid);
 int
 xc_core_arch_context_get_shdr(struct xc_core_arch_context* arch_ctxt, 
diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_domain.c
--- a/tools/libxc/xc_domain.c   Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/xc_domain.c   Wed Jul 02 11:30:37 2008 +0900
@@ -298,30 +298,21 @@ int xc_vcpu_getcontext(int xc_handle,
 int xc_vcpu_getcontext(int xc_handle,
                        uint32_t domid,
                        uint32_t vcpu,
-                       vcpu_guest_context_t *ctxt)
-{
-    int rc;
-    DECLARE_DOMCTL;
-    size_t sz = sizeof(vcpu_guest_context_either_t);
+                       vcpu_guest_context_any_t *ctxt)
+{
+    int rc;
+    DECLARE_DOMCTL;
+    size_t sz = sizeof(vcpu_guest_context_any_t);
 
     domctl.cmd = XEN_DOMCTL_getvcpucontext;
     domctl.domain = (domid_t)domid;
     domctl.u.vcpucontext.vcpu   = (uint16_t)vcpu;
-    set_xen_guest_handle(domctl.u.vcpucontext.ctxt, ctxt);
-
-    /*
-     * We may be asked to lock either a 32-bit or a 64-bit context. Lock the
-     * larger of the two if possible, otherwise fall back to native size.
-     */
+    set_xen_guest_handle(domctl.u.vcpucontext.ctxt, &ctxt->c);
+
+    
     if ( (rc = lock_pages(ctxt, sz)) != 0 )
-    {
-        sz = sizeof(*ctxt);
-        if ( (rc = lock_pages(ctxt, sz)) != 0 )
-            return rc;
-    }
-
+        return rc;
     rc = do_domctl(xc_handle, &domctl);
-
     unlock_pages(ctxt, sz);
 
     return rc;
@@ -626,32 +617,28 @@ int xc_vcpu_setcontext(int xc_handle,
 int xc_vcpu_setcontext(int xc_handle,
                        uint32_t domid,
                        uint32_t vcpu,
-                       vcpu_guest_context_t *ctxt)
-{
-    DECLARE_DOMCTL;
-    int rc;
-    size_t sz = sizeof(vcpu_guest_context_either_t);
+                       vcpu_guest_context_any_t *ctxt)
+{
+    DECLARE_DOMCTL;
+    int rc;
+    size_t sz = sizeof(vcpu_guest_context_any_t);
+
+    if (ctxt == NULL)
+    {
+        errno = EINVAL;
+        return -1;
+    }
 
     domctl.cmd = XEN_DOMCTL_setvcpucontext;
     domctl.domain = domid;
     domctl.u.vcpucontext.vcpu = vcpu;
-    set_xen_guest_handle(domctl.u.vcpucontext.ctxt, ctxt);
-
-    /*
-     * We may be asked to lock either a 32-bit or a 64-bit context. Lock the
-     * larger of the two if possible, otherwise fall back to native size.
-     */
-    if ( (ctxt != NULL) && (rc = lock_pages(ctxt, sz)) != 0 )
-    {
-        sz = sizeof(*ctxt);
-        if ( (rc = lock_pages(ctxt, sz)) != 0 )
-            return rc;
-    }
-
+    set_xen_guest_handle(domctl.u.vcpucontext.ctxt, &ctxt->c);
+
+    if ( (rc = lock_pages(ctxt, sz)) != 0 )
+        return rc;
     rc = do_domctl(xc_handle, &domctl);
-
-    if ( ctxt != NULL )
-        unlock_pages(ctxt, sz);
+    
+    unlock_pages(ctxt, sz);
 
     return rc;
 }
diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_domain_restore.c
--- a/tools/libxc/xc_domain_restore.c   Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/xc_domain_restore.c   Wed Jul 02 11:30:37 2008 +0900
@@ -153,7 +153,7 @@ static xen_pfn_t *load_p2m_frame_list(
     int io_fd, int *pae_extended_cr3, int *ext_vcpucontext)
 {
     xen_pfn_t *p2m_frame_list;
-    vcpu_guest_context_either_t ctxt;
+    vcpu_guest_context_any_t ctxt;
     xen_pfn_t p2m_fl_zero;
 
     /* Read first entry of P2M list, or extended-info signature (~0UL). */
@@ -284,12 +284,12 @@ int xc_domain_restore(int xc_handle, int
     /* The new domain's shared-info frame number. */
     unsigned long shared_info_frame;
     unsigned char shared_info_page[PAGE_SIZE]; /* saved contents from file */
-    shared_info_either_t *old_shared_info = 
-        (shared_info_either_t *)shared_info_page;
-    shared_info_either_t *new_shared_info;
+    shared_info_any_t *old_shared_info = 
+        (shared_info_any_t *)shared_info_page;
+    shared_info_any_t *new_shared_info;
 
     /* A copy of the CPU context of the guest. */
-    vcpu_guest_context_either_t ctxt;
+    vcpu_guest_context_any_t ctxt;
 
     /* A table containing the type of each PFN (/not/ MFN!). */
     unsigned long *pfn_type = NULL;
@@ -304,7 +304,7 @@ int xc_domain_restore(int xc_handle, int
     xen_pfn_t *p2m_frame_list = NULL;
     
     /* A temporary mapping of the guest's start_info page. */
-    start_info_either_t *start_info;
+    start_info_any_t *start_info;
 
     /* Our mapping of the current region (batch) */
     char *region_base;
diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_domain_save.c
--- a/tools/libxc/xc_domain_save.c      Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/xc_domain_save.c      Wed Jul 02 11:30:37 2008 +0900
@@ -412,7 +412,7 @@ static int suspend_and_state(int (*suspe
 ** it to update the MFN to a reasonable value.
 */
 static void *map_frame_list_list(int xc_handle, uint32_t dom,
-                                 shared_info_either_t *shinfo)
+                                 shared_info_any_t *shinfo)
 {
     int count = 100;
     void *p;
@@ -628,9 +628,9 @@ static xen_pfn_t *map_and_save_p2m_table
                                          int io_fd, 
                                          uint32_t dom,
                                          unsigned long p2m_size,
-                                         shared_info_either_t *live_shinfo)
-{
-    vcpu_guest_context_either_t ctxt;
+                                         shared_info_any_t *live_shinfo)
+{
+    vcpu_guest_context_any_t ctxt;
 
     /* Double and single indirect references to the live P2M table */
     void *live_p2m_frame_list_list = NULL;
@@ -735,7 +735,7 @@ static xen_pfn_t *map_and_save_p2m_table
         p2m_frame_list[i/FPP] = mfn_to_pfn(p2m_frame_list[i/FPP]);
     }
 
-    if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt.c) )
+    if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt) )
     {
         ERROR("Could not get vcpu context");
         goto out;
@@ -814,7 +814,7 @@ int xc_domain_save(int xc_handle, int io
     unsigned long shared_info_frame;
 
     /* A copy of the CPU context of the guest. */
-    vcpu_guest_context_either_t ctxt;
+    vcpu_guest_context_any_t ctxt;
 
     /* A table containing the type of each PFN (/not/ MFN!). */
     unsigned long *pfn_type = NULL;
@@ -824,7 +824,7 @@ int xc_domain_save(int xc_handle, int io
     char page[PAGE_SIZE];
 
     /* Live mapping of shared info structure */
-    shared_info_either_t *live_shinfo = NULL;
+    shared_info_any_t *live_shinfo = NULL;
 
     /* base of the region in which domain memory is mapped */
     unsigned char *region_base = NULL;
@@ -1536,7 +1536,7 @@ int xc_domain_save(int xc_handle, int io
         }
     }
 
-    if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt.c) )
+    if ( xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt) )
     {
         ERROR("Could not get vcpu context");
         goto out;
@@ -1556,7 +1556,7 @@ int xc_domain_save(int xc_handle, int io
         if ( !(vcpumap & (1ULL << i)) )
             continue;
 
-        if ( (i != 0) && xc_vcpu_getcontext(xc_handle, dom, i, &ctxt.c) )
+        if ( (i != 0) && xc_vcpu_getcontext(xc_handle, dom, i, &ctxt) )
         {
             ERROR("No context for VCPU%d", i);
             goto out;
@@ -1624,7 +1624,7 @@ int xc_domain_save(int xc_handle, int io
      * Reset the MFN to be a known-invalid value. See map_frame_list_list().
      */
     memcpy(page, live_shinfo, PAGE_SIZE);
-    SET_FIELD(((shared_info_either_t *)page), 
+    SET_FIELD(((shared_info_any_t *)page), 
               arch.pfn_to_mfn_frame_list_list, 0);
     if ( write_exact(io_fd, page, PAGE_SIZE) )
     {
diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_misc.c
--- a/tools/libxc/xc_misc.c     Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/xc_misc.c     Wed Jul 02 11:30:37 2008 +0900
@@ -253,6 +253,34 @@ int xc_hvm_track_dirty_vram(
     arg.first_pfn = first_pfn;
     arg.nr        = nr;
     set_xen_guest_handle(arg.dirty_bitmap, (uint8_t *)dirty_bitmap);
+
+    if ( (rc = lock_pages(&arg, sizeof(arg))) != 0 )
+    {
+        PERROR("Could not lock memory");
+        return rc;
+    }
+
+    rc = do_xen_hypercall(xc_handle, &hypercall);
+
+    unlock_pages(&arg, sizeof(arg));
+
+    return rc;
+}
+
+int xc_hvm_modified_memory(
+    int xc_handle, domid_t dom, uint64_t first_pfn, uint64_t nr)
+{
+    DECLARE_HYPERCALL;
+    struct xen_hvm_modified_memory arg;
+    int rc;
+
+    hypercall.op     = __HYPERVISOR_hvm_op;
+    hypercall.arg[0] = HVMOP_modified_memory;
+    hypercall.arg[1] = (unsigned long)&arg;
+
+    arg.domid     = dom;
+    arg.first_pfn = first_pfn;
+    arg.nr        = nr;
 
     if ( (rc = lock_pages(&arg, sizeof(arg))) != 0 )
     {
diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_pagetab.c
--- a/tools/libxc/xc_pagetab.c  Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/xc_pagetab.c  Wed Jul 02 11:30:37 2008 +0900
@@ -48,7 +48,7 @@ unsigned long xc_translate_foreign_addre
 unsigned long xc_translate_foreign_address(int xc_handle, uint32_t dom,
                                            int vcpu, unsigned long long virt )
 {
-    vcpu_guest_context_t ctx;
+    vcpu_guest_context_any_t ctx;
     unsigned long long cr3;
     void *pd, *pt, *pdppage = NULL, *pdp, *pml = NULL;
     unsigned long long pde, pte, pdpe, pmle;
@@ -78,7 +78,7 @@ unsigned long xc_translate_foreign_addre
         DPRINTF("failed to retreive vcpu context\n");
         goto out;
     }
-    cr3 = ((unsigned long long)xen_cr3_to_pfn(ctx.ctrlreg[3])) << PAGE_SHIFT;
+    cr3 = ((unsigned long long)xen_cr3_to_pfn(ctx.c.ctrlreg[3])) << PAGE_SHIFT;
 
     /* Page Map Level 4 */
 
diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_private.h
--- a/tools/libxc/xc_private.h  Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/xc_private.h  Wed Jul 02 11:30:37 2008 +0900
@@ -188,9 +188,9 @@ int xc_map_foreign_ranges(int xc_handle,
                           privcmd_mmap_entry_t *entries, int nr);
 
 void *map_domain_va_core(unsigned long domfd, int cpu, void *guest_va,
-                         vcpu_guest_context_t *ctxt);
+                         vcpu_guest_context_any_t *ctxt);
 int xc_waitdomain_core(int xc_handle, int domain, int *status,
-    int options, vcpu_guest_context_t *ctxt);
+    int options, vcpu_guest_context_any_t *ctxt);
 
 void bitmap_64_to_byte(uint8_t *bp, const uint64_t *lp, int nbits);
 void bitmap_byte_to_64(uint64_t *lp, const uint8_t *bp, int nbits);
diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_ptrace.c
--- a/tools/libxc/xc_ptrace.c   Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/xc_ptrace.c   Wed Jul 02 11:30:37 2008 +0900
@@ -40,9 +40,9 @@ static int current_isfile;
 static int current_isfile;
 static int current_is_hvm;
 
-static uint64_t                 online_cpumap;
-static uint64_t                 regs_valid;
-static vcpu_guest_context_t     ctxt[MAX_VIRT_CPUS];
+static uint64_t                         online_cpumap;
+static uint64_t                         regs_valid;
+static vcpu_guest_context_any_t      ctxt[MAX_VIRT_CPUS];
 
 extern int ffsll(long long int);
 #define FOREACH_CPU(cpumap, i)  for ( cpumap = online_cpumap; (i = 
ffsll(cpumap)); cpumap &= ~(1 << (index - 1)) )
@@ -96,9 +96,9 @@ xc_register_event_handler(thr_ev_handler
 }
 
 static inline int
-paging_enabled(vcpu_guest_context_t *v)
-{
-    unsigned long cr0 = v->ctrlreg[0];
+paging_enabled(vcpu_guest_context_any_t *v)
+{
+    unsigned long cr0 = v->c.ctrlreg[0];
     return (cr0 & X86_CR0_PE) && (cr0 & X86_CR0_PG);
 }
 
@@ -174,7 +174,7 @@ map_domain_va_32(
 
     l2 = xc_map_foreign_range(
          xc_handle, current_domid, PAGE_SIZE, PROT_READ,
-         xen_cr3_to_pfn(ctxt[cpu].ctrlreg[3]));
+         xen_cr3_to_pfn(ctxt[cpu].c.ctrlreg[3]));
     if ( l2 == NULL )
         return NULL;
 
@@ -216,7 +216,7 @@ map_domain_va_pae(
 
     l3 = xc_map_foreign_range(
         xc_handle, current_domid, PAGE_SIZE, PROT_READ,
-        xen_cr3_to_pfn(ctxt[cpu].ctrlreg[3]));
+        xen_cr3_to_pfn(ctxt[cpu].c.ctrlreg[3]));
     if ( l3 == NULL )
         return NULL;
 
@@ -264,12 +264,12 @@ map_domain_va_64(
     uint64_t *l4, *l3, *l2, *l1;
     static void *v[MAX_VIRT_CPUS];
 
-    if ((ctxt[cpu].ctrlreg[4] & 0x20) == 0 ) /* legacy ia32 mode */
+    if ((ctxt[cpu].c.ctrlreg[4] & 0x20) == 0 ) /* legacy ia32 mode */
         return map_domain_va_32(xc_handle, cpu, guest_va, perm);
 
     l4 = xc_map_foreign_range(
         xc_handle, current_domid, PAGE_SIZE, PROT_READ,
-        xen_cr3_to_pfn(ctxt[cpu].ctrlreg[3]));
+        xen_cr3_to_pfn(ctxt[cpu].c.ctrlreg[3]));
     if ( l4 == NULL )
         return NULL;
 
@@ -494,26 +494,26 @@ xc_ptrace(
     case PTRACE_GETREGS:
         if (!current_isfile && fetch_regs(xc_handle, cpu, NULL))
             goto out_error;
-        SET_PT_REGS(pt, ctxt[cpu].user_regs);
+        SET_PT_REGS(pt, ctxt[cpu].c.user_regs);
         memcpy(data, &pt, sizeof(struct gdb_regs));
         break;
 
     case PTRACE_GETFPREGS:
         if (!current_isfile && fetch_regs(xc_handle, cpu, NULL)) 
                 goto out_error;
-        memcpy(data, &ctxt[cpu].fpu_ctxt, sizeof (elf_fpregset_t));
+        memcpy(data, &ctxt[cpu].c.fpu_ctxt, sizeof (elf_fpregset_t));
         break;
 
     case PTRACE_GETFPXREGS:
         if (!current_isfile && fetch_regs(xc_handle, cpu, NULL))
                 goto out_error;
-        memcpy(data, &ctxt[cpu].fpu_ctxt, sizeof(ctxt[cpu].fpu_ctxt));
+        memcpy(data, &ctxt[cpu].c.fpu_ctxt, sizeof(ctxt[cpu].c.fpu_ctxt));
         break;
 
     case PTRACE_SETREGS:
         if (current_isfile)
                 goto out_unsupported; /* XXX not yet supported */
-        SET_XC_REGS(((struct gdb_regs *)data), ctxt[cpu].user_regs);
+        SET_XC_REGS(((struct gdb_regs *)data), ctxt[cpu].c.user_regs);
         if ((retval = xc_vcpu_setcontext(xc_handle, current_domid, cpu,
                                 &ctxt[cpu])))
             goto out_error_domctl;
@@ -525,7 +525,7 @@ xc_ptrace(
         /*  XXX we can still have problems if the user switches threads
          *  during single-stepping - but that just seems retarded
          */
-        ctxt[cpu].user_regs.eflags |= PSL_T;
+        ctxt[cpu].c.user_regs.eflags |= PSL_T;
         if ((retval = xc_vcpu_setcontext(xc_handle, current_domid, cpu,
                                 &ctxt[cpu])))
             goto out_error_domctl;
@@ -542,9 +542,9 @@ xc_ptrace(
                 if (fetch_regs(xc_handle, cpu, NULL))
                     goto out_error;
                 /* Clear trace flag */
-                if ( ctxt[cpu].user_regs.eflags & PSL_T )
+                if ( ctxt[cpu].c.user_regs.eflags & PSL_T )
                 {
-                    ctxt[cpu].user_regs.eflags &= ~PSL_T;
+                    ctxt[cpu].c.user_regs.eflags &= ~PSL_T;
                     if ((retval = xc_vcpu_setcontext(xc_handle, current_domid,
                                                 cpu, &ctxt[cpu])))
                         goto out_error_domctl;
diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_ptrace_core.c
--- a/tools/libxc/xc_ptrace_core.c      Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/xc_ptrace_core.c      Wed Jul 02 11:30:37 2008 +0900
@@ -641,24 +641,24 @@ static const struct xc_core_format_type*
 
 void *
 map_domain_va_core(unsigned long domfd, int cpu, void *guest_va,
-                   vcpu_guest_context_t *ctxt)
+                   vcpu_guest_context_any_t *ctxt)
 {
     if (current_format_type == NULL)
         return NULL;
     return (current_format_type->map_domain_va_core)(domfd, cpu, guest_va,
-                                                     ctxt);
+                                                     &ctxt->c);
 }
 
 int
 xc_waitdomain_core(int xc_handle, int domfd, int *status, int options,
-                   vcpu_guest_context_t *ctxt)
+                   vcpu_guest_context_any_t *ctxt)
 {
     int ret;
     int i;
 
     for (i = 0; i < NR_FORMAT_TYPE; i++) {
         ret = (format_type[i].waitdomain_core)(xc_handle, domfd, status,
-                                               options, ctxt);
+                                               options, &ctxt->c);
         if (ret == 0) {
             current_format_type = &format_type[i];
             break;
diff -r 11318234588e -r 08f77df14cba tools/libxc/xc_resume.c
--- a/tools/libxc/xc_resume.c   Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/xc_resume.c   Wed Jul 02 11:30:37 2008 +0900
@@ -13,7 +13,7 @@
 
 static int modify_returncode(int xc_handle, uint32_t domid)
 {
-    vcpu_guest_context_either_t ctxt;
+    vcpu_guest_context_any_t ctxt;
     xc_dominfo_t info;
     xen_capabilities_info_t caps;
     int rc;
@@ -39,7 +39,7 @@ static int modify_returncode(int xc_hand
         return -1;
     }
 
-    if ( (rc = xc_vcpu_getcontext(xc_handle, domid, 0, &ctxt.c)) != 0 )
+    if ( (rc = xc_vcpu_getcontext(xc_handle, domid, 0, &ctxt)) != 0 )
         return rc;
 
     if ( !info.hvm )
@@ -49,7 +49,7 @@ static int modify_returncode(int xc_hand
     else
         ctxt.x32.user_regs.eax = 1;
 
-    if ( (rc = xc_vcpu_setcontext(xc_handle, domid, 0, &ctxt.c)) != 0 )
+    if ( (rc = xc_vcpu_setcontext(xc_handle, domid, 0, &ctxt)) != 0 )
         return rc;
 
     return 0;
@@ -89,7 +89,7 @@ static int xc_domain_resume_any(int xc_h
     int i, rc = -1;
 #if defined(__i386__) || defined(__x86_64__)
     unsigned long mfn, p2m_size = 0;
-    vcpu_guest_context_t ctxt;
+    vcpu_guest_context_any_t ctxt;
     start_info_t *start_info;
     shared_info_t *shinfo = NULL;
     xen_pfn_t *p2m_frame_list_list = NULL;
@@ -167,7 +167,7 @@ static int xc_domain_resume_any(int xc_h
         goto out;
     }
 
-    mfn = ctxt.user_regs.edx;
+    mfn = ctxt.c.user_regs.edx;
 
     start_info = xc_map_foreign_range(xc_handle, domid, PAGE_SIZE,
                                       PROT_READ | PROT_WRITE, mfn);
diff -r 11318234588e -r 08f77df14cba tools/libxc/xenctrl.h
--- a/tools/libxc/xenctrl.h     Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/xenctrl.h     Wed Jul 02 11:30:37 2008 +0900
@@ -30,6 +30,11 @@
 #include <xen/xsm/acm.h>
 #include <xen/xsm/acm_ops.h>
 #include <xen/xsm/flask_op.h>
+
+#if defined(__i386__) || defined(__x86_64__)
+#include <xen/foreign/x86_32.h>
+#include <xen/foreign/x86_64.h>
+#endif
 
 #ifdef __ia64__
 #define XC_PAGE_SHIFT           14
@@ -162,6 +167,35 @@ typedef struct xc_dominfo {
 } xc_dominfo_t;
 
 typedef xen_domctl_getdomaininfo_t xc_domaininfo_t;
+
+typedef union 
+{
+#if defined(__i386__) || defined(__x86_64__)
+    vcpu_guest_context_x86_64_t x64;
+    vcpu_guest_context_x86_32_t x32;   
+#endif
+    vcpu_guest_context_t c;
+} vcpu_guest_context_any_t;
+
+typedef union
+{
+#if defined(__i386__) || defined(__x86_64__)
+    shared_info_x86_64_t x64;
+    shared_info_x86_32_t x32;
+#endif
+    shared_info_t s;
+} shared_info_any_t;
+
+typedef union
+{
+#if defined(__i386__) || defined(__x86_64__)
+    start_info_x86_64_t x64;
+    start_info_x86_32_t x32;
+#endif
+    start_info_t s;
+} start_info_any_t;
+
+
 int xc_domain_create(int xc_handle,
                      uint32_t ssidref,
                      xen_domain_handle_t handle,
@@ -307,7 +341,7 @@ int xc_vcpu_setcontext(int xc_handle,
 int xc_vcpu_setcontext(int xc_handle,
                        uint32_t domid,
                        uint32_t vcpu,
-                       vcpu_guest_context_t *ctxt);
+                       vcpu_guest_context_any_t *ctxt);
 /**
  * This function will return information about one or more domains, using a
  * single hypercall.  The domain information will be stored into the supplied
@@ -368,7 +402,7 @@ int xc_vcpu_getcontext(int xc_handle,
 int xc_vcpu_getcontext(int xc_handle,
                        uint32_t domid,
                        uint32_t vcpu,
-                       vcpu_guest_context_t *ctxt);
+                       vcpu_guest_context_any_t *ctxt);
 
 typedef xen_domctl_getvcpuinfo_t xc_vcpuinfo_t;
 int xc_vcpu_getinfo(int xc_handle,
@@ -894,6 +928,12 @@ int xc_hvm_track_dirty_vram(
     int xc_handle, domid_t dom,
     uint64_t first_pfn, uint64_t nr,
     unsigned long *bitmap);
+
+/*
+ * Notify that some pages got modified by the Device Model
+ */
+int xc_hvm_modified_memory(
+    int xc_handle, domid_t dom, uint64_t first_pfn, uint64_t nr);
 
 typedef enum {
   XC_ERROR_NONE = 0,
diff -r 11318234588e -r 08f77df14cba tools/libxc/xg_save_restore.h
--- a/tools/libxc/xg_save_restore.h     Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/libxc/xg_save_restore.h     Wed Jul 02 11:30:37 2008 +0900
@@ -112,28 +112,6 @@ static inline int get_platform_info(int 
 #define is_mapped(pfn_type) (!((pfn_type) & 0x80000000UL))
 
 
-/* 32-on-64 support: saving 32bit guests from 64bit tools and vice versa */
-typedef union 
-{
-    vcpu_guest_context_x86_64_t x64;
-    vcpu_guest_context_x86_32_t x32;   
-    vcpu_guest_context_t c;
-} vcpu_guest_context_either_t;
-
-typedef union 
-{
-    shared_info_x86_64_t x64;
-    shared_info_x86_32_t x32;   
-    shared_info_t s;
-} shared_info_either_t;
-
-typedef union 
-{
-    start_info_x86_64_t x64;
-    start_info_x86_32_t x32;   
-    start_info_t s;
-} start_info_either_t;
-
 #define GET_FIELD(_p, _f) ((guest_width==8) ? ((_p)->x64._f) : ((_p)->x32._f))
 
 #define SET_FIELD(_p, _f, _v) do {              \
diff -r 11318234588e -r 08f77df14cba tools/python/xen/util/blkif.py
--- a/tools/python/xen/util/blkif.py    Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/python/xen/util/blkif.py    Wed Jul 02 11:30:37 2008 +0900
@@ -16,8 +16,11 @@ def blkdev_name_to_number(name):
 
     n = expand_dev_name(name)
 
+    devname = 'virtual-device'
+    devnum = None
+
     try:
-        return os.stat(n).st_rdev
+        return (devname, os.stat(n).st_rdev)
     except Exception, ex:
         pass
 
@@ -25,28 +28,30 @@ def blkdev_name_to_number(name):
     if re.match( '/dev/sd[a-z]([1-9]|1[0-5])?$', n):
         major = scsi_major[(ord(n[7:8]) - ord('a')) / 16]
         minor = ((ord(n[7:8]) - ord('a')) % 16) * 16 + int(n[8:] or 0)
-        return major * 256 + minor
-    if re.match( '/dev/sd[a-i][a-z]([1-9]|1[0-5])?$', n):
+        devnum = major * 256 + minor
+    elif re.match( '/dev/sd[a-i][a-z]([1-9]|1[0-5])?$', n):
         major = scsi_major[((ord(n[7:8]) - ord('a') + 1) * 26 + (ord(n[8:9]) - 
ord('a'))) / 16 ]
         minor = (((ord(n[7:8]) - ord('a') + 1 ) * 26 + (ord(n[8:9]) - 
ord('a'))) % 16) * 16 + int(n[9:] or 0)
-        return major * 256 + minor
-
-    if re.match( '/dev/hd[a-t]([1-9]|[1-5][0-9]|6[0-3])?', n):
+        devnum = major * 256 + minor
+    elif re.match( '/dev/hd[a-t]([1-9]|[1-5][0-9]|6[0-3])?', n):
         ide_majors = [ 3, 22, 33, 34, 56, 57, 88, 89, 90, 91 ]
         major = ide_majors[(ord(n[7:8]) - ord('a')) / 2]
         minor = ((ord(n[7:8]) - ord('a')) % 2) * 64 + int(n[8:] or 0)
-        return major * 256 + minor
+        devnum = major * 256 + minor
+    elif re.match( '/dev/xvd[a-p]([1-9]|1[0-5])?$', n):
+        devnum = (202 << 8) + ((ord(n[8:9]) - ord('a')) << 4) + int(n[9:] or 0)
+    elif re.match('/dev/xvd[q-z]([1-9]|1[0-5])?$', n):
+        devname = 'virtual-device-ext'
+        devnum = (1 << 28) + ((ord(n[8:9]) - ord('a')) << 8) + int(n[9:] or 0)
+    elif re.match('/dev/xvd[a-i][a-z]([1-9]|1[0-5])?$', n):
+        devname = 'virtual-device-ext'
+        devnum = (1 << 28) + (((ord(n[8:9]) - ord('a') + 1) * 26 + 
(ord(n[9:10]) - ord('a'))) << 8) + int(n[10:] or 0)
+    elif re.match( '^(0x)[0-9a-fA-F]+$', name ):
+        devnum = string.atoi(name, 16)
+    elif re.match('^[0-9]+$', name):
+        devnum = string.atoi(name, 10)
 
-    if re.match( '/dev/xvd[a-p]([1-9]|1[0-5])?', n):
-        return 202 * 256 + 16 * (ord(n[8:9]) - ord('a')) + int(n[9:] or 0)
-
-    if re.match( '^(0x)[0-9a-fA-F]+$', name ):
-        return string.atoi(name,16)
-
-    if re.match('^[0-9]+$', name):
-        return string.atoi(name, 10)
-
-    return None
+    return (devname, devnum)
 
 def blkdev_segment(name):
     """Take the given block-device name (e.g. '/dev/sda1', 'hda')
@@ -58,7 +63,7 @@ def blkdev_segment(name):
         type:         'Disk' or identifying name for partition type
     """
     val = None
-    n = blkdev_name_to_number(name)
+    (name, n) = blkdev_name_to_number(name)
     if not n is None:
         val = { 'device'       : n,
                 'start_sector' : long(0),
diff -r 11318234588e -r 08f77df14cba tools/python/xen/xend/XendConfig.py
--- a/tools/python/xen/xend/XendConfig.py       Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/python/xen/xend/XendConfig.py       Wed Jul 02 11:30:37 2008 +0900
@@ -1123,7 +1123,7 @@ class XendConfig(dict):
             try:
                 devid = int(dev2)
             except ValueError:
-                devid = blkdev_name_to_number(dev2)
+                (xenbus, devid) = blkdev_name_to_number(dev2)
                 if devid == None:
                     log.debug("The device %s is not device name", dev2)
                     return None
diff -r 11318234588e -r 08f77df14cba tools/python/xen/xend/XendOptions.py
--- a/tools/python/xen/xend/XendOptions.py      Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/python/xen/xend/XendOptions.py      Wed Jul 02 11:30:37 2008 +0900
@@ -132,6 +132,9 @@ class XendOptions:
     """Default script to configure a backend network interface"""
     vif_script = osdep.vif_script
 
+    """Default rotation count of qemu-dm log file."""
+    qemu_dm_logrotate_count = 10
+
     def __init__(self):
         self.configure()
 
@@ -350,6 +353,10 @@ class XendOptions:
 
     def get_vnc_x509_verify(self):
         return self.get_config_string('vnc-x509-verify', 
self.xend_vnc_x509_verify)
+
+    def get_qemu_dm_logrotate_count(self):
+        return self.get_config_int("qemu-dm-logrotate-count",
+                                   self.qemu_dm_logrotate_count)
 
 
 class XendOptionsFile(XendOptions):
diff -r 11318234588e -r 08f77df14cba tools/python/xen/xend/image.py
--- a/tools/python/xen/xend/image.py    Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/python/xen/xend/image.py    Wed Jul 02 11:30:37 2008 +0900
@@ -378,13 +378,23 @@ class ImageHandler:
         # keep track of pid and spawned options to kill it later
 
         self.logfile = "/var/log/xen/qemu-dm-%s.log" %  
str(self.vm.info['name_label'])
-        if os.path.exists(self.logfile):
-            if os.path.exists(self.logfile + ".1"):
-                os.unlink(self.logfile + ".1")
-            os.rename(self.logfile, self.logfile + ".1")
+
+        # rotate log
+        logfile_mode = os.O_WRONLY|os.O_CREAT|os.O_APPEND
+        logrotate_count = XendOptions.instance().get_qemu_dm_logrotate_count()
+        if logrotate_count > 0:
+            logfile_mode |= os.O_TRUNC
+            if os.path.exists("%s.%d" % (self.logfile, logrotate_count)):
+                os.unlink("%s.%d" % (self.logfile, logrotate_count))
+            for n in range(logrotate_count - 1, 0, -1):
+                if os.path.exists("%s.%d" % (self.logfile, n)):
+                    os.rename("%s.%d" % (self.logfile, n),
+                              "%s.%d" % (self.logfile, (n + 1)))
+            if os.path.exists(self.logfile):
+                os.rename(self.logfile, self.logfile + ".1")
 
         null = os.open("/dev/null", os.O_RDONLY)
-        logfd = os.open(self.logfile, 
os.O_WRONLY|os.O_CREAT|os.O_TRUNC|os.O_APPEND)
+        logfd = os.open(self.logfile, logfile_mode)
         
         sys.stderr.flush()
         pid = os.fork()
diff -r 11318234588e -r 08f77df14cba tools/python/xen/xend/server/blkif.py
--- a/tools/python/xen/xend/server/blkif.py     Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/python/xen/xend/server/blkif.py     Wed Jul 02 11:30:37 2008 +0900
@@ -81,11 +81,11 @@ class BlkifController(DevController):
         if security.on() == xsconstants.XS_POLICY_ACM:
             self.do_access_control(config, uname)
 
-        devid = blkif.blkdev_name_to_number(dev)
+        (device_path, devid) = blkif.blkdev_name_to_number(dev)
         if devid is None:
             raise VmError('Unable to find number for device (%s)' % (dev))
 
-        front = { 'virtual-device' : "%i" % devid,
+        front = { device_path : "%i" % devid,
                   'device-type' : dev_type
                 }
 
@@ -204,5 +204,5 @@ class BlkifController(DevController):
                 dev = devid.split('/')[-1]
                 dev = int(dev)
             except ValueError:
-                dev = blkif.blkdev_name_to_number(dev)
+                (device_path, dev) = blkif.blkdev_name_to_number(dev)
         return dev
diff -r 11318234588e -r 08f77df14cba tools/python/xen/xm/main.py
--- a/tools/python/xen/xm/main.py       Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/python/xen/xm/main.py       Wed Jul 02 11:30:37 2008 +0900
@@ -2022,8 +2022,7 @@ def xm_block_list(args):
             map(server.xenapi.VBD.get_runtime_properties, vbd_refs)
         vbd_devs = \
             map(server.xenapi.VBD.get_device, vbd_refs)
-        vbd_devids = \
-            map(blkdev_name_to_number, vbd_devs)
+        vbd_devids = [blkdev_name_to_number(x)[1] for x in vbd_devs]
         devs = map(lambda (devid, prop): [devid, map2sxp(prop)],
                    zip(vbd_devids, vbd_properties))
     else:
diff -r 11318234588e -r 08f77df14cba tools/tests/test_x86_emulator.c
--- a/tools/tests/test_x86_emulator.c   Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/tests/test_x86_emulator.c   Wed Jul 02 11:30:37 2008 +0900
@@ -22,23 +22,22 @@ static int read(
 static int read(
     unsigned int seg,
     unsigned long offset,
-    unsigned long *val,
+    void *p_data,
     unsigned int bytes,
     struct x86_emulate_ctxt *ctxt)
 {
-    *val = 0;
-    memcpy(val, (void *)offset, bytes);
+    memcpy(p_data, (void *)offset, bytes);
     return X86EMUL_OKAY;
 }
 
 static int write(
     unsigned int seg,
     unsigned long offset,
-    unsigned long val,
+    void *p_data,
     unsigned int bytes,
     struct x86_emulate_ctxt *ctxt)
 {
-    memcpy((void *)offset, &val, bytes);
+    memcpy((void *)offset, p_data, bytes);
     return X86EMUL_OKAY;
 }
 
diff -r 11318234588e -r 08f77df14cba tools/xenballoon/xenballoon-monitor
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xenballoon/xenballoon-monitor       Wed Jul 02 11:30:37 2008 +0900
@@ -0,0 +1,43 @@
+#!/bin/bash
+#
+# xenballoon-monitor - monitor certain stats from xenballoond
+#   (run in dom0 with "watch -d xenballoon-monitor" for xentop-like output)
+#
+# Copyright (C) 2009 Oracle Corporation and/or its affiliates.
+# All rights reserved
+# Written by: Dan Magenheimer <dan.magenheimer@xxxxxxxxxx>
+#
+# Hint: Use "xm sched-credit -d 0 -w 2000" to watch on heavily loaded machines
+#
+echo "id   mem-kb  tgt-kb  commit   swapin  swapout      pgin     pgout 
active(sec)"
+for i in `xenstore-list /local/domain`; do
+ if [ "$i" -ne 0 ]; then
+ tot=0; tgt=0; sin=0; sout=0; pgin=0; pgout=0; cmt=0; up=0; idle=0; act=0;
+ if xenstore-exists /local/domain/$i/memory/meminfo; then
+  tot=`xenstore-read /local/domain/$i/memory/meminfo | grep MemTotal \
+   | sed 's/[^1-9]*\([1-9][0-9]*\).*/\1/'`
+  cmt=`xenstore-read /local/domain/$i/memory/meminfo | grep Committed_AS \
+   | sed 's/[^1-9]*\([1-9][0-9]*\).*/\1/'`
+ fi
+ if xenstore-exists /local/domain/$i/memory/selftarget; then
+  tgt=`xenstore-read /local/domain/$i/memory/selftarget`
+ fi
+ if xenstore-exists /local/domain/$i/memory/vmstat; then
+  sin=`xenstore-read /local/domain/$i/memory/vmstat | grep pswpin \
+       | cut -d" " -f2`
+  sout=`xenstore-read /local/domain/$i/memory/vmstat | grep pswpout \
+       | cut -d" " -f2`
+  pgin=`xenstore-read /local/domain/$i/memory/vmstat | grep pgpgin \
+       | cut -d" " -f2`
+  pgout=`xenstore-read /local/domain/$i/memory/vmstat | grep pgout \
+       | cut -d" " -f2`
+ fi
+ if xenstore-exists /local/domain/$i/memory/uptime; then
+  up=`xenstore-read /local/domain/$i/memory/uptime | cut -d" " -f1`
+  idle=`xenstore-read /local/domain/$i/memory/uptime | cut -d" " -f2`
+  act=`echo $up - $idle | bc -iq`
+ fi
+ printf "%2d %8d%8d%8d%9d%9d%10d%10d%10.2f\n" $i $tot $tgt $cmt $sin $sout 
$pgin $pgout $act
+ fi
+done
+echo Free memory: `xm info | grep free | sed 's/[^1-9]*\([1-9][0-9]*\).*/\1/'` 
MB
diff -r 11318234588e -r 08f77df14cba tools/xenballoon/xenballoon.conf
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xenballoon/xenballoon.conf  Wed Jul 02 11:30:37 2008 +0900
@@ -0,0 +1,91 @@
+## Path: System/xen
+## Description: xen domain start/stop on boot
+## Type: string
+## Default: 
+
+# NOTE: "xenbus is enabled" means not only that /proc/xen/xenbus exists
+# but also that /usr/bin/xenstore-* tools are installed.
+
+## Type: boolean
+## Default: false
+#
+# If XENBALLOON_SELF is true, selfballooning will occur, meaning the
+# balloon driver will grow and shrink according to available memory.
+# If xenbus is enabled, may be overridden by {memory/selfballoon}==0
+# If false but xenballoond is able to communicate with domain0 via
+# xenbus, balloon targets will be set by domain0
+# 
+XENBALLOON_SELF=false
+
+## Type: integer (must be > 0)
+## Default: 1
+#
+# If self-ballooning, number of seconds between checks/adjustments.
+# If xenbus is enabled, may be overridden by {memory/interval}
+XENBALLOON_SELF_INTERVAL=1
+
+## Type: integer (must be > 0)
+## Default: 1
+#
+# If NOT self-ballooning but xenbus is enabled, number of seconds between
+# checks/adjustments. May be overridden by {memory/interval}
+XENBALLOON_INTERVAL=1
+
+## Type: integer (must be > 0)
+## Default: 10
+#
+# When current > target, reduces rate at which target memory is ballooned
+# out.  For a value of n, 1/n of the difference will be ballooned.
+# This value applies both to selfballooning and directed ballooning.
+# May be overridden by {memory/downhysteresis}
+XENBALLOON_AUTO_DOWNHYSTERESIS=10
+
+## Type: integer (must be > 0)
+## Default: 1
+#
+# When current < target, reduces rate at which target memory is reclaimed
+# (if available).  For a value of n, 1/n of the difference will be ballooned.
+# This value applies both to selfballooning and directed ballooning.
+# May be overridden by {memory/uphysteresis}
+XENBALLOON_AUTO_UPHYSTERESIS=1
+
+## Type: integer (must be >= 0)
+## Default: 0
+#
+# In order to avoid ballooning so much memory that a guest experiences
+# out-of-memory errors (OOMs), memory will not be ballooned out below
+# a minimum target, in MB.  If this value is 0 (default), an heuristic
+# based on the maximum amount of memory will be used.  (The heuristic
+# provides the same minimum as recent versions of the balloon driver but
+# early versions of the balloon driver did not enforce a minimum.)
+XENBALLOON_MINMEM=0
+
+## Type: string
+## Default: "/var/run/xenballoon-maxmem"
+#
+# Location where memory high-water mark is stored; if a guest supports
+# hot-add memory, maxmem might increase across time and the minimum
+# target heuristic is based on max memory. NOTE: Reboot after changing
+# this variable, else overballooning may occur.
+XENBALLOON_MAXMEMFILE=/var/run/xenballoon-maxmem
+
+## Type: integer (0 or 1)
+## Default: 1
+#
+# If xenbus is enabled, whether selfballooning or directed ballooning,
+# place the result of 'cat /proc/meminfo" on xenbus at memory/meminfo
+XENBALLOON_SEND_MEMINFO=1
+
+## Type: integer (0 or 1)
+## Default: 1
+#
+# If xenbus is enabled, whether selfballooning or directed ballooning,
+# place the result of 'cat /proc/vmstat" on xenbus at memory/vmstat
+XENBALLOON_SEND_VMSTAT=1
+
+## Type: integer (0 or 1)
+## Default: 1
+#
+# If xenbus is enabled, whether selfballooning or directed ballooning,
+# place the result of 'cat /proc/uptime" on xenbus at memory/uptime
+XENBALLOON_SEND_UPTIME=1
diff -r 11318234588e -r 08f77df14cba tools/xenballoon/xenballoond
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xenballoon/xenballoond      Wed Jul 02 11:30:37 2008 +0900
@@ -0,0 +1,205 @@
+#!/bin/bash
+#
+# Copyright (C) 2008 Oracle Corporation and/or its affiliates.
+# All rights reserved.
+# Written by: Dan Magenheimer <dan.magenheimer@xxxxxxxxxx>
+#
+# xenballoond - In-guest engine for Xen memory ballooning
+# Version: 080630
+#
+# Two "policies" are implemented:
+# - Selfballooning: Adjust memory periodically, with no (or little) input
+#     from domain0.  Target memory is determined solely by the
+#     Committed_AS line in /proc/meminfo, but parameters may adjust
+#     the rate at which the target is achieved.
+# - Directed ballooning: Adjust memory solely as directed by domain0
+#
+# Under some circumstances, "output" may also be generated; the contents
+# of /proc/meminfo and /proc/vmstat may be periodically placed on xenbus.
+#
+# If xenbus is running and the /usr/bin/xenstore-* tools are installed,
+# "xenbus is enabled".
+#
+# Parameters are documented in /etc/sysconfig/xenballoon.conf. Although 
+# some are not used with directed ballooning, all must be set properly.
+# If xenbus is enabled, some of these parameters may be overridden by values
+# set by domain0 via xenbus.
+
+minmb() {
+       RETVAL=$XENBALLOON_MINMEM
+       if [ $RETVAL -ne 0 ]; then
+               return $RETVAL
+       fi
+       kb=`cat $XENBALLOON_MAXMEMFILE`
+       let "mb=$kb/1024"
+       let "pages=$kb/4"
+       # this algorithm from drivers/xen/balloon/balloon.c:minimum_target()
+       # which was added to balloon.c in 2008 to avoid ballooning too small
+       # it is unnecessary here except to accomodate pre-2008 balloon drivers
+       # note that ranges are adjusted because a VM with "memory=1024"
+       # gets somewhat less than 1024MB
+       if [ $mb -lt 125 ]; then
+               let RETVAL="$(( 8 + ($pages >> 9) ))"
+       elif [ $mb -lt 500 ]; then
+               let RETVAL="$(( 40 + ($pages >> 10) ))"
+       elif [ $mb -lt 2000 ]; then
+               let RETVAL="$(( 104 + ($pages >> 11) ))"
+       else
+               let RETVAL="$(( 296 + ($pages >> 13) ))"
+       fi
+       return  # value returned in RETVAL in mB
+}
+
+curkb() {
+       kb=`grep MemTotal /proc/meminfo | sed 's/  */ /' | \
+               cut -f2 -d' '`
+       RETVAL=$kb
+       return  # value returned in RETVAL in kB
+}
+
+downhysteresis() {
+       RETVAL=$XENBALLOON_AUTO_DOWNHYSTERESIS
+       if [ $xenstore_enabled = "true" ]; then
+               if xenstore-exists memory/downhysteresis ; then
+                       RETVAL=`xenstore-read memory/downhysteresis`
+               fi
+       fi
+       return
+}
+
+uphysteresis() {
+       RETVAL=$XENBALLOON_AUTO_UPHYSTERESIS
+       if [ $xenstore_enabled = "true" ]; then
+               if xenstore-exists memory/uphysteresis ; then
+                       RETVAL=`xenstore-read memory/uphysteresis`
+               fi
+       fi
+       return
+}
+
+selfballoon_eval() {
+       if [ $xenstore_enabled = "true" ]; then
+               if xenstore-exists memory/selfballoon; then
+                       RETVAL=`xenstore-read memory/selfballoon`
+                       if [ $RETVAL -eq 1 ]; then
+                               selfballoon_enabled=true
+                               return
+                       fi
+               fi
+       fi
+       selfballoon_enabled=$XENBALLOON_SELF
+       return
+}
+
+selftarget() {
+       tgtkb=`grep Committed_AS /proc/meminfo | sed 's/  */ /' | cut -f2 -d' '`
+       minmb
+       let "minbytes=$RETVAL*1024*1024"
+       let "tgtbytes=$tgtkb*1024"
+       if [ $tgtbytes -lt $minbytes ]; then
+               let "tgtbytes=$minbytes"
+       fi
+       RETVAL=$tgtbytes  # value returned in RETVAL in bytes
+       return
+}
+
+# $1 == 1 means use selftarget, else target in kB
+balloon_to_target() {
+       if [ "$1" -eq 1 ]; then
+               selftarget
+               tgtbytes=$RETVAL
+       else
+               let "tgtbytes=$(( $1 * 1024 ))"
+       fi
+       curkb
+       let "curbytes=$RETVAL*1024"
+       if [ $curbytes -gt $tgtbytes ]; then
+               downhysteresis
+               downhys=$RETVAL
+               if [ $downhys -ne 0 ]; then
+                       let "tgtbytes=$(( $curbytes - \
+                               ( ( $curbytes - $tgtbytes ) / $downhys ) ))"
+               fi
+       else if [ $curbytes -lt $tgtbytes ]; then
+               uphysteresis
+               uphys=$RETVAL
+               let "tgtbytes=$(( $curbytes + \
+                               ( ( $tgtbytes - $curbytes ) / $uphys ) ))"
+               fi
+       fi
+       echo $tgtbytes > /proc/xen/balloon
+       if [ $xenstore_enabled = "true" ]; then
+               let "tgtkb=$(( $tgtbytes/1024 ))"
+               xenstore-write memory/selftarget $tgtkb
+       fi
+}
+
+send_memory_stats() {
+       if [ ! $xenstore_enabled = "true" ]; then
+               return
+       fi
+       if [ $XENBALLOON_SEND_MEMINFO ]; then
+               xenstore-write memory/meminfo "`cat /proc/meminfo`"
+       fi
+       if [ $XENBALLOON_SEND_VMSTAT ]; then
+               xenstore-write memory/vmstat "`cat /proc/vmstat`"
+       fi
+       if [ $XENBALLOON_SEND_UPTIME ]; then
+               xenstore-write memory/uptime "`cat /proc/uptime`"
+       fi
+}
+
+if [ ! -f /proc/xen/balloon ]; then
+       echo "$0: no balloon driver installed"
+       exit 0
+fi
+if [ ! -f /proc/meminfo ]; then
+       echo "$0: can't read /proc/meminfo"
+       exit 0
+fi
+xenstore_enabled=true
+if [ -f /usr/bin/xenstore-exists -a -f /usr/bin/xenstore-read -a \
+     -f /usr/bin/xenstore-write ]; then
+       xenstore_enabled=true
+else
+       echo "$0: missing /usr/bin/xenstore-* tools, disabling directed 
ballooning"
+       xenstore_enabled=false
+fi
+
+. /etc/sysconfig/xenballoon.conf
+
+while true;
+do
+       # handle special case for PV domains with hot-add memory
+       if [ ! -f $XENBALLOON_MAXMEMFILE ]; then
+               maxkb=0
+       else
+               maxkb=`cat $XENBALLOON_MAXMEMFILE`
+       fi
+       curkb=`grep MemTotal /proc/meminfo | sed 's/  */ /' | cut -f2 -d' '`
+       if [ $curkb -gt $maxkb ]; then
+               echo $curkb > $XENBALLOON_MAXMEMFILE
+       fi
+       interval=$XENBALLOON_INTERVAL
+       # do self-ballooning
+       selfballoon_eval
+       if [ $selfballoon_enabled = "true" ]; then
+               balloon_to_target 1
+               interval=$XENBALLOON_SELF_INTERVAL
+       # or do directed ballooning
+       elif [ $xenstore_enabled = "true" ]; then
+               if xenstore-exists memory/target ; then
+                       tgtkb=`xenstore-read memory/target`
+                       balloon_to_target $tgtkb
+               fi
+               interval=$XENBALLOON_INTERVAL
+       fi
+       send_memory_stats
+       if [ $xenstore_enabled = "true" ]; then
+               if xenstore-exists memory/interval ; then
+                       interval=`xenstore-read memory/interval`
+               fi
+       fi
+       sleep $interval
+done &
+
diff -r 11318234588e -r 08f77df14cba tools/xenballoon/xenballoond.README
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xenballoon/xenballoond.README       Wed Jul 02 11:30:37 2008 +0900
@@ -0,0 +1,82 @@
+Xenballoond.README
+Preliminary version 0.1, 2008/06/30
+
+Copyright (C) 2008 Oracle Corporation and/or its affiliates.
+All rights reserved.
+Written by Dan Magenheimer <dan.magenheimer@xxxxxxxxxx>
+
+INTRODUCTION
+
+Xenballoond runs in guest domains and both implements selfballooning and
+provides metrics to dom0 for (future) directed ballooning.  Both capabilities
+provide a foundation for basic "memory overcommit" functionality.
+
+With selfballooning enabled, xenballoond uses the Committed_AS value found
+in /proc/meminfo as a first approximation of how much memory is required
+by the guest and feeds this statistic back to the balloon driver to inflate
+or deflate the balloon as required to achieve the target guest memory size.
+Hysteresis parameters may be adjusted to rate-limit balloon inflation
+and deflation.
+
+If configured, certain selfballooning parameters -- including notably
+enabling/disabling of self-ballooning -- can be controlled from domain0.
+(These are fully documented in xenballoon.conf.)
+
+If configured, the following guest statistics are sent back to domain0:
+- /proc/meminfo
+- /proc/vmstat
+- /proc/uptime
+In a future release, some of these values will be used by a policy module
+in domain0 to control guest balloon size and provide memory balancing
+across all guests on a given system.
+
+Note that no page sharing (content-based or otherwise) is implemented
+and no VMM-based swapping is necessary.
+
+For more information, see:
+http://www.xen.org/files/xensummitboston08/MemoryOvercommit-XenSummit2008.pdf
+http://wiki.xensource.com/xenwiki/Open_Topics_For_Discussion?action=AttachFile&do=get&target=Memory+Overcommit.pdf
+
+INSTALLATION AND DEPLOYMENT
+
+In this preliminary release:
+- directed ballooning is not implemented, though a monitor is provided
+- only Redhat-based guests are supported
+
+Guest prerequisites to use xenballoond:
+- each guest must be configured with adequate[1] swap space
+- each guest must have the balloon driver installed (/proc/xen/balloon exists) 
+- if directed ballooning (or monitoring) is desired, xenstore tools must be
+  installed in each guest in /usr/bin [2]
+
+[1] for best results, for a guest that is configured with maxmem=N and
+    requires Z MB of swap space without xenballoond, available swap should
+    be increased to N+Z MB when xenballoond is running
+[2] specifically xenstore-read, xenstore-exists, and xenstore-write must
+    be installed.  Binaries can be obtained, for example, by building
+    xen-vvv.gz/tools in a guest-binary-compatible development tree
+
+Instructions to install/deploy xenballoond (in Redhat-based system):
+- in each guest:
+  - ensure pre-requisites are met (see above)
+  - place xenballoon.conf in /etc/sysconfig
+  - place xenballoond in /usr/sbin
+  - copy xenballoond.init to /etc/rc.d/init.d/xenballoond (note file rename)
+  - edit /etc/sysconfig/xenballoond.conf as desired (especially note that
+    selfballooning defaults as off)
+  - start xenballoond with "service xenballoond start", and/or configure
+    xenballoond to start at init (e.g. "chkconfig xenballoond on")
+- in domain0:
+  - if monitoring is desired, xenballoon-monitor may be installed in /usr/sbin
+- note that certain xenballoond.conf variables may be overridden by domain0
+  if xenstore is running in the guest; these are fully documented in
+  xenballoond.conf
+
+TODO:
+080630 modifications to support SUSE-based and debian-based guests
+080630 domain0 ballooning policy module
+080630 experiment with more aggressive (optionally) memory minimum targets
+080630 BUG: xenballoond doesn't properly record the fact that it's running;
+       e.g. flipping between run levels 5 and 3 launches additional daemons
+080630 BUG: reports of possible incompatibilites between ballooning and
+       save/restore/migrate have not been duplicated
diff -r 11318234588e -r 08f77df14cba tools/xenballoon/xenballoond.init
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xenballoon/xenballoond.init Wed Jul 02 11:30:37 2008 +0900
@@ -0,0 +1,91 @@
+#!/bin/bash
+#
+# xenballoond  Script to start and stop Xen ballooning daemon.
+#
+# Copyright (C) 2008 Oracle Corporation and/or its affiliates.
+# All rights reserved.
+# Written by: Dan Magenheimer <dan.magenheimer@xxxxxxxxxx>
+#
+# chkconfig: 2345 98 01
+# description: Starts and stops the Xen control daemon.
+### BEGIN INIT INFO
+# Provides:          xenballoond
+# Required-Start:    $syslog $remote_fs
+# Should-Start:
+# Required-Stop:     $syslog $remote_fs
+# Should-Stop:
+# Default-Start:     3 4 5
+# Default-Stop:      0 1 2 6
+# Default-Enabled:   yes
+# Short-Description: Start/stop xend
+# Description:       Starts and stops the Xen ballooning daemon.
+### END INIT INFO
+
+# Source function library
+. /etc/init.d/functions
+
+#don't use in domain0
+[ -f /proc/xen/capabilities ] && \
+       grep -q "control_d" /proc/xen/capabilities && exit 0
+
+if [ -f /etc/sysconfig/xenballoon.conf ]; then
+       . /etc/sysconfig/xenballoon.conf
+fi
+
+# Check that balloon driver is present
+[ ! -f /proc/xen/balloon ] && exit 0
+
+# Record original memory (in kB)
+[ -z "$XENBALLOON_MAXMEMFILE" ] && exit 0
+let maxmem=`grep MemTotal /proc/meminfo | sed 's/  */ /' | cut -f2 -d' '`
+if [ -f "$XENBALLOON_MAXMEMFILE" ]; then
+       let oldmax=`cat $XENBALLOON_MAXMEMFILE`
+       if [ $oldmax -gt $maxmem ]; then
+               let maxmem=oldmax
+       fi
+fi
+echo $maxmem > $XENBALLOON_MAXMEMFILE
+
+RETVAL=0
+prog="xenballoond"
+
+start() {
+        # Start daemons.
+        echo -n $"Starting $prog: "
+        daemon xenballoond $OPTIONS
+       RETVAL=$?
+        echo
+       return $RETVAL
+}
+
+stop() {
+        echo -n $"Shutting down $prog: "
+       killproc xenballoond
+       RETVAL=$?
+        echo
+       return $RETVAL
+}
+
+# See how we were called.
+case "$1" in
+  start)
+       start
+        ;;
+  stop)
+       stop
+        ;;
+  status)
+       status xenballoond
+       RETVAL=$?
+       ;;
+  restart|reload)
+       stop
+       start
+       RETVAL=$?
+       ;;
+  *)
+        echo $"Usage: $0 {start|stop|restart|status}"
+        exit 1
+esac
+
+exit $RETVAL
diff -r 11318234588e -r 08f77df14cba tools/xentrace/xenctx.c
--- a/tools/xentrace/xenctx.c   Thu Jun 19 12:48:04 2008 +0900
+++ b/tools/xentrace/xenctx.c   Wed Jul 02 11:30:37 2008 +0900
@@ -702,7 +702,7 @@ void dump_ctx(int vcpu)
 void dump_ctx(int vcpu)
 {
     int ret;
-    vcpu_guest_context_t ctx;
+    vcpu_guest_context_any_t ctx;
     xc_dominfo_t dominfo;
 
     xc_handle = xc_interface_open(); /* for accessing control interface */
@@ -727,10 +727,10 @@ void dump_ctx(int vcpu)
         exit(-1);
     }
 
-    print_ctx(&ctx);
+    print_ctx(&ctx.c);
 #ifndef NO_TRANSLATION
-    if (is_kernel_text(INSTR_POINTER((&ctx.user_regs))))
-        print_stack(&ctx, vcpu);
+    if (is_kernel_text(INSTR_POINTER((&ctx.c.user_regs))))
+        print_stack(&ctx.c, vcpu);
 #endif
 
     if (!dominfo.paused) {
diff -r 11318234588e -r 08f77df14cba tools/xm-test/lib/XmTestLib/block_utils.py
--- a/tools/xm-test/lib/XmTestLib/block_utils.py        Thu Jun 19 12:48:04 
2008 +0900
+++ b/tools/xm-test/lib/XmTestLib/block_utils.py        Wed Jul 02 11:30:37 
2008 +0900
@@ -15,7 +15,7 @@ __all__ = [ "block_attach", "block_detac
 
 
 def get_state(domain, devname):
-    number = xen.util.blkif.blkdev_name_to_number(devname)
+    (path, number) = xen.util.blkif.blkdev_name_to_number(devname)
     s, o = traceCommand("xm block-list %s | awk '/^%d/ {print $4}'" %
                         (domain.getName(), number))
     if s != 0:
diff -r 11318234588e -r 08f77df14cba xen/arch/ia64/vmx/vmx_hypercall.c
--- a/xen/arch/ia64/vmx/vmx_hypercall.c Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/ia64/vmx/vmx_hypercall.c Wed Jul 02 11:30:37 2008 +0900
@@ -204,6 +204,53 @@ do_hvm_op(unsigned long op, XEN_GUEST_HA
         rc = -ENOSYS;
         break;
 
+    case HVMOP_modified_memory:
+    {
+        struct xen_hvm_modified_memory a;
+        struct domain *d;
+        unsigned long pfn;
+
+        if ( copy_from_guest(&a, arg, 1) )
+            return -EFAULT;
+
+        if ( a.domid == DOMID_SELF )
+        {
+            d = rcu_lock_current_domain();
+        }
+        else
+        {
+            if ( (d = rcu_lock_domain_by_id(a.domid)) == NULL )
+                return -ESRCH;
+            if ( !IS_PRIV_FOR(current->domain, d) )
+            {
+                rc = -EPERM;
+                goto param_fail3;
+            }
+        }
+
+        rc = -EINVAL;
+        if ( !is_hvm_domain(d) )
+            goto param_fail3;
+
+        rc = -EINVAL;
+        if ( a.first_pfn > domain_get_maximum_gpfn(d)
+                || a.first_pfn + a.nr - 1 < a.first_pfn
+                || a.first_pfn + a.nr - 1 > domain_get_maximum_gpfn(d))
+            goto param_fail3;
+
+        rc = 0;
+        if ( !d->arch.shadow_bitmap )
+            goto param_fail3;
+
+        for (pfn = a.first_pfn; pfn < a.first_pfn + a.nr; pfn++)
+            if (pfn < d->arch.shadow_bitmap_size)
+                set_bit(pfn, d->arch.shadow_bitmap);
+
+    param_fail3:
+        rcu_unlock_domain(d);
+        break;
+    }
+
     default:
         gdprintk(XENLOG_INFO, "Bad HVM op %ld.\n", op);
         rc = -ENOSYS;
diff -r 11318234588e -r 08f77df14cba xen/arch/ia64/xen/mm.c
--- a/xen/arch/ia64/xen/mm.c    Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/ia64/xen/mm.c    Wed Jul 02 11:30:37 2008 +0900
@@ -207,7 +207,7 @@ alloc_dom_xen_and_dom_io(void)
      * Any Xen-heap pages that we will allow to be mapped will have
      * their domain field set to dom_xen.
      */
-    dom_xen = alloc_domain(DOMID_XEN);
+    dom_xen = domain_create(DOMID_XEN, DOMCRF_dummy, 0);
     BUG_ON(dom_xen == NULL);
 
     /*
@@ -215,7 +215,7 @@ alloc_dom_xen_and_dom_io(void)
      * This domain owns I/O pages that are within the range of the page_info
      * array. Mappings occur at the priv of the caller.
      */
-    dom_io = alloc_domain(DOMID_IO);
+    dom_io = domain_create(DOMID_IO, DOMCRF_dummy, 0);
     BUG_ON(dom_io == NULL);
 }
 
@@ -1553,7 +1553,7 @@ expose_p2m_init(void)
      * Initialise our DOMID_P2M domain.
      * This domain owns m2p table pages.
      */
-    dom_p2m = alloc_domain(DOMID_P2M);
+    dom_p2m = domain_create(DOMID_P2M, DOMCRF_dummy, 0);
     BUG_ON(dom_p2m == NULL);
     dom_p2m->max_pages = ~0U;
 
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/acpi/cpufreq/Makefile
--- a/xen/arch/x86/acpi/cpufreq/Makefile        Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/acpi/cpufreq/Makefile        Wed Jul 02 11:30:37 2008 +0900
@@ -1,3 +1,4 @@ obj-y += cpufreq.o
 obj-y += cpufreq.o
 obj-y += utility.o
 obj-y += cpufreq_ondemand.o
+obj-y += powernow.o
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/acpi/cpufreq/cpufreq.c
--- a/xen/arch/x86/acpi/cpufreq/cpufreq.c       Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/acpi/cpufreq/cpufreq.c       Wed Jul 02 11:30:37 2008 +0900
@@ -47,6 +47,10 @@ struct processor_pminfo processor_pminfo
 struct processor_pminfo processor_pminfo[NR_CPUS];
 struct cpufreq_policy xen_px_policy[NR_CPUS];
 
+static cpumask_t *cpufreq_dom_pt;
+static cpumask_t cpufreq_dom_mask;
+static unsigned int cpufreq_dom_max;
+
 enum {
     UNDEFINED_CAPABLE = 0,
     SYSTEM_INTEL_MSR_CAPABLE,
@@ -60,7 +64,6 @@ struct acpi_cpufreq_data {
     struct processor_performance *acpi_data;
     struct cpufreq_frequency_table *freq_table;
     unsigned int max_freq;
-    unsigned int resume;
     unsigned int cpu_feature;
 };
 
@@ -328,14 +331,16 @@ static int acpi_cpufreq_target(struct cp
 
     next_perf_state = data->freq_table[next_state].index;
     if (perf->state == next_perf_state) {
-        if (unlikely(data->resume)) {
-            printk("xen_pminfo: @acpi_cpufreq_target, "
-                "Called after resume, resetting to P%d\n", 
+        if (unlikely(policy->resume)) {
+            printk(KERN_INFO "Called after resume, resetting to P%d\n", 
                 next_perf_state);
-            data->resume = 0;
+            policy->resume = 0;
         }
-        else
+        else {
+            printk(KERN_INFO "Already at target state (P%d)\n", 
+                next_perf_state);
             return 0;
+        }
     }
 
     switch (data->cpu_feature) {
@@ -531,7 +536,7 @@ acpi_cpufreq_cpu_init(struct cpufreq_pol
      * the first call to ->target() should result in us actually
      * writing something to the appropriate registers.
      */
-    data->resume = 1;
+    policy->resume = 1;
 
     return result;
 
@@ -549,61 +554,101 @@ static struct cpufreq_driver acpi_cpufre
     .init   = acpi_cpufreq_cpu_init,
 };
 
-int acpi_cpufreq_init(void)
-{
-    unsigned int i, ret = 0;
-    unsigned int dom, max_dom = 0;
-    cpumask_t *pt, dom_mask;
-
-    cpus_clear(dom_mask);
+void cpufreq_dom_exit(void)
+{
+    cpufreq_dom_max = 0;
+    cpus_clear(cpufreq_dom_mask);
+    if (cpufreq_dom_pt)
+        xfree(cpufreq_dom_pt);
+}
+
+int cpufreq_dom_init(void)
+{
+    unsigned int i;
+
+    cpufreq_dom_max = 0;
+    cpus_clear(cpufreq_dom_mask);
 
     for_each_online_cpu(i) {
-        cpu_set(processor_pminfo[i].perf.domain_info.domain, dom_mask);
-        if (max_dom < processor_pminfo[i].perf.domain_info.domain)
-            max_dom = processor_pminfo[i].perf.domain_info.domain;
-    }
-    max_dom++;
-
-    pt = xmalloc_array(cpumask_t, max_dom);
-    if (!pt)
+        cpu_set(processor_pminfo[i].perf.domain_info.domain, cpufreq_dom_mask);
+        if (cpufreq_dom_max < processor_pminfo[i].perf.domain_info.domain)
+            cpufreq_dom_max = processor_pminfo[i].perf.domain_info.domain;
+    }
+    cpufreq_dom_max++;
+
+    cpufreq_dom_pt = xmalloc_array(cpumask_t, cpufreq_dom_max);
+    if (!cpufreq_dom_pt)
         return -ENOMEM;
-    memset(pt, 0, max_dom * sizeof(cpumask_t));
-
-    /* get cpumask of each psd domain */
+    memset(cpufreq_dom_pt, 0, cpufreq_dom_max * sizeof(cpumask_t));
+
     for_each_online_cpu(i)
-        cpu_set(i, pt[processor_pminfo[i].perf.domain_info.domain]);
+        cpu_set(i, 
cpufreq_dom_pt[processor_pminfo[i].perf.domain_info.domain]);
 
     for_each_online_cpu(i)
-        processor_pminfo[i].perf.shared_cpu_map = 
-            pt[processor_pminfo[i].perf.domain_info.domain];
-
-    cpufreq_driver = &acpi_cpufreq_driver;
-
-    /* setup cpufreq infrastructure */
+        processor_pminfo[i].perf.shared_cpu_map =
+            cpufreq_dom_pt[processor_pminfo[i].perf.domain_info.domain];
+
+    return 0;
+}
+
+static int cpufreq_cpu_init(void)
+{
+    int i, ret = 0;
+
     for_each_online_cpu(i) {
         xen_px_policy[i].cpu = i;
 
         ret = px_statistic_init(i);
         if (ret)
-            goto out;
+            return ret;
 
         ret = acpi_cpufreq_cpu_init(&xen_px_policy[i]);
         if (ret)
-            goto out;
-    }
-
-    /* setup ondemand cpufreq */
-    for (dom=0; dom<max_dom; dom++) {
-        if (!cpu_isset(dom, dom_mask))
+            return ret;
+    }
+    return ret;
+}
+
+int cpufreq_dom_dbs(unsigned int event)
+{
+    int cpu, dom, ret = 0;
+
+    for (dom=0; dom<cpufreq_dom_max; dom++) {
+        if (!cpu_isset(dom, cpufreq_dom_mask))
             continue;
-        i = first_cpu(pt[dom]);
-        ret = cpufreq_governor_dbs(&xen_px_policy[i], CPUFREQ_GOV_START);
+        cpu = first_cpu(cpufreq_dom_pt[dom]);
+        ret = cpufreq_governor_dbs(&xen_px_policy[cpu], event);
         if (ret)
-            goto out;
-    }
-
-out:
-    xfree(pt);
-   
+            return ret;
+    }
     return ret;
 }
+
+int acpi_cpufreq_init(void)
+{
+    int ret = 0;
+    
+    /* setup cpumask of psd dom and shared cpu map of cpu */
+    ret = cpufreq_dom_init();
+    if (ret)
+        goto err;
+
+    /* setup cpufreq driver */
+    cpufreq_driver = &acpi_cpufreq_driver;
+
+    /* setup cpufreq infrastructure */
+    ret = cpufreq_cpu_init();
+    if (ret)
+        goto err;
+
+    /* setup cpufreq dbs according to dom coordiation */
+    ret = cpufreq_dom_dbs(CPUFREQ_GOV_START);
+    if (ret)
+        goto err;
+
+    return ret;
+
+err:
+    cpufreq_dom_exit();
+    return ret;
+}
diff -r 11318234588e -r 08f77df14cba 
xen/arch/x86/acpi/cpufreq/cpufreq_ondemand.c
--- a/xen/arch/x86/acpi/cpufreq/cpufreq_ondemand.c      Thu Jun 19 12:48:04 
2008 +0900
+++ b/xen/arch/x86/acpi/cpufreq/cpufreq_ondemand.c      Wed Jul 02 11:30:37 
2008 +0900
@@ -52,7 +52,7 @@ static struct dbs_tuners {
 
 static struct timer dbs_timer[NR_CPUS];
 
-static inline uint64_t get_cpu_idle_time(unsigned int cpu)
+inline uint64_t get_cpu_idle_time(unsigned int cpu)
 {
     uint64_t idle_ns;
     struct vcpu *v;
@@ -79,6 +79,12 @@ static void dbs_check_cpu(struct cpu_dbs
         return;
 
     policy = this_dbs_info->cur_policy;
+
+    if (unlikely(policy->resume)) {
+        __cpufreq_driver_target(policy, policy->max,CPUFREQ_RELATION_H);
+        return;
+    }
+
     cur_ns = NOW();
     total_ns = cur_ns - this_dbs_info->prev_cpu_wall;
     this_dbs_info->prev_cpu_wall = NOW();
@@ -217,8 +223,7 @@ int cpufreq_governor_dbs(struct cpufreq_
         break;
 
     case CPUFREQ_GOV_STOP:
-        if (this_dbs_info->enable)
-            dbs_timer_exit(this_dbs_info);
+        dbs_timer_exit(this_dbs_info);
         dbs_enable--;
 
         break;
@@ -233,5 +238,4 @@ int cpufreq_governor_dbs(struct cpufreq_
         break;
     }
     return 0;
-}
-             
+} 
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/acpi/cpufreq/powernow.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/acpi/cpufreq/powernow.c      Wed Jul 02 11:30:37 2008 +0900
@@ -0,0 +1,305 @@
+/*
+ *  powernow - AMD Architectural P-state Driver ($Revision: 1.4 $)
+ *
+ *  Copyright (C) 2008 Mark Langsdorf <mark.langsdorf@xxxxxxx>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or (at
+ *  your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+
+#include <xen/types.h>
+#include <xen/errno.h>
+#include <xen/delay.h>
+#include <xen/cpumask.h>
+#include <xen/timer.h>
+#include <xen/xmalloc.h>
+#include <asm/bug.h>
+#include <asm/msr.h>
+#include <asm/io.h>
+#include <asm/config.h>
+#include <asm/processor.h>
+#include <asm/percpu.h>
+#include <asm/cpufeature.h>
+#include <acpi/acpi.h>
+#include <acpi/cpufreq/cpufreq.h>
+
+#define CPUID_FREQ_VOLT_CAPABILITIES    0x80000007
+#define USE_HW_PSTATE           0x00000080
+#define HW_PSTATE_MASK          0x00000007
+#define HW_PSTATE_VALID_MASK    0x80000000
+#define HW_PSTATE_MAX_MASK      0x000000f0
+#define HW_PSTATE_MAX_SHIFT     4
+#define MSR_PSTATE_DEF_BASE     0xc0010064 /* base of Pstate MSRs */
+#define MSR_PSTATE_STATUS       0xc0010063 /* Pstate Status MSR */
+#define MSR_PSTATE_CTRL         0xc0010062 /* Pstate control MSR */
+#define MSR_PSTATE_CUR_LIMIT    0xc0010061 /* pstate current limit MSR */
+
+extern struct processor_pminfo processor_pminfo[NR_CPUS];
+extern struct cpufreq_policy xen_px_policy[NR_CPUS];
+
+struct powernow_cpufreq_data {
+    struct processor_performance *acpi_data;
+    struct cpufreq_frequency_table *freq_table;
+    unsigned int max_freq;
+    unsigned int resume;
+    unsigned int cpu_feature;
+};
+
+static struct powernow_cpufreq_data *drv_data[NR_CPUS];
+
+struct drv_cmd {
+    unsigned int type;
+    cpumask_t mask;
+    u64 addr;
+    u32 val;
+};
+
+static void transition_pstate(void *drvcmd)
+{
+    struct drv_cmd *cmd;
+    cmd = (struct drv_cmd *) drvcmd;
+
+    wrmsr(MSR_PSTATE_CTRL, cmd->val, 0);
+}
+
+static int powernow_cpufreq_target(struct cpufreq_policy *policy,
+                               unsigned int target_freq, unsigned int relation)
+{
+    struct powernow_cpufreq_data *data = drv_data[policy->cpu];
+    struct processor_performance *perf;
+    struct cpufreq_freqs freqs;
+    cpumask_t online_policy_cpus;
+    struct drv_cmd cmd;
+    unsigned int next_state = 0; /* Index into freq_table */
+    unsigned int next_perf_state = 0; /* Index into perf table */
+    int result = 0;
+
+    if (unlikely(data == NULL ||
+        data->acpi_data == NULL || data->freq_table == NULL)) {
+        return -ENODEV;
+    }
+
+    perf = data->acpi_data;
+    result = cpufreq_frequency_table_target(policy,
+                                            data->freq_table,
+                                            target_freq,
+                                            relation, &next_state);
+    if (unlikely(result))
+        return -ENODEV;
+
+    online_policy_cpus = policy->cpus;
+
+    next_perf_state = data->freq_table[next_state].index;
+    if (perf->state == next_perf_state) {
+        if (unlikely(data->resume)) 
+            data->resume = 0;
+        else
+            return 0;
+    }
+
+    cpus_clear(cmd.mask);
+
+    if (policy->shared_type != CPUFREQ_SHARED_TYPE_ANY)
+        cmd.mask = online_policy_cpus;
+    else
+        cpu_set(policy->cpu, cmd.mask);
+
+    freqs.old = perf->states[perf->state].core_frequency * 1000;
+    freqs.new = data->freq_table[next_state].frequency;
+
+    cmd.val = next_perf_state;
+
+    on_selected_cpus( cmd.mask, transition_pstate, (void *) &cmd, 0, 0);
+
+    perf->state = next_perf_state;
+    policy->cur = freqs.new;
+
+    return result;
+}
+
+static int powernow_cpufreq_cpu_init(struct cpufreq_policy *policy)
+{
+    unsigned int i;
+    unsigned int valid_states = 0;
+    unsigned int cpu = policy->cpu;
+    struct powernow_cpufreq_data *data;
+    unsigned int result = 0;
+    struct processor_performance *perf;
+    u32 max_hw_pstate, hi = 0, lo = 0;
+
+    data = xmalloc(struct powernow_cpufreq_data);
+    if (!data)
+        return -ENOMEM;
+    memset(data, 0, sizeof(struct powernow_cpufreq_data));
+
+    drv_data[cpu] = data;
+
+    data->acpi_data = &processor_pminfo[cpu].perf;
+
+    perf = data->acpi_data;
+    policy->shared_type = perf->shared_type;
+
+    /*
+     * Will let policy->cpus know about dependency only when software
+     * coordination is required.
+     */
+    if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL ||
+        policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) {
+        policy->cpus = perf->shared_cpu_map;
+    } else {
+        policy->cpus = cpumask_of_cpu(cpu);    
+    }
+
+    /* capability check */
+    if (perf->state_count <= 1) {
+        printk("No P-States\n");
+        result = -ENODEV;
+        goto err_unreg;
+    }
+    rdmsr(MSR_PSTATE_CUR_LIMIT, hi, lo);
+    max_hw_pstate = (hi & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT;
+
+    if (perf->control_register.space_id != perf->status_register.space_id) {
+        result = -ENODEV;
+        goto err_unreg;
+    }
+
+    data->freq_table = xmalloc_array(struct cpufreq_frequency_table, 
+                                    (perf->state_count+1));
+    if (!data->freq_table) {
+        result = -ENOMEM;
+        goto err_unreg;
+    }
+
+    /* detect transition latency */
+    policy->cpuinfo.transition_latency = 0;
+    for (i=0; i<perf->state_count; i++) {
+        if ((perf->states[i].transition_latency * 1000) >
+            policy->cpuinfo.transition_latency)
+            policy->cpuinfo.transition_latency =
+                perf->states[i].transition_latency * 1000;
+    }
+
+    data->max_freq = perf->states[0].core_frequency * 1000;
+    /* table init */
+    for (i=0; i<perf->state_count && i<max_hw_pstate; i++) {
+        if (i>0 && perf->states[i].core_frequency >=
+            data->freq_table[valid_states-1].frequency / 1000)
+            continue;
+
+        data->freq_table[valid_states].index = perf->states[i].control & 
HW_PSTATE_MASK;
+        data->freq_table[valid_states].frequency =
+            perf->states[i].core_frequency * 1000;
+        valid_states++;
+    }
+    data->freq_table[valid_states].frequency = CPUFREQ_TABLE_END;
+    perf->state = 0;
+
+    result = cpufreq_frequency_table_cpuinfo(policy, data->freq_table);
+    if (result)
+        goto err_freqfree;
+
+    /*
+     * the first call to ->target() should result in us actually
+     * writing something to the appropriate registers.
+     */
+    data->resume = 1;
+
+    policy->cur = data->freq_table[i].frequency;
+    return result;
+
+err_freqfree:
+    xfree(data->freq_table);
+err_unreg:
+    xfree(data);
+    drv_data[cpu] = NULL;
+
+    return result;
+}
+
+static struct cpufreq_driver powernow_cpufreq_driver = {
+    .target = powernow_cpufreq_target,
+    .init   = powernow_cpufreq_cpu_init,
+};
+
+int powernow_cpufreq_init(void)
+{
+    unsigned int i, ret = 0;
+    unsigned int dom, max_dom = 0;
+    cpumask_t *pt, dom_mask;
+
+    cpus_clear(dom_mask);
+
+    for_each_online_cpu(i) {
+        struct cpuinfo_x86 *c = &cpu_data[i];
+       if (c->x86_vendor != X86_VENDOR_AMD)
+            ret = -ENODEV;
+        else 
+        {
+            u32 eax, ebx, ecx, edx;
+            cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
+            if ((edx & USE_HW_PSTATE) != USE_HW_PSTATE)
+                ret = -ENODEV;
+       }
+        if (ret)
+            return ret;
+        cpu_set(processor_pminfo[i].perf.domain_info.domain, dom_mask);
+        if (max_dom < processor_pminfo[i].perf.domain_info.domain)
+            max_dom = processor_pminfo[i].perf.domain_info.domain;
+    }
+    max_dom++;
+
+    pt = xmalloc_array(cpumask_t, max_dom);
+    if (!pt)
+        return -ENOMEM;
+    memset(pt, 0, max_dom * sizeof(cpumask_t));
+
+    /* get cpumask of each psd domain */
+    for_each_online_cpu(i)
+        cpu_set(i, pt[processor_pminfo[i].perf.domain_info.domain]);
+
+    for_each_online_cpu(i)
+        processor_pminfo[i].perf.shared_cpu_map = 
+            pt[processor_pminfo[i].perf.domain_info.domain];
+
+    cpufreq_driver = &powernow_cpufreq_driver;
+
+    /* setup cpufreq infrastructure */
+    for_each_online_cpu(i) {
+        xen_px_policy[i].cpu = i;
+
+        ret = powernow_cpufreq_cpu_init(&xen_px_policy[i]);
+        if (ret)
+            goto cpufreq_init_out;
+    }
+
+    /* setup ondemand cpufreq */
+    for (dom=0; dom<max_dom; dom++) {
+        if (!cpu_isset(dom, dom_mask))
+            continue;
+        i = first_cpu(pt[dom]);
+        ret = cpufreq_governor_dbs(&xen_px_policy[i], CPUFREQ_GOV_START);
+        if (ret)
+            goto cpufreq_init_out;
+    }
+
+cpufreq_init_out:
+    xfree(pt);
+   
+    return ret;
+}
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/acpi/cpufreq/utility.c
--- a/xen/arch/x86/acpi/cpufreq/utility.c       Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/acpi/cpufreq/utility.c       Wed Jul 02 11:30:37 2008 +0900
@@ -37,6 +37,41 @@ struct cpufreq_driver *cpufreq_driver;
  *                    Px STATISTIC INFO                              *
  *********************************************************************/
 
+void px_statistic_suspend(void)
+{
+    int cpu;
+    uint64_t now;
+
+    now = NOW();
+
+    for_each_online_cpu(cpu) {
+        struct pm_px *pxpt = &px_statistic_data[cpu];
+        uint64_t total_idle_ns;
+        uint64_t tmp_idle_ns;
+
+        total_idle_ns = get_cpu_idle_time(cpu);
+        tmp_idle_ns = total_idle_ns - pxpt->prev_idle_wall;
+
+        pxpt->u.pt[pxpt->u.cur].residency +=
+                    now - pxpt->prev_state_wall;
+        pxpt->u.pt[pxpt->u.cur].residency -= tmp_idle_ns;
+    }
+}
+
+void px_statistic_resume(void)
+{
+    int cpu;
+    uint64_t now;
+
+    now = NOW();
+
+    for_each_online_cpu(cpu) {
+        struct pm_px *pxpt = &px_statistic_data[cpu];
+        pxpt->prev_state_wall = now;
+        pxpt->prev_idle_wall = get_cpu_idle_time(cpu);
+    }
+}
+
 void px_statistic_update(cpumask_t cpumask, uint8_t from, uint8_t to)
 {
     uint32_t i;
@@ -47,15 +82,22 @@ void px_statistic_update(cpumask_t cpuma
     for_each_cpu_mask(i, cpumask) {
         struct pm_px *pxpt = &px_statistic_data[i];
         uint32_t statnum = processor_pminfo[i].perf.state_count;
+        uint64_t total_idle_ns;
+        uint64_t tmp_idle_ns;
+
+        total_idle_ns = get_cpu_idle_time(i);
+        tmp_idle_ns = total_idle_ns - pxpt->prev_idle_wall;
 
         pxpt->u.last = from;
         pxpt->u.cur = to;
         pxpt->u.pt[to].count++;
         pxpt->u.pt[from].residency += now - pxpt->prev_state_wall;
+        pxpt->u.pt[from].residency -= tmp_idle_ns;
 
         (*(pxpt->u.trans_pt + from*statnum + to))++;
 
         pxpt->prev_state_wall = now;
+        pxpt->prev_idle_wall = total_idle_ns;
     }
 }
 
@@ -87,6 +129,7 @@ int px_statistic_init(int cpuid)
         pxpt->u.pt[i].freq = pmpt->perf.states[i].core_frequency;
 
     pxpt->prev_state_wall = NOW();
+    pxpt->prev_idle_wall = get_cpu_idle_time(cpuid);
 
     return 0;
 }
@@ -107,6 +150,7 @@ void px_statistic_reset(int cpuid)
     }
 
     pxpt->prev_state_wall = NOW();
+    pxpt->prev_idle_wall = get_cpu_idle_time(cpuid);
 }
 
 
@@ -242,3 +286,62 @@ int __cpufreq_driver_getavg(struct cpufr
 
     return ret;
 }
+
+
+/*********************************************************************
+ *               CPUFREQ SUSPEND/RESUME                              *
+ *********************************************************************/
+
+void cpufreq_suspend(void)
+{
+    int cpu;
+
+    /* to protect the case when Px was controlled by dom0-kernel */
+    /* or when CPU_FREQ not set in which case ACPI Px objects not parsed */
+    for_each_online_cpu(cpu) {
+        struct processor_performance *perf = &processor_pminfo[cpu].perf;
+
+        if (!perf->init)
+            return;
+    }
+
+    cpufreq_dom_dbs(CPUFREQ_GOV_STOP);
+
+    cpufreq_dom_exit();
+
+    px_statistic_suspend();
+}
+
+int cpufreq_resume(void)
+{
+    int cpu, ret = 0;
+
+    /* 1. to protect the case when Px was controlled by dom0-kernel */
+    /* or when CPU_FREQ not set in which case ACPI Px objects not parsed */
+    /* 2. set state and resume flag to sync cpu to right state and freq */
+    for_each_online_cpu(cpu) {
+        struct processor_performance *perf = &processor_pminfo[cpu].perf;
+        struct cpufreq_policy *policy = &xen_px_policy[cpu];
+
+        if (!perf->init)
+            goto err;
+        perf->state = 0;
+        policy->resume = 1;
+    }
+
+    px_statistic_resume();
+
+    ret = cpufreq_dom_init();
+    if (ret)
+        goto err;
+
+    ret = cpufreq_dom_dbs(CPUFREQ_GOV_START);
+    if (ret)
+        goto err;
+
+    return ret;
+
+err:
+    cpufreq_dom_exit();
+    return ret;
+}
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/acpi/pmstat.c
--- a/xen/arch/x86/acpi/pmstat.c        Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/acpi/pmstat.c        Wed Jul 02 11:30:37 2008 +0900
@@ -71,11 +71,18 @@ int do_get_pm_info(struct xen_sysctl_get
     case PMSTAT_get_pxstat:
     {
         uint64_t now, ct;
+        uint64_t total_idle_ns;
+        uint64_t tmp_idle_ns;
+
+        total_idle_ns = get_cpu_idle_time(op->cpuid);
+        tmp_idle_ns = total_idle_ns - pxpt->prev_idle_wall;
 
         now = NOW();
         pxpt->u.usable = pmpt->perf.state_count - pmpt->perf.ppc;
         pxpt->u.pt[pxpt->u.cur].residency += now - pxpt->prev_state_wall;
+        pxpt->u.pt[pxpt->u.cur].residency -= tmp_idle_ns;
         pxpt->prev_state_wall = now;
+        pxpt->prev_idle_wall = total_idle_ns;
 
         ct = pmpt->perf.state_count;
         if ( copy_to_guest(op->u.getpx.trans_pt, pxpt->u.trans_pt, ct*ct) )
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/acpi/power.c
--- a/xen/arch/x86/acpi/power.c Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/acpi/power.c Wed Jul 02 11:30:37 2008 +0900
@@ -27,7 +27,7 @@
 #include <public/platform.h>
 #include <asm/tboot.h>
 
-#define pmprintk(_l, _f, _a...) printk(_l "<PM> " _f "\n", ## _a )
+#include <acpi/cpufreq/cpufreq.h>
 
 static char opt_acpi_sleep[20];
 string_param("acpi_sleep", opt_acpi_sleep);
@@ -124,9 +124,11 @@ static int enter_state(u32 state)
     if ( !spin_trylock(&pm_lock) )
         return -EBUSY;
 
-    pmprintk(XENLOG_INFO, "Preparing system for ACPI S%d state.", state);
+    printk(XENLOG_INFO "Preparing system for ACPI S%d state.", state);
 
     freeze_domains();
+
+    cpufreq_suspend();
 
     disable_nonboot_cpus();
     if ( num_online_cpus() != 1 )
@@ -139,11 +141,14 @@ static int enter_state(u32 state)
 
     acpi_sleep_prepare(state);
 
+    console_start_sync();
+    printk("Entering ACPI S%d state.\n", state);
+
     local_irq_save(flags);
 
     if ( (error = device_power_down()) )
     {
-        pmprintk(XENLOG_ERR, "Some devices failed to power down.");
+        printk(XENLOG_ERR "Some devices failed to power down.");
         goto done;
     }
 
@@ -162,8 +167,6 @@ static int enter_state(u32 state)
         break;
     }
 
-    pmprintk(XENLOG_DEBUG, "Back to C.");
-
     /* Restore CR4 and EFER from cached values. */
     write_cr4(read_cr4());
     if ( cpu_has_efer )
@@ -171,16 +174,18 @@ static int enter_state(u32 state)
 
     device_power_up();
 
-    pmprintk(XENLOG_INFO, "Finishing wakeup from ACPI S%d state.", state);
+    printk(XENLOG_INFO "Finishing wakeup from ACPI S%d state.", state);
 
  done:
     local_irq_restore(flags);
+    console_end_sync();
     acpi_sleep_post(state);
     if ( !hvm_cpu_up() )
         BUG();
 
  enable_cpu:
     enable_nonboot_cpus();
+    cpufreq_resume();
     thaw_domains();
     spin_unlock(&pm_lock);
     return error;
@@ -206,7 +211,7 @@ int acpi_enter_sleep(struct xenpf_enter_
          ((sleep->pm1a_cnt_val ^ sleep->pm1b_cnt_val) &
           ACPI_BITMASK_SLEEP_ENABLE) )
     {
-        pmprintk(XENLOG_ERR, "Mismatched pm1a/pm1b setting.");
+        gdprintk(XENLOG_ERR, "Mismatched pm1a/pm1b setting.");
         return -EINVAL;
     }
 
@@ -278,7 +283,7 @@ acpi_status asmlinkage acpi_enter_sleep_
     if ( tboot_in_measured_env() )
     {
         tboot_sleep(sleep_state);
-        pmprintk(XENLOG_ERR, "TBOOT failed entering s3 state\n");
+        printk(XENLOG_ERR "TBOOT failed entering s3 state\n");
         return_ACPI_STATUS(AE_ERROR);
     }
 
@@ -320,7 +325,7 @@ static int __init acpi_sleep_init(void)
             p += strspn(p, ", \t");
     }
 
-    printk(XENLOG_INFO "<PM> ACPI (supports");
+    printk(XENLOG_INFO "ACPI sleep modes:");
     for ( i = 0; i < ACPI_S_STATE_COUNT; i++ )
     {
         if ( i == ACPI_STATE_S3 )
@@ -331,7 +336,7 @@ static int __init acpi_sleep_init(void)
         else
             sleep_states[i] = 0;
     }
-    printk(")\n");
+    printk("\n");
 
     return 0;
 }
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/hvm/emulate.c
--- a/xen/arch/x86/hvm/emulate.c        Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/hvm/emulate.c        Wed Jul 02 11:30:37 2008 +0900
@@ -21,15 +21,33 @@
 
 static int hvmemul_do_io(
     int is_mmio, paddr_t addr, unsigned long *reps, int size,
-    paddr_t value, int dir, int df, int value_is_ptr, unsigned long *val)
-{
+    paddr_t ram_gpa, int dir, int df, void *p_data)
+{
+    paddr_t value = ram_gpa;
+    int value_is_ptr = (p_data == NULL);
     struct vcpu *curr = current;
     vcpu_iodata_t *vio = get_ioreq(curr);
     ioreq_t *p = &vio->vp_ioreq;
     int rc;
 
-    /* Only retrieve the value from singleton (non-REP) reads. */
-    ASSERT((val == NULL) || ((dir == IOREQ_READ) && !value_is_ptr));
+    /*
+     * Weird-sized accesses have undefined behaviour: we discard writes
+     * and read all-ones.
+     */
+    if ( unlikely((size > sizeof(long)) || (size & (size - 1))) )
+    {
+        gdprintk(XENLOG_WARNING, "bad mmio size %d\n", size);
+        ASSERT(p_data != NULL); /* cannot happen with a REP prefix */
+        if ( dir == IOREQ_READ )
+            memset(p_data, ~0, size);
+        return X86EMUL_UNHANDLEABLE;
+    }
+
+    if ( (p_data != NULL) && (dir == IOREQ_WRITE) )
+    {
+        memcpy(&value, p_data, size);
+        p_data = NULL;
+    }
 
     if ( is_mmio && !value_is_ptr )
     {
@@ -47,8 +65,7 @@ static int hvmemul_do_io(
             unsigned int bytes = curr->arch.hvm_vcpu.mmio_large_read_bytes;
             if ( (addr >= pa) && ((addr + size) <= (pa + bytes)) )
             {
-                *val = 0;
-                memcpy(val, &curr->arch.hvm_vcpu.mmio_large_read[addr - pa],
+                memcpy(p_data, &curr->arch.hvm_vcpu.mmio_large_read[addr - pa],
                        size);
                 return X86EMUL_OKAY;
             }
@@ -61,7 +78,7 @@ static int hvmemul_do_io(
         break;
     case HVMIO_completed:
         curr->arch.hvm_vcpu.io_state = HVMIO_none;
-        if ( val == NULL )
+        if ( p_data == NULL )
             return X86EMUL_UNHANDLEABLE;
         goto finish_access;
     case HVMIO_dispatched:
@@ -82,7 +99,7 @@ static int hvmemul_do_io(
     }
 
     curr->arch.hvm_vcpu.io_state =
-        (val == NULL) ? HVMIO_dispatched : HVMIO_awaiting_completion;
+        (p_data == NULL) ? HVMIO_dispatched : HVMIO_awaiting_completion;
 
     p->dir = dir;
     p->data_is_ptr = value_is_ptr;
@@ -116,7 +133,7 @@ static int hvmemul_do_io(
         break;
     case X86EMUL_UNHANDLEABLE:
         hvm_send_assist_req(curr);
-        rc = (val != NULL) ? X86EMUL_RETRY : X86EMUL_OKAY;
+        rc = (p_data != NULL) ? X86EMUL_RETRY : X86EMUL_OKAY;
         break;
     default:
         BUG();
@@ -126,8 +143,8 @@ static int hvmemul_do_io(
         return rc;
 
  finish_access:
-    if ( val != NULL )
-        *val = curr->arch.hvm_vcpu.io_data;
+    if ( p_data != NULL )
+        memcpy(p_data, &curr->arch.hvm_vcpu.io_data, size);
 
     if ( is_mmio && !value_is_ptr )
     {
@@ -152,7 +169,7 @@ static int hvmemul_do_io(
                   sizeof(curr->arch.hvm_vcpu.mmio_large_read)) )
             {
                 memcpy(&curr->arch.hvm_vcpu.mmio_large_read[addr - pa],
-                       val, size);
+                       p_data, size);
                 curr->arch.hvm_vcpu.mmio_large_read_bytes += size;
             }
         }
@@ -163,18 +180,16 @@ static int hvmemul_do_io(
 
 static int hvmemul_do_pio(
     unsigned long port, unsigned long *reps, int size,
-    paddr_t value, int dir, int df, int value_is_ptr, unsigned long *val)
-{
-    return hvmemul_do_io(0, port, reps, size, value,
-                         dir, df, value_is_ptr, val);
+    paddr_t ram_gpa, int dir, int df, void *p_data)
+{
+    return hvmemul_do_io(0, port, reps, size, ram_gpa, dir, df, p_data);
 }
 
 static int hvmemul_do_mmio(
     paddr_t gpa, unsigned long *reps, int size,
-    paddr_t value, int dir, int df, int value_is_ptr, unsigned long *val)
-{
-    return hvmemul_do_io(1, gpa, reps, size, value,
-                         dir, df, value_is_ptr, val);
+    paddr_t ram_gpa, int dir, int df, void *p_data)
+{
+    return hvmemul_do_io(1, gpa, reps, size, ram_gpa, dir, df, p_data);
 }
 
 /*
@@ -287,7 +302,7 @@ static int __hvmemul_read(
 static int __hvmemul_read(
     enum x86_segment seg,
     unsigned long offset,
-    unsigned long *val,
+    void *p_data,
     unsigned int bytes,
     enum hvm_access_type access_type,
     struct hvm_emulate_ctxt *hvmemul_ctxt)
@@ -302,8 +317,6 @@ static int __hvmemul_read(
         seg, offset, bytes, access_type, hvmemul_ctxt, &addr);
     if ( rc != X86EMUL_OKAY )
         return rc;
-
-    *val = 0;
 
     if ( unlikely(curr->arch.hvm_vcpu.mmio_gva == (addr & PAGE_MASK)) &&
          curr->arch.hvm_vcpu.mmio_gva )
@@ -314,7 +327,7 @@ static int __hvmemul_read(
         gpa = (((paddr_t)curr->arch.hvm_vcpu.mmio_gpfn << PAGE_SHIFT) | off);
         if ( (off + bytes) <= PAGE_SIZE )
             return hvmemul_do_mmio(gpa, &reps, bytes, 0,
-                                   IOREQ_READ, 0, 0, val);
+                                   IOREQ_READ, 0, p_data);
     }
 
     if ( (seg != x86_seg_none) &&
@@ -322,15 +335,13 @@ static int __hvmemul_read(
         pfec |= PFEC_user_mode;
 
     rc = ((access_type == hvm_access_insn_fetch) ?
-          hvm_fetch_from_guest_virt(val, addr, bytes, pfec) :
-          hvm_copy_from_guest_virt(val, addr, bytes, pfec));
+          hvm_fetch_from_guest_virt(p_data, addr, bytes, pfec) :
+          hvm_copy_from_guest_virt(p_data, addr, bytes, pfec));
     if ( rc == HVMCOPY_bad_gva_to_gfn )
         return X86EMUL_EXCEPTION;
 
     if ( rc == HVMCOPY_bad_gfn_to_mfn )
     {
-        unsigned long reps = 1;
-
         if ( access_type == hvm_access_insn_fetch )
             return X86EMUL_UNHANDLEABLE;
 
@@ -339,7 +350,7 @@ static int __hvmemul_read(
         if ( rc != X86EMUL_OKAY )
             return rc;
 
-        return hvmemul_do_mmio(gpa, &reps, bytes, 0, IOREQ_READ, 0, 0, val);
+        return hvmemul_do_mmio(gpa, &reps, bytes, 0, IOREQ_READ, 0, p_data);
     }
 
     return X86EMUL_OKAY;
@@ -348,19 +359,19 @@ static int hvmemul_read(
 static int hvmemul_read(
     enum x86_segment seg,
     unsigned long offset,
-    unsigned long *val,
+    void *p_data,
     unsigned int bytes,
     struct x86_emulate_ctxt *ctxt)
 {
     return __hvmemul_read(
-        seg, offset, val, bytes, hvm_access_read,
+        seg, offset, p_data, bytes, hvm_access_read,
         container_of(ctxt, struct hvm_emulate_ctxt, ctxt));
 }
 
 static int hvmemul_insn_fetch(
     enum x86_segment seg,
     unsigned long offset,
-    unsigned long *val,
+    void *p_data,
     unsigned int bytes,
     struct x86_emulate_ctxt *ctxt)
 {
@@ -371,19 +382,18 @@ static int hvmemul_insn_fetch(
     /* Fall back if requested bytes are not in the prefetch cache. */
     if ( unlikely((insn_off + bytes) > hvmemul_ctxt->insn_buf_bytes) )
         return __hvmemul_read(
-            seg, offset, val, bytes,
+            seg, offset, p_data, bytes,
             hvm_access_insn_fetch, hvmemul_ctxt);
 
     /* Hit the cache. Simple memcpy. */
-    *val = 0;
-    memcpy(val, &hvmemul_ctxt->insn_buf[insn_off], bytes);
+    memcpy(p_data, &hvmemul_ctxt->insn_buf[insn_off], bytes);
     return X86EMUL_OKAY;
 }
 
 static int hvmemul_write(
     enum x86_segment seg,
     unsigned long offset,
-    unsigned long val,
+    void *p_data,
     unsigned int bytes,
     struct x86_emulate_ctxt *ctxt)
 {
@@ -406,29 +416,27 @@ static int hvmemul_write(
         unsigned int off = addr & (PAGE_SIZE - 1);
         gpa = (((paddr_t)curr->arch.hvm_vcpu.mmio_gpfn << PAGE_SHIFT) | off);
         if ( (off + bytes) <= PAGE_SIZE )
-            return hvmemul_do_mmio(gpa, &reps, bytes, val,
-                                   IOREQ_WRITE, 0, 0, NULL);
+            return hvmemul_do_mmio(gpa, &reps, bytes, 0,
+                                   IOREQ_WRITE, 0, p_data);
     }
 
     if ( (seg != x86_seg_none) &&
          (hvmemul_ctxt->seg_reg[x86_seg_ss].attr.fields.dpl == 3) )
         pfec |= PFEC_user_mode;
 
-    rc = hvm_copy_to_guest_virt(addr, &val, bytes, pfec);
+    rc = hvm_copy_to_guest_virt(addr, p_data, bytes, pfec);
     if ( rc == HVMCOPY_bad_gva_to_gfn )
         return X86EMUL_EXCEPTION;
 
     if ( rc == HVMCOPY_bad_gfn_to_mfn )
     {
-        unsigned long reps = 1;
-
         rc = hvmemul_linear_to_phys(
             addr, &gpa, bytes, &reps, pfec, hvmemul_ctxt);
         if ( rc != X86EMUL_OKAY )
             return rc;
 
-        return hvmemul_do_mmio(gpa, &reps, bytes, val,
-                               IOREQ_WRITE, 0, 0, NULL);
+        return hvmemul_do_mmio(gpa, &reps, bytes, 0,
+                               IOREQ_WRITE, 0, p_data);
     }
 
     return X86EMUL_OKAY;
@@ -442,12 +450,8 @@ static int hvmemul_cmpxchg(
     unsigned int bytes,
     struct x86_emulate_ctxt *ctxt)
 {
-    unsigned long new = 0;
-    if ( bytes > sizeof(new) )
-        return X86EMUL_UNHANDLEABLE;
-    memcpy(&new, p_new, bytes);
     /* Fix this in case the guest is really relying on r-m-w atomicity. */
-    return hvmemul_write(seg, offset, new, bytes, ctxt);
+    return hvmemul_write(seg, offset, p_new, bytes, ctxt);
 }
 
 static int hvmemul_rep_ins(
@@ -480,7 +484,7 @@ static int hvmemul_rep_ins(
         return rc;
 
     return hvmemul_do_pio(src_port, reps, bytes_per_rep, gpa, IOREQ_READ,
-                          !!(ctxt->regs->eflags & X86_EFLAGS_DF), 1, NULL);
+                          !!(ctxt->regs->eflags & X86_EFLAGS_DF), NULL);
 }
 
 static int hvmemul_rep_outs(
@@ -513,7 +517,7 @@ static int hvmemul_rep_outs(
         return rc;
 
     return hvmemul_do_pio(dst_port, reps, bytes_per_rep, gpa, IOREQ_WRITE,
-                          !!(ctxt->regs->eflags & X86_EFLAGS_DF), 1, NULL);
+                          !!(ctxt->regs->eflags & X86_EFLAGS_DF), NULL);
 }
 
 static int hvmemul_rep_movs(
@@ -563,14 +567,14 @@ static int hvmemul_rep_movs(
     if ( !p2m_is_ram(p2mt) )
         return hvmemul_do_mmio(
             sgpa, reps, bytes_per_rep, dgpa, IOREQ_READ,
-            !!(ctxt->regs->eflags & X86_EFLAGS_DF), 1, NULL);
+            !!(ctxt->regs->eflags & X86_EFLAGS_DF), NULL);
 
     (void)gfn_to_mfn_current(dgpa >> PAGE_SHIFT, &p2mt);
     if ( p2m_is_ram(p2mt) )
         return X86EMUL_UNHANDLEABLE;
     return hvmemul_do_mmio(
         dgpa, reps, bytes_per_rep, sgpa, IOREQ_WRITE,
-        !!(ctxt->regs->eflags & X86_EFLAGS_DF), 1, NULL);
+        !!(ctxt->regs->eflags & X86_EFLAGS_DF), NULL);
 }
 
 static int hvmemul_read_segment(
@@ -607,7 +611,8 @@ static int hvmemul_read_io(
     struct x86_emulate_ctxt *ctxt)
 {
     unsigned long reps = 1;
-    return hvmemul_do_pio(port, &reps, bytes, 0, IOREQ_READ, 0, 0, val);
+    *val = 0;
+    return hvmemul_do_pio(port, &reps, bytes, 0, IOREQ_READ, 0, val);
 }
 
 static int hvmemul_write_io(
@@ -617,7 +622,7 @@ static int hvmemul_write_io(
     struct x86_emulate_ctxt *ctxt)
 {
     unsigned long reps = 1;
-    return hvmemul_do_pio(port, &reps, bytes, val, IOREQ_WRITE, 0, 0, NULL);
+    return hvmemul_do_pio(port, &reps, bytes, 0, IOREQ_WRITE, 0, &val);
 }
 
 static int hvmemul_read_cr(
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/hvm/hvm.c
--- a/xen/arch/x86/hvm/hvm.c    Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/hvm/hvm.c    Wed Jul 02 11:30:37 2008 +0900
@@ -2529,6 +2529,66 @@ long do_hvm_op(unsigned long op, XEN_GUE
         break;
     }
 
+    case HVMOP_modified_memory:
+    {
+        struct xen_hvm_modified_memory a;
+        struct domain *d;
+        unsigned long pfn;
+
+        if ( copy_from_guest(&a, arg, 1) )
+            return -EFAULT;
+
+        if ( a.domid == DOMID_SELF )
+        {
+            d = rcu_lock_current_domain();
+        }
+        else
+        {
+            if ( (d = rcu_lock_domain_by_id(a.domid)) == NULL )
+                return -ESRCH;
+            if ( !IS_PRIV_FOR(current->domain, d) )
+            {
+                rc = -EPERM;
+                goto param_fail3;
+            }
+        }
+
+        rc = -EINVAL;
+        if ( !is_hvm_domain(d) )
+            goto param_fail3;
+
+        rc = xsm_hvm_param(d, op);
+        if ( rc )
+            goto param_fail3;
+
+        rc = -EINVAL;
+        if ( (a.first_pfn > domain_get_maximum_gpfn(d)) ||
+             ((a.first_pfn + a.nr - 1) < a.first_pfn) ||
+             ((a.first_pfn + a.nr - 1) > domain_get_maximum_gpfn(d)) )
+            goto param_fail3;
+
+        rc = 0;
+        if ( !paging_mode_log_dirty(d) )
+            goto param_fail3;
+
+        for ( pfn = a.first_pfn; pfn < a.first_pfn + a.nr; pfn++ )
+        {
+            p2m_type_t t;
+            mfn_t mfn = gfn_to_mfn(d, pfn, &t);
+            if ( mfn_x(mfn) != INVALID_MFN )
+            {
+                paging_mark_dirty(d, mfn_x(mfn));
+                /* These are most probably not page tables any more */
+                /* don't take a long time and don't die either */
+                sh_remove_shadows(d->vcpu[0], mfn, 1, 0);
+            }
+        }
+
+    param_fail3:
+        rcu_unlock_domain(d);
+        break;
+    }
+
     default:
     {
         gdprintk(XENLOG_WARNING, "Bad HVM op %ld.\n", op);
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/hvm/vmx/vmcs.c
--- a/xen/arch/x86/hvm/vmx/vmcs.c       Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/hvm/vmx/vmcs.c       Wed Jul 02 11:30:37 2008 +0900
@@ -677,10 +677,11 @@ static int construct_vmcs(struct vcpu *v
     return 0;
 }
 
-int vmx_read_guest_msr(struct vcpu *v, u32 msr, u64 *val)
-{
-    unsigned int i, msr_count = v->arch.hvm_vmx.msr_count;
-    const struct vmx_msr_entry *msr_area = v->arch.hvm_vmx.msr_area;
+int vmx_read_guest_msr(u32 msr, u64 *val)
+{
+    struct vcpu *curr = current;
+    unsigned int i, msr_count = curr->arch.hvm_vmx.msr_count;
+    const struct vmx_msr_entry *msr_area = curr->arch.hvm_vmx.msr_area;
 
     for ( i = 0; i < msr_count; i++ )
     {
@@ -694,10 +695,11 @@ int vmx_read_guest_msr(struct vcpu *v, u
     return -ESRCH;
 }
 
-int vmx_write_guest_msr(struct vcpu *v, u32 msr, u64 val)
-{
-    unsigned int i, msr_count = v->arch.hvm_vmx.msr_count;
-    struct vmx_msr_entry *msr_area = v->arch.hvm_vmx.msr_area;
+int vmx_write_guest_msr(u32 msr, u64 val)
+{
+    struct vcpu *curr = current;
+    unsigned int i, msr_count = curr->arch.hvm_vmx.msr_count;
+    struct vmx_msr_entry *msr_area = curr->arch.hvm_vmx.msr_area;
 
     for ( i = 0; i < msr_count; i++ )
     {
@@ -711,10 +713,20 @@ int vmx_write_guest_msr(struct vcpu *v, 
     return -ESRCH;
 }
 
-int vmx_add_guest_msr(struct vcpu *v, u32 msr)
-{
-    unsigned int i, msr_count = v->arch.hvm_vmx.msr_count;
-    struct vmx_msr_entry *msr_area = v->arch.hvm_vmx.msr_area;
+int vmx_add_guest_msr(u32 msr)
+{
+    struct vcpu *curr = current;
+    unsigned int i, msr_count = curr->arch.hvm_vmx.msr_count;
+    struct vmx_msr_entry *msr_area = curr->arch.hvm_vmx.msr_area;
+
+    if ( msr_area == NULL )
+    {
+        if ( (msr_area = alloc_xenheap_page()) == NULL )
+            return -ENOMEM;
+        curr->arch.hvm_vmx.msr_area = msr_area;
+        __vmwrite(VM_EXIT_MSR_STORE_ADDR, virt_to_maddr(msr_area));
+        __vmwrite(VM_ENTRY_MSR_LOAD_ADDR, virt_to_maddr(msr_area));
+    }
 
     for ( i = 0; i < msr_count; i++ )
         if ( msr_area[i].index == msr )
@@ -723,29 +735,29 @@ int vmx_add_guest_msr(struct vcpu *v, u3
     if ( msr_count == (PAGE_SIZE / sizeof(struct vmx_msr_entry)) )
         return -ENOSPC;
 
-    if ( msr_area == NULL )
-    {
-        if ( (msr_area = alloc_xenheap_page()) == NULL )
-            return -ENOMEM;
-        v->arch.hvm_vmx.msr_area = msr_area;
-        __vmwrite(VM_EXIT_MSR_STORE_ADDR, virt_to_maddr(msr_area));
-        __vmwrite(VM_ENTRY_MSR_LOAD_ADDR, virt_to_maddr(msr_area));
-    }
-
     msr_area[msr_count].index = msr;
     msr_area[msr_count].mbz   = 0;
     msr_area[msr_count].data  = 0;
-    v->arch.hvm_vmx.msr_count = ++msr_count;
+    curr->arch.hvm_vmx.msr_count = ++msr_count;
     __vmwrite(VM_EXIT_MSR_STORE_COUNT, msr_count);
     __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, msr_count);
 
     return 0;
 }
 
-int vmx_add_host_load_msr(struct vcpu *v, u32 msr)
-{
-    unsigned int i, msr_count = v->arch.hvm_vmx.host_msr_count;
-    struct vmx_msr_entry *msr_area = v->arch.hvm_vmx.host_msr_area;
+int vmx_add_host_load_msr(u32 msr)
+{
+    struct vcpu *curr = current;
+    unsigned int i, msr_count = curr->arch.hvm_vmx.host_msr_count;
+    struct vmx_msr_entry *msr_area = curr->arch.hvm_vmx.host_msr_area;
+
+    if ( msr_area == NULL )
+    {
+        if ( (msr_area = alloc_xenheap_page()) == NULL )
+            return -ENOMEM;
+        curr->arch.hvm_vmx.host_msr_area = msr_area;
+        __vmwrite(VM_EXIT_MSR_LOAD_ADDR, virt_to_maddr(msr_area));
+    }
 
     for ( i = 0; i < msr_count; i++ )
         if ( msr_area[i].index == msr )
@@ -754,18 +766,10 @@ int vmx_add_host_load_msr(struct vcpu *v
     if ( msr_count == (PAGE_SIZE / sizeof(struct vmx_msr_entry)) )
         return -ENOSPC;
 
-    if ( msr_area == NULL )
-    {
-        if ( (msr_area = alloc_xenheap_page()) == NULL )
-            return -ENOMEM;
-        v->arch.hvm_vmx.host_msr_area = msr_area;
-        __vmwrite(VM_EXIT_MSR_LOAD_ADDR, virt_to_maddr(msr_area));
-    }
-
     msr_area[msr_count].index = msr;
     msr_area[msr_count].mbz   = 0;
     rdmsrl(msr, msr_area[msr_count].data);
-    v->arch.hvm_vmx.host_msr_count = ++msr_count;
+    curr->arch.hvm_vmx.host_msr_count = ++msr_count;
     __vmwrite(VM_EXIT_MSR_LOAD_COUNT, msr_count);
 
     return 0;
@@ -776,21 +780,17 @@ int vmx_create_vmcs(struct vcpu *v)
     struct arch_vmx_struct *arch_vmx = &v->arch.hvm_vmx;
     int rc;
 
-    if ( arch_vmx->vmcs == NULL )
-    {
-        if ( (arch_vmx->vmcs = vmx_alloc_vmcs()) == NULL )
-            return -ENOMEM;
-
-        INIT_LIST_HEAD(&arch_vmx->active_list);
-        __vmpclear(virt_to_maddr(arch_vmx->vmcs));
-        arch_vmx->active_cpu = -1;
-        arch_vmx->launched   = 0;
-    }
+    if ( (arch_vmx->vmcs = vmx_alloc_vmcs()) == NULL )
+        return -ENOMEM;
+
+    INIT_LIST_HEAD(&arch_vmx->active_list);
+    __vmpclear(virt_to_maddr(arch_vmx->vmcs));
+    arch_vmx->active_cpu = -1;
+    arch_vmx->launched   = 0;
 
     if ( (rc = construct_vmcs(v)) != 0 )
     {
         vmx_free_vmcs(arch_vmx->vmcs);
-        arch_vmx->vmcs = NULL;
         return rc;
     }
 
@@ -801,13 +801,13 @@ void vmx_destroy_vmcs(struct vcpu *v)
 {
     struct arch_vmx_struct *arch_vmx = &v->arch.hvm_vmx;
 
-    if ( arch_vmx->vmcs == NULL )
-        return;
-
     vmx_clear_vmcs(v);
 
     vmx_free_vmcs(arch_vmx->vmcs);
-    arch_vmx->vmcs = NULL;
+
+    free_xenheap_page(v->arch.hvm_vmx.host_msr_area);
+    free_xenheap_page(v->arch.hvm_vmx.msr_area);
+    free_xenheap_page(v->arch.hvm_vmx.msr_bitmap);
 }
 
 void vm_launch_fail(void)
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/hvm/vmx/vmx.c
--- a/xen/arch/x86/hvm/vmx/vmx.c        Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/hvm/vmx/vmx.c        Wed Jul 02 11:30:37 2008 +0900
@@ -1523,7 +1523,8 @@ static int vmx_cr_access(unsigned long e
         break;
     case VMX_CONTROL_REG_ACCESS_TYPE_LMSW:
         value = v->arch.hvm_vcpu.guest_cr[0];
-        value = (value & ~0xFFFF) | ((exit_qualification >> 16) & 0xFFFF);
+        /* LMSW can: (1) set bits 0-3; (2) clear bits 1-3. */
+        value = (value & ~0xe) | ((exit_qualification >> 16) & 0xf);
         HVMTRACE_LONG_1D(LMSW, current, value);
         return !hvm_set_cr0(value);
     default:
@@ -1655,7 +1656,7 @@ static int vmx_msr_read_intercept(struct
                 goto done;
         }
 
-        if ( vmx_read_guest_msr(v, ecx, &msr_content) == 0 )
+        if ( vmx_read_guest_msr(ecx, &msr_content) == 0 )
             break;
 
         if ( is_last_branch_msr(ecx) )
@@ -1817,12 +1818,12 @@ static int vmx_msr_write_intercept(struc
 
             for ( ; (rc == 0) && lbr->count; lbr++ )
                 for ( i = 0; (rc == 0) && (i < lbr->count); i++ )
-                    if ( (rc = vmx_add_guest_msr(v, lbr->base + i)) == 0 )
+                    if ( (rc = vmx_add_guest_msr(lbr->base + i)) == 0 )
                         vmx_disable_intercept_for_msr(v, lbr->base + i);
         }
 
         if ( (rc < 0) ||
-             (vmx_add_host_load_msr(v, ecx) < 0) )
+             (vmx_add_host_load_msr(ecx) < 0) )
             vmx_inject_hw_exception(v, TRAP_machine_check, 0);
         else
         {
@@ -1842,7 +1843,7 @@ static int vmx_msr_write_intercept(struc
         switch ( long_mode_do_msr_write(regs) )
         {
             case HNDL_unhandled:
-                if ( (vmx_write_guest_msr(v, ecx, msr_content) != 0) &&
+                if ( (vmx_write_guest_msr(ecx, msr_content) != 0) &&
                      !is_last_branch_msr(ecx) )
                     wrmsr_hypervisor_regs(ecx, regs->eax, regs->edx);
                 break;
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/hvm/vmx/vpmu_core2.c
--- a/xen/arch/x86/hvm/vmx/vpmu_core2.c Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/hvm/vmx/vpmu_core2.c Wed Jul 02 11:30:37 2008 +0900
@@ -219,12 +219,12 @@ static int core2_vpmu_alloc_resource(str
         return 0;
 
     wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
-    if ( vmx_add_host_load_msr(v, MSR_CORE_PERF_GLOBAL_CTRL) )
-        return 0;
-
-    if ( vmx_add_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL) )
-        return 0;
-    vmx_write_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, -1ULL);
+    if ( vmx_add_host_load_msr(MSR_CORE_PERF_GLOBAL_CTRL) )
+        return 0;
+
+    if ( vmx_add_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL) )
+        return 0;
+    vmx_write_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, -1ULL);
 
     pmu_enable = xmalloc_bytes(sizeof(struct core2_pmu_enable) +
                  (core2_get_pmc_count()-1)*sizeof(char));
@@ -347,7 +347,7 @@ static int core2_vpmu_do_wrmsr(struct cp
         break;
     case MSR_CORE_PERF_FIXED_CTR_CTRL:
         non_global_ctrl = msr_content;
-        vmx_read_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, &global_ctrl);
+        vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, &global_ctrl);
         global_ctrl >>= 32;
         for ( i = 0; i < 3; i++ )
         {
@@ -359,7 +359,7 @@ static int core2_vpmu_do_wrmsr(struct cp
         break;
     default:
         tmp = ecx - MSR_P6_EVNTSEL0;
-        vmx_read_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, &global_ctrl);
+        vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, &global_ctrl);
         if ( tmp >= 0 && tmp < core2_get_pmc_count() )
             core2_vpmu_cxt->pmu_enable->arch_pmc_enable[tmp] =
                 (global_ctrl >> tmp) & (msr_content >> 22) & 1;
@@ -385,7 +385,7 @@ static int core2_vpmu_do_wrmsr(struct cp
     if ( type != MSR_TYPE_GLOBAL )
         wrmsrl(ecx, msr_content);
     else
-        vmx_write_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, msr_content);
+        vmx_write_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, msr_content);
 
     return 1;
 }
@@ -410,7 +410,7 @@ static int core2_vpmu_do_rdmsr(struct cp
         msr_content = core2_vpmu_cxt->global_ovf_status;
         break;
     case MSR_CORE_PERF_GLOBAL_CTRL:
-        vmx_read_guest_msr(v, MSR_CORE_PERF_GLOBAL_CTRL, &msr_content);
+        vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, &msr_content);
         break;
     default:
         rdmsrl(regs->ecx, msr_content);
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/mm.c
--- a/xen/arch/x86/mm.c Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/mm.c Wed Jul 02 11:30:37 2008 +0900
@@ -219,7 +219,7 @@ void __init arch_init_memory(void)
      * Any Xen-heap pages that we will allow to be mapped will have
      * their domain field set to dom_xen.
      */
-    dom_xen = alloc_domain(DOMID_XEN);
+    dom_xen = domain_create(DOMID_XEN, DOMCRF_dummy, 0);
     BUG_ON(dom_xen == NULL);
 
     /*
@@ -227,7 +227,7 @@ void __init arch_init_memory(void)
      * This domain owns I/O pages that are within the range of the page_info
      * array. Mappings occur at the priv of the caller.
      */
-    dom_io = alloc_domain(DOMID_IO);
+    dom_io = domain_create(DOMID_IO, DOMCRF_dummy, 0);
     BUG_ON(dom_io == NULL);
 
     /* First 1MB of RAM is historically marked as I/O. */
@@ -1933,9 +1933,15 @@ int get_page_type(struct page_info *page
         {
             struct domain *d = page_get_owner(page);
 
-            /* Never allow a shadowed frame to go from type count 0 to 1 */
-            if ( d && shadow_mode_enabled(d) )
-                shadow_remove_all_shadows(d->vcpu[0], _mfn(page_to_mfn(page)));
+            /* Normally we should never let a page go from type count 0
+             * to type count 1 when it is shadowed. One exception:
+             * out-of-sync shadowed pages are allowed to become
+             * writeable. */
+            if ( d && shadow_mode_enabled(d)
+                 && (page->count_info & PGC_page_table)
+                 && !((page->shadow_flags & (1u<<29))
+                      && type == PGT_writable_page) )
+               shadow_remove_all_shadows(d->vcpu[0], _mfn(page_to_mfn(page)));
 
             ASSERT(!(x & PGT_pae_xen_l2));
             if ( (x & PGT_type_mask) != type )
@@ -3533,15 +3539,14 @@ static int ptwr_emulated_read(
 static int ptwr_emulated_read(
     enum x86_segment seg,
     unsigned long offset,
-    unsigned long *val,
+    void *p_data,
     unsigned int bytes,
     struct x86_emulate_ctxt *ctxt)
 {
     unsigned int rc;
     unsigned long addr = offset;
 
-    *val = 0;
-    if ( (rc = copy_from_user((void *)val, (void *)addr, bytes)) != 0 )
+    if ( (rc = copy_from_user(p_data, (void *)addr, bytes)) != 0 )
     {
         propagate_page_fault(addr + bytes - rc, 0); /* read fault */
         return X86EMUL_EXCEPTION;
@@ -3568,7 +3573,7 @@ static int ptwr_emulated_update(
     /* Only allow naturally-aligned stores within the original %cr2 page. */
     if ( unlikely(((addr^ptwr_ctxt->cr2) & PAGE_MASK) || (addr & (bytes-1))) )
     {
-        MEM_LOG("Bad ptwr access (cr2=%lx, addr=%lx, bytes=%u)",
+        MEM_LOG("ptwr_emulate: bad access (cr2=%lx, addr=%lx, bytes=%u)",
                 ptwr_ctxt->cr2, addr, bytes);
         return X86EMUL_UNHANDLEABLE;
     }
@@ -3676,10 +3681,21 @@ static int ptwr_emulated_write(
 static int ptwr_emulated_write(
     enum x86_segment seg,
     unsigned long offset,
-    unsigned long val,
+    void *p_data,
     unsigned int bytes,
     struct x86_emulate_ctxt *ctxt)
 {
+    paddr_t val = 0;
+
+    if ( (bytes > sizeof(paddr_t)) || (bytes & (bytes -1)) )
+    {
+        MEM_LOG("ptwr_emulate: bad write size (addr=%lx, bytes=%u)",
+                offset, bytes);
+        return X86EMUL_UNHANDLEABLE;
+    }
+
+    memcpy(&val, p_data, bytes);
+
     return ptwr_emulated_update(
         offset, 0, val, bytes, 0,
         container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
@@ -3694,10 +3710,17 @@ static int ptwr_emulated_cmpxchg(
     struct x86_emulate_ctxt *ctxt)
 {
     paddr_t old = 0, new = 0;
-    if ( bytes > sizeof(paddr_t) )
+
+    if ( (bytes > sizeof(paddr_t)) || (bytes & (bytes -1)) )
+    {
+        MEM_LOG("ptwr_emulate: bad cmpxchg size (addr=%lx, bytes=%u)",
+                offset, bytes);
         return X86EMUL_UNHANDLEABLE;
+    }
+
     memcpy(&old, p_old, bytes);
     memcpy(&new, p_new, bytes);
+
     return ptwr_emulated_update(
         offset, old, new, bytes, 1,
         container_of(ctxt, struct ptwr_emulate_ctxt, ctxt));
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/mm/shadow/common.c
--- a/xen/arch/x86/mm/shadow/common.c   Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/mm/shadow/common.c   Wed Jul 02 11:30:37 2008 +0900
@@ -54,6 +54,10 @@ void shadow_domain_init(struct domain *d
     /* Use shadow pagetables for log-dirty support */
     paging_log_dirty_init(d, shadow_enable_log_dirty, 
                           shadow_disable_log_dirty, shadow_clean_dirty_bitmap);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    d->arch.paging.shadow.oos_active = 0;
+#endif
 }
 
 /* Setup the shadow-specfic parts of a vcpu struct. Note: The most important
@@ -64,6 +68,16 @@ void shadow_domain_init(struct domain *d
  */
 void shadow_vcpu_init(struct vcpu *v)
 {
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    int i;
+
+    for ( i = 0; i < SHADOW_OOS_PAGES; i++ )
+    {
+        v->arch.paging.shadow.oos[i] = _mfn(INVALID_MFN);
+        v->arch.paging.shadow.oos_snapshot[i] = _mfn(INVALID_MFN);
+    }
+#endif
+
     v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 3);
 }
 
@@ -131,7 +145,7 @@ static int
 static int
 hvm_read(enum x86_segment seg,
          unsigned long offset,
-         unsigned long *val,
+         void *p_data,
          unsigned int bytes,
          enum hvm_access_type access_type,
          struct sh_emulate_ctxt *sh_ctxt)
@@ -144,12 +158,10 @@ hvm_read(enum x86_segment seg,
     if ( rc )
         return rc;
 
-    *val = 0;
-
     if ( access_type == hvm_access_insn_fetch )
-        rc = hvm_fetch_from_guest_virt(val, addr, bytes, 0);
+        rc = hvm_fetch_from_guest_virt(p_data, addr, bytes, 0);
     else
-        rc = hvm_copy_from_guest_virt(val, addr, bytes, 0);
+        rc = hvm_copy_from_guest_virt(p_data, addr, bytes, 0);
 
     switch ( rc )
     {
@@ -167,20 +179,20 @@ static int
 static int
 hvm_emulate_read(enum x86_segment seg,
                  unsigned long offset,
-                 unsigned long *val,
+                 void *p_data,
                  unsigned int bytes,
                  struct x86_emulate_ctxt *ctxt)
 {
     if ( !is_x86_user_segment(seg) )
         return X86EMUL_UNHANDLEABLE;
-    return hvm_read(seg, offset, val, bytes, hvm_access_read,
+    return hvm_read(seg, offset, p_data, bytes, hvm_access_read,
                     container_of(ctxt, struct sh_emulate_ctxt, ctxt));
 }
 
 static int
 hvm_emulate_insn_fetch(enum x86_segment seg,
                        unsigned long offset,
-                       unsigned long *val,
+                       void *p_data,
                        unsigned int bytes,
                        struct x86_emulate_ctxt *ctxt)
 {
@@ -192,19 +204,18 @@ hvm_emulate_insn_fetch(enum x86_segment 
 
     /* Fall back if requested bytes are not in the prefetch cache. */
     if ( unlikely((insn_off + bytes) > sh_ctxt->insn_buf_bytes) )
-        return hvm_read(seg, offset, val, bytes,
+        return hvm_read(seg, offset, p_data, bytes,
                         hvm_access_insn_fetch, sh_ctxt);
 
     /* Hit the cache. Simple memcpy. */
-    *val = 0;
-    memcpy(val, &sh_ctxt->insn_buf[insn_off], bytes);
+    memcpy(p_data, &sh_ctxt->insn_buf[insn_off], bytes);
     return X86EMUL_OKAY;
 }
 
 static int
 hvm_emulate_write(enum x86_segment seg,
                   unsigned long offset,
-                  unsigned long val,
+                  void *p_data,
                   unsigned int bytes,
                   struct x86_emulate_ctxt *ctxt)
 {
@@ -227,7 +238,7 @@ hvm_emulate_write(enum x86_segment seg,
         return rc;
 
     return v->arch.paging.mode->shadow.x86_emulate_write(
-        v, addr, &val, bytes, sh_ctxt);
+        v, addr, p_data, bytes, sh_ctxt);
 }
 
 static int 
@@ -279,7 +290,7 @@ static int
 static int
 pv_emulate_read(enum x86_segment seg,
                 unsigned long offset,
-                unsigned long *val,
+                void *p_data,
                 unsigned int bytes,
                 struct x86_emulate_ctxt *ctxt)
 {
@@ -288,8 +299,7 @@ pv_emulate_read(enum x86_segment seg,
     if ( !is_x86_user_segment(seg) )
         return X86EMUL_UNHANDLEABLE;
 
-    *val = 0;
-    if ( (rc = copy_from_user((void *)val, (void *)offset, bytes)) != 0 )
+    if ( (rc = copy_from_user(p_data, (void *)offset, bytes)) != 0 )
     {
         propagate_page_fault(offset + bytes - rc, 0); /* read fault */
         return X86EMUL_EXCEPTION;
@@ -301,7 +311,7 @@ static int
 static int
 pv_emulate_write(enum x86_segment seg,
                  unsigned long offset,
-                 unsigned long val,
+                 void *p_data,
                  unsigned int bytes,
                  struct x86_emulate_ctxt *ctxt)
 {
@@ -311,7 +321,7 @@ pv_emulate_write(enum x86_segment seg,
     if ( !is_x86_user_segment(seg) )
         return X86EMUL_UNHANDLEABLE;
     return v->arch.paging.mode->shadow.x86_emulate_write(
-        v, offset, &val, bytes, sh_ctxt);
+        v, offset, p_data, bytes, sh_ctxt);
 }
 
 static int 
@@ -427,6 +437,585 @@ void shadow_continue_emulation(struct sh
         }
     }
 }
+ 
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+/**************************************************************************/
+/* Out-of-sync shadows. */ 
+
+/* From time to time, we let a shadowed pagetable page go out of sync 
+ * with its shadow: the guest is allowed to write directly to the page, 
+ * and those writes are not synchronously reflected in the shadow.
+ * This lets us avoid many emulations if the guest is writing a lot to a 
+ * pagetable, but it relaxes a pretty important invariant in the shadow 
+ * pagetable design.  Therefore, some rules:
+ *
+ * 1. Only L1 pagetables may go out of sync: any page that is shadowed
+ *    at at higher level must be synchronously updated.  This makes
+ *    using linear shadow pagetables much less dangerous.
+ *    That means that: (a) unsyncing code needs to check for higher-level
+ *    shadows, and (b) promotion code needs to resync.
+ * 
+ * 2. All shadow operations on a guest page require the page to be brought
+ *    back into sync before proceeding.  This must be done under the
+ *    shadow lock so that the page is guaranteed to remain synced until
+ *    the operation completes.
+ *
+ *    Exceptions to this rule: the pagefault and invlpg handlers may 
+ *    update only one entry on an out-of-sync page without resyncing it. 
+ *
+ * 3. Operations on shadows that do not start from a guest page need to
+ *    be aware that they may be handling an out-of-sync shadow.
+ *
+ * 4. Operations that do not normally take the shadow lock (fast-path 
+ *    #PF handler, INVLPG) must fall back to a locking, syncing version 
+ *    if they see an out-of-sync table. 
+ *
+ * 5. Operations corresponding to guest TLB flushes (MOV CR3, INVLPG)
+ *    must explicitly resync all relevant pages or update their
+ *    shadows.
+ *
+ * Currently out-of-sync pages are listed in a simple open-addressed
+ * hash table with a second chance (must resist temptation to radically
+ * over-engineer hash tables...)  The virtual address of the access
+ * which caused us to unsync the page is also kept in the hash table, as
+ * a hint for finding the writable mappings later.
+ *
+ * We keep a hash per vcpu, because we want as much as possible to do
+ * the re-sync on the save vcpu we did the unsync on, so the VA hint
+ * will be valid.
+ */
+
+
+#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL
+static void sh_oos_audit(struct domain *d) 
+{
+    int idx, expected_idx, expected_idx_alt;
+    struct page_info *pg;
+    struct vcpu *v;
+    
+    for_each_vcpu(d, v) 
+    {
+        for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ )
+        {
+            mfn_t *oos = v->arch.paging.shadow.oos;
+            if ( !mfn_valid(oos[idx]) )
+                continue;
+            
+            expected_idx = mfn_x(oos[idx]) % SHADOW_OOS_PAGES;
+            expected_idx_alt = ((expected_idx + 1) % SHADOW_OOS_PAGES);
+            if ( idx != expected_idx && idx != expected_idx_alt )
+            {
+                printk("%s: idx %d contains gmfn %lx, expected at %d or %d.\n",
+                       __func__, idx, mfn_x(oos[idx]), 
+                       expected_idx, expected_idx_alt);
+                BUG();
+            }
+            pg = mfn_to_page(oos[idx]);
+            if ( !(pg->count_info & PGC_page_table) )
+            {
+                printk("%s: idx %x gmfn %lx not a pt (count %"PRIx32")\n",
+                       __func__, idx, mfn_x(oos[idx]), pg->count_info);
+                BUG();
+            }
+            if ( !(pg->shadow_flags & SHF_out_of_sync) )
+            {
+                printk("%s: idx %x gmfn %lx not marked oos (flags %lx)\n",
+                       __func__, idx, mfn_x(oos[idx]), pg->shadow_flags);
+                BUG();
+            }
+            if ( (pg->shadow_flags & SHF_page_type_mask & ~SHF_L1_ANY) )
+            {
+                printk("%s: idx %x gmfn %lx shadowed as non-l1 (flags %lx)\n",
+                       __func__, idx, mfn_x(oos[idx]), pg->shadow_flags);
+                BUG();
+            }
+        }
+    }
+}
+#endif
+
+#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
+void oos_audit_hash_is_present(struct domain *d, mfn_t gmfn) 
+{
+    int idx;
+    struct vcpu *v;
+    mfn_t *oos;
+
+    ASSERT(mfn_is_out_of_sync(gmfn));
+    
+    for_each_vcpu(d, v) 
+    {
+        oos = v->arch.paging.shadow.oos;
+        idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
+        if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
+            idx = (idx + 1) % SHADOW_OOS_PAGES;
+        
+        if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
+            return;
+    }
+
+    SHADOW_ERROR("gmfn %lx marked OOS but not in hash table\n", mfn_x(gmfn));
+    BUG();
+}
+#endif
+
+/* Update the shadow, but keep the page out of sync. */
+static inline void _sh_resync_l1(struct vcpu *v, mfn_t gmfn, mfn_t snpmfn)
+{
+    struct page_info *pg = mfn_to_page(gmfn);
+
+    ASSERT(mfn_valid(gmfn));
+    ASSERT(page_is_out_of_sync(pg));
+
+    /* Call out to the appropriate per-mode resyncing function */
+    if ( pg->shadow_flags & SHF_L1_32 )
+        SHADOW_INTERNAL_NAME(sh_resync_l1, 2)(v, gmfn, snpmfn);
+    else if ( pg->shadow_flags & SHF_L1_PAE )
+        SHADOW_INTERNAL_NAME(sh_resync_l1, 3)(v, gmfn, snpmfn);
+#if CONFIG_PAGING_LEVELS >= 4
+    else if ( pg->shadow_flags & SHF_L1_64 )
+        SHADOW_INTERNAL_NAME(sh_resync_l1, 4)(v, gmfn, snpmfn);
+#endif
+}
+
+#define _FIXUP_IDX(_b, _i) ((_b) * SHADOW_OOS_FT_HASH + (_i))
+
+void oos_fixup_add(struct vcpu *v, mfn_t gmfn,
+                   mfn_t smfn, unsigned long off)
+{
+    int idx, i, free = 0, free_slot = 0;
+    struct oos_fixup *fixups = v->arch.paging.shadow.oos_fixups;
+
+    idx = mfn_x(gmfn) % SHADOW_OOS_FT_HASH;
+    for ( i = 0; i < SHADOW_OOS_FT_ENTRIES; i++ )
+    {
+        if ( !mfn_valid(fixups[_FIXUP_IDX(idx, i)].gmfn)
+             || !mfn_is_out_of_sync(fixups[_FIXUP_IDX(idx, i)].gmfn) )
+        {
+            free = 1;
+            free_slot = _FIXUP_IDX(idx, i);
+        }
+        else if ( (mfn_x(fixups[_FIXUP_IDX(idx, i)].gmfn) == mfn_x(gmfn))
+                  && (mfn_x(fixups[_FIXUP_IDX(idx, i)].smfn) == mfn_x(smfn))
+                  && (fixups[_FIXUP_IDX(idx, i)].off == off) )
+        {
+            perfc_incr(shadow_oos_fixup_no_add);
+            return;
+        }
+    }
+
+    if ( free )
+    {
+        if ( !v->arch.paging.shadow.oos_fixup_used )
+            v->arch.paging.shadow.oos_fixup_used = 1;
+        fixups[free_slot].gmfn = gmfn;
+        fixups[free_slot].smfn = smfn;
+        fixups[free_slot].off = off;
+        perfc_incr(shadow_oos_fixup_add_ok);
+        return;
+    }
+
+
+    perfc_incr(shadow_oos_fixup_add_fail);
+}
+
+void oos_fixup_remove(struct vcpu *v, mfn_t gmfn)
+{
+    int idx, i;
+    struct domain *d = v->domain;
+
+    perfc_incr(shadow_oos_fixup_remove);
+
+    /* If the domain is dying we might get called when deallocating
+     * the shadows. Fixup tables are already freed so exit now. */
+    if ( d->is_dying )
+        return;
+
+    idx = mfn_x(gmfn) % SHADOW_OOS_FT_HASH;
+    for_each_vcpu(d, v)
+    {
+        struct oos_fixup *fixups = v->arch.paging.shadow.oos_fixups;
+        for ( i = 0; i < SHADOW_OOS_FT_ENTRIES; i++ )
+            if ( mfn_x(fixups[_FIXUP_IDX(idx, i)].gmfn) == mfn_x(gmfn) )
+                fixups[_FIXUP_IDX(idx, i)].gmfn = _mfn(INVALID_MFN);
+    }
+}
+
+int oos_fixup_flush(struct vcpu *v)
+{
+    int i, rc = 0;
+    struct oos_fixup *fixups = v->arch.paging.shadow.oos_fixups;
+
+    perfc_incr(shadow_oos_fixup_flush);
+
+    if ( !v->arch.paging.shadow.oos_fixup_used )
+        return 0;
+
+    for ( i = 0; i < SHADOW_OOS_FT_HASH * SHADOW_OOS_FT_ENTRIES; i++ )
+    {
+        if ( mfn_valid(fixups[i].gmfn) )
+        {
+            if ( mfn_is_out_of_sync(fixups[i].gmfn) )
+                rc |= sh_remove_write_access_from_sl1p(v, fixups[i].gmfn,
+                                                       fixups[i].smfn,
+                                                       fixups[i].off);
+            fixups[i].gmfn = _mfn(INVALID_MFN);
+        }
+    }
+
+    v->arch.paging.shadow.oos_fixup_used = 0;
+
+    return rc;
+}
+
+int oos_fixup_flush_gmfn(struct vcpu *v, mfn_t gmfn)
+{
+    int idx, i, rc = 0;
+    struct domain *d = v->domain;
+
+    perfc_incr(shadow_oos_fixup_flush_gmfn);
+
+    idx = mfn_x(gmfn) % SHADOW_OOS_FT_HASH;
+    for_each_vcpu(d, v)
+    {
+        struct oos_fixup *fixups = v->arch.paging.shadow.oos_fixups;
+
+        for ( i = 0; i < SHADOW_OOS_FT_ENTRIES; i++ )
+        {
+            if ( mfn_x(fixups[_FIXUP_IDX(idx, i)].gmfn) != mfn_x(gmfn) )
+                continue;
+
+            rc |= sh_remove_write_access_from_sl1p(v, 
+                                                   
fixups[_FIXUP_IDX(idx,i)].gmfn,
+                                                   
fixups[_FIXUP_IDX(idx,i)].smfn,
+                                                   
fixups[_FIXUP_IDX(idx,i)].off);
+
+            fixups[_FIXUP_IDX(idx,i)].gmfn = _mfn(INVALID_MFN);
+        }
+    }
+
+    return rc;
+}
+
+static int oos_remove_write_access(struct vcpu *v, mfn_t gmfn, unsigned long 
va)
+{
+    int ftlb = 0;
+
+    ftlb |= oos_fixup_flush_gmfn(v, gmfn);
+
+    switch ( sh_remove_write_access(v, gmfn, 0, va) )
+    {
+    default:
+    case 0:
+        break;
+
+    case 1:
+        ftlb |= 1;
+        break;
+
+    case -1:
+        /* An unfindable writeable typecount has appeared, probably via a
+         * grant table entry: can't shoot the mapping, so try to unshadow 
+         * the page.  If that doesn't work either, the guest is granting
+         * his pagetables and must be killed after all.
+         * This will flush the tlb, so we can return with no worries. */
+        sh_remove_shadows(v, gmfn, 0 /* Be thorough */, 1 /* Must succeed */);
+        return 1;
+    }
+
+    if ( ftlb )
+        flush_tlb_mask(v->domain->domain_dirty_cpumask);
+
+    return 0;
+}
+
+
+/* Pull all the entries on an out-of-sync page back into sync. */
+static void _sh_resync(struct vcpu *v, mfn_t gmfn, unsigned long va, mfn_t snp)
+{
+    struct page_info *pg = mfn_to_page(gmfn);
+
+    ASSERT(shadow_locked_by_me(v->domain));
+    ASSERT(mfn_is_out_of_sync(gmfn));
+    /* Guest page must be shadowed *only* as L1 when out of sync. */
+    ASSERT(!(mfn_to_page(gmfn)->shadow_flags & SHF_page_type_mask 
+             & ~SHF_L1_ANY));
+    ASSERT(!sh_page_has_multiple_shadows(mfn_to_page(gmfn)));
+
+    SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, va=%lx\n",
+                  v->domain->domain_id, v->vcpu_id, mfn_x(gmfn), va);
+
+    /* Need to pull write access so the page *stays* in sync. */
+    if ( oos_remove_write_access(v, gmfn, va) )
+    {
+        /* Page has been unshadowed. */
+        return;
+    }
+
+    /* No more writable mappings of this page, please */
+    pg->shadow_flags &= ~SHF_oos_may_write;
+
+    /* Update the shadows with current guest entries. */
+    _sh_resync_l1(v, gmfn, snp);
+
+    /* Now we know all the entries are synced, and will stay that way */
+    pg->shadow_flags &= ~SHF_out_of_sync;
+    perfc_incr(shadow_resync);
+}
+
+
+/* Add an MFN to the list of out-of-sync guest pagetables */
+static void oos_hash_add(struct vcpu *v, mfn_t gmfn, unsigned long va)
+{
+    int idx, oidx, swap = 0;
+    void *gptr, *gsnpptr;
+    mfn_t *oos = v->arch.paging.shadow.oos;
+    unsigned long *oos_va = v->arch.paging.shadow.oos_va;
+    mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot;
+
+    idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
+    oidx = idx;
+
+    if ( mfn_valid(oos[idx]) 
+         && (mfn_x(oos[idx]) % SHADOW_OOS_PAGES) == idx )
+    {
+        /* Punt the current occupant into the next slot */
+        SWAP(oos[idx], gmfn);
+        SWAP(oos_va[idx], va);
+        swap = 1;
+        idx = (idx + 1) % SHADOW_OOS_PAGES;
+    }
+    if ( mfn_valid(oos[idx]) )
+   {
+        /* Crush the current occupant. */
+        _sh_resync(v, oos[idx], oos_va[idx], oos_snapshot[idx]);
+        perfc_incr(shadow_unsync_evict);
+    }
+    oos[idx] = gmfn;
+    oos_va[idx] = va;
+
+    if ( swap )
+        SWAP(oos_snapshot[idx], oos_snapshot[oidx]);
+
+    gptr = sh_map_domain_page(oos[oidx]);
+    gsnpptr = sh_map_domain_page(oos_snapshot[oidx]);
+    memcpy(gsnpptr, gptr, PAGE_SIZE);
+    sh_unmap_domain_page(gptr);
+    sh_unmap_domain_page(gsnpptr);
+}
+
+/* Remove an MFN from the list of out-of-sync guest pagetables */
+static void oos_hash_remove(struct vcpu *v, mfn_t gmfn)
+{
+    int idx;
+    mfn_t *oos;
+    struct domain *d = v->domain;
+
+    SHADOW_PRINTK("D%dV%d gmfn %lx\n",
+                  v->domain->domain_id, v->vcpu_id, mfn_x(gmfn)); 
+
+    for_each_vcpu(d, v) 
+    {
+        oos = v->arch.paging.shadow.oos;
+        idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
+        if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
+            idx = (idx + 1) % SHADOW_OOS_PAGES;
+        if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
+        {
+            oos[idx] = _mfn(INVALID_MFN);
+            return;
+        }
+    }
+
+    SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn));
+    BUG();
+}
+
+mfn_t oos_snapshot_lookup(struct vcpu *v, mfn_t gmfn)
+{
+    int idx;
+    mfn_t *oos;
+    mfn_t *oos_snapshot;
+    struct domain *d = v->domain;
+    
+    for_each_vcpu(d, v) 
+    {
+        oos = v->arch.paging.shadow.oos;
+        oos_snapshot = v->arch.paging.shadow.oos_snapshot;
+        idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
+        if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
+            idx = (idx + 1) % SHADOW_OOS_PAGES;
+        if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
+        {
+            return oos_snapshot[idx];
+        }
+    }
+
+    SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn));
+    BUG();
+    return _mfn(INVALID_MFN);
+}
+
+/* Pull a single guest page back into sync */
+void sh_resync(struct vcpu *v, mfn_t gmfn)
+{
+    int idx;
+    mfn_t *oos;
+    unsigned long *oos_va;
+    mfn_t *oos_snapshot;
+    struct domain *d = v->domain;
+
+    for_each_vcpu(d, v) 
+    {
+        oos = v->arch.paging.shadow.oos;
+        oos_va = v->arch.paging.shadow.oos_va;
+        oos_snapshot = v->arch.paging.shadow.oos_snapshot;
+        idx = mfn_x(gmfn) % SHADOW_OOS_PAGES;
+        if ( mfn_x(oos[idx]) != mfn_x(gmfn) )
+            idx = (idx + 1) % SHADOW_OOS_PAGES;
+        
+        if ( mfn_x(oos[idx]) == mfn_x(gmfn) )
+        {
+            _sh_resync(v, gmfn, oos_va[idx], oos_snapshot[idx]);
+            oos[idx] = _mfn(INVALID_MFN);
+            return;
+        }
+    }
+
+    SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn));
+    BUG();
+}
+
+/* Figure out whether it's definitely safe not to sync this l1 table,
+ * by making a call out to the mode in which that shadow was made. */
+static int sh_skip_sync(struct vcpu *v, mfn_t gl1mfn)
+{
+    struct page_info *pg = mfn_to_page(gl1mfn);
+    if ( pg->shadow_flags & SHF_L1_32 )
+        return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 2)(v, gl1mfn);
+    else if ( pg->shadow_flags & SHF_L1_PAE )
+        return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 3)(v, gl1mfn);
+#if CONFIG_PAGING_LEVELS >= 4
+    else if ( pg->shadow_flags & SHF_L1_64 )
+        return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 4)(v, gl1mfn);
+#endif
+    SHADOW_ERROR("gmfn 0x%lx was OOS but not shadowed as an l1.\n", 
+                 mfn_x(gl1mfn));
+    BUG();
+    return 0; /* BUG() is no longer __attribute__((noreturn)). */
+}
+
+
+/* Pull all out-of-sync pages back into sync.  Pages brought out of sync
+ * on other vcpus are allowed to remain out of sync, but their contents
+ * will be made safe (TLB flush semantics); pages unsynced by this vcpu
+ * are brought back into sync and write-protected.  If skip != 0, we try
+ * to avoid resyncing at all if we think we can get away with it. */
+void sh_resync_all(struct vcpu *v, int skip, int this, int others, int 
do_locking)
+{
+    int idx;
+    struct vcpu *other;
+    mfn_t *oos = v->arch.paging.shadow.oos;
+    unsigned long *oos_va = v->arch.paging.shadow.oos_va;
+    mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot;
+
+    SHADOW_PRINTK("d=%d, v=%d\n", v->domain->domain_id, v->vcpu_id);
+
+    ASSERT(do_locking || shadow_locked_by_me(v->domain));
+
+    if ( !this )
+        goto resync_others;
+
+    if ( do_locking )
+        shadow_lock(v->domain);
+
+    if ( oos_fixup_flush(v) )
+        flush_tlb_mask(v->domain->domain_dirty_cpumask);    
+
+    /* First: resync all of this vcpu's oos pages */
+    for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ ) 
+        if ( mfn_valid(oos[idx]) )
+        {
+            /* Write-protect and sync contents */
+            _sh_resync(v, oos[idx], oos_va[idx], oos_snapshot[idx]);
+            oos[idx] = _mfn(INVALID_MFN);
+        }
+
+    if ( do_locking )
+        shadow_unlock(v->domain);
+
+ resync_others:
+    if ( !others )
+        return;
+
+    /* Second: make all *other* vcpus' oos pages safe. */
+    for_each_vcpu(v->domain, other)
+    {
+        if ( v == other ) 
+            continue;
+
+        if ( do_locking )
+            shadow_lock(v->domain);
+
+        oos = other->arch.paging.shadow.oos;
+        oos_va = other->arch.paging.shadow.oos_va;
+        oos_snapshot = other->arch.paging.shadow.oos_snapshot;
+        for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ ) 
+        {
+            if ( !mfn_valid(oos[idx]) )
+                continue;
+
+            if ( skip )
+            {
+                /* Update the shadows and leave the page OOS. */
+                if ( sh_skip_sync(v, oos[idx]) )
+                    continue;
+                _sh_resync_l1(other, oos[idx], oos_snapshot[idx]);
+            }
+            else
+            {
+                /* Write-protect and sync contents */
+                _sh_resync(other, oos[idx], oos_va[idx], oos_snapshot[idx]);
+                oos[idx] = _mfn(INVALID_MFN);
+            }
+        }
+        
+        if ( do_locking )
+            shadow_unlock(v->domain);
+    }
+}
+
+/* Allow a shadowed page to go out of sync */
+int sh_unsync(struct vcpu *v, mfn_t gmfn, unsigned long va)
+{
+    struct page_info *pg;
+    
+    ASSERT(shadow_locked_by_me(v->domain));
+
+    SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx va %lx\n",
+                  v->domain->domain_id, v->vcpu_id, mfn_x(gmfn), va);
+
+    pg = mfn_to_page(gmfn);
+ 
+    /* Guest page must be shadowed *only* as L1 and *only* once when out
+     * of sync.  Also, get out now if it's already out of sync. 
+     * Also, can't safely unsync if some vcpus have paging disabled.*/
+    if ( pg->shadow_flags & 
+         ((SHF_page_type_mask & ~SHF_L1_ANY) | SHF_out_of_sync) 
+         || sh_page_has_multiple_shadows(pg)
+         || !is_hvm_domain(v->domain)
+         || !v->domain->arch.paging.shadow.oos_active )
+        return 0;
+
+    pg->shadow_flags |= SHF_out_of_sync|SHF_oos_may_write;
+    oos_hash_add(v, gmfn, va);
+    perfc_incr(shadow_unsync);
+    return 1;
+}
+
+#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
+
 
 /**************************************************************************/
 /* Code for "promoting" a guest page to the point where the shadow code is
@@ -440,6 +1029,12 @@ void shadow_promote(struct vcpu *v, mfn_
 
     ASSERT(mfn_valid(gmfn));
 
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+    /* Is the page already shadowed and out of sync? */
+    if ( page_is_out_of_sync(page) ) 
+        sh_resync(v, gmfn);
+#endif
+
     /* We should never try to promote a gmfn that has writeable mappings */
     ASSERT((page->u.inuse.type_info & PGT_type_mask) != PGT_writable_page
            || (page->u.inuse.type_info & PGT_count_mask) == 0
@@ -463,7 +1058,17 @@ void shadow_demote(struct vcpu *v, mfn_t
     clear_bit(type, &page->shadow_flags);
 
     if ( (page->shadow_flags & SHF_page_type_mask) == 0 )
+    {
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+        /* Was the page out of sync? */
+        if ( page_is_out_of_sync(page) ) 
+        {
+            oos_hash_remove(v, gmfn);
+            oos_fixup_remove(v, gmfn);
+        }
+#endif 
         clear_bit(_PGC_page_table, &page->count_info);
+    }
 }
 
 /**************************************************************************/
@@ -674,7 +1279,8 @@ shadow_order(unsigned int shadow_type)
         0, /* SH_type_l3_64_shadow   */
         0, /* SH_type_l4_64_shadow   */
         2, /* SH_type_p2m_table      */
-        0  /* SH_type_monitor_table  */
+        0, /* SH_type_monitor_table  */
+        0  /* SH_type_oos_snapshot   */
         };
     ASSERT(shadow_type < SH_type_unused);
     return type_to_order[shadow_type];
@@ -1220,6 +1826,14 @@ static unsigned int sh_set_allocation(st
             sp = list_entry(d->arch.paging.shadow.freelists[order].next,
                             struct shadow_page_info, list);
             list_del(&sp->list);
+#if defined(__x86_64__)
+            /*
+             * Re-instate lock field which we overwrite with shadow_page_info.
+             * This was safe, since the lock is only used on guest pages.
+             */
+            for ( j = 0; j < 1U << order; j++ )
+                spin_lock_init(&((struct page_info *)sp)[j].lock);
+#endif
             d->arch.paging.shadow.free_pages -= 1 << order;
             d->arch.paging.shadow.total_pages -= 1 << order;
             free_domheap_pages((struct page_info *)sp, order);
@@ -1297,6 +1911,27 @@ static void sh_hash_audit_bucket(struct 
             /* Bad shadow flags on guest page? */
             BUG_ON( !(gpg->shadow_flags & (1<<sp->type)) );
             /* Bad type count on guest page? */
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+            if ( sp->type == SH_type_l1_32_shadow
+                 || sp->type == SH_type_l1_pae_shadow
+                 || sp->type == SH_type_l1_64_shadow )
+            {
+                if ( (gpg->u.inuse.type_info & PGT_type_mask) == 
PGT_writable_page
+                     && (gpg->u.inuse.type_info & PGT_count_mask) != 0 )
+                {
+                    if ( !page_is_out_of_sync(gpg) )
+                    {
+                        SHADOW_ERROR("MFN %#lx shadowed (by %#"PRI_mfn")"
+                                     " and not OOS but has typecount %#lx\n",
+                                     sp->backpointer, 
+                                     mfn_x(shadow_page_to_mfn(sp)), 
+                                     gpg->u.inuse.type_info);
+                        BUG();
+                    }
+                }
+            }
+            else /* Not an l1 */
+#endif
             if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page 
                  && (gpg->u.inuse.type_info & PGT_count_mask) != 0 )
             {
@@ -1608,7 +2243,8 @@ void sh_destroy_shadow(struct vcpu *v, m
 /* Remove all writeable mappings of a guest frame from the shadow tables 
  * Returns non-zero if we need to flush TLBs. 
  * level and fault_addr desribe how we found this to be a pagetable;
- * level==0 means we have some other reason for revoking write access.*/
+ * level==0 means we have some other reason for revoking write access.
+ * If level==0 we are allowed to fail, returning -1. */
 
 int sh_remove_write_access(struct vcpu *v, mfn_t gmfn, 
                            unsigned int level,
@@ -1659,7 +2295,12 @@ int sh_remove_write_access(struct vcpu *
         return 0;
 
     /* Early exit if it's already a pagetable, or otherwise not writeable */
-    if ( sh_mfn_is_a_page_table(gmfn) 
+    if ( (sh_mfn_is_a_page_table(gmfn)
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+         /* Unless they've been allowed to go out of sync with their shadows */
+           && !mfn_oos_may_write(gmfn)
+#endif
+         )
          || (pg->u.inuse.type_info & PGT_count_mask) == 0 )
         return 0;
 
@@ -1676,7 +2317,7 @@ int sh_remove_write_access(struct vcpu *
     }
 
 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
-    if ( v == current && level != 0 )
+    if ( v == current )
     {
         unsigned long gfn;
         /* Heuristic: there is likely to be only one writeable mapping,
@@ -1690,6 +2331,8 @@ int sh_remove_write_access(struct vcpu *
                 return 1;                                                 \
         } while (0)
 
+        if ( level == 0 && fault_addr )
+            GUESS(fault_addr, 6);
         
         if ( v->arch.paging.mode->guest_levels == 2 )
         {
@@ -1773,13 +2416,19 @@ int sh_remove_write_access(struct vcpu *
 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC */
     
     /* Brute-force search of all the shadows, by walking the hash */
-    perfc_incr(shadow_writeable_bf);
+    if ( level == 0 )
+        perfc_incr(shadow_writeable_bf_1);
+    else
+        perfc_incr(shadow_writeable_bf);
     hash_foreach(v, callback_mask, callbacks, gmfn);
 
     /* If that didn't catch the mapping, then there's some non-pagetable
      * mapping -- ioreq page, grant mapping, &c. */
     if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) != 0 )
     {
+        if ( level == 0 )
+            return -1;
+
         SHADOW_ERROR("can't remove write access to mfn %lx: guest has "
                       "%lu special-use mappings of it\n", mfn_x(gmfn),
                       (mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask));
@@ -1790,7 +2439,34 @@ int sh_remove_write_access(struct vcpu *
     return 1;
 }
 
-
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+int sh_remove_write_access_from_sl1p(struct vcpu *v, mfn_t gmfn,
+                                     mfn_t smfn, unsigned long off)
+{
+    struct shadow_page_info *sp = mfn_to_shadow_page(smfn);
+    
+    ASSERT(mfn_valid(smfn));
+    ASSERT(mfn_valid(gmfn));
+    
+    if ( sp->type == SH_type_l1_32_shadow )
+    {
+        return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,2)
+            (v, gmfn, smfn, off);
+    }
+#if CONFIG_PAGING_LEVELS >= 3
+    else if ( sp->type == SH_type_l1_pae_shadow )
+        return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,3)
+            (v, gmfn, smfn, off);
+#if CONFIG_PAGING_LEVELS >= 4
+    else if ( sp->type == SH_type_l1_64_shadow )
+        return SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p,4)
+            (v, gmfn, smfn, off);
+#endif
+#endif
+    
+    return 0;
+}
+#endif 
 
 /**************************************************************************/
 /* Remove all mappings of a guest frame from the shadow tables.
@@ -2127,6 +2803,36 @@ static void sh_update_paging_modes(struc
     }
 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
 
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+    if ( v->arch.paging.shadow.oos_fixups == NULL )
+    {
+        int i;
+        v->arch.paging.shadow.oos_fixups =
+            alloc_xenheap_pages(SHADOW_OOS_FT_ORDER);
+        if ( v->arch.paging.shadow.oos_fixups == NULL )
+        {
+            SHADOW_ERROR("Could not allocate OOS fixup table"
+                         " for dom %u vcpu %u\n",
+                         v->domain->domain_id, v->vcpu_id);
+            domain_crash(v->domain);
+            return;
+        }
+        for ( i = 0; i < SHADOW_OOS_FT_HASH * SHADOW_OOS_FT_ENTRIES; i++ )
+            v->arch.paging.shadow.oos_fixups[i].gmfn = _mfn(INVALID_MFN);
+    }
+     
+    if ( mfn_x(v->arch.paging.shadow.oos_snapshot[0]) == INVALID_MFN )
+    {
+        int i;
+        for(i = 0; i < SHADOW_OOS_PAGES; i++)
+        {
+            shadow_prealloc(d, SH_type_oos_snapshot, 1);
+            v->arch.paging.shadow.oos_snapshot[i] =
+                shadow_alloc(d, SH_type_oos_snapshot, 0);
+        }
+    }
+#endif /* OOS */
+
     // Valid transitions handled by this function:
     // - For PV guests:
     //     - after a shadow mode has been changed
@@ -2158,6 +2864,13 @@ static void sh_update_paging_modes(struc
         ///
         ASSERT(shadow_mode_translate(d));
         ASSERT(shadow_mode_external(d));
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+        /* Need to resync all our pages now, because if a page goes out
+         * of sync with paging enabled and is resynced with paging
+         * disabled, the resync will go wrong. */
+        shadow_resync_all(v, 0);
+#endif /* OOS */
 
         if ( !hvm_paging_enabled(v) )
         {
@@ -2254,6 +2967,27 @@ static void sh_update_paging_modes(struc
         //        This *does* happen, at least for CR4.PGE...
     }
 
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    /* We need to check that all the vcpus have paging enabled to
+     * unsync PTs. */
+    if ( is_hvm_domain(d) )
+    {
+        int pe = 1;
+        struct vcpu *vptr;
+
+        for_each_vcpu(d, vptr)
+        {
+            if ( !hvm_paging_enabled(vptr) )
+            {
+                pe = 0;
+                break;
+            }
+        }
+
+        d->arch.paging.shadow.oos_active = pe;
+    }
+#endif /* OOS */
+
     v->arch.paging.mode->update_cr3(v, 0);
 }
 
@@ -2426,17 +3160,36 @@ void shadow_teardown(struct domain *d)
         }
     }
 
-#if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) 
+#if (SHADOW_OPTIMIZATIONS & (SHOPT_VIRTUAL_TLB|SHOPT_OUT_OF_SYNC))
     /* Free the virtual-TLB array attached to each vcpu */
     for_each_vcpu(d, v)
     {
+#if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
         if ( v->arch.paging.vtlb )
         {
             xfree(v->arch.paging.vtlb);
             v->arch.paging.vtlb = NULL;
         }
-    }
 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+        if ( v->arch.paging.shadow.oos_fixups )
+        {
+            free_xenheap_pages(v->arch.paging.shadow.oos_fixups,
+                               SHADOW_OOS_FT_ORDER);
+            v->arch.paging.shadow.oos_fixups = NULL;
+        }
+
+        {
+            int i;
+            mfn_t *oos_snapshot = v->arch.paging.shadow.oos_snapshot;
+            for(i = 0; i < SHADOW_OOS_PAGES; i++)
+                if ( mfn_valid(oos_snapshot[i]) )
+                    shadow_free(d, oos_snapshot[i]);
+        }
+#endif /* OOS */
+    }
+#endif /* (SHADOW_OPTIMIZATIONS & (SHOPT_VIRTUAL_TLB|SHOPT_OUT_OF_SYNC)) */
 
     list_for_each_safe(entry, n, &d->arch.paging.shadow.p2m_freelist)
     {
@@ -3044,7 +3797,11 @@ void shadow_audit_tables(struct vcpu *v)
 
     if ( !(SHADOW_AUDIT_ENABLE) )
         return;
-    
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    sh_oos_audit(v->domain);
+#endif
+
     if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL )
         mask = ~1; /* Audit every table in the system */
     else 
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/mm/shadow/multi.c
--- a/xen/arch/x86/mm/shadow/multi.c    Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/mm/shadow/multi.c    Wed Jul 02 11:30:37 2008 +0900
@@ -305,22 +305,54 @@ shadow_check_gwalk(struct vcpu *v, unsig
 }
 
 /* Remove write access permissions from a gwalk_t in a batch, and
- * return OR-ed result for TLB flush hint
+ * return OR-ed result for TLB flush hint and need to rewalk the guest
+ * pages.
+ *
+ * Syncing pages will remove write access to that page; but it may
+ * also give write access to other pages in the path. If we resync any
+ * pages, re-walk from the beginning.
  */
+#define GW_RMWR_FLUSHTLB 1
+#define GW_RMWR_REWALK   2
+
 static inline uint32_t
 gw_remove_write_accesses(struct vcpu *v, unsigned long va, walk_t *gw)
 {
-    int rc = 0;
+    uint32_t rc = 0;
 
 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
-    rc = sh_remove_write_access(v, gw->l3mfn, 3, va);
-#endif
-    rc |= sh_remove_write_access(v, gw->l2mfn, 2, va);
-#endif
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    if ( mfn_is_out_of_sync(gw->l3mfn) )
+    {
+        sh_resync(v, gw->l3mfn);
+        rc = GW_RMWR_REWALK;
+    }
+    else
+#endif /* OOS */
+     if ( sh_remove_write_access(v, gw->l3mfn, 3, va) )
+         rc = GW_RMWR_FLUSHTLB;
+#endif /* GUEST_PAGING_LEVELS >= 4 */
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    if ( mfn_is_out_of_sync(gw->l2mfn) )
+    {
+        sh_resync(v, gw->l2mfn);
+        rc |= GW_RMWR_REWALK;
+    }
+    else
+#endif /* OOS */
+    if ( sh_remove_write_access(v, gw->l2mfn, 2, va) )
+        rc |= GW_RMWR_FLUSHTLB;
+#endif /* GUEST_PAGING_LEVELS >= 3 */
+
     if ( !(guest_supports_superpages(v) &&
-           (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) )
-        rc |= sh_remove_write_access(v, gw->l1mfn, 1, va);
+           (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE))
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+         && !mfn_is_out_of_sync(gw->l1mfn)
+#endif /* OOS */
+         && sh_remove_write_access(v, gw->l1mfn, 1, va) )
+        rc |= GW_RMWR_FLUSHTLB;
 
     return rc;
 }
@@ -882,7 +914,12 @@ _sh_propagate(struct vcpu *v,
     
     // protect guest page tables
     //
-    if ( unlikely((level == 1) && sh_mfn_is_a_page_table(target_mfn)) )
+    if ( unlikely((level == 1) 
+                  && sh_mfn_is_a_page_table(target_mfn)
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
+                  && !mfn_oos_may_write(target_mfn)
+#endif /* OOS */
+                  ) )
     {
         if ( shadow_mode_trap_reads(d) )
         {
@@ -1125,6 +1162,9 @@ static int shadow_set_l4e(struct vcpu *v
             domain_crash(v->domain);
             return SHADOW_SET_ERROR;
         }
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
+        shadow_resync_all(v, 0);
+#endif
     }
 
     /* Write the new entry */
@@ -1163,12 +1203,17 @@ static int shadow_set_l3e(struct vcpu *v
              | (((unsigned long)sl3e) & ~PAGE_MASK));
     
     if ( shadow_l3e_get_flags(new_sl3e) & _PAGE_PRESENT )
+    {
         /* About to install a new reference */        
         if ( !sh_get_ref(v, shadow_l3e_get_mfn(new_sl3e), paddr) )
         {
             domain_crash(v->domain);
             return SHADOW_SET_ERROR;
-        } 
+        }
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC )
+        shadow_resync_all(v, 0);
+#endif
+    }
 
     /* Write the new entry */
     shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn);
@@ -1219,12 +1264,29 @@ static int shadow_set_l2e(struct vcpu *v
              | (((unsigned long)sl2e) & ~PAGE_MASK));
 
     if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT ) 
+    {
+        mfn_t sl1mfn = shadow_l2e_get_mfn(new_sl2e);
+
         /* About to install a new reference */
-        if ( !sh_get_ref(v, shadow_l2e_get_mfn(new_sl2e), paddr) )
+        if ( !sh_get_ref(v, sl1mfn, paddr) )
         {
             domain_crash(v->domain);
             return SHADOW_SET_ERROR;
-        } 
+        }
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+        {
+            struct shadow_page_info *sp = mfn_to_shadow_page(sl1mfn);
+            mfn_t gl1mfn = _mfn(sp->backpointer);
+
+            /* If the shadow is a fl1 then the backpointer contains
+               the GFN instead of the GMFN, and it's definitely not
+               OOS. */
+            if ( (sp->type != SH_type_fl1_shadow) && mfn_valid(gl1mfn)
+                 && mfn_is_out_of_sync(gl1mfn) )
+                sh_resync(v, gl1mfn);
+        }
+#endif
+    }
 
     /* Write the new entry */
 #if GUEST_PAGING_LEVELS == 2
@@ -1347,6 +1409,9 @@ static int shadow_set_l1e(struct vcpu *v
     int flags = 0;
     struct domain *d = v->domain;
     shadow_l1e_t old_sl1e;
+#if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
+    mfn_t new_gmfn = shadow_l1e_get_mfn(new_sl1e);
+#endif
     ASSERT(sl1e != NULL);
     
     old_sl1e = *sl1e;
@@ -1363,8 +1428,18 @@ static int shadow_set_l1e(struct vcpu *v
                 /* Doesn't look like a pagetable. */
                 flags |= SHADOW_SET_ERROR;
                 new_sl1e = shadow_l1e_empty();
-            } else {
+            }
+            else
+            {
                 shadow_vram_get_l1e(new_sl1e, sl1e, sl1mfn, d);
+#if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
+                if ( mfn_valid(new_gmfn) && mfn_oos_may_write(new_gmfn)
+                     && (shadow_l1e_get_flags(new_sl1e) & _PAGE_RW) )
+                {
+                    oos_fixup_add(v, new_gmfn, sl1mfn, 
pgentry_ptr_to_slot(sl1e));
+                }
+#endif
+
             }
         }
     } 
@@ -2532,6 +2607,9 @@ static int validate_gl1e(struct vcpu *v,
     mfn_t gmfn;
     p2m_type_t p2mt;
     int result = 0;
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    mfn_t gl1mfn;
+#endif /* OOS */
 
     perfc_incr(shadow_validate_gl1e_calls);
 
@@ -2539,10 +2617,138 @@ static int validate_gl1e(struct vcpu *v,
     gmfn = gfn_to_mfn(v->domain, gfn, &p2mt);
 
     l1e_propagate_from_guest(v, new_gl1e, gmfn, &new_sl1e, ft_prefetch, p2mt);
+    result |= shadow_set_l1e(v, sl1p, new_sl1e, sl1mfn);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
+    if ( mfn_valid(gl1mfn) 
+         && mfn_is_out_of_sync(gl1mfn) )
+    {
+        /* Update the OOS snapshot. */
+        mfn_t snpmfn = oos_snapshot_lookup(v, gl1mfn);
+        guest_l1e_t *snp;
+
+        ASSERT(mfn_valid(snpmfn));
+
+        snp = sh_map_domain_page(snpmfn);
+        snp[guest_index(new_ge)] = new_gl1e;
+        sh_unmap_domain_page(snp);
+    }
+#endif /* OOS */
+
+    return result;
+}
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+/**************************************************************************/
+/* Special validation function for re-syncing out-of-sync shadows. 
+ * Walks the *shadow* page, and for every entry that it finds,
+ * revalidates the guest entry that corresponds to it.
+ * N.B. This function is called with the vcpu that unsynced the page,
+ *      *not* the one that is causing it to be resynced. */
+void sh_resync_l1(struct vcpu *v, mfn_t gl1mfn, mfn_t snpmfn)
+{
+    mfn_t sl1mfn;
+    shadow_l1e_t *sl1p;
+    guest_l1e_t *gl1p, *gp, *snp;
+    int rc = 0;
+
+    ASSERT(mfn_valid(snpmfn));
+
+    sl1mfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
+    ASSERT(mfn_valid(sl1mfn)); /* Otherwise we would not have been called */
+
+    snp = sh_map_domain_page(snpmfn);
+    gp = sh_map_domain_page(gl1mfn);
+    gl1p = gp;
+
+   SHADOW_FOREACH_L1E(sl1mfn, sl1p, &gl1p, 0, {
+        guest_l1e_t gl1e = *gl1p;
+        guest_l1e_t *snpl1p = (guest_l1e_t *)snp + guest_index(gl1p);
+
+        if ( memcmp(snpl1p, &gl1e, sizeof(gl1e)) )
+        {
+            gfn_t gfn;
+            mfn_t gmfn;
+            p2m_type_t p2mt;
+            shadow_l1e_t nsl1e;
+
+            gfn = guest_l1e_get_gfn(gl1e);
+            gmfn = gfn_to_mfn(v->domain, gfn, &p2mt);
+            l1e_propagate_from_guest(v, gl1e, gmfn, &nsl1e, ft_prefetch, p2mt);
+            rc |= shadow_set_l1e(v, sl1p, nsl1e, sl1mfn);
+            
+            *snpl1p = gl1e;
+        }
+    });
+
+    sh_unmap_domain_page(gp);
+    sh_unmap_domain_page(snp);
+
+    /* Setting shadow L1 entries should never need us to flush the TLB */
+    ASSERT(!(rc & SHADOW_SET_FLUSH));
+}
+
+/* Figure out whether it's definitely safe not to sync this l1 table. 
+ * That is: if we can tell that it's only used once, and that the 
+ * toplevel shadow responsible is not one of ours. 
+ * N.B. This function is called with the vcpu that required the resync, 
+ *      *not* the one that originally unsynced the page, but it is
+ *      called in the *mode* of the vcpu that unsynced it.  Clear?  Good. */
+int sh_safe_not_to_sync(struct vcpu *v, mfn_t gl1mfn)
+{
+    struct shadow_page_info *sp;
+    mfn_t smfn;
+
+    smfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow);
+    ASSERT(mfn_valid(smfn)); /* Otherwise we would not have been called */
     
-    result |= shadow_set_l1e(v, sl1p, new_sl1e, sl1mfn);
-    return result;
-}
+    /* Up to l2 */
+    sp = mfn_to_shadow_page(smfn);
+    if ( sp->count != 1 || !sp->up )
+        return 0;
+    smfn = _mfn(sp->up >> PAGE_SHIFT);
+    ASSERT(mfn_valid(smfn));
+
+#if (SHADOW_PAGING_LEVELS == 4) 
+    /* up to l3 */
+    sp = mfn_to_shadow_page(smfn);
+    if ( sp->count != 1 || !sp->up )
+        return 0;
+    smfn = _mfn(sp->up >> PAGE_SHIFT);
+    ASSERT(mfn_valid(smfn));
+
+    /* up to l4 */
+    sp = mfn_to_shadow_page(smfn);
+    if ( sp->count != 1 
+         || sh_type_is_pinnable(v, SH_type_l3_64_shadow) || !sp->up )
+        return 0;
+    smfn = _mfn(sp->up >> PAGE_SHIFT);
+    ASSERT(mfn_valid(smfn));
+
+#if (GUEST_PAGING_LEVELS == 2)
+    /* In 2-on-3 shadow mode the up pointer contains the link to the
+     * shadow page, but the shadow_table contains only the first of the
+     * four pages that makes the PAE top shadow tables. */
+    smfn = _mfn(mfn_x(smfn) & ~0x3UL);
+#endif
+
+#endif
+
+    if ( pagetable_get_pfn(v->arch.shadow_table[0]) == mfn_x(smfn)
+#if (SHADOW_PAGING_LEVELS == 3) 
+         || pagetable_get_pfn(v->arch.shadow_table[1]) == mfn_x(smfn)
+         || pagetable_get_pfn(v->arch.shadow_table[2]) == mfn_x(smfn)
+         || pagetable_get_pfn(v->arch.shadow_table[3]) == mfn_x(smfn) 
+#endif
+        )
+        return 0;
+    
+    /* Only in use in one toplevel shadow, and it's not the one we're 
+     * running on */
+    return 1;
+}
+#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
 
 
 /**************************************************************************/
@@ -2725,6 +2931,10 @@ static void sh_prefetch(struct vcpu *v, 
     shadow_l1e_t sl1e;
     u32 gflags;
     p2m_type_t p2mt;
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    guest_l1e_t *snpl1p = NULL;
+#endif /* OOS */
+
 
     /* Prefetch no further than the end of the _shadow_ l1 MFN */
     dist = (PAGE_SIZE - ((unsigned long)ptr_sl1e & ~PAGE_MASK)) / sizeof sl1e;
@@ -2737,6 +2947,17 @@ static void sh_prefetch(struct vcpu *v, 
         /* Normal guest page; grab the next guest entry */
         gl1p = sh_map_domain_page(gw->l1mfn);
         gl1p += guest_l1_table_offset(gw->va);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+        if ( mfn_is_out_of_sync(gw->l1mfn) )
+        {
+            mfn_t snpmfn = oos_snapshot_lookup(v, gw->l1mfn);
+
+            ASSERT(mfn_valid(snpmfn));
+            snpl1p = sh_map_domain_page(snpmfn);
+            snpl1p += guest_l1_table_offset(gw->va);
+        }
+#endif /* OOS */
     }
 
     for ( i = 1; i < dist ; i++ ) 
@@ -2774,9 +2995,18 @@ static void sh_prefetch(struct vcpu *v, 
         /* Propagate the entry.  */
         l1e_propagate_from_guest(v, gl1e, gmfn, &sl1e, ft_prefetch, p2mt);
         (void) shadow_set_l1e(v, ptr_sl1e + i, sl1e, sl1mfn);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+        if ( snpl1p != NULL )
+            snpl1p[i] = gl1e;
+#endif /* OOS */
     }
     if ( gl1p != NULL )
         sh_unmap_domain_page(gl1p);
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    if ( snpl1p != NULL )
+        sh_unmap_domain_page(snpl1p);
+#endif /* OOS */
 }
 
 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH */
@@ -2805,6 +3035,7 @@ static int sh_page_fault(struct vcpu *v,
     int r;
     fetch_type_t ft = 0;
     p2m_type_t p2mt;
+    uint32_t rc;
 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
     int fast_emul = 0;
 #endif
@@ -2830,6 +3061,17 @@ static int sh_page_fault(struct vcpu *v,
         {
             fast_emul = 1;
             gmfn = _mfn(v->arch.paging.shadow.last_emulated_mfn);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+            /* Fall back to the slow path if we're trying to emulate
+               writes to an out of sync page. */
+            if ( mfn_valid(gmfn) && mfn_is_out_of_sync(gmfn) )
+            {
+                v->arch.paging.last_write_emul_ok = 0;
+                goto page_fault_slow_path;
+            }
+#endif /* OOS */
+
             perfc_incr(shadow_fault_fast_emulate);
             goto early_emulation;
         }
@@ -2855,6 +3097,31 @@ static int sh_page_fault(struct vcpu *v,
                                       sizeof(sl1e)) == 0)
                     && sh_l1e_is_magic(sl1e)) )
         {
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+             /* First, need to check that this isn't an out-of-sync
+              * shadow l1e.  If it is, we fall back to the slow path, which
+              * will sync it up again. */
+            {
+                shadow_l2e_t sl2e;
+                mfn_t gl1mfn;
+               if ( (__copy_from_user(&sl2e,
+                                       (sh_linear_l2_table(v)
+                                        + shadow_l2_linear_offset(va)),
+                                       sizeof(sl2e)) != 0)
+                     || !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT)
+                     || !mfn_valid(gl1mfn = _mfn(mfn_to_shadow_page(
+                                      shadow_l2e_get_mfn(sl2e))->backpointer))
+                     || unlikely(mfn_is_out_of_sync(gl1mfn)) )
+               {
+                   /* Hit the slow path as if there had been no 
+                    * shadow entry at all, and let it tidy up */
+                   ASSERT(regs->error_code & PFEC_page_present);
+                   regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present);
+                   goto page_fault_slow_path;
+               }
+            }
+#endif /* SHOPT_OUT_OF_SYNC */
+
             if ( sh_l1e_is_gnp(sl1e) )
             {
                 /* Not-present in a guest PT: pass to the guest as
@@ -2890,6 +3157,10 @@ static int sh_page_fault(struct vcpu *v,
             return EXCRET_fault_fixed;
         }
     }
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+ page_fault_slow_path:
+#endif
 #endif /* SHOPT_FAST_FAULT_PATH */
 
     /* Detect if this page fault happened while we were already in Xen
@@ -2904,7 +3175,21 @@ static int sh_page_fault(struct vcpu *v,
         return 0;
     }
 
-    if ( guest_walk_tables(v, va, &gw, regs->error_code) != 0 )
+ rewalk:
+    rc = guest_walk_tables(v, va, &gw, regs->error_code);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    if ( !(rc & _PAGE_PRESENT) )
+        regs->error_code |= PFEC_page_present;
+    else if ( regs->error_code & PFEC_page_present )
+    {
+            SHADOW_ERROR("OOS paranoia: Something is wrong in guest TLB"
+                         " flushing. Have fun debugging it.\n");
+            regs->error_code &= ~PFEC_page_present;
+    }
+#endif
+
+    if ( rc != 0 )
     {
         perfc_incr(shadow_fault_bail_real_fault);
         SHADOW_PRINTK("not a shadow fault\n");
@@ -2948,7 +3233,10 @@ static int sh_page_fault(struct vcpu *v,
 
     shadow_lock(d);
 
-    if ( gw_remove_write_accesses(v, va, &gw) )
+    rc = gw_remove_write_accesses(v, va, &gw);
+
+    /* First bit set: Removed write access to a page. */
+    if ( rc & GW_RMWR_FLUSHTLB )
     {
         /* Write permission removal is also a hint that other gwalks
          * overlapping with this one may be inconsistent
@@ -2958,11 +3246,20 @@ static int sh_page_fault(struct vcpu *v,
         flush_tlb_mask(d->domain_dirty_cpumask);
     }
 
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    /* Second bit set: Resynced a page. Re-walk needed. */
+    if ( rc & GW_RMWR_REWALK )
+    {
+        shadow_unlock(d);
+        goto rewalk;
+    }
+#endif /* OOS */
+
     if ( !shadow_check_gwalk(v, va, &gw) )
     {
         perfc_incr(shadow_inconsistent_gwalk);
         shadow_unlock(d);
-        return EXCRET_fault_fixed;
+        goto rewalk;
     }
 
     shadow_audit_tables(v);
@@ -2991,17 +3288,45 @@ static int sh_page_fault(struct vcpu *v,
         return 0;
     }
 
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    /* Always unsync when writing to L1 page tables. */
+    if ( sh_mfn_is_a_page_table(gmfn)
+         && ft == ft_demand_write )
+        sh_unsync(v, gmfn, va);
+#endif /* OOS */
+
     /* Calculate the shadow entry and write it */
     l1e_propagate_from_guest(v, gw.l1e, gmfn, &sl1e, ft, p2mt);
     r = shadow_set_l1e(v, ptr_sl1e, sl1e, sl1mfn);
 
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    if ( mfn_valid(gw.l1mfn) 
+         && mfn_is_out_of_sync(gw.l1mfn) )
+    {
+        /* Update the OOS snapshot. */
+        mfn_t snpmfn = oos_snapshot_lookup(v, gw.l1mfn);
+        guest_l1e_t *snp;
+        
+        ASSERT(mfn_valid(snpmfn));
+        
+        snp = sh_map_domain_page(snpmfn);
+        snp[guest_l1_table_offset(va)] = gw.l1e;
+        sh_unmap_domain_page(snp);
+    }
+#endif /* OOS */
+
 #if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
     /* Prefetch some more shadow entries */
     sh_prefetch(v, &gw, ptr_sl1e, sl1mfn);
 #endif
 
     /* Need to emulate accesses to page tables */
-    if ( sh_mfn_is_a_page_table(gmfn) )
+    if ( sh_mfn_is_a_page_table(gmfn)
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+         /* Unless they've been allowed to go out of sync with their shadows */
+         && !mfn_is_out_of_sync(gmfn)
+#endif
+         )
     {
         if ( ft == ft_demand_write )
         {
@@ -3215,6 +3540,7 @@ sh_invlpg(struct vcpu *v, unsigned long 
  * instruction should be issued on the hardware, or 0 if it's safe not
  * to do so. */
 {
+    mfn_t sl1mfn;
     shadow_l2e_t sl2e;
     
     perfc_incr(shadow_invlpg);
@@ -3278,12 +3604,64 @@ sh_invlpg(struct vcpu *v, unsigned long 
     // If so, then we'll need to flush the entire TLB (because that's
     // easier than invalidating all of the individual 4K pages).
     //
-    if ( mfn_to_shadow_page(shadow_l2e_get_mfn(sl2e))->type
+    sl1mfn = shadow_l2e_get_mfn(sl2e);
+    if ( mfn_to_shadow_page(sl1mfn)->type
          == SH_type_fl1_shadow )
     {
         flush_tlb_local();
         return 0;
     }
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+    /* Check to see if the SL1 is out of sync. */
+    {
+        mfn_t gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
+        struct page_info *pg = mfn_to_page(gl1mfn);
+        if ( mfn_valid(gl1mfn) 
+             && page_is_out_of_sync(pg) )
+        {
+            /* The test above may give false positives, since we don't
+             * hold the shadow lock yet.  Check again with the lock held. */
+            shadow_lock(v->domain);
+
+            /* This must still be a copy-from-user because we didn't
+             * have the shadow lock last time we checked, and the
+             * higher-level shadows might have disappeared under our
+             * feet. */
+            if ( __copy_from_user(&sl2e, 
+                                  sh_linear_l2_table(v)
+                                  + shadow_l2_linear_offset(va),
+                                  sizeof (sl2e)) != 0 )
+            {
+                perfc_incr(shadow_invlpg_fault);
+                shadow_unlock(v->domain);
+                return 0;
+            }
+
+            if ( !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) )
+            {
+                shadow_unlock(v->domain);
+                return 0;
+            }
+
+            sl1mfn = shadow_l2e_get_mfn(sl2e);
+            gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
+            pg = mfn_to_page(gl1mfn);
+            
+            if ( likely(sh_mfn_is_a_page_table(gl1mfn)
+                        && page_is_out_of_sync(pg) ) )
+            {
+                shadow_l1e_t *sl1;
+                sl1 = sh_linear_l1_table(v) + shadow_l1_linear_offset(va);
+                /* Remove the shadow entry that maps this VA */
+                (void) shadow_set_l1e(v, sl1, shadow_l1e_empty(), sl1mfn);
+            }
+            shadow_unlock(v->domain);
+            /* Need the invlpg, to pick up the disappeareance of the sl1e */
+            return 1;
+        }
+    }
+#endif
 
     return 1;
 }
@@ -3710,6 +4088,13 @@ sh_update_cr3(struct vcpu *v, int do_loc
         return;
     }
 
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    /* Need to resync all the shadow entries on a TLB flush.  Resync
+     * current vcpus OOS pages before switching to the new shadow
+     * tables so that the VA hint is still valid.  */
+    shadow_resync_current_vcpu(v, do_locking);
+#endif
+
     if ( do_locking ) shadow_lock(v->domain);
 
     ASSERT(shadow_locked_by_me(v->domain));
@@ -3938,11 +4323,70 @@ sh_update_cr3(struct vcpu *v, int do_loc
 
     /* Release the lock, if we took it (otherwise it's the caller's problem) */
     if ( do_locking ) shadow_unlock(v->domain);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    /* Need to resync all the shadow entries on a TLB flush. We only
+     * update the shadows, leaving the pages out of sync. Also, we try
+     * to skip synchronization of shadows not mapped in the new
+     * tables. */
+    shadow_sync_other_vcpus(v, do_locking);
+#endif
+
 }
 
 
 /**************************************************************************/
 /* Functions to revoke guest rights */
+
+#if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
+int sh_rm_write_access_from_sl1p(struct vcpu *v, mfn_t gmfn, 
+                                 mfn_t smfn, unsigned long off)
+{
+    int r;
+    shadow_l1e_t *sl1p, sl1e;
+    struct shadow_page_info *sp;
+
+    ASSERT(mfn_valid(gmfn));
+    ASSERT(mfn_valid(smfn));
+
+    sp = mfn_to_shadow_page(smfn);
+
+    if ( sp->mbz != 0 ||
+#if GUEST_PAGING_LEVELS == 4
+         (sp->type != SH_type_l1_64_shadow)
+#elif GUEST_PAGING_LEVELS == 3
+         (sp->type != SH_type_l1_pae_shadow)
+#elif GUEST_PAGING_LEVELS == 2
+         (sp->type != SH_type_l1_32_shadow)
+#endif
+       )
+        goto fail;
+
+    sl1p = sh_map_domain_page(smfn);
+    sl1p += off;
+    sl1e = *sl1p;
+    if ( ((shadow_l1e_get_flags(sl1e) & (_PAGE_PRESENT|_PAGE_RW))
+          != (_PAGE_PRESENT|_PAGE_RW))
+         || (mfn_x(shadow_l1e_get_mfn(sl1e)) != mfn_x(gmfn)) )
+    {
+        sh_unmap_domain_page(sl1p);
+        goto fail;
+    }
+
+    /* Found it!  Need to remove its write permissions. */
+    sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW);
+    r = shadow_set_l1e(v, sl1p, sl1e, smfn);
+    ASSERT( !(r & SHADOW_SET_ERROR) );
+
+    sh_unmap_domain_page(sl1p);
+    perfc_incr(shadow_writeable_h_7);
+    return 1;
+
+ fail:
+    perfc_incr(shadow_writeable_h_8);
+    return 0;
+}
+#endif /* OOS */
 
 #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC
 static int sh_guess_wrmap(struct vcpu *v, unsigned long vaddr, mfn_t gmfn)
@@ -4437,23 +4881,35 @@ sh_x86_emulate_cmpxchg8b(struct vcpu *v,
 
 #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES
 
-#define AUDIT_FAIL(_level, _fmt, _a...) do {                               \
-    printk("Shadow %u-on-%u audit failed at level %i, index %i\n"         \
-           "gl" #_level "mfn = %" PRI_mfn                              \
-           " sl" #_level "mfn = %" PRI_mfn                             \
-           " &gl" #_level "e = %p &sl" #_level "e = %p"                    \
-           " gl" #_level "e = %" SH_PRI_gpte                              \
-           " sl" #_level "e = %" SH_PRI_pte "\nError: " _fmt "\n",        \
-           GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS,                      \
-           _level, guest_index(gl ## _level ## e),                         \
-           mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn),         \
-           gl ## _level ## e, sl ## _level ## e,                           \
-           gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, \
-           ##_a);                                                          \
-    BUG();                                                                 \
-    done = 1;                                                              \
+#define AUDIT_FAIL(_level, _fmt, _a...) do {                            \
+    printk("Shadow %u-on-%u audit failed at level %i, index %i\n"       \
+           "gl" #_level "mfn = %" PRI_mfn                               \
+           " sl" #_level "mfn = %" PRI_mfn                              \
+           " &gl" #_level "e = %p &sl" #_level "e = %p"                 \
+           " gl" #_level "e = %" SH_PRI_gpte                            \
+           " sl" #_level "e = %" SH_PRI_pte "\nError: " _fmt "\n",      \
+           GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS,                   \
+               _level, guest_index(gl ## _level ## e),                  \
+               mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn),  \
+               gl ## _level ## e, sl ## _level ## e,                    \
+               gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, 
\
+               ##_a);                                                   \
+        BUG();                                                          \
+        done = 1;                                                       \
 } while (0)
 
+#define AUDIT_FAIL_MIN(_level, _fmt, _a...) do {                        \
+    printk("Shadow %u-on-%u audit failed at level %i\n"                 \
+           "gl" #_level "mfn = %" PRI_mfn                               \
+           " sl" #_level "mfn = %" PRI_mfn                              \
+           " Error: " _fmt "\n",                                        \
+           GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS,                   \
+           _level,                                                      \
+           mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn),      \
+           ##_a);                                                       \
+    BUG();                                                              \
+    done = 1;                                                           \
+} while (0)
 
 static char * sh_audit_flags(struct vcpu *v, int level,
                               int gflags, int sflags) 
@@ -4494,6 +4950,16 @@ int sh_audit_l1_table(struct vcpu *v, mf
     
     /* Follow the backpointer */
     gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    /* Out-of-sync l1 shadows can contain anything: just check the OOS hash */
+    if ( page_is_out_of_sync(mfn_to_page(gl1mfn)) )
+    {
+        oos_audit_hash_is_present(v->domain, gl1mfn);
+        return 0;
+    }
+#endif
+
     gl1e = gp = sh_map_domain_page(gl1mfn);
     SHADOW_FOREACH_L1E(sl1mfn, sl1e, &gl1e, done, {
 
@@ -4574,6 +5040,13 @@ int sh_audit_l2_table(struct vcpu *v, mf
 
     /* Follow the backpointer */
     gl2mfn = _mfn(mfn_to_shadow_page(sl2mfn)->backpointer);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+    /* Only L1's may be out of sync. */
+    if ( page_is_out_of_sync(mfn_to_page(gl2mfn)) )
+        AUDIT_FAIL_MIN(2, "gmfn %lx is out of sync", mfn_x(gl2mfn));
+#endif
+
     gl2e = gp = sh_map_domain_page(gl2mfn);
     SHADOW_FOREACH_L2E(sl2mfn, sl2e, &gl2e, done, v->domain, {
 
@@ -4616,6 +5089,13 @@ int sh_audit_l3_table(struct vcpu *v, mf
 
     /* Follow the backpointer */
     gl3mfn = _mfn(mfn_to_shadow_page(sl3mfn)->backpointer);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+    /* Only L1's may be out of sync. */
+    if ( page_is_out_of_sync(mfn_to_page(gl3mfn)) )
+        AUDIT_FAIL_MIN(3, "gmfn %lx is out of sync", mfn_x(gl3mfn));
+#endif
+
     gl3e = gp = sh_map_domain_page(gl3mfn);
     SHADOW_FOREACH_L3E(sl3mfn, sl3e, &gl3e, done, {
 
@@ -4656,6 +5136,13 @@ int sh_audit_l4_table(struct vcpu *v, mf
 
     /* Follow the backpointer */
     gl4mfn = _mfn(mfn_to_shadow_page(sl4mfn)->backpointer);
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+    /* Only L1's may be out of sync. */
+    if ( page_is_out_of_sync(mfn_to_page(gl4mfn)) )
+        AUDIT_FAIL_MIN(4, "gmfn %lx is out of sync", mfn_x(gl4mfn));
+#endif
+
     gl4e = gp = sh_map_domain_page(gl4mfn);
     SHADOW_FOREACH_L4E(sl4mfn, sl4e, &gl4e, done, v->domain,
     {
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/mm/shadow/multi.h
--- a/xen/arch/x86/mm/shadow/multi.h    Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/mm/shadow/multi.h    Wed Jul 02 11:30:37 2008 +0900
@@ -115,3 +115,17 @@ SHADOW_INTERNAL_NAME(sh_destroy_monitor_
 
 extern struct paging_mode 
 SHADOW_INTERNAL_NAME(sh_paging_mode, GUEST_LEVELS);
+
+#if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
+extern void 
+SHADOW_INTERNAL_NAME(sh_resync_l1, GUEST_LEVELS)
+     (struct vcpu *v, mfn_t gmfn, mfn_t snpmfn);
+
+extern int
+SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, GUEST_LEVELS)
+     (struct vcpu*v, mfn_t gmfn);
+
+extern int
+SHADOW_INTERNAL_NAME(sh_rm_write_access_from_sl1p, GUEST_LEVELS)
+     (struct vcpu *v, mfn_t gmfn, mfn_t smfn, unsigned long off);
+#endif
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/mm/shadow/private.h
--- a/xen/arch/x86/mm/shadow/private.h  Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/mm/shadow/private.h  Wed Jul 02 11:30:37 2008 +0900
@@ -63,8 +63,9 @@ extern int shadow_audit_enable;
 #define SHOPT_SKIP_VERIFY         0x20  /* Skip PTE v'fy when safe to do so */
 #define SHOPT_VIRTUAL_TLB         0x40  /* Cache guest v->p translations */
 #define SHOPT_FAST_EMULATION      0x80  /* Fast write emulation */
-
-#define SHADOW_OPTIMIZATIONS      0xff
+#define SHOPT_OUT_OF_SYNC        0x100  /* Allow guest writes to L1 PTs */
+
+#define SHADOW_OPTIMIZATIONS     0x1ff
 
 
 /******************************************************************************
@@ -195,9 +196,9 @@ struct shadow_page_info
         u32 tlbflush_timestamp;
     };
     struct {
-        unsigned int type:4;      /* What kind of shadow is this? */
+        unsigned int type:5;      /* What kind of shadow is this? */
         unsigned int pinned:1;    /* Is the shadow pinned? */
-        unsigned int count:27;    /* Reference count */
+        unsigned int count:26;    /* Reference count */
         u32 mbz;                  /* Must be zero: this is where the owner 
                                    * field lives in a non-shadow page */
     } __attribute__((packed));
@@ -242,7 +243,8 @@ static inline void shadow_check_page_str
 #define SH_type_max_shadow    (13U)
 #define SH_type_p2m_table     (14U) /* in use as the p2m table */
 #define SH_type_monitor_table (15U) /* in use as a monitor table */
-#define SH_type_unused        (16U)
+#define SH_type_oos_snapshot  (16U) /* in use as OOS snapshot */
+#define SH_type_unused        (17U)
 
 /* 
  * What counts as a pinnable shadow?
@@ -301,6 +303,72 @@ static inline int sh_type_is_pinnable(st
 #define SHF_PAE (SHF_L1_PAE|SHF_FL1_PAE|SHF_L2_PAE|SHF_L2H_PAE)
 #define SHF_64  (SHF_L1_64|SHF_FL1_64|SHF_L2_64|SHF_L2H_64|SHF_L3_64|SHF_L4_64)
 
+#define SHF_L1_ANY  (SHF_L1_32|SHF_L1_PAE|SHF_L1_64)
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+/* Marks a guest L1 page table which is shadowed but not write-protected.
+ * If set, then *only* L1 shadows (SHF_L1_*) are allowed. 
+ *
+ * out_of_sync indicates that the shadow tables may not reflect the
+ * guest tables.  If it is clear, then the shadow tables *must* reflect
+ * the guest tables.
+ *
+ * oos_may_write indicates that a page may have writable mappings.
+ *
+ * Most of the time the flags are synonymous.  There is a short period of time 
+ * during resync that oos_may_write is clear but out_of_sync is not.  If a 
+ * codepath is called during that time and is sensitive to oos issues, it may 
+ * need to use the second flag.
+ */
+#define SHF_out_of_sync (1u<<30)
+#define SHF_oos_may_write (1u<<29)
+
+/* Fixup tables are a non-complete writable-mappings reverse map for
+   OOS pages. This let us quickly resync pages (avoiding brute-force
+   search of the shadows) when the va hint is not sufficient (i.e.,
+   the pagetable is mapped in multiple places and in multiple
+   shadows.) */
+#define SHADOW_OOS_FT_ENTRIES                           \
+    ((PAGE_SIZE << SHADOW_OOS_FT_ORDER)                 \
+     / (SHADOW_OOS_FT_HASH * sizeof(struct oos_fixup)))
+
+#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
+
+static inline int sh_page_has_multiple_shadows(struct page_info *pg)
+{
+    u32 shadows;
+    if ( !(pg->count_info & PGC_page_table) )
+        return 0;
+    shadows = pg->shadow_flags & SHF_page_type_mask;
+    /* More than one type bit set in shadow-flags? */
+    return ( (shadows & ~(1UL << find_first_set_bit(shadows))) != 0 );
+}
+
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) 
+/* The caller must verify this is reasonable to call; i.e., valid mfn,
+ * domain is translated, &c */
+static inline int page_is_out_of_sync(struct page_info *p) 
+{
+    return (p->count_info & PGC_page_table)
+        && (p->shadow_flags & SHF_out_of_sync);
+}
+
+static inline int mfn_is_out_of_sync(mfn_t gmfn) 
+{
+    return page_is_out_of_sync(mfn_to_page(mfn_x(gmfn)));
+}
+
+static inline int page_oos_may_write(struct page_info *p) 
+{
+    return (p->count_info & PGC_page_table)
+        && (p->shadow_flags & SHF_oos_may_write);
+}
+
+static inline int mfn_oos_may_write(mfn_t gmfn) 
+{
+    return page_oos_may_write(mfn_to_page(mfn_x(gmfn)));
+}
+#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
 
 /******************************************************************************
  * Various function declarations 
@@ -351,7 +419,57 @@ int shadow_cmpxchg_guest_entry(struct vc
 int shadow_cmpxchg_guest_entry(struct vcpu *v, intpte_t *p,
                                intpte_t *old, intpte_t new, mfn_t gmfn);
 
-
+#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
+/* Allow a shadowed page to go out of sync */
+int sh_unsync(struct vcpu *v, mfn_t gmfn, unsigned long va);
+
+/* Pull an out-of-sync page back into sync. */
+void sh_resync(struct vcpu *v, mfn_t gmfn);
+
+void oos_fixup_add(struct vcpu *v, mfn_t gmfn, mfn_t smfn, unsigned long off);
+
+int sh_remove_write_access_from_sl1p(struct vcpu *v, mfn_t gmfn,
+                                     mfn_t smfn, unsigned long offset);
+
+/* Pull all out-of-sync shadows back into sync.  If skip != 0, we try
+ * to avoid resyncing where we think we can get away with it. */
+
+void sh_resync_all(struct vcpu *v, int skip, int this, int others, int 
do_locking);
+
+static inline void
+shadow_resync_all(struct vcpu *v, int do_locking)
+{
+    sh_resync_all(v,
+                  0 /* skip */,
+                  1 /* this */,
+                  1 /* others */,
+                  do_locking);
+}
+
+static inline void
+shadow_resync_current_vcpu(struct vcpu *v, int do_locking)
+{
+    sh_resync_all(v,
+                  0 /* skip */,
+                  1 /* this */, 
+                  0 /* others */,
+                  do_locking);
+}
+
+static inline void
+shadow_sync_other_vcpus(struct vcpu *v, int do_locking)
+{
+    sh_resync_all(v,
+                  1 /* skip */, 
+                  0 /* this */,
+                  1 /* others */,
+                  do_locking);
+}
+
+void oos_audit_hash_is_present(struct domain *d, mfn_t gmfn);
+mfn_t oos_snapshot_lookup(struct vcpu *v, mfn_t gmfn);
+
+#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */
 
 /******************************************************************************
  * Flags used in the return value of the shadow_set_lXe() functions...
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/mm/shadow/types.h
--- a/xen/arch/x86/mm/shadow/types.h    Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/mm/shadow/types.h    Wed Jul 02 11:30:37 2008 +0900
@@ -438,6 +438,11 @@ struct shadow_walk_t
 #define sh_guess_wrmap             INTERNAL_NAME(sh_guess_wrmap)
 #define sh_clear_shadow_entry      INTERNAL_NAME(sh_clear_shadow_entry)
 
+#if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC
+#define sh_resync_l1               INTERNAL_NAME(sh_resync_l1)
+#define sh_safe_not_to_sync        INTERNAL_NAME(sh_safe_not_to_sync)
+#define sh_rm_write_access_from_sl1p 
INTERNAL_NAME(sh_rm_write_access_from_sl1p)
+#endif
 
 /* The sh_guest_(map|get)_* functions depends on Xen's paging levels */
 #define sh_guest_map_l1e \
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/platform_hypercall.c
--- a/xen/arch/x86/platform_hypercall.c Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/platform_hypercall.c Wed Jul 02 11:30:37 2008 +0900
@@ -408,7 +408,12 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xe
                 cpu_count++;
             }
             if ( cpu_count == num_online_cpus() )
-                ret = acpi_cpufreq_init();
+            {
+                if ( boot_cpu_data.x86_vendor == X86_VENDOR_AMD )
+                    ret = powernow_cpufreq_init();
+                else
+                    ret = acpi_cpufreq_init();
+            }
             break;
         }
  
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/x86_emulate/x86_emulate.c
--- a/xen/arch/x86/x86_emulate/x86_emulate.c    Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c    Wed Jul 02 11:30:37 2008 +0900
@@ -142,12 +142,14 @@ static uint8_t opcode_table[256] = {
     ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
     ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
     /* 0xD0 - 0xD7 */
-    ByteOp|DstMem|SrcImplicit|ModRM, DstMem|SrcImplicit|ModRM, 
-    ByteOp|DstMem|SrcImplicit|ModRM, DstMem|SrcImplicit|ModRM, 
+    ByteOp|DstMem|SrcImplicit|ModRM, DstMem|SrcImplicit|ModRM,
+    ByteOp|DstMem|SrcImplicit|ModRM, DstMem|SrcImplicit|ModRM,
     ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
     /* 0xD8 - 0xDF */
-    0, ImplicitOps|ModRM|Mov, 0, ImplicitOps|ModRM|Mov,
-    0, ImplicitOps|ModRM|Mov, ImplicitOps|ModRM|Mov, ImplicitOps|ModRM|Mov,
+    ImplicitOps|ModRM|Mov, ImplicitOps|ModRM|Mov,
+    ImplicitOps|ModRM|Mov, ImplicitOps|ModRM|Mov,
+    ImplicitOps|ModRM|Mov, ImplicitOps|ModRM|Mov,
+    ImplicitOps|ModRM|Mov, ImplicitOps|ModRM|Mov,
     /* 0xE0 - 0xE7 */
     ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
     ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
@@ -216,7 +218,7 @@ static uint8_t twobyte_table[256] = {
     ByteOp|DstMem|SrcNone|ModRM|Mov, ByteOp|DstMem|SrcNone|ModRM|Mov,
     /* 0xA0 - 0xA7 */
     ImplicitOps, ImplicitOps, ImplicitOps, DstBitBase|SrcReg|ModRM,
-    DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, 0, 0, 
+    DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, 0, 0,
     /* 0xA8 - 0xAF */
     ImplicitOps, ImplicitOps, 0, DstBitBase|SrcReg|ModRM,
     DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM, 0, DstReg|SrcMem|ModRM,
@@ -246,8 +248,20 @@ static uint8_t twobyte_table[256] = {
 /* Type, address-of, and value of an instruction's operand. */
 struct operand {
     enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type;
-    unsigned int  bytes;
-    unsigned long val, orig_val;
+    unsigned int bytes;
+
+    /* Up to 128-byte operand value, addressable as ulong or uint32_t[]. */
+    union {
+        unsigned long val;
+        uint32_t bigval[4];
+    };
+
+    /* Up to 128-byte operand value, addressable as ulong or uint32_t[]. */
+    union {
+        unsigned long orig_val;
+        uint32_t orig_bigval[4];
+    };
+
     union {
         /* OP_REG: Pointer to register field. */
         unsigned long *reg;
@@ -466,7 +480,7 @@ do{ asm volatile (                      
 
 /* Fetch next part of the instruction being emulated. */
 #define insn_fetch_bytes(_size)                                         \
-({ unsigned long _x, _eip = _regs.eip;                                  \
+({ unsigned long _x = 0, _eip = _regs.eip;                              \
    if ( !mode_64bit() ) _eip = (uint32_t)_eip; /* ignore upper dword */ \
    _regs.eip += (_size); /* real hardware doesn't truncate */           \
    generate_exception_if((uint8_t)(_regs.eip - ctxt->regs->eip) > 15,   \
@@ -594,6 +608,18 @@ do{ struct fpu_insn_ctxt fic;           
     put_fpu(&fic);                                      \
 } while (0)
 
+#define emulate_fpu_insn_memsrc(_op, _arg)              \
+do{ struct fpu_insn_ctxt fic;                           \
+    get_fpu(X86EMUL_FPU_fpu, &fic);                     \
+    asm volatile (                                      \
+        "movb $2f-1f,%0 \n"                             \
+        "1: " _op " %1  \n"                             \
+        "2:             \n"                             \
+        : "=m" (fic.insn_bytes)                         \
+        : "m" (_arg) : "memory" );                      \
+    put_fpu(&fic);                                      \
+} while (0)
+
 #define emulate_fpu_insn_stub(_bytes...)                                \
 do{ uint8_t stub[] = { _bytes, 0xc3 };                                  \
     struct fpu_insn_ctxt fic = { .insn_bytes = sizeof(stub)-1 };        \
@@ -654,6 +680,19 @@ static void __put_rep_prefix(
     if ( rep_prefix )                                                   \
         __put_rep_prefix(&_regs, ctxt->regs, ad_bytes, reps_completed); \
 })
+
+/* Compatibility function: read guest memory, zero-extend result to a ulong. */
+static int read_ulong(
+        enum x86_segment seg,
+        unsigned long offset,
+        unsigned long *val,
+        unsigned int bytes,
+        struct x86_emulate_ctxt *ctxt,
+        struct x86_emulate_ops *ops)
+{
+    *val = 0;
+    return ops->read(seg, offset, val, bytes, ctxt);
+}
 
 /*
  * Unsigned multiplication with double-word result.
@@ -841,7 +880,8 @@ static int ioport_access_check(
          (tr.limit < 0x67) )
         goto raise_exception;
 
-    if ( (rc = ops->read(x86_seg_none, tr.base + 0x66, &iobmp, 2, ctxt)) )
+    if ( (rc = read_ulong(x86_seg_none, tr.base + 0x66,
+                          &iobmp, 2, ctxt, ops)) )
         return rc;
 
     /* Ensure TSS includes two bytes including byte containing first port. */
@@ -849,7 +889,8 @@ static int ioport_access_check(
     if ( tr.limit <= iobmp )
         goto raise_exception;
 
-    if ( (rc = ops->read(x86_seg_none, tr.base + iobmp, &iobmp, 2, ctxt)) )
+    if ( (rc = read_ulong(x86_seg_none, tr.base + iobmp,
+                          &iobmp, 2, ctxt, ops)) )
         return rc;
     if ( (iobmp & (((1<<bytes)-1) << (first_port&7))) != 0 )
         goto raise_exception;
@@ -941,12 +982,12 @@ protmode_load_seg(
         goto raise_exn;
 
     do {
-        if ( (rc = ops->read(x86_seg_none, desctab.base + (sel & 0xfff8),
-                             &val, 4, ctxt)) )
+        if ( (rc = read_ulong(x86_seg_none, desctab.base + (sel & 0xfff8),
+                              &val, 4, ctxt, ops)) )
             return rc;
         desc.a = val;
-        if ( (rc = ops->read(x86_seg_none, desctab.base + (sel & 0xfff8) + 4,
-                             &val, 4, ctxt)) )
+        if ( (rc = read_ulong(x86_seg_none, desctab.base + (sel & 0xfff8) + 4,
+                              &val, 4, ctxt, ops)) )
             return rc;
         desc.b = val;
 
@@ -992,14 +1033,15 @@ protmode_load_seg(
             if ( (desc.b & (5u<<9)) == (4u<<9) )
                 goto raise_exn;
             /* Non-conforming segment: check DPL against RPL and CPL. */
-            if ( ((desc.b & (6u<<9)) != (6u<<9)) && ((dpl < cpl) || (dpl < 
rpl)) )
+            if ( ((desc.b & (6u<<9)) != (6u<<9)) &&
+                 ((dpl < cpl) || (dpl < rpl)) )
                 goto raise_exn;
             break;
         }
 
         /* Ensure Accessed flag is set. */
         new_desc_b = desc.b | 0x100;
-        rc = ((desc.b & 0x100) ? X86EMUL_OKAY : 
+        rc = ((desc.b & 0x100) ? X86EMUL_OKAY :
               ops->cmpxchg(
                   x86_seg_none, desctab.base + (sel & 0xfff8) + 4,
                   &desc.b, &new_desc_b, 4, ctxt));
@@ -1061,16 +1103,16 @@ decode_register(
     case  2: p = &regs->edx; break;
     case  3: p = &regs->ebx; break;
     case  4: p = (highbyte_regs ?
-                  ((unsigned char *)&regs->eax + 1) : 
+                  ((unsigned char *)&regs->eax + 1) :
                   (unsigned char *)&regs->esp); break;
     case  5: p = (highbyte_regs ?
-                  ((unsigned char *)&regs->ecx + 1) : 
+                  ((unsigned char *)&regs->ecx + 1) :
                   (unsigned char *)&regs->ebp); break;
     case  6: p = (highbyte_regs ?
-                  ((unsigned char *)&regs->edx + 1) : 
+                  ((unsigned char *)&regs->edx + 1) :
                   (unsigned char *)&regs->esi); break;
     case  7: p = (highbyte_regs ?
-                  ((unsigned char *)&regs->ebx + 1) : 
+                  ((unsigned char *)&regs->ebx + 1) :
                   (unsigned char *)&regs->edi); break;
 #if defined(__x86_64__)
     case  8: p = &regs->r8;  break;
@@ -1402,8 +1444,8 @@ x86_emulate(
             case 8: src.val = *(uint64_t *)src.reg; break;
             }
         }
-        else if ( (rc = ops->read(src.mem.seg, src.mem.off,
-                                  &src.val, src.bytes, ctxt)) )
+        else if ( (rc = read_ulong(src.mem.seg, src.mem.off,
+                                   &src.val, src.bytes, ctxt, ops)) )
             goto done;
         break;
     case SrcImm:
@@ -1494,8 +1536,8 @@ x86_emulate(
         }
         else if ( !(d & Mov) ) /* optimisation - avoid slow emulated read */
         {
-            if ( (rc = ops->read(dst.mem.seg, dst.mem.off,
-                                 &dst.val, dst.bytes, ctxt)) )
+            if ( (rc = read_ulong(dst.mem.seg, dst.mem.off,
+                                  &dst.val, dst.bytes, ctxt, ops)) )
                 goto done;
             dst.orig_val = dst.val;
         }
@@ -1571,8 +1613,8 @@ x86_emulate(
         int lb, ub, idx;
         generate_exception_if(mode_64bit() || (src.type != OP_MEM),
                               EXC_UD, -1);
-        if ( (rc = ops->read(src.mem.seg, src.mem.off + op_bytes,
-                             &src_val2, op_bytes, ctxt)) )
+        if ( (rc = read_ulong(src.mem.seg, src.mem.off + op_bytes,
+                              &src_val2, op_bytes, ctxt, ops)) )
             goto done;
         ub  = (op_bytes == 2) ? (int16_t)src_val2 : (int32_t)src_val2;
         lb  = (op_bytes == 2) ? (int16_t)src.val  : (int32_t)src.val;
@@ -1588,8 +1630,8 @@ x86_emulate(
             /* movsxd */
             if ( src.type == OP_REG )
                 src.val = *(int32_t *)src.reg;
-            else if ( (rc = ops->read(src.mem.seg, src.mem.off,
-                                      &src.val, 4, ctxt)) )
+            else if ( (rc = read_ulong(src.mem.seg, src.mem.off,
+                                       &src.val, 4, ctxt, ops)) )
                 goto done;
             dst.val = (int32_t)src.val;
         }
@@ -1613,8 +1655,8 @@ x86_emulate(
         unsigned long src1; /* ModR/M source operand */
         if ( ea.type == OP_REG )
             src1 = *ea.reg;
-        else if ( (rc = ops->read(ea.mem.seg, ea.mem.off,
-                                  &src1, op_bytes, ctxt)) )
+        else if ( (rc = read_ulong(ea.mem.seg, ea.mem.off,
+                                   &src1, op_bytes, ctxt, ops)) )
             goto done;
         _regs.eflags &= ~(EFLG_OF|EFLG_CF);
         switch ( dst.bytes )
@@ -1720,8 +1762,8 @@ x86_emulate(
         /* 64-bit mode: POP defaults to a 64-bit operand. */
         if ( mode_64bit() && (dst.bytes == 4) )
             dst.bytes = 8;
-        if ( (rc = ops->read(x86_seg_ss, sp_post_inc(dst.bytes),
-                             &dst.val, dst.bytes, ctxt)) != 0 )
+        if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(dst.bytes),
+                              &dst.val, dst.bytes, ctxt, ops)) != 0 )
             goto done;
         break;
 
@@ -1773,8 +1815,8 @@ x86_emulate(
         dst.val = x86_seg_es;
     les: /* dst.val identifies the segment */
         generate_exception_if(src.type != OP_MEM, EXC_UD, -1);
-        if ( (rc = ops->read(src.mem.seg, src.mem.off + src.bytes,
-                             &sel, 2, ctxt)) != 0 )
+        if ( (rc = read_ulong(src.mem.seg, src.mem.off + src.bytes,
+                              &sel, 2, ctxt, ops)) != 0 )
             goto done;
         if ( (rc = load_seg(dst.val, (uint16_t)sel, ctxt, ops)) != 0 )
             goto done;
@@ -2020,8 +2062,8 @@ x86_emulate(
                 dst.bytes = op_bytes = 8;
                 if ( dst.type == OP_REG )
                     dst.val = *dst.reg;
-                else if ( (rc = ops->read(dst.mem.seg, dst.mem.off,
-                                          &dst.val, 8, ctxt)) != 0 )
+                else if ( (rc = read_ulong(dst.mem.seg, dst.mem.off,
+                                           &dst.val, 8, ctxt, ops)) != 0 )
                     goto done;
             }
             src.val = _regs.eip;
@@ -2036,8 +2078,8 @@ x86_emulate(
 
             generate_exception_if(dst.type != OP_MEM, EXC_UD, -1);
 
-            if ( (rc = ops->read(dst.mem.seg, dst.mem.off+dst.bytes,
-                                 &sel, 2, ctxt)) )
+            if ( (rc = read_ulong(dst.mem.seg, dst.mem.off+dst.bytes,
+                                  &sel, 2, ctxt, ops)) )
                 goto done;
 
             if ( (modrm_reg & 7) == 3 ) /* call */
@@ -2046,9 +2088,9 @@ x86_emulate(
                 fail_if(ops->read_segment == NULL);
                 if ( (rc = ops->read_segment(x86_seg_cs, &reg, ctxt)) ||
                      (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes),
-                                      reg.sel, op_bytes, ctxt)) ||
+                                      &reg.sel, op_bytes, ctxt)) ||
                      (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes),
-                                      _regs.eip, op_bytes, ctxt)) )
+                                      &_regs.eip, op_bytes, ctxt)) )
                     goto done;
             }
 
@@ -2066,12 +2108,12 @@ x86_emulate(
                 dst.bytes = 8;
                 if ( dst.type == OP_REG )
                     dst.val = *dst.reg;
-                else if ( (rc = ops->read(dst.mem.seg, dst.mem.off,
-                                          &dst.val, 8, ctxt)) != 0 )
+                else if ( (rc = read_ulong(dst.mem.seg, dst.mem.off,
+                                           &dst.val, 8, ctxt, ops)) != 0 )
                     goto done;
             }
             if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(dst.bytes),
-                                  dst.val, dst.bytes, ctxt)) != 0 )
+                                  &dst.val, dst.bytes, ctxt)) != 0 )
                 goto done;
             dst.type = OP_NONE;
             break;
@@ -2106,7 +2148,7 @@ x86_emulate(
                 &dst.val, dst.bytes, ctxt);
         else
             rc = ops->write(
-                dst.mem.seg, dst.mem.off, dst.val, dst.bytes, ctxt);
+                dst.mem.seg, dst.mem.off, &dst.val, dst.bytes, ctxt);
         if ( rc != 0 )
             goto done;
     default:
@@ -2153,7 +2195,7 @@ x86_emulate(
         if ( mode_64bit() && (op_bytes == 4) )
             op_bytes = 8;
         if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes),
-                              reg.sel, op_bytes, ctxt)) != 0 )
+                              &reg.sel, op_bytes, ctxt)) != 0 )
             goto done;
         break;
     }
@@ -2165,8 +2207,8 @@ x86_emulate(
         /* 64-bit mode: POP defaults to a 64-bit operand. */
         if ( mode_64bit() && (op_bytes == 4) )
             op_bytes = 8;
-        if ( (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes),
-                             &dst.val, op_bytes, ctxt)) != 0 )
+        if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes),
+                              &dst.val, op_bytes, ctxt, ops)) != 0 )
             goto done;
         if ( (rc = load_seg(src.val, (uint16_t)dst.val, ctxt, ops)) != 0 )
             return rc;
@@ -2275,8 +2317,8 @@ x86_emulate(
         dst.bytes = op_bytes;
         if ( mode_64bit() && (dst.bytes == 4) )
             dst.bytes = 8;
-        if ( (rc = ops->read(x86_seg_ss, sp_post_inc(dst.bytes),
-                             &dst.val, dst.bytes, ctxt)) != 0 )
+        if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(dst.bytes),
+                              &dst.val, dst.bytes, ctxt, ops)) != 0 )
             goto done;
         break;
 
@@ -2288,7 +2330,7 @@ x86_emulate(
         generate_exception_if(mode_64bit(), EXC_UD, -1);
         for ( i = 0; i < 8; i++ )
             if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes),
-                                  regs[i], op_bytes, ctxt)) != 0 )
+                                  &regs[i], op_bytes, ctxt)) != 0 )
             goto done;
         break;
     }
@@ -2303,8 +2345,8 @@ x86_emulate(
         generate_exception_if(mode_64bit(), EXC_UD, -1);
         for ( i = 0; i < 8; i++ )
         {
-            if ( (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes),
-                                 &dst.val, op_bytes, ctxt)) != 0 )
+            if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes),
+                                  &dst.val, op_bytes, ctxt, ops)) != 0 )
                 goto done;
             switch ( op_bytes )
             {
@@ -2382,8 +2424,8 @@ x86_emulate(
         }
         else
         {
-            if ( (rc = ops->read(ea.mem.seg, truncate_ea(_regs.esi),
-                                 &dst.val, dst.bytes, ctxt)) != 0 )
+            if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.esi),
+                                  &dst.val, dst.bytes, ctxt, ops)) != 0 )
                 goto done;
             fail_if(ops->write_io == NULL);
             if ( (rc = ops->write_io(port, dst.bytes, dst.val, ctxt)) != 0 )
@@ -2455,9 +2497,9 @@ x86_emulate(
 
         if ( (rc = ops->read_segment(x86_seg_cs, &reg, ctxt)) ||
              (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes),
-                              reg.sel, op_bytes, ctxt)) ||
+                              &reg.sel, op_bytes, ctxt)) ||
              (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes),
-                              _regs.eip, op_bytes, ctxt)) )
+                              &_regs.eip, op_bytes, ctxt)) )
             goto done;
 
         if ( (rc = load_seg(x86_seg_cs, sel, ctxt, ops)) != 0 )
@@ -2483,8 +2525,8 @@ x86_emulate(
         /* 64-bit mode: POP defaults to a 64-bit operand. */
         if ( mode_64bit() && (op_bytes == 4) )
             op_bytes = 8;
-        if ( (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes),
-                             &dst.val, op_bytes, ctxt)) != 0 )
+        if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes),
+                              &dst.val, op_bytes, ctxt, ops)) != 0 )
             goto done;
         if ( op_bytes == 2 )
             dst.val = (uint16_t)dst.val | (_regs.eflags & 0xffff0000u);
@@ -2507,8 +2549,8 @@ x86_emulate(
         dst.type  = OP_REG;
         dst.reg   = (unsigned long *)&_regs.eax;
         dst.bytes = (d & ByteOp) ? 1 : op_bytes;
-        if ( (rc = ops->read(ea.mem.seg, insn_fetch_bytes(ad_bytes),
-                             &dst.val, dst.bytes, ctxt)) != 0 )
+        if ( (rc = read_ulong(ea.mem.seg, insn_fetch_bytes(ad_bytes),
+                              &dst.val, dst.bytes, ctxt, ops)) != 0 )
             goto done;
         break;
 
@@ -2536,8 +2578,8 @@ x86_emulate(
         }
         else
         {
-            if ( (rc = ops->read(ea.mem.seg, truncate_ea(_regs.esi),
-                                 &dst.val, dst.bytes, ctxt)) != 0 )
+            if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.esi),
+                                  &dst.val, dst.bytes, ctxt, ops)) != 0 )
                 goto done;
             dst.type = OP_MEM;
             nr_reps = 1;
@@ -2556,10 +2598,10 @@ x86_emulate(
         unsigned long next_eip = _regs.eip;
         get_rep_prefix();
         src.bytes = dst.bytes = (d & ByteOp) ? 1 : op_bytes;
-        if ( (rc = ops->read(ea.mem.seg, truncate_ea(_regs.esi),
-                             &dst.val, dst.bytes, ctxt)) ||
-             (rc = ops->read(x86_seg_es, truncate_ea(_regs.edi),
-                             &src.val, src.bytes, ctxt)) )
+        if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.esi),
+                              &dst.val, dst.bytes, ctxt, ops)) ||
+             (rc = read_ulong(x86_seg_es, truncate_ea(_regs.edi),
+                              &src.val, src.bytes, ctxt, ops)) )
             goto done;
         register_address_increment(
             _regs.esi, (_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
@@ -2592,8 +2634,8 @@ x86_emulate(
         dst.type  = OP_REG;
         dst.bytes = (d & ByteOp) ? 1 : op_bytes;
         dst.reg   = (unsigned long *)&_regs.eax;
-        if ( (rc = ops->read(ea.mem.seg, truncate_ea(_regs.esi),
-                             &dst.val, dst.bytes, ctxt)) != 0 )
+        if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.esi),
+                              &dst.val, dst.bytes, ctxt, ops)) != 0 )
             goto done;
         register_address_increment(
             _regs.esi, (_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes);
@@ -2606,8 +2648,8 @@ x86_emulate(
         get_rep_prefix();
         src.bytes = dst.bytes = (d & ByteOp) ? 1 : op_bytes;
         dst.val = _regs.eax;
-        if ( (rc = ops->read(x86_seg_es, truncate_ea(_regs.edi),
-                             &src.val, src.bytes, ctxt)) != 0 )
+        if ( (rc = read_ulong(x86_seg_es, truncate_ea(_regs.edi),
+                              &src.val, src.bytes, ctxt, ops)) != 0 )
             goto done;
         register_address_increment(
             _regs.edi, (_regs.eflags & EFLG_DF) ? -src.bytes : src.bytes);
@@ -2624,8 +2666,8 @@ x86_emulate(
     case 0xc3: /* ret (near) */ {
         int offset = (b == 0xc2) ? insn_fetch_type(uint16_t) : 0;
         op_bytes = mode_64bit() ? 8 : op_bytes;
-        if ( (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes + offset),
-                             &dst.val, op_bytes, ctxt)) != 0 )
+        if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes + offset),
+                              &dst.val, op_bytes, ctxt, ops)) != 0 )
             goto done;
         _regs.eip = dst.val;
         break;
@@ -2640,7 +2682,7 @@ x86_emulate(
         dst.bytes = (mode_64bit() && (op_bytes == 4)) ? 8 : op_bytes;
         dst.reg = (unsigned long *)&_regs.ebp;
         if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(dst.bytes),
-                              _regs.ebp, dst.bytes, ctxt)) )
+                              &_regs.ebp, dst.bytes, ctxt)) )
             goto done;
         dst.val = _regs.esp;
 
@@ -2650,14 +2692,14 @@ x86_emulate(
             {
                 unsigned long ebp, temp_data;
                 ebp = truncate_word(_regs.ebp - i*dst.bytes, ctxt->sp_size/8);
-                if ( (rc = ops->read(x86_seg_ss, ebp,
-                                     &temp_data, dst.bytes, ctxt)) ||
+                if ( (rc = read_ulong(x86_seg_ss, ebp,
+                                      &temp_data, dst.bytes, ctxt, ops)) ||
                      (rc = ops->write(x86_seg_ss, sp_pre_dec(dst.bytes),
-                                      temp_data, dst.bytes, ctxt)) )
+                                      &temp_data, dst.bytes, ctxt)) )
                     goto done;
             }
             if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(dst.bytes),
-                                  dst.val, dst.bytes, ctxt)) )
+                                  &dst.val, dst.bytes, ctxt)) )
                 goto done;
         }
 
@@ -2683,8 +2725,8 @@ x86_emulate(
 
         /* Second writeback, to %%ebp. */
         dst.reg = (unsigned long *)&_regs.ebp;
-        if ( (rc = ops->read(x86_seg_ss, sp_post_inc(dst.bytes),
-                             &dst.val, dst.bytes, ctxt)) )
+        if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(dst.bytes),
+                              &dst.val, dst.bytes, ctxt, ops)) )
             goto done;
         break;
 
@@ -2692,10 +2734,10 @@ x86_emulate(
     case 0xcb: /* ret (far) */ {
         int offset = (b == 0xca) ? insn_fetch_type(uint16_t) : 0;
         op_bytes = mode_64bit() ? 8 : op_bytes;
-        if ( (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes),
-                             &dst.val, op_bytes, ctxt)) || 
-             (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes + offset),
-                             &src.val, op_bytes, ctxt)) ||
+        if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes),
+                              &dst.val, op_bytes, ctxt, ops)) ||
+             (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes + offset),
+                              &src.val, op_bytes, ctxt, ops)) ||
              (rc = load_seg(x86_seg_cs, (uint16_t)src.val, ctxt, ops)) )
             goto done;
         _regs.eip = dst.val;
@@ -2729,12 +2771,12 @@ x86_emulate(
         if ( !mode_iopl() )
             mask |= EFLG_IF;
         fail_if(!in_realmode(ctxt, ops));
-        if ( (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes),
-                             &eip, op_bytes, ctxt)) ||
-             (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes),
-                             &cs, op_bytes, ctxt)) ||
-             (rc = ops->read(x86_seg_ss, sp_post_inc(op_bytes),
-                             &eflags, op_bytes, ctxt)) )
+        if ( (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes),
+                              &eip, op_bytes, ctxt, ops)) ||
+             (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes),
+                              &cs, op_bytes, ctxt, ops)) ||
+             (rc = read_ulong(x86_seg_ss, sp_post_inc(op_bytes),
+                              &eflags, op_bytes, ctxt, ops)) )
             goto done;
         if ( op_bytes == 2 )
             eflags = (uint16_t)eflags | (_regs.eflags & 0xffff0000u);
@@ -2779,12 +2821,64 @@ x86_emulate(
 
     case 0xd7: /* xlat */ {
         unsigned long al = (uint8_t)_regs.eax;
-        if ( (rc = ops->read(ea.mem.seg, truncate_ea(_regs.ebx + al),
-                             &al, 1, ctxt)) != 0 )
+        if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.ebx + al),
+                              &al, 1, ctxt, ops)) != 0 )
             goto done;
         *(uint8_t *)&_regs.eax = al;
         break;
     }
+
+    case 0xd8: /* FPU 0xd8 */
+        switch ( modrm )
+        {
+        case 0xc0 ... 0xc7: /* fadd %stN,%stN */
+        case 0xc8 ... 0xcf: /* fmul %stN,%stN */
+        case 0xd0 ... 0xd7: /* fcom %stN,%stN */
+        case 0xd8 ... 0xdf: /* fcomp %stN,%stN */
+        case 0xe0 ... 0xe7: /* fsub %stN,%stN */
+        case 0xe8 ... 0xef: /* fsubr %stN,%stN */
+        case 0xf0 ... 0xf7: /* fdiv %stN,%stN */
+        case 0xf8 ... 0xff: /* fdivr %stN,%stN */
+            emulate_fpu_insn_stub(0xd8, modrm);
+            break;
+        default:
+            fail_if(modrm >= 0xc0);
+            ea.bytes = 4;
+            src = ea;
+            if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val,
+                                 src.bytes, ctxt)) != 0 )
+                goto done;
+            switch ( modrm_reg & 7 )
+            {
+            case 0: /* fadd */
+                emulate_fpu_insn_memsrc("fadds", src.val);
+                break;
+            case 1: /* fmul */
+                emulate_fpu_insn_memsrc("fmuls", src.val);
+                break;
+            case 2: /* fcom */
+                emulate_fpu_insn_memsrc("fcoms", src.val);
+                break;
+            case 3: /* fcomp */
+                emulate_fpu_insn_memsrc("fcomps", src.val);
+                break;
+            case 4: /* fsub */
+                emulate_fpu_insn_memsrc("fsubs", src.val);
+                break;
+            case 5: /* fsubr */
+                emulate_fpu_insn_memsrc("fsubrs", src.val);
+                break;
+            case 6: /* fdiv */
+                emulate_fpu_insn_memsrc("fdivs", src.val);
+                break;
+            case 7: /* fdivr */
+                emulate_fpu_insn_memsrc("fdivrs", src.val);
+                break;
+            default:
+                goto cannot_emulate;
+            }
+        }
+        break;
 
     case 0xd9: /* FPU 0xd9 */
         switch ( modrm )
@@ -2822,28 +2916,269 @@ x86_emulate(
             emulate_fpu_insn_stub(0xd9, modrm);
             break;
         default:
-            fail_if((modrm_reg & 7) != 7);
             fail_if(modrm >= 0xc0);
-            /* fnstcw m2byte */
-            ea.bytes = 2;
-            dst = ea;
-            emulate_fpu_insn_memdst("fnstcw", dst.val);
+            switch ( modrm_reg & 7 )
+            {
+            case 0: /* fld m32fp */
+                ea.bytes = 4;
+                src = ea;
+                if ( (rc = ops->read(ea.mem.seg, ea.mem.off, &src.val,
+                                     src.bytes, ctxt)) != 0 )
+                    goto done;
+                emulate_fpu_insn_memsrc("flds", src.val);
+                break;
+            case 2: /* fstp m32fp */
+                ea.bytes = 4;
+                dst = ea;
+                dst.type = OP_MEM;
+                emulate_fpu_insn_memdst("fsts", dst.val);
+                break;
+            case 3: /* fstp m32fp */
+                ea.bytes = 4;
+                dst = ea;
+                dst.type = OP_MEM;
+                emulate_fpu_insn_memdst("fstps", dst.val);
+                break;
+                /* case 4: fldenv - TODO */
+            case 5: /* fldcw m2byte */
+                ea.bytes = 2;
+                src = ea;
+                if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val,
+                                     src.bytes, ctxt)) != 0 )
+                    goto done;
+                emulate_fpu_insn_memsrc("fldcw", src.val);
+                break;
+                /* case 6: fstenv - TODO */
+            case 7: /* fnstcw m2byte */
+                ea.bytes = 2;
+                dst = ea;
+                dst.type = OP_MEM;
+                emulate_fpu_insn_memdst("fnstcw", dst.val);
+                break;
+            default:
+                goto cannot_emulate;
+            }
+        }
+        break;
+
+    case 0xda: /* FPU 0xda */
+        switch ( modrm )
+        {
+        case 0xc0 ... 0xc7: /* fcmovb %stN */
+        case 0xc8 ... 0xcf: /* fcmove %stN */
+        case 0xd0 ... 0xd7: /* fcmovbe %stN */
+        case 0xd8 ... 0xdf: /* fcmovu %stN */
+        case 0xe9:          /* fucompp */
+            emulate_fpu_insn_stub(0xda, modrm);
+            break;
+        default:
+            fail_if(modrm >= 0xc0);
+            ea.bytes = 8;
+            src = ea;
+            if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val,
+                                 src.bytes, ctxt)) != 0 )
+                goto done;
+            switch ( modrm_reg & 7 )
+            {
+            case 0: /* fiadd m64i */
+                emulate_fpu_insn_memsrc("fiaddl", src.val);
+                break;
+            case 1: /* fimul m64i */
+                emulate_fpu_insn_memsrc("fimul", src.val);
+                break;
+            case 2: /* ficom m64i */
+                emulate_fpu_insn_memsrc("ficoml", src.val);
+                break;
+            case 3: /* ficomp m64i */
+                emulate_fpu_insn_memsrc("ficompl", src.val);
+                break;
+            case 4: /* fisub m64i */
+                emulate_fpu_insn_memsrc("fisubl", src.val);
+                break;
+            case 5: /* fisubr m64i */
+                emulate_fpu_insn_memsrc("fisubrl", src.val);
+                break;
+            case 6: /* fidiv m64i */
+                emulate_fpu_insn_memsrc("fidivl", src.val);
+                break;
+            case 7: /* fidivr m64i */
+                emulate_fpu_insn_memsrc("fidivrl", src.val);
+                break;
+            default:
+                goto cannot_emulate;
+            }
         }
         break;
 
     case 0xdb: /* FPU 0xdb */
-        fail_if(modrm != 0xe3);
-        /* fninit */
-        emulate_fpu_insn("fninit");
+        switch ( modrm )
+        {
+        case 0xc0 ... 0xc7: /* fcmovnb %stN */
+        case 0xc8 ... 0xcf: /* fcmovne %stN */
+        case 0xd0 ... 0xd7: /* fcmovnbe %stN */
+        case 0xd8 ... 0xdf: /* fcmovnu %stN */
+            emulate_fpu_insn_stub(0xdb, modrm);
+            break;
+        case 0xe2: /* fnclex */
+            emulate_fpu_insn("fnclex");
+            break;
+        case 0xe3: /* fninit */
+            emulate_fpu_insn("fninit");
+            break;
+        case 0xe4: /* fsetpm - 287 only, ignored by 387 */
+            break;
+        case 0xe8 ... 0xef: /* fucomi %stN */
+        case 0xf0 ... 0xf7: /* fcomi %stN */
+            emulate_fpu_insn_stub(0xdb, modrm);
+            break;
+        default:
+            fail_if(modrm >= 0xc0);
+            switch ( modrm_reg & 7 )
+            {
+            case 0: /* fild m32i */
+                ea.bytes = 4;
+                src = ea;
+                if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val,
+                                     src.bytes, ctxt)) != 0 )
+                    goto done;
+                emulate_fpu_insn_memsrc("fildl", src.val);
+                break;
+            case 1: /* fisttp m32i */
+                ea.bytes = 4;
+                dst = ea;
+                dst.type = OP_MEM;
+                emulate_fpu_insn_memdst("fisttpl", dst.val);
+                break;
+            case 2: /* fist m32i */
+                ea.bytes = 4;
+                dst = ea;
+                dst.type = OP_MEM;
+                emulate_fpu_insn_memdst("fistl", dst.val);
+                break;
+            case 3: /* fistp m32i */
+                ea.bytes = 4;
+                dst = ea;
+                dst.type = OP_MEM;
+                emulate_fpu_insn_memdst("fistpl", dst.val);
+                break;
+            case 5: /* fld m80fp */
+                ea.bytes = 10;
+                src = ea;
+                if ( (rc = ops->read(src.mem.seg, src.mem.off,
+                                     &src.val, src.bytes, ctxt)) != 0 )
+                    goto done;
+                emulate_fpu_insn_memdst("fldt", src.val);
+                break;
+            case 7: /* fstp m80fp */
+                ea.bytes = 10;
+                dst.type = OP_MEM;
+                dst = ea;
+                emulate_fpu_insn_memdst("fstpt", dst.val);
+                break;
+            default:
+                goto cannot_emulate;
+            }
+        }
+        break;
+
+    case 0xdc: /* FPU 0xdc */
+        switch ( modrm )
+        {
+        case 0xc0 ... 0xc7: /* fadd %stN */
+        case 0xc8 ... 0xcf: /* fmul %stN */
+        case 0xe0 ... 0xe7: /* fsubr %stN */
+        case 0xe8 ... 0xef: /* fsub %stN */
+        case 0xf0 ... 0xf7: /* fdivr %stN */
+        case 0xf8 ... 0xff: /* fdiv %stN */
+            emulate_fpu_insn_stub(0xdc, modrm);
+            break;
+        default:
+            fail_if(modrm >= 0xc0);
+            ea.bytes = 8;
+            src = ea;
+            if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val,
+                                 src.bytes, ctxt)) != 0 )
+                goto done;
+            switch ( modrm_reg & 7 )
+            {
+            case 0: /* fadd m64fp */
+                emulate_fpu_insn_memsrc("faddl", src.val);
+                break;
+            case 1: /* fmul m64fp */
+                emulate_fpu_insn_memsrc("fmull", src.val);
+                break;
+            case 2: /* fcom m64fp */
+                emulate_fpu_insn_memsrc("fcoml", src.val);
+                break;
+            case 3: /* fcomp m64fp */
+                emulate_fpu_insn_memsrc("fcompl", src.val);
+                break;
+            case 4: /* fsub m64fp */
+                emulate_fpu_insn_memsrc("fsubl", src.val);
+                break;
+            case 5: /* fsubr m64fp */
+                emulate_fpu_insn_memsrc("fsubrl", src.val);
+                break;
+            case 6: /* fdiv m64fp */
+                emulate_fpu_insn_memsrc("fdivl", src.val);
+                break;
+            case 7: /* fdivr m64fp */
+                emulate_fpu_insn_memsrc("fdivrl", src.val);
+                break;
+            }
+        }
         break;
 
     case 0xdd: /* FPU 0xdd */
-        fail_if((modrm_reg & 7) != 7);
-        fail_if(modrm >= 0xc0);
-        /* fnstsw m2byte */
-        ea.bytes = 2;
-        dst = ea;
-        emulate_fpu_insn_memdst("fnstsw", dst.val);
+        switch ( modrm )
+        {
+        case 0xc0 ... 0xc7: /* ffree %stN */
+        case 0xd0 ... 0xd7: /* fst %stN */
+        case 0xd8 ... 0xdf: /* fstp %stN */
+        case 0xe0 ... 0xe7: /* fucom %stN */
+        case 0xe8 ... 0xef: /* fucomp %stN */
+            emulate_fpu_insn_stub(0xdd, modrm);
+            break;
+        default:
+            fail_if(modrm >= 0xc0);
+            switch ( modrm_reg & 7 )
+            {
+            case 0: /* fld m64fp */;
+                ea.bytes = 8;
+                src = ea;
+                if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val,
+                                     src.bytes, ctxt)) != 0 )
+                    goto done;
+                emulate_fpu_insn_memsrc("fldl", src.val);
+                break;
+            case 1: /* fisttp m64i */
+                ea.bytes = 8;
+                dst = ea;
+                dst.type = OP_MEM;
+                emulate_fpu_insn_memdst("fisttpll", dst.val);
+                break;
+            case 2: /* fst m64fp */
+                ea.bytes = 8;
+                dst = ea;
+                dst.type = OP_MEM;
+                emulate_fpu_insn_memsrc("fstl", dst.val);
+                break;
+            case 3: /* fstp m64fp */
+                ea.bytes = 8;
+                dst = ea;
+                dst.type = OP_MEM;
+                emulate_fpu_insn_memdst("fstpl", dst.val);
+                break;
+            case 7: /* fnstsw m2byte */
+                ea.bytes = 2;
+                dst = ea;
+                dst.type = OP_MEM;
+                emulate_fpu_insn_memdst("fnstsw", dst.val);
+                break;
+            default:
+                goto cannot_emulate;
+            }
+        }
         break;
 
     case 0xde: /* FPU 0xde */
@@ -2859,17 +3194,120 @@ x86_emulate(
             emulate_fpu_insn_stub(0xde, modrm);
             break;
         default:
-            goto cannot_emulate;
+            fail_if(modrm >= 0xc0);
+            ea.bytes = 2;
+            src = ea;
+            if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val,
+                                 src.bytes, ctxt)) != 0 )
+                goto done;
+            switch ( modrm_reg & 7 )
+            {
+            case 0: /* fiadd m16i */
+                emulate_fpu_insn_memsrc("fiadd", src.val);
+                break;
+            case 1: /* fimul m16i */
+                emulate_fpu_insn_memsrc("fimul", src.val);
+                break;
+            case 2: /* ficom m16i */
+                emulate_fpu_insn_memsrc("ficom", src.val);
+                break;
+            case 3: /* ficomp m16i */
+                emulate_fpu_insn_memsrc("ficomp", src.val);
+                break;
+            case 4: /* fisub m16i */
+                emulate_fpu_insn_memsrc("fisub", src.val);
+                break;
+            case 5: /* fisubr m16i */
+                emulate_fpu_insn_memsrc("fisubr", src.val);
+                break;
+            case 6: /* fidiv m16i */
+                emulate_fpu_insn_memsrc("fidiv", src.val);
+                break;
+            case 7: /* fidivr m16i */
+                emulate_fpu_insn_memsrc("fidivr", src.val);
+                break;
+            default:
+                goto cannot_emulate;
+            }
         }
         break;
 
     case 0xdf: /* FPU 0xdf */
-        fail_if(modrm != 0xe0);
-        /* fnstsw %ax */
-        dst.bytes = 2;
-        dst.type = OP_REG;
-        dst.reg = (unsigned long *)&_regs.eax;
-        emulate_fpu_insn_memdst("fnstsw", dst.val);
+        switch ( modrm )
+        {
+        case 0xe0:
+            /* fnstsw %ax */
+            dst.bytes = 2;
+            dst.type = OP_REG;
+            dst.reg = (unsigned long *)&_regs.eax;
+            emulate_fpu_insn_memdst("fnstsw", dst.val);
+            break;
+        case 0xf0 ... 0xf7: /* fcomip %stN */
+        case 0xf8 ... 0xff: /* fucomip %stN */
+            emulate_fpu_insn_stub(0xdf, modrm);
+            break;
+        default:
+            fail_if(modrm >= 0xc0);
+            switch ( modrm_reg & 7 )
+            {
+            case 0: /* fild m16i */
+                ea.bytes = 2;
+                src = ea;
+                if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val,
+                                     src.bytes, ctxt)) != 0 )
+                    goto done;
+                emulate_fpu_insn_memsrc("fild", src.val);
+                break;
+            case 1: /* fisttp m16i */
+                ea.bytes = 2;
+                dst = ea;
+                dst.type = OP_MEM;
+                emulate_fpu_insn_memdst("fisttp", dst.val);
+                break;
+            case 2: /* fist m16i */
+                ea.bytes = 2;
+                dst = ea;
+                dst.type = OP_MEM;
+                emulate_fpu_insn_memdst("fist", dst.val);
+                break;
+            case 3: /* fistp m16i */
+                ea.bytes = 2;
+                dst = ea;
+                dst.type = OP_MEM;
+                emulate_fpu_insn_memdst("fistp", dst.val);
+                break;
+            case 4: /* fbld m80dec */
+                ea.bytes = 10;
+                dst = ea;
+                if ( (rc = ops->read(src.mem.seg, src.mem.off,
+                                     &src.val, src.bytes, ctxt)) != 0 )
+                    goto done;
+                emulate_fpu_insn_memdst("fbld", src.val);
+                break;
+            case 5: /* fild m64i */
+                ea.bytes = 8;
+                src = ea;
+                if ( (rc = ops->read(src.mem.seg, src.mem.off, &src.val,
+                                     src.bytes, ctxt)) != 0 )
+                    goto done;
+                emulate_fpu_insn_memsrc("fildll", src.val);
+                break;
+            case 6: /* fbstp packed bcd */
+                ea.bytes = 10;
+                dst = ea;
+                dst.type = OP_MEM;
+                emulate_fpu_insn_memdst("fbstp", dst.val);
+                break;
+            case 7: /* fistp m64i */
+                ea.bytes = 8;
+                dst = ea;
+                dst.type = OP_MEM;
+                emulate_fpu_insn_memdst("fistpll", dst.val);
+                break;
+            default:
+                goto cannot_emulate;
+            }
+        }
         break;
 
     case 0xe0 ... 0xe2: /* loop{,z,nz} */ {
@@ -2924,7 +3362,6 @@ x86_emulate(
             /* out */
             fail_if(ops->write_io == NULL);
             rc = ops->write_io(port, op_bytes, _regs.eax, ctxt);
-            
         }
         else
         {
@@ -3242,9 +3679,9 @@ x86_emulate(
             if ( op_bytes == 2 )
                 reg.base &= 0xffffff;
             if ( (rc = ops->write(ea.mem.seg, ea.mem.off+0,
-                                  reg.limit, 2, ctxt)) ||
+                                  &reg.limit, 2, ctxt)) ||
                  (rc = ops->write(ea.mem.seg, ea.mem.off+2,
-                                  reg.base, mode_64bit() ? 8 : 4, ctxt)) )
+                                  &reg.base, mode_64bit() ? 8 : 4, ctxt)) )
                 goto done;
             break;
         case 2: /* lgdt */
@@ -3252,10 +3689,10 @@ x86_emulate(
             generate_exception_if(ea.type != OP_MEM, EXC_UD, -1);
             fail_if(ops->write_segment == NULL);
             memset(&reg, 0, sizeof(reg));
-            if ( (rc = ops->read(ea.mem.seg, ea.mem.off+0,
-                                 &limit, 2, ctxt)) ||
-                 (rc = ops->read(ea.mem.seg, ea.mem.off+2,
-                                 &base, mode_64bit() ? 8 : 4, ctxt)) )
+            if ( (rc = read_ulong(ea.mem.seg, ea.mem.off+0,
+                                  &limit, 2, ctxt, ops)) ||
+                 (rc = read_ulong(ea.mem.seg, ea.mem.off+2,
+                                  &base, mode_64bit() ? 8 : 4, ctxt, ops)) )
                 goto done;
             reg.base = base;
             reg.limit = limit;
@@ -3267,7 +3704,8 @@ x86_emulate(
                 goto done;
             break;
         case 4: /* smsw */
-            ea.bytes = 2;
+            if ( ea.type == OP_MEM )
+                ea.bytes = 2;
             dst = ea;
             fail_if(ops->read_cr == NULL);
             if ( (rc = ops->read_cr(0, &dst.val, ctxt)) )
@@ -3281,11 +3719,11 @@ x86_emulate(
                 goto done;
             if ( ea.type == OP_REG )
                 cr0w = *ea.reg;
-            else if ( (rc = ops->read(ea.mem.seg, ea.mem.off,
-                                      &cr0w, 2, ctxt)) )
+            else if ( (rc = read_ulong(ea.mem.seg, ea.mem.off,
+                                       &cr0w, 2, ctxt, ops)) )
                 goto done;
-            cr0 &= 0xffff0000;
-            cr0 |= (uint16_t)cr0w;
+            /* LMSW can: (1) set bits 0-3; (2) clear bits 1-3. */
+            cr0 = (cr0 & ~0xe) | (cr0w & 0xf);
             if ( (rc = ops->write_cr(0, cr0, ctxt)) )
                 goto done;
             break;
@@ -3404,8 +3842,10 @@ x86_emulate(
         if ( ea.type == OP_MEM )
         {
             unsigned long lval, hval;
-            if ( (rc = ops->read(ea.mem.seg, ea.mem.off+0, &lval, 4, ctxt)) ||
-                 (rc = ops->read(ea.mem.seg, ea.mem.off+4, &hval, 4, ctxt)) )
+            if ( (rc = read_ulong(ea.mem.seg, ea.mem.off+0,
+                                  &lval, 4, ctxt, ops)) ||
+                 (rc = read_ulong(ea.mem.seg, ea.mem.off+4,
+                                  &hval, 4, ctxt, ops)) )
                 goto done;
             val = ((uint64_t)hval << 32) | (uint32_t)lval;
             stub[2] = modrm & 0x38; /* movq (%eax),%mmN */
@@ -3428,8 +3868,8 @@ x86_emulate(
         if ( ea.type == OP_MEM )
         {
             unsigned long lval = (uint32_t)val, hval = (uint32_t)(val >> 32);
-            if ( (rc = ops->write(ea.mem.seg, ea.mem.off+0, lval, 4, ctxt)) ||
-                 (rc = ops->write(ea.mem.seg, ea.mem.off+4, hval, 4, ctxt)) )
+            if ( (rc = ops->write(ea.mem.seg, ea.mem.off+0, &lval, 4, ctxt)) ||
+                 (rc = ops->write(ea.mem.seg, ea.mem.off+4, &hval, 4, ctxt)) )
                 goto done;
         }
         break;
@@ -3481,8 +3921,8 @@ x86_emulate(
 
         /* Get actual old value. */
         for ( i = 0; i < (op_bytes/sizeof(long)); i++ )
-            if ( (rc = ops->read(ea.mem.seg, ea.mem.off + i*sizeof(long),
-                                 &old[i], sizeof(long), ctxt)) != 0 )
+            if ( (rc = read_ulong(ea.mem.seg, ea.mem.off + i*sizeof(long),
+                                  &old[i], sizeof(long), ctxt, ops)) != 0 )
                 goto done;
 
         /* Get expected and proposed values. */
diff -r 11318234588e -r 08f77df14cba xen/arch/x86/x86_emulate/x86_emulate.h
--- a/xen/arch/x86/x86_emulate/x86_emulate.h    Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h    Wed Jul 02 11:30:37 2008 +0900
@@ -102,7 +102,8 @@ enum x86_emulate_fpu_type {
 };
 
 /*
- * These operations represent the instruction emulator's interface to memory.
+ * These operations represent the instruction emulator's interface to memory,
+ * I/O ports, privileged state... pretty much everything other than GPRs.
  * 
  * NOTES:
  *  1. If the access fails (cannot emulate, or a standard access faults) then
@@ -110,8 +111,7 @@ enum x86_emulate_fpu_type {
  *     some out-of-band mechanism, unknown to the emulator. The memop signals
  *     failure by returning X86EMUL_EXCEPTION to the emulator, which will
  *     then immediately bail.
- *  2. Valid access sizes are 1, 2, 4 and 8 (x86/64 only) bytes.
- *  3. The emulator cannot handle 64-bit mode emulation on an x86/32 system.
+ *  2. The emulator cannot handle 64-bit mode emulation on an x86/32 system.
  */
 struct x86_emulate_ops
 {
@@ -121,19 +121,25 @@ struct x86_emulate_ops
      * All memory-access functions:
      *  @seg:   [IN ] Segment being dereferenced (specified as x86_seg_??).
      *  @offset:[IN ] Offset within segment.
+     *  @p_data:[IN ] Pointer to i/o data buffer (length is @bytes)
      * Read functions:
      *  @val:   [OUT] Value read, zero-extended to 'ulong'.
      * Write functions:
      *  @val:   [IN ] Value to write (low-order bytes used as req'd).
      * Variable-length access functions:
-     *  @bytes: [IN ] Number of bytes to read or write.
-     */
-
-    /* read: Emulate a memory read. */
+     *  @bytes: [IN ] Number of bytes to read or write. Valid access sizes are
+     *                1, 2, 4 and 8 (x86/64 only) bytes, unless otherwise
+     *                stated.
+     */
+
+    /*
+     * read: Emulate a memory read.
+     *  @bytes: Access length (0 < @bytes < 4096).
+     */
     int (*read)(
         enum x86_segment seg,
         unsigned long offset,
-        unsigned long *val,
+        void *p_data,
         unsigned int bytes,
         struct x86_emulate_ctxt *ctxt);
 
@@ -144,15 +150,18 @@ struct x86_emulate_ops
     int (*insn_fetch)(
         enum x86_segment seg,
         unsigned long offset,
-        unsigned long *val,
-        unsigned int bytes,
-        struct x86_emulate_ctxt *ctxt);
-
-    /* write: Emulate a memory write. */
+        void *p_data,
+        unsigned int bytes,
+        struct x86_emulate_ctxt *ctxt);
+
+    /*
+     * write: Emulate a memory write.
+     *  @bytes: Access length (0 < @bytes < 4096).
+     */
     int (*write)(
         enum x86_segment seg,
         unsigned long offset,
-        unsigned long val,
+        void *p_data,
         unsigned int bytes,
         struct x86_emulate_ctxt *ctxt);
 
diff -r 11318234588e -r 08f77df14cba xen/common/domain.c
--- a/xen/common/domain.c       Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/common/domain.c       Wed Jul 02 11:30:37 2008 +0900
@@ -73,21 +73,133 @@ int current_domain_id(void)
     return current->domain->domain_id;
 }
 
-struct domain *alloc_domain(domid_t domid)
+static struct domain *alloc_domain_struct(void)
+{
+    return xmalloc(struct domain);
+}
+
+static void free_domain_struct(struct domain *d)
+{
+    xfree(d);
+}
+
+static void __domain_finalise_shutdown(struct domain *d)
+{
+    struct vcpu *v;
+
+    BUG_ON(!spin_is_locked(&d->shutdown_lock));
+
+    if ( d->is_shut_down )
+        return;
+
+    for_each_vcpu ( d, v )
+        if ( !v->paused_for_shutdown )
+            return;
+
+    d->is_shut_down = 1;
+    send_guest_global_virq(dom0, VIRQ_DOM_EXC);
+}
+
+static void vcpu_check_shutdown(struct vcpu *v)
+{
+    struct domain *d = v->domain;
+
+    spin_lock(&d->shutdown_lock);
+
+    if ( d->is_shutting_down )
+    {
+        if ( !v->paused_for_shutdown )
+            vcpu_pause_nosync(v);
+        v->paused_for_shutdown = 1;
+        v->defer_shutdown = 0;
+        __domain_finalise_shutdown(d);
+    }
+
+    spin_unlock(&d->shutdown_lock);
+}
+
+struct vcpu *alloc_vcpu(
+    struct domain *d, unsigned int vcpu_id, unsigned int cpu_id)
+{
+    struct vcpu *v;
+
+    BUG_ON(d->vcpu[vcpu_id] != NULL);
+
+    if ( (v = alloc_vcpu_struct()) == NULL )
+        return NULL;
+
+    v->domain = d;
+    v->vcpu_id = vcpu_id;
+
+    v->runstate.state = is_idle_vcpu(v) ? RUNSTATE_running : RUNSTATE_offline;
+    v->runstate.state_entry_time = NOW();
+
+    if ( !is_idle_domain(d) )
+    {
+        set_bit(_VPF_down, &v->pause_flags);
+        v->vcpu_info = (void *)&shared_info(d, vcpu_info[vcpu_id]);
+    }
+
+    if ( sched_init_vcpu(v, cpu_id) != 0 )
+    {
+        free_vcpu_struct(v);
+        return NULL;
+    }
+
+    if ( vcpu_initialise(v) != 0 )
+    {
+        sched_destroy_vcpu(v);
+        free_vcpu_struct(v);
+        return NULL;
+    }
+
+    d->vcpu[vcpu_id] = v;
+    if ( vcpu_id != 0 )
+        d->vcpu[v->vcpu_id-1]->next_in_list = v;
+
+    /* Must be called after making new vcpu visible to for_each_vcpu(). */
+    vcpu_check_shutdown(v);
+
+    return v;
+}
+
+struct vcpu *alloc_idle_vcpu(unsigned int cpu_id)
 {
     struct domain *d;
-
-    if ( (d = xmalloc(struct domain)) == NULL )
+    struct vcpu *v;
+    unsigned int vcpu_id = cpu_id % MAX_VIRT_CPUS;
+
+    if ( (v = idle_vcpu[cpu_id]) != NULL )
+        return v;
+
+    d = (vcpu_id == 0) ?
+        domain_create(IDLE_DOMAIN_ID, 0, 0) :
+        idle_vcpu[cpu_id - vcpu_id]->domain;
+    BUG_ON(d == NULL);
+
+    v = alloc_vcpu(d, vcpu_id, cpu_id);
+    idle_vcpu[cpu_id] = v;
+
+    return v;
+}
+
+struct domain *domain_create(
+    domid_t domid, unsigned int domcr_flags, ssidref_t ssidref)
+{
+    struct domain *d, **pd;
+    enum { INIT_xsm = 1u<<0, INIT_rangeset = 1u<<1, INIT_evtchn = 1u<<2,
+           INIT_gnttab = 1u<<3, INIT_arch = 1u<<4 };
+    int init_status = 0;
+
+    if ( (d = alloc_domain_struct()) == NULL )
         return NULL;
 
     memset(d, 0, sizeof(*d));
     d->domain_id = domid;
 
     if ( xsm_alloc_security_domain(d) != 0 )
-    {
-        free_domain(d);
-        return NULL;
-    }
+        goto fail;
+    init_status |= INIT_xsm;
 
     atomic_set(&d->refcnt, 1);
     spin_lock_init(&d->domain_lock);
@@ -97,132 +209,17 @@ struct domain *alloc_domain(domid_t domi
     INIT_LIST_HEAD(&d->page_list);
     INIT_LIST_HEAD(&d->xenpage_list);
 
-    return d;
-}
-
-void free_domain(struct domain *d)
-{
-    xsm_free_security_domain(d);
-    xfree(d);
-}
-
-static void __domain_finalise_shutdown(struct domain *d)
-{
-    struct vcpu *v;
-
-    BUG_ON(!spin_is_locked(&d->shutdown_lock));
-
-    if ( d->is_shut_down )
-        return;
-
-    for_each_vcpu ( d, v )
-        if ( !v->paused_for_shutdown )
-            return;
-
-    d->is_shut_down = 1;
-    send_guest_global_virq(dom0, VIRQ_DOM_EXC);
-}
-
-static void vcpu_check_shutdown(struct vcpu *v)
-{
-    struct domain *d = v->domain;
-
-    spin_lock(&d->shutdown_lock);
-
-    if ( d->is_shutting_down )
-    {
-        if ( !v->paused_for_shutdown )
-            vcpu_pause_nosync(v);
-        v->paused_for_shutdown = 1;
-        v->defer_shutdown = 0;
-        __domain_finalise_shutdown(d);
-    }
-
-    spin_unlock(&d->shutdown_lock);
-}
-
-struct vcpu *alloc_vcpu(
-    struct domain *d, unsigned int vcpu_id, unsigned int cpu_id)
-{
-    struct vcpu *v;
-
-    BUG_ON(d->vcpu[vcpu_id] != NULL);
-
-    if ( (v = alloc_vcpu_struct()) == NULL )
-        return NULL;
-
-    v->domain = d;
-    v->vcpu_id = vcpu_id;
-
-    v->runstate.state = is_idle_vcpu(v) ? RUNSTATE_running : RUNSTATE_offline;
-    v->runstate.state_entry_time = NOW();
-
-    if ( !is_idle_domain(d) )
-    {
-        set_bit(_VPF_down, &v->pause_flags);
-        v->vcpu_info = (void *)&shared_info(d, vcpu_info[vcpu_id]);
-    }
-
-    if ( sched_init_vcpu(v, cpu_id) != 0 )
-    {
-        free_vcpu_struct(v);
-        return NULL;
-    }
-
-    if ( vcpu_initialise(v) != 0 )
-    {
-        sched_destroy_vcpu(v);
-        free_vcpu_struct(v);
-        return NULL;
-    }
-
-    d->vcpu[vcpu_id] = v;
-    if ( vcpu_id != 0 )
-        d->vcpu[v->vcpu_id-1]->next_in_list = v;
-
-    /* Must be called after making new vcpu visible to for_each_vcpu(). */
-    vcpu_check_shutdown(v);
-
-    return v;
-}
-
-struct vcpu *alloc_idle_vcpu(unsigned int cpu_id)
-{
-    struct domain *d;
-    struct vcpu *v;
-    unsigned int vcpu_id = cpu_id % MAX_VIRT_CPUS;
-
-    if ( (v = idle_vcpu[cpu_id]) != NULL )
-        return v;
-
-    d = (vcpu_id == 0) ?
-        domain_create(IDLE_DOMAIN_ID, 0, 0) :
-        idle_vcpu[cpu_id - vcpu_id]->domain;
-    BUG_ON(d == NULL);
-
-    v = alloc_vcpu(d, vcpu_id, cpu_id);
-    idle_vcpu[cpu_id] = v;
-
-    return v;
-}
-
-struct domain *domain_create(
-    domid_t domid, unsigned int domcr_flags, ssidref_t ssidref)
-{
-    struct domain *d, **pd;
-    enum { INIT_evtchn = 1, INIT_gnttab = 2, INIT_arch = 8 }; 
-    int init_status = 0;
-
-    if ( (d = alloc_domain(domid)) == NULL )
-        return NULL;
-
     if ( domcr_flags & DOMCRF_hvm )
         d->is_hvm = 1;
 
     if ( (domid == 0) && opt_dom0_vcpus_pin )
         d->is_pinned = 1;
 
+    if ( domcr_flags & DOMCRF_dummy )
+        return d;
+
     rangeset_domain_initialise(d);
+    init_status |= INIT_rangeset;
 
     if ( !is_idle_domain(d) )
     {
@@ -278,8 +275,11 @@ struct domain *domain_create(
         grant_table_destroy(d);
     if ( init_status & INIT_evtchn )
         evtchn_destroy(d);
-    rangeset_domain_destroy(d);
-    free_domain(d);
+    if ( init_status & INIT_rangeset )
+        rangeset_domain_destroy(d);
+    if ( init_status & INIT_xsm )
+        xsm_free_security_domain(d);
+    free_domain_struct(d);
     return NULL;
 }
 
@@ -535,7 +535,8 @@ static void complete_domain_destroy(stru
     if ( d->target != NULL )
         put_domain(d->target);
 
-    free_domain(d);
+    xsm_free_security_domain(d);
+    free_domain_struct(d);
 
     send_guest_global_virq(dom0, VIRQ_DOM_EXC);
 }
diff -r 11318234588e -r 08f77df14cba xen/drivers/passthrough/vtd/dmar.c
--- a/xen/drivers/passthrough/vtd/dmar.c        Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/drivers/passthrough/vtd/dmar.c        Wed Jul 02 11:30:37 2008 +0900
@@ -383,7 +383,8 @@ acpi_parse_one_drhd(struct acpi_dmar_ent
     dmaru->address = drhd->address;
     dmaru->include_all = drhd->flags & 1; /* BIT0: INCLUDE_ALL */
     INIT_LIST_HEAD(&dmaru->ioapic_list);
-    dprintk(XENLOG_INFO VTDPREFIX, "dmaru->address = %lx\n", dmaru->address);
+    dprintk(XENLOG_INFO VTDPREFIX, "dmaru->address = %"PRIx64"\n",
+            dmaru->address);
 
     dev_scope_start = (void *)(drhd + 1);
     dev_scope_end   = ((void *)drhd) + header->length;
diff -r 11318234588e -r 08f77df14cba xen/drivers/passthrough/vtd/dmar.h
--- a/xen/drivers/passthrough/vtd/dmar.h        Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/drivers/passthrough/vtd/dmar.h        Wed Jul 02 11:30:37 2008 +0900
@@ -42,28 +42,28 @@ struct acpi_ioapic_unit {
 
 struct acpi_drhd_unit {
     struct list_head list;
-    unsigned long    address; /* register base address of the unit */
-    struct    pci_dev *devices; /* target devices */
+    u64    address; /* register base address of the unit */
+    struct pci_dev *devices; /* target devices */
     int    devices_cnt;
-    u8    include_all:1;
+    u8     include_all:1;
     struct iommu *iommu;
     struct list_head ioapic_list;
 };
 
 struct acpi_rmrr_unit {
     struct list_head list;
-    unsigned long base_address;
-    unsigned long end_address;
+    u64    base_address;
+    u64    end_address;
     struct pci_dev *devices; /* target devices */
     int    devices_cnt;
-    u8    allow_all:1;
+    u8     allow_all:1;
 };
 
 struct acpi_atsr_unit {
     struct list_head list;
-    struct    pci_dev *devices; /* target devices */
+    struct pci_dev *devices; /* target devices */
     int    devices_cnt;
-    u8    all_ports:1;
+    u8     all_ports:1;
 };
 
 #define for_each_iommu(domain, iommu) \
diff -r 11318234588e -r 08f77df14cba xen/drivers/passthrough/vtd/intremap.c
--- a/xen/drivers/passthrough/vtd/intremap.c    Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/drivers/passthrough/vtd/intremap.c    Wed Jul 02 11:30:37 2008 +0900
@@ -52,7 +52,7 @@ static void remap_entry_to_ioapic_rte(
     unsigned long flags;
     struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu);
 
-    if ( ir_ctrl == NULL || ir_ctrl->iremap_index < 0 )
+    if ( ir_ctrl == NULL )
     {
         dprintk(XENLOG_ERR VTDPREFIX,
                 "remap_entry_to_ioapic_rte: ir_ctl is not ready\n");
@@ -153,6 +153,7 @@ static void ioapic_rte_to_remap_entry(st
     }
 
     memcpy(iremap_entry, &new_ire, sizeof(struct iremap_entry));
+    iommu_flush_cache_entry(iremap_entry);
     iommu_flush_iec_index(iommu, 0, index);
     invalidate_sync(iommu);
 
@@ -170,7 +171,8 @@ unsigned int io_apic_read_remap_rte(
     struct iommu *iommu = ioapic_to_iommu(mp_ioapics[apic].mpc_apicid);
     struct ir_ctrl *ir_ctrl = iommu_ir_ctrl(iommu);
 
-    if ( !iommu || !ir_ctrl || ir_ctrl->iremap_maddr == 0 )
+    if ( !iommu || !ir_ctrl || ir_ctrl->iremap_maddr == 0 ||
+         ir_ctrl->iremap_index == -1 )
     {
         *IO_APIC_BASE(apic) = reg;
         return *(IO_APIC_BASE(apic)+4);
@@ -377,6 +379,7 @@ static void msi_msg_to_remap_entry(
     remap_rte->data = 0;
 
     memcpy(iremap_entry, &new_ire, sizeof(struct iremap_entry));
+    iommu_flush_cache_entry(iremap_entry);
     iommu_flush_iec_index(iommu, 0, index);
     invalidate_sync(iommu);
 
diff -r 11318234588e -r 08f77df14cba xen/drivers/passthrough/vtd/iommu.c
--- a/xen/drivers/passthrough/vtd/iommu.c       Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/drivers/passthrough/vtd/iommu.c       Wed Jul 02 11:30:37 2008 +0900
@@ -1269,7 +1269,6 @@ static int domain_context_mapping(
 }
 
 static int domain_context_unmap_one(
-    struct domain *domain,
     struct iommu *iommu,
     u8 bus, u8 devfn)
 {
@@ -1300,7 +1299,6 @@ static int domain_context_unmap_one(
 }
 
 static int domain_context_unmap(
-    struct domain *domain,
     struct iommu *iommu,
     struct pci_dev *pdev)
 {
@@ -1320,14 +1318,13 @@ static int domain_context_unmap(
             PCI_FUNC(pdev->devfn), PCI_SUBORDINATE_BUS);
         break;
     case DEV_TYPE_PCIe_ENDPOINT:
-        ret = domain_context_unmap_one(domain, iommu,
+        ret = domain_context_unmap_one(iommu,
                                        (u8)(pdev->bus), (u8)(pdev->devfn));
         break;
     case DEV_TYPE_PCI:
         if ( pdev->bus == 0 )
             ret = domain_context_unmap_one(
-                domain, iommu,
-                (u8)(pdev->bus), (u8)(pdev->devfn));
+                iommu, (u8)(pdev->bus), (u8)(pdev->devfn));
         else
         {
             if ( bus2bridge[pdev->bus].bus != 0 )
@@ -1335,7 +1332,7 @@ static int domain_context_unmap(
                          "domain_context_unmap:"
                          "bus2bridge[%d].bus != 0\n", pdev->bus);
 
-            ret = domain_context_unmap_one(domain, iommu,
+            ret = domain_context_unmap_one(iommu,
                                            (u8)(bus2bridge[pdev->bus].bus),
                                            (u8)(bus2bridge[pdev->bus].devfn));
 
@@ -1345,8 +1342,7 @@ static int domain_context_unmap(
                 for ( func = 0; func < 8; func++ )
                 {
                     ret = domain_context_unmap_one(
-                        domain, iommu,
-                        pdev->bus, (u8)PCI_DEVFN(dev, func));
+                        iommu, pdev->bus, (u8)PCI_DEVFN(dev, func));
                     if ( ret )
                         return ret;
                 }
@@ -1389,7 +1385,7 @@ void reassign_device_ownership(
  found:
     drhd = acpi_find_matched_drhd_unit(pdev);
     iommu = drhd->iommu;
-    domain_context_unmap(source, iommu, pdev);
+    domain_context_unmap(iommu, pdev);
 
     /* Move pci device from the source domain to target domain. */
     spin_lock_irqsave(&source_hd->iommu_list_lock, flags);
@@ -1589,7 +1585,7 @@ static int iommu_prepare_rmrr_dev(
     struct pci_dev *pdev)
 {
     struct acpi_drhd_unit *drhd;
-    unsigned long size;
+    u64 size;
     int ret;
 
     /* page table init */
diff -r 11318234588e -r 08f77df14cba xen/drivers/passthrough/vtd/qinval.c
--- a/xen/drivers/passthrough/vtd/qinval.c      Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/drivers/passthrough/vtd/qinval.c      Wed Jul 02 11:30:37 2008 +0900
@@ -222,7 +222,7 @@ int invalidate_sync(struct iommu *iommu)
     int ret = -1;
     struct qi_ctrl *qi_ctrl = iommu_qi_ctrl(iommu);
 
-    if ( qi_ctrl->qinval_maddr == 0 )
+    if ( qi_ctrl->qinval_maddr != 0 )
     {
         ret = queue_invalidate_wait(iommu,
             0, 1, 1, 1, &qi_ctrl->qinval_poll_status);
@@ -416,7 +416,6 @@ int qinval_setup(struct iommu *iommu)
 int qinval_setup(struct iommu *iommu)
 {
     s_time_t start_time;
-    u32 status = 0;
     struct qi_ctrl *qi_ctrl;
     struct iommu_flush *flush;
 
@@ -450,15 +449,12 @@ int qinval_setup(struct iommu *iommu)
 
     /* Make sure hardware complete it */
     start_time = NOW();
-    for ( ; ; )
-    {
-        status = dmar_readl(iommu->reg, DMAR_GSTS_REG);
-        if ( status & DMA_GSTS_QIES )
-            break;
+    while ( !(dmar_readl(iommu->reg, DMAR_GSTS_REG) & DMA_GSTS_QIES) )
+    {
         if ( NOW() > (start_time + DMAR_OPERATION_TIMEOUT) )
             panic("Cannot set QIE field for queue invalidation\n");
         cpu_relax();
     }
-    status = 0;
-    return status;
-}
+
+    return 0;
+}
diff -r 11318234588e -r 08f77df14cba xen/drivers/passthrough/vtd/utils.c
--- a/xen/drivers/passthrough/vtd/utils.c       Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/drivers/passthrough/vtd/utils.c       Wed Jul 02 11:30:37 2008 +0900
@@ -166,7 +166,7 @@ void print_iommu_regs(struct acpi_drhd_u
     struct iommu *iommu = drhd->iommu;
 
     printk("---- print_iommu_regs ----\n");
-    printk("print_iommu_regs: drhd->address = %lx\n", drhd->address);
+    printk("print_iommu_regs: drhd->address = %"PRIx64"\n", drhd->address);
     printk("print_iommu_regs: DMAR_VER_REG = %x\n",
            dmar_readl(iommu->reg,DMAR_VER_REG));
     printk("print_iommu_regs: DMAR_CAP_REG = %"PRIx64"\n",
diff -r 11318234588e -r 08f77df14cba xen/include/acpi/cpufreq/cpufreq.h
--- a/xen/include/acpi/cpufreq/cpufreq.h        Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/include/acpi/cpufreq/cpufreq.h        Wed Jul 02 11:30:37 2008 +0900
@@ -36,7 +36,10 @@ struct cpufreq_policy {
     unsigned int        max;    /* in kHz */
     unsigned int        cur;    /* in kHz, only needed if cpufreq
                                  * governors are used */
+    unsigned int        resume; /* flag for cpufreq 1st run
+                                 * S3 wakeup, hotplug cpu, etc */
 };
+extern struct cpufreq_policy xen_px_policy[NR_CPUS];
 
 #define CPUFREQ_SHARED_TYPE_NONE (0) /* None */
 #define CPUFREQ_SHARED_TYPE_HW   (1) /* HW does needed coordination */
diff -r 11318234588e -r 08f77df14cba xen/include/acpi/cpufreq/processor_perf.h
--- a/xen/include/acpi/cpufreq/processor_perf.h Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/include/acpi/cpufreq/processor_perf.h Wed Jul 02 11:30:37 2008 +0900
@@ -6,9 +6,21 @@
 
 int get_cpu_id(u8);
 int acpi_cpufreq_init(void);
+int powernow_cpufreq_init(void);
+
 void px_statistic_update(cpumask_t, uint8_t, uint8_t);
 int  px_statistic_init(int);
 void px_statistic_reset(int);
+void px_statistic_suspend(void);
+void px_statistic_resume(void);
+
+void cpufreq_dom_exit(void);
+int  cpufreq_dom_init(void);
+int  cpufreq_dom_dbs(unsigned int);
+void cpufreq_suspend(void);
+int  cpufreq_resume(void);
+
+inline uint64_t get_cpu_idle_time(unsigned int);
 
 struct processor_performance {
     uint32_t state;
@@ -44,6 +56,7 @@ struct pm_px {
 struct pm_px {
     struct px_stat u;
     uint64_t prev_state_wall;
+    uint64_t prev_idle_wall;
 };
 
 extern struct pm_px px_statistic_data[NR_CPUS];
diff -r 11318234588e -r 08f77df14cba xen/include/asm-x86/domain.h
--- a/xen/include/asm-x86/domain.h      Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/include/asm-x86/domain.h      Wed Jul 02 11:30:37 2008 +0900
@@ -103,6 +103,9 @@ struct shadow_domain {
      * emulation and remove write permission
      */
     atomic_t          gtable_dirty_version;
+
+    /* OOS */
+    int oos_active;
 };
 
 struct shadow_vcpu {
@@ -122,6 +125,17 @@ struct shadow_vcpu {
     unsigned long last_emulated_frame;
     /* Last MFN that we emulated a write successfully */
     unsigned long last_emulated_mfn;
+
+    /* Shadow out-of-sync: pages that this vcpu has let go out of sync */
+    mfn_t oos[SHADOW_OOS_PAGES];
+    unsigned long oos_va[SHADOW_OOS_PAGES];
+    mfn_t oos_snapshot[SHADOW_OOS_PAGES];
+    struct oos_fixup {
+        mfn_t gmfn;
+        mfn_t smfn;
+        unsigned long off;
+    } *oos_fixups;
+    int oos_fixup_used;
 };
 
 /************************************************/
diff -r 11318234588e -r 08f77df14cba xen/include/asm-x86/hvm/vmx/vmcs.h
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h        Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h        Wed Jul 02 11:30:37 2008 +0900
@@ -333,10 +333,10 @@ enum vmcs_field {
 #define VMCS_VPID_WIDTH 16
 
 void vmx_disable_intercept_for_msr(struct vcpu *v, u32 msr);
-int vmx_read_guest_msr(struct vcpu *v, u32 msr, u64 *val);
-int vmx_write_guest_msr(struct vcpu *v, u32 msr, u64 val);
-int vmx_add_guest_msr(struct vcpu *v, u32 msr);
-int vmx_add_host_load_msr(struct vcpu *v, u32 msr);
+int vmx_read_guest_msr(u32 msr, u64 *val);
+int vmx_write_guest_msr(u32 msr, u64 val);
+int vmx_add_guest_msr(u32 msr);
+int vmx_add_host_load_msr(u32 msr);
 
 #endif /* ASM_X86_HVM_VMX_VMCS_H__ */
 
diff -r 11318234588e -r 08f77df14cba xen/include/asm-x86/mm.h
--- a/xen/include/asm-x86/mm.h  Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/include/asm-x86/mm.h  Wed Jul 02 11:30:37 2008 +0900
@@ -130,6 +130,14 @@ static inline u32 pickle_domptr(struct d
 /* The order of the largest allocation unit we use for shadow pages */
 #define SHADOW_MAX_ORDER 2 /* Need up to 16k allocs for 32-bit on PAE/64 */
 
+/* The number of out-of-sync shadows we allow per vcpu (prime, please) */
+#define SHADOW_OOS_PAGES 3
+
+/* The order OOS fixup tables per vcpu */
+#define SHADOW_OOS_FT_ORDER 1
+/* OOS fixup tables hash entries */
+#define SHADOW_OOS_FT_HASH 13
+
 #define page_get_owner(_p)    (unpickle_domptr((_p)->u.inuse._domain))
 #define page_set_owner(_p,_d) ((_p)->u.inuse._domain = pickle_domptr(_d))
 
diff -r 11318234588e -r 08f77df14cba xen/include/asm-x86/perfc_defn.h
--- a/xen/include/asm-x86/perfc_defn.h  Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/include/asm-x86/perfc_defn.h  Wed Jul 02 11:30:37 2008 +0900
@@ -80,7 +80,11 @@ PERFCOUNTER(shadow_writeable_h_3,  "shad
 PERFCOUNTER(shadow_writeable_h_3,  "shadow writeable: 64b w2k3")
 PERFCOUNTER(shadow_writeable_h_4,  "shadow writeable: linux low/solaris")
 PERFCOUNTER(shadow_writeable_h_5,  "shadow writeable: linux high")
+PERFCOUNTER(shadow_writeable_h_6,  "shadow writeable: unsync va")
+PERFCOUNTER(shadow_writeable_h_7,  "shadow writeable: sl1p")
+PERFCOUNTER(shadow_writeable_h_8,  "shadow writeable: sl1p failed")
 PERFCOUNTER(shadow_writeable_bf,   "shadow writeable brute-force")
+PERFCOUNTER(shadow_writeable_bf_1, "shadow writeable resync bf")
 PERFCOUNTER(shadow_mappings,       "shadow removes all mappings")
 PERFCOUNTER(shadow_mappings_bf,    "shadow rm-mappings brute-force")
 PERFCOUNTER(shadow_early_unshadow, "shadow unshadows for fork/exit")
@@ -101,4 +105,15 @@ PERFCOUNTER(shadow_em_ex_non_pt,   "shad
 PERFCOUNTER(shadow_em_ex_non_pt,   "shadow extra non-pt-write op")
 PERFCOUNTER(shadow_em_ex_fail,     "shadow extra emulation failed")
 
+PERFCOUNTER(shadow_oos_fixup_add_ok,    "shadow OOS fixups adds")
+PERFCOUNTER(shadow_oos_fixup_no_add,    "shadow OOS fixups no adds")
+PERFCOUNTER(shadow_oos_fixup_add_fail,  "shadow OOS fixups adds failed")
+PERFCOUNTER(shadow_oos_fixup_remove,    "shadow OOS fixups removes")
+PERFCOUNTER(shadow_oos_fixup_flush,     "shadow OOS fixups flushes")
+PERFCOUNTER(shadow_oos_fixup_flush_gmfn,"shadow OOS fixups gmfn flushes")
+
+PERFCOUNTER(shadow_unsync,         "shadow OOS unsyncs")
+PERFCOUNTER(shadow_unsync_evict,   "shadow OOS evictions")
+PERFCOUNTER(shadow_resync,         "shadow OOS resyncs")
+
 /*#endif*/ /* __XEN_PERFC_DEFN_H__ */
diff -r 11318234588e -r 08f77df14cba xen/include/public/hvm/hvm_op.h
--- a/xen/include/public/hvm/hvm_op.h   Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/include/public/hvm/hvm_op.h   Wed Jul 02 11:30:37 2008 +0900
@@ -92,6 +92,19 @@ typedef struct xen_hvm_track_dirty_vram 
 typedef struct xen_hvm_track_dirty_vram xen_hvm_track_dirty_vram_t;
 DEFINE_XEN_GUEST_HANDLE(xen_hvm_track_dirty_vram_t);
 
+/* Notify that some pages got modified by the Device Model. */
+#define HVMOP_modified_memory    7
+struct xen_hvm_modified_memory {
+    /* Domain to be updated. */
+    domid_t  domid;
+    /* First pfn. */
+    uint64_aligned_t first_pfn;
+    /* Number of pages. */
+    uint64_aligned_t nr;
+};
+typedef struct xen_hvm_modified_memory xen_hvm_modified_memory_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_modified_memory_t);
+
 #endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */
 
 #endif /* __XEN_PUBLIC_HVM_HVM_OP_H__ */
diff -r 11318234588e -r 08f77df14cba xen/include/xen/domain.h
--- a/xen/include/xen/domain.h  Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/include/xen/domain.h  Wed Jul 02 11:30:37 2008 +0900
@@ -15,9 +15,6 @@ int boot_vcpu(
     struct domain *d, int vcpuid, vcpu_guest_context_u ctxt);
 struct vcpu *alloc_idle_vcpu(unsigned int cpu_id);
 void vcpu_reset(struct vcpu *v);
-
-struct domain *alloc_domain(domid_t domid);
-void free_domain(struct domain *d);
 
 struct xen_domctl_getdomaininfo;
 void getdomaininfo(struct domain *d, struct xen_domctl_getdomaininfo *info);
diff -r 11318234588e -r 08f77df14cba xen/include/xen/sched.h
--- a/xen/include/xen/sched.h   Thu Jun 19 12:48:04 2008 +0900
+++ b/xen/include/xen/sched.h   Wed Jul 02 11:30:37 2008 +0900
@@ -315,10 +315,14 @@ struct domain *domain_create(
 struct domain *domain_create(
     domid_t domid, unsigned int domcr_flags, ssidref_t ssidref);
  /* DOMCRF_hvm: Create an HVM domain, as opposed to a PV domain. */
-#define _DOMCRF_hvm 0
-#define DOMCRF_hvm  (1U<<_DOMCRF_hvm)
-#define _DOMCRF_hap 1
-#define DOMCRF_hap  (1U<<_DOMCRF_hap)
+#define _DOMCRF_hvm   0
+#define DOMCRF_hvm    (1U<<_DOMCRF_hvm)
+ /* DOMCRF_hap: Create a domain with hardware-assisted paging. */
+#define _DOMCRF_hap   1
+#define DOMCRF_hap    (1U<<_DOMCRF_hap)
+ /* DOMCRF_dummy: Create a dummy domain (not scheduled; not on domain list) */
+#define _DOMCRF_dummy 2
+#define DOMCRF_dummy  (1U<<_DOMCRF_dummy)
 
 int construct_dom0(
     struct domain *d,

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.