[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen-unstable] merge with xen-unstable.hg



# HG changeset patch
# User Isaku Yamahata <yamahata@xxxxxxxxxxxxx>
# Date 1225770199 -32400
# Node ID e75cb35c798beabee0b0ed4025ef82a39c702279
# Parent  10f0e1bb8e5e9a28e1ebe3fbb9291fb8114ef4bc
# Parent  43a079fd50fdab01cd2be443bfef011b3b0495ae
merge with xen-unstable.hg
---
 xen/common/xmalloc.c                          |  286 --------------
 .hgignore                                     |    1 
 extras/mini-os/include/sched.h                |    3 
 extras/mini-os/include/wait.h                 |   10 
 extras/mini-os/minios.mk                      |    3 
 tools/Makefile                                |    1 
 tools/blktap/drivers/block-qcow.c             |   24 -
 tools/firmware/hvmloader/acpi/static_tables.c |    2 
 tools/firmware/rombios/rombios.c              |    4 
 tools/flask/policy/policy/modules/xen/xen.te  |    3 
 tools/python/xen/util/diagnose.py             |    4 
 tools/python/xen/xend/XendConfig.py           |   17 
 tools/python/xen/xend/XendDomainInfo.py       |   73 ++-
 tools/python/xen/xend/server/DevConstants.py  |   45 ++
 tools/python/xen/xend/server/DevController.py |   31 -
 tools/python/xen/xend/server/iopif.py         |   20 -
 tools/python/xen/xend/server/irqif.py         |   19 
 tools/python/xen/xend/server/pciif.py         |    3 
 tools/python/xen/xend/server/vscsiif.py       |   15 
 tools/python/xen/xm/create.py                 |   14 
 tools/python/xen/xm/main.py                   |    5 
 tools/xenpmd/Makefile                         |   20 +
 tools/xenpmd/xenpmd.c                         |  520 ++++++++++++++++++++++++++
 xen/arch/ia64/xen/cpufreq/cpufreq.c           |   15 
 xen/arch/ia64/xen/irq.c                       |    2 
 xen/arch/x86/acpi/cpu_idle.c                  |  103 ++---
 xen/arch/x86/acpi/cpufreq/cpufreq.c           |   14 
 xen/arch/x86/acpi/cpufreq/powernow.c          |   14 
 xen/arch/x86/acpi/cpuidle_menu.c              |   14 
 xen/arch/x86/domain.c                         |  116 ++++-
 xen/arch/x86/domain_build.c                   |   34 +
 xen/arch/x86/hpet.c                           |    7 
 xen/arch/x86/hvm/emulate.c                    |   30 +
 xen/arch/x86/hvm/hpet.c                       |  339 +++++++++-------
 xen/arch/x86/hvm/hvm.c                        |    1 
 xen/arch/x86/hvm/i8254.c                      |    4 
 xen/arch/x86/hvm/rtc.c                        |    4 
 xen/arch/x86/hvm/svm/entry.S                  |    3 
 xen/arch/x86/hvm/vlapic.c                     |   10 
 xen/arch/x86/hvm/vmx/entry.S                  |    6 
 xen/arch/x86/hvm/vmx/vmx.c                    |   81 ++--
 xen/arch/x86/hvm/vmx/vpmu_core2.c             |   20 +
 xen/arch/x86/hvm/vpt.c                        |   18 
 xen/arch/x86/irq.c                            |    6 
 xen/arch/x86/mm.c                             |  251 +++++++++---
 xen/arch/x86/mm/hap/p2m-ept.c                 |    8 
 xen/arch/x86/mm/p2m.c                         |   17 
 xen/arch/x86/msi.c                            |   69 +--
 xen/arch/x86/oprofile/nmi_int.c               |   51 ++
 xen/arch/x86/oprofile/op_model_ppro.c         |  103 +++++
 xen/arch/x86/oprofile/op_x86_model.h          |    5 
 xen/arch/x86/setup.c                          |    1 
 xen/arch/x86/smpboot.c                        |   14 
 xen/arch/x86/time.c                           |    4 
 xen/arch/x86/traps.c                          |   29 -
 xen/arch/x86/x86_32/domain_page.c             |   10 
 xen/arch/x86/x86_64/compat/mm.c               |    5 
 xen/arch/x86/x86_64/cpufreq.c                 |   33 -
 xen/common/event_channel.c                    |    2 
 xen/common/kernel.c                           |    3 
 xen/common/keyhandler.c                       |    4 
 xen/common/spinlock.c                         |   69 +++
 xen/common/timer.c                            |  125 +++---
 xen/common/xenoprof.c                         |    2 
 xen/drivers/char/serial.c                     |    7 
 xen/drivers/cpufreq/cpufreq.c                 |  149 ++++++-
 xen/include/asm-x86/config.h                  |    8 
 xen/include/asm-x86/event.h                   |   32 -
 xen/include/asm-x86/fixmap.h                  |    1 
 xen/include/asm-x86/hvm/vmx/vpmu.h            |    2 
 xen/include/asm-x86/hvm/vmx/vpmu_core2.h      |   22 -
 xen/include/asm-x86/hvm/vpt.h                 |   70 +--
 xen/include/asm-x86/mm.h                      |   30 +
 xen/include/asm-x86/page.h                    |    3 
 xen/include/asm-x86/softirq.h                 |    3 
 xen/include/asm-x86/x86_32/page.h             |    3 
 xen/include/asm-x86/x86_64/page.h             |    5 
 xen/include/asm-x86/xenoprof.h                |    3 
 xen/include/public/features.h                 |    3 
 xen/include/public/trace.h                    |    2 
 xen/include/public/xen.h                      |   14 
 xen/include/xen/cpuidle.h                     |    8 
 xen/include/xen/domain_page.h                 |    6 
 xen/include/xen/spinlock.h                    |   23 +
 xen/include/xen/time.h                        |    1 
 xen/include/xen/timer.h                       |    3 
 xen/include/xlat.lst                          |    2 
 87 files changed, 2085 insertions(+), 1084 deletions(-)

diff -r 10f0e1bb8e5e -r e75cb35c798b .hgignore
--- a/.hgignore Tue Nov 04 12:07:22 2008 +0900
+++ b/.hgignore Tue Nov 04 12:43:19 2008 +0900
@@ -211,6 +211,7 @@
 ^tools/xenfb/vncfb$
 ^tools/xenmon/xentrace_setmask$
 ^tools/xenmon/xenbaked$
+^tools/xenpmd/xenpmd$
 ^tools/xenstat/xentop/xentop$
 ^tools/xenstore/testsuite/tmp/.*$
 ^tools/xenstore/xen$
diff -r 10f0e1bb8e5e -r e75cb35c798b extras/mini-os/include/sched.h
--- a/extras/mini-os/include/sched.h    Tue Nov 04 12:07:22 2008 +0900
+++ b/extras/mini-os/include/sched.h    Tue Nov 04 12:43:19 2008 +0900
@@ -48,8 +48,9 @@ void exit_thread(void) __attribute__((no
 void exit_thread(void) __attribute__((noreturn));
 void schedule(void);
 
+#ifdef __INSIDE_MINIOS__
 #define current get_current()
-
+#endif
 
 void wake(struct thread *thread);
 void block(struct thread *thread);
diff -r 10f0e1bb8e5e -r e75cb35c798b extras/mini-os/include/wait.h
--- a/extras/mini-os/include/wait.h     Tue Nov 04 12:07:22 2008 +0900
+++ b/extras/mini-os/include/wait.h     Tue Nov 04 12:43:19 2008 +0900
@@ -7,7 +7,7 @@
 
 #define DEFINE_WAIT(name)                               \
 struct wait_queue name = {                              \
-    .thread       = current,                            \
+    .thread       = get_current(),                            \
     .thread_list  = MINIOS_LIST_HEAD_INIT((name).thread_list), \
 }
 
@@ -53,7 +53,7 @@ static inline void wake_up(struct wait_q
     unsigned long flags;        \
     local_irq_save(flags);      \
     add_wait_queue(&wq, &w);    \
-    block(current);             \
+    block(get_current());       \
     local_irq_restore(flags);   \
 } while (0)
 
@@ -74,8 +74,8 @@ static inline void wake_up(struct wait_q
         /* protect the list */                                  \
         local_irq_save(flags);                                  \
         add_wait_queue(&wq, &__wait);                           \
-        current->wakeup_time = deadline;                        \
-        clear_runnable(current);                                \
+        get_current()->wakeup_time = deadline;                  \
+        clear_runnable(get_current());                          \
         local_irq_restore(flags);                               \
         if((condition) || (deadline && NOW() >= deadline))      \
             break;                                              \
@@ -83,7 +83,7 @@ static inline void wake_up(struct wait_q
     }                                                           \
     local_irq_save(flags);                                      \
     /* need to wake up */                                       \
-    wake(current);                                              \
+    wake(get_current());                                        \
     remove_wait_queue(&__wait);                                 \
     local_irq_restore(flags);                                   \
 } while(0) 
diff -r 10f0e1bb8e5e -r e75cb35c798b extras/mini-os/minios.mk
--- a/extras/mini-os/minios.mk  Tue Nov 04 12:07:22 2008 +0900
+++ b/extras/mini-os/minios.mk  Tue Nov 04 12:43:19 2008 +0900
@@ -25,6 +25,9 @@ else
 else
 DEF_CFLAGS += -O3
 endif
+
+# Make the headers define our internal stuff
+DEF_CFLAGS += -D__INSIDE_MINIOS__
 
 # Build the CFLAGS and ASFLAGS for compiling and assembling.
 # DEF_... flags are the common mini-os flags,
diff -r 10f0e1bb8e5e -r e75cb35c798b tools/Makefile
--- a/tools/Makefile    Tue Nov 04 12:07:22 2008 +0900
+++ b/tools/Makefile    Tue Nov 04 12:43:19 2008 +0900
@@ -24,6 +24,7 @@ SUBDIRS-$(LIBXENAPI_BINDINGS) += libxen
 SUBDIRS-$(LIBXENAPI_BINDINGS) += libxen
 SUBDIRS-y += fs-back
 SUBDIRS-$(CONFIG_IOEMU) += ioemu-dir
+SUBDIRS-y += xenpmd
 
 # These don't cross-compile
 ifeq ($(XEN_COMPILE_ARCH),$(XEN_TARGET_ARCH))
diff -r 10f0e1bb8e5e -r e75cb35c798b tools/blktap/drivers/block-qcow.c
--- a/tools/blktap/drivers/block-qcow.c Tue Nov 04 12:07:22 2008 +0900
+++ b/tools/blktap/drivers/block-qcow.c Tue Nov 04 12:43:19 2008 +0900
@@ -722,11 +722,11 @@ static inline void init_fds(struct disk_
 /* Open the disk file and initialize qcow state. */
 static int tdqcow_open (struct disk_driver *dd, const char *name, td_flag_t 
flags)
 {
-       int fd, len, i, shift, ret, size, l1_table_size, o_flags;
+       int fd, len, i, shift, ret, size, l1_table_size, o_flags, 
l1_table_block;
        int max_aio_reqs;
        struct td_state     *bs = dd->td_state;
        struct tdqcow_state *s  = (struct tdqcow_state *)dd->private;
-       char *buf;
+       char *buf, *buf2;
        QCowHeader *header;
        QCowHeader_ext *exthdr;
        uint32_t cksum;
@@ -734,8 +734,8 @@ static int tdqcow_open (struct disk_driv
 
        DPRINTF("QCOW: Opening %s\n",name);
 
-       /* Since we don't handle O_DIRECT correctly, don't use it */
-       o_flags = O_LARGEFILE | ((flags == TD_RDONLY) ? O_RDONLY : O_RDWR);
+       o_flags = O_DIRECT | O_LARGEFILE | 
+               ((flags == TD_RDONLY) ? O_RDONLY : O_RDWR);
        fd = open(name, o_flags);
        if (fd < 0) {
                DPRINTF("Unable to open %s (%d)\n",name,0 - errno);
@@ -819,9 +819,14 @@ static int tdqcow_open (struct disk_driv
                (int) (s->l1_size * sizeof(uint64_t)), 
                l1_table_size);
 
-       lseek(fd, s->l1_table_offset, SEEK_SET);
-       if (read(fd, s->l1_table, l1_table_size) != l1_table_size)
+       lseek(fd, 0, SEEK_SET);
+       l1_table_block = l1_table_size + s->l1_table_offset;
+       l1_table_block = l1_table_block + 512 - (l1_table_block % 512); 
+       ret = posix_memalign((void **)&buf2, 4096, l1_table_block);
+       if (ret != 0) goto fail;
+       if (read(fd, buf2, l1_table_block) != l1_table_block)
                goto fail;
+       memcpy(s->l1_table, buf2 + s->l1_table_offset, l1_table_size);
 
        for(i = 0; i < s->l1_size; i++) {
                be64_to_cpus(&s->l1_table[i]);
@@ -871,8 +876,9 @@ static int tdqcow_open (struct disk_driv
 
                        DPRINTF("qcow: Converting image to big endian L1 
table\n");
 
-                       lseek(fd, s->l1_table_offset, SEEK_SET);
-                       if (write(fd, s->l1_table, l1_table_size) != 
l1_table_size) {
+                       memcpy(buf2 + s->l1_table_offset, s->l1_table, 
l1_table_size);
+                       lseek(fd, 0, SEEK_SET);
+                       if (write(fd, buf2, l1_table_block) != l1_table_block) {
                                DPRINTF("qcow: Failed to write new L1 table\n");
                                goto fail;
                        }
@@ -917,7 +923,7 @@ static int tdqcow_open (struct disk_driv
        init_fds(dd);
 
        if (!final_cluster)
-               s->fd_end = s->l1_table_offset + l1_table_size;
+               s->fd_end = l1_table_block;
        else {
                s->fd_end = lseek(fd, 0, SEEK_END);
                if (s->fd_end == (off_t)-1)
diff -r 10f0e1bb8e5e -r e75cb35c798b 
tools/firmware/hvmloader/acpi/static_tables.c
--- a/tools/firmware/hvmloader/acpi/static_tables.c     Tue Nov 04 12:07:22 
2008 +0900
+++ b/tools/firmware/hvmloader/acpi/static_tables.c     Tue Nov 04 12:43:19 
2008 +0900
@@ -67,7 +67,7 @@ struct acpi_20_fadt Fadt = {
 
     .p_lvl2_lat = 0x0fff, /* >100,  means we do not support C2 state */
     .p_lvl3_lat = 0x0fff, /* >1000, means we do not support C3 state */
-    .iapc_boot_arch = ACPI_LEGACY_DEVICES | ACPI_8042,
+    .iapc_boot_arch = ACPI_8042,
     .flags = (ACPI_PROC_C1 | ACPI_SLP_BUTTON |
               ACPI_WBINVD | ACPI_PWR_BUTTON |
               ACPI_FIX_RTC | ACPI_TMR_VAL_EXT),
diff -r 10f0e1bb8e5e -r e75cb35c798b tools/firmware/rombios/rombios.c
--- a/tools/firmware/rombios/rombios.c  Tue Nov 04 12:07:22 2008 +0900
+++ b/tools/firmware/rombios/rombios.c  Tue Nov 04 12:43:19 2008 +0900
@@ -7216,7 +7216,7 @@ BX_INFO("floppy: drive>1 || head>1 ...\n
         outb(0x03f5, head);
         outb(0x03f5, sector);
         outb(0x03f5, 2); // 512 byte sector size
-        outb(0x03f5, 0); // last sector number possible on track
+        outb(0x03f5, sector + num_sectors - 1); // last sector to read on track
         outb(0x03f5, 0); // Gap length
         outb(0x03f5, 0xff); // Gap length
 
@@ -7364,7 +7364,7 @@ BX_INFO("floppy: drive>1 || head>1 ...\n
         outb(0x03f5, head);
         outb(0x03f5, sector);
         outb(0x03f5, 2); // 512 byte sector size
-        outb(0x03f5, 0); // last sector number possible on track
+        outb(0x03f5, sector + num_sectors - 1); // last sector to write on 
track
         outb(0x03f5, 0); // Gap length
         outb(0x03f5, 0xff); // Gap length
 
diff -r 10f0e1bb8e5e -r e75cb35c798b 
tools/flask/policy/policy/modules/xen/xen.te
--- a/tools/flask/policy/policy/modules/xen/xen.te      Tue Nov 04 12:07:22 
2008 +0900
+++ b/tools/flask/policy/policy/modules/xen/xen.te      Tue Nov 04 12:43:19 
2008 +0900
@@ -74,7 +74,7 @@ allow dom0_t pirq_t:event {vector};
 allow dom0_t pirq_t:event {vector};
 allow dom0_t xen_t:mmu {memorymap};
 
-allow dom0_t dom0_t:mmu {pinpage map_read map_write adjust};
+allow dom0_t dom0_t:mmu {pinpage map_read map_write adjust updatemp};
 allow dom0_t dom0_t:grant {query setup};
 allow dom0_t dom0_t:domain {scheduler getdomaininfo getvcpuinfo 
getvcpuaffinity};
 
@@ -112,6 +112,7 @@ allow domU_t evchnU-0_t:event {send};
 
 allow dom0_t dom0_t:event {send};
 allow dom0_t domU_t:grant {copy};
+allow domU_t domU_t:grant {copy};
 
 manage_domain(dom0_t, domU_t)
 
diff -r 10f0e1bb8e5e -r e75cb35c798b tools/python/xen/util/diagnose.py
--- a/tools/python/xen/util/diagnose.py Tue Nov 04 12:07:22 2008 +0900
+++ b/tools/python/xen/util/diagnose.py Tue Nov 04 12:43:19 2008 +0900
@@ -23,7 +23,7 @@ from xen.xend.XendClient import server
 from xen.xend.XendClient import server
 from xen.xend.XendError import XendError
 from xen.xend.xenstore.xstransact import xstransact
-from xen.xend.server import DevController
+from xen.xend.server import DevConstants
 
 import xen.xend.XendProtocol
 
@@ -169,7 +169,7 @@ def diagnose_hotplugging():
 
 
 def stateString(state):
-    return state and DevController.xenbusState[int(state)] or '<None>'
+    return state and DevConstants.xenbusState[int(state)] or '<None>'
 
 
 def main(argv = None):
diff -r 10f0e1bb8e5e -r e75cb35c798b tools/python/xen/xend/XendConfig.py
--- a/tools/python/xen/xend/XendConfig.py       Tue Nov 04 12:07:22 2008 +0900
+++ b/tools/python/xen/xend/XendConfig.py       Tue Nov 04 12:43:19 2008 +0900
@@ -1602,21 +1602,21 @@ class XendConfig(dict):
         #   [vscsi,
         #     [dev,
         #       [devid, 0], [p-devname, sdb], [p-dev, 1:0:0:1],
-        #       [v-dev, 0:0:0:0], [state, Initialising]
+        #       [v-dev, 0:0:0:0], [state, 1]
         #     ],
         #     [dev,
         #       [devid, 0], [p-devname, sdc], [p-dev, 1:0:0:2],
-        #       [v-dev, 0:0:0:1], [satet, Initialising]
+        #       [v-dev, 0:0:0:1], [satet, 1]
         #     ]
         #   ],
         #   [vscsi,
         #     [dev,
         #       [devid, 1], [p-devname, sdg], [p-dev, 2:0:0:0],
-        #       [v-dev, 1:0:0:0], [state, Initialising]
+        #       [v-dev, 1:0:0:0], [state, 1]
         #     ],
         #     [dev,
         #       [devid, 1], [p-devname, sdh], [p-dev, 2:0:0:1],
-        #       [v-dev, 1:0:0:1], [satet, Initialising]
+        #       [v-dev, 1:0:0:1], [satet, 1]
         #     ]
         #   ]
         # ]
@@ -1632,18 +1632,19 @@ class XendConfig(dict):
         #   [vscsi,
         #     [dev,
         #       [devid, 0], [p-devname, sdd], [p-dev, 1:0:0:3],
-        #       [v-dev, 0:0:0:2], [state, Initialising]
+        #       [v-dev, 0:0:0:2], [state, 1]
         #     ]
         #   ]
         # ]
         #
-        # state 'Initialising' indicates that the device is being attached,
-        # while state 'Closing' indicates that the device is being detached.
+        # state xenbusState['Initialising'] indicates that the device is 
+        # being attached, while state xenbusState['Closing'] indicates 
+        # that the device is being detached.
         #
         # The Dict looks like this:
         #
         # { devs: [ {devid: 0, p-devname: sdd, p-dev: 1:0:0:3,
-        #            v-dev: 0:0:0:2, state: Initialising} ] }
+        #            v-dev: 0:0:0:2, state: 1} ] }
 
         dev_config = {}
 
diff -r 10f0e1bb8e5e -r e75cb35c798b tools/python/xen/xend/XendDomainInfo.py
--- a/tools/python/xen/xend/XendDomainInfo.py   Tue Nov 04 12:07:22 2008 +0900
+++ b/tools/python/xen/xend/XendDomainInfo.py   Tue Nov 04 12:43:19 2008 +0900
@@ -52,6 +52,7 @@ from xen.xend.xenstore.xswatch import xs
 from xen.xend.xenstore.xswatch import xswatch
 from xen.xend.XendConstants import *
 from xen.xend.XendAPIConstants import *
+from xen.xend.server.DevConstants import xenbusState
 
 from xen.xend.XendVMMetrics import XendVMMetrics
 
@@ -797,7 +798,7 @@ class XendDomainInfo:
         existing_dev_info = self._getDeviceInfo_vscsi(req_devid, dev['v-dev'])
         state = dev['state']
 
-        if state == 'Initialising':
+        if state == xenbusState['Initialising']:
             # new create
             # If request devid does not exist, create and exit.
             if existing_dev_info is None:
@@ -806,25 +807,48 @@ class XendDomainInfo:
             elif existing_dev_info == "exists":
                 raise XendError("The virtual device %s is already defined" % 
dev['v-dev'])
 
-        elif state == 'Closing':
+        elif state == xenbusState['Closing']:
             if existing_dev_info is None:
                 raise XendError("Cannot detach vscsi device does not exist")
 
-        # use DevController.reconfigureDevice to change device config
-        dev_control = self.getDeviceController(dev_class)
-        dev_uuid = dev_control.reconfigureDevice(req_devid, dev_config)
-        dev_control.waitForDevice_reconfigure(req_devid)
-        num_devs = dev_control.cleanupDevice(req_devid)
-
-        # update XendConfig with new device info
-        if dev_uuid:
-            new_dev_sxp = dev_control.configuration(req_devid)
+        if self.domid is not None:
+            # use DevController.reconfigureDevice to change device config
+            dev_control = self.getDeviceController(dev_class)
+            dev_uuid = dev_control.reconfigureDevice(req_devid, dev_config)
+            dev_control.waitForDevice_reconfigure(req_devid)
+            num_devs = dev_control.cleanupDevice(req_devid)
+
+            # update XendConfig with new device info
+            if dev_uuid:
+                new_dev_sxp = dev_control.configuration(req_devid)
+                self.info.device_update(dev_uuid, new_dev_sxp)
+
+            # If there is no device left, destroy vscsi and remove config.
+            if num_devs == 0:
+                self.destroyDevice('vscsi', req_devid)
+                del self.info['devices'][dev_uuid]
+
+        else:
+            cur_dev_sxp = self._getDeviceInfo_vscsi(req_devid, None)
+            new_dev_sxp = ['vscsi']
+            for cur_dev in sxp.children(cur_dev_sxp, 'dev'):
+                if state == xenbusState['Closing']:
+                    cur_dev_vdev = sxp.child_value(cur_dev, 'v-dev')
+                    if cur_dev_vdev == dev['v-dev']:
+                        continue
+                new_dev_sxp.append(cur_dev)
+
+            if state == xenbusState['Initialising']:
+                new_dev_sxp.append(sxp.child0(dev_sxp, 'dev'))
+
+            dev_uuid = sxp.child_value(cur_dev_sxp, 'uuid')
             self.info.device_update(dev_uuid, new_dev_sxp)
 
-        # If there is no device left, destroy vscsi and remove config.
-        if num_devs == 0:
-            self.destroyDevice('vscsi', req_devid)
-            del self.info['devices'][dev_uuid]
+            # If there is only 'vscsi' in new_dev_sxp, remove the config.
+            if len(sxp.children(new_dev_sxp, 'dev')) == 0:
+                del self.info['devices'][dev_uuid]
+
+        xen.xend.XendDomain.instance().managed_config_save(self)
 
         return True
 
@@ -986,7 +1010,17 @@ class XendDomainInfo:
             sxprs = []
             dev_num = 0
             for dev_type, dev_info in self.info.all_devices_sxpr():
-                if dev_type == deviceClass:
+                if dev_type != deviceClass:
+                    continue
+
+                if deviceClass == 'vscsi':
+                    vscsi_devs = ['devs', []]
+                    for vscsi_dev in sxp.children(dev_info, 'dev'):
+                        vscsi_dev.append(['frontstate', None])
+                        vscsi_devs[1].append(vscsi_dev)
+                        dev_num = int(sxp.child_value(vscsi_dev, 'devid'))
+                    sxprs.append([dev_num, [vscsi_devs]])
+                else:
                     sxprs.append([dev_num, dev_info])
                     dev_num += 1
             return sxprs
@@ -2380,11 +2414,10 @@ class XendDomainInfo:
             time.sleep(2)
         for paths in plist:
             if paths.find('backend') != -1:
-                from xen.xend.server import DevController
                 # Modify online status /before/ updating state (latter is 
watched by
                 # drivers, so this ordering avoids a race).
                 xstransact.Write(paths, 'online', "0")
-                xstransact.Write(paths, 'state', 
str(DevController.xenbusState['Closing']))
+                xstransact.Write(paths, 'state', str(xenbusState['Closing']))
             # force
             xstransact.Remove(paths)
 
@@ -3439,7 +3472,7 @@ class XendDomainInfo:
                     ['p-devname', pscsi.get_dev_name()],
                     ['p-dev', pscsi.get_physical_HCTL()],
                     ['v-dev', xenapi_dscsi.get('virtual_HCTL')],
-                    ['state', 'Initialising'],
+                    ['state', xenbusState['Initialising']],
                     ['uuid', dscsi_uuid]
                 ]
             ]
@@ -3558,7 +3591,7 @@ class XendDomainInfo:
         if target_dev is None:
             raise XendError('Failed to destroy device')
 
-        target_dev.append(['state', 'Closing'])
+        target_dev.append(['state', xenbusState['Closing']])
         target_vscsi_sxp = ['vscsi', target_dev]
 
         if self._stateGet() != XEN_API_VM_POWER_STATE_RUNNING:
diff -r 10f0e1bb8e5e -r e75cb35c798b 
tools/python/xen/xend/server/DevConstants.py
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/python/xen/xend/server/DevConstants.py      Tue Nov 04 12:43:19 
2008 +0900
@@ -0,0 +1,45 @@
+#============================================================================
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of version 2.1 of the GNU Lesser General Public
+# License as published by the Free Software Foundation.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+#============================================================================
+# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx>
+# Copyright (C) 2005 XenSource Ltd
+#============================================================================
+
+DEVICE_CREATE_TIMEOUT  = 100
+DEVICE_DESTROY_TIMEOUT = 100
+HOTPLUG_STATUS_NODE = "hotplug-status"
+HOTPLUG_ERROR_NODE  = "hotplug-error"
+HOTPLUG_STATUS_ERROR = "error"
+HOTPLUG_STATUS_BUSY  = "busy"
+
+Connected    = 1
+Error        = 2
+Missing      = 3
+Timeout      = 4
+Busy         = 5
+Disconnected = 6
+
+xenbusState = {
+    'Unknown'       : 0,
+    'Initialising'  : 1,
+    'InitWait'      : 2,
+    'Initialised'   : 3,
+    'Connected'     : 4,
+    'Closing'       : 5,
+    'Closed'        : 6,
+    'Reconfiguring' : 7,
+    'Reconfigured'  : 8,
+    }
+xenbusState.update(dict(zip(xenbusState.values(), xenbusState.keys())))
+
diff -r 10f0e1bb8e5e -r e75cb35c798b 
tools/python/xen/xend/server/DevController.py
--- a/tools/python/xen/xend/server/DevController.py     Tue Nov 04 12:07:22 
2008 +0900
+++ b/tools/python/xen/xend/server/DevController.py     Tue Nov 04 12:43:19 
2008 +0900
@@ -23,41 +23,14 @@ from xen.xend.XendError import VmError
 from xen.xend.XendError import VmError
 from xen.xend.XendLogging import log
 import xen.xend.XendConfig
+from xen.xend.server.DevConstants import *
 
 from xen.xend.xenstore.xstransact import xstransact, complete
 from xen.xend.xenstore.xswatch import xswatch
 
 import os
 
-DEVICE_CREATE_TIMEOUT  = 100
-DEVICE_DESTROY_TIMEOUT = 100
-HOTPLUG_STATUS_NODE = "hotplug-status"
-HOTPLUG_ERROR_NODE  = "hotplug-error"
-HOTPLUG_STATUS_ERROR = "error"
-HOTPLUG_STATUS_BUSY  = "busy"
-
-Connected    = 1
-Error        = 2
-Missing      = 3
-Timeout      = 4
-Busy         = 5
-Disconnected = 6
-
-xenbusState = {
-    'Unknown'      : 0,
-    'Initialising' : 1,
-    'InitWait'     : 2,
-    'Initialised'  : 3,
-    'Connected'    : 4,
-    'Closing'      : 5,
-    'Closed'       : 6,
-    'Reconfiguring': 7,
-    'Reconfigured' : 8,
-    }
-
 xoptions = XendOptions.instance()
-
-xenbusState.update(dict(zip(xenbusState.values(), xenbusState.keys())))
 
 
 class DevController:
@@ -569,7 +542,7 @@ class DevController:
             xswatch(statusPath, hotplugStatusCallback, ev, result)
             ev.wait(DEVICE_CREATE_TIMEOUT)
             err = xstransact.Read(statusPath, HOTPLUG_ERROR_NODE)
-            if result['status'] != 'Connected':
+            if result['status'] != Connected:
                 return (result['status'], err)
             
         backpath = self.readVm(devid, "backend")
diff -r 10f0e1bb8e5e -r e75cb35c798b tools/python/xen/xend/server/iopif.py
--- a/tools/python/xen/xend/server/iopif.py     Tue Nov 04 12:07:22 2008 +0900
+++ b/tools/python/xen/xend/server/iopif.py     Tue Nov 04 12:43:19 2008 +0900
@@ -45,8 +45,21 @@ def parse_ioport(val):
 
 class IOPortsController(DevController):
 
+    valid_cfg = ['to', 'from', 'uuid']
+
     def __init__(self, vm):
         DevController.__init__(self, vm)
+
+    def getDeviceConfiguration(self, devid, transaction = None):
+        result = DevController.getDeviceConfiguration(self, devid, transaction)
+        if transaction is None:
+            devinfo = self.readBackend(devid, *self.valid_cfg)
+        else:
+            devinfo = self.readBackendTxn(transaction, devid, *self.valid_cfg)
+        config = dict(zip(self.valid_cfg, devinfo))
+        config = dict([(key, val) for key, val in config.items()
+                       if val != None])
+        return config
 
     def getDeviceDetails(self, config):
         """@see DevController.getDeviceDetails"""
@@ -81,4 +94,9 @@ class IOPortsController(DevController):
                 'ioports: Failed to configure legacy i/o range: %s - %s' %
                 (io_from, io_to))
 
-        return (None, {}, {})
+        back = dict([(k, config[k]) for k in self.valid_cfg if k in config])
+        return (self.allocateDeviceID(), back, {})
+
+    def waitForDevice(self, devid):
+        # don't wait for hotplug
+        return
diff -r 10f0e1bb8e5e -r e75cb35c798b tools/python/xen/xend/server/irqif.py
--- a/tools/python/xen/xend/server/irqif.py     Tue Nov 04 12:07:22 2008 +0900
+++ b/tools/python/xen/xend/server/irqif.py     Tue Nov 04 12:43:19 2008 +0900
@@ -39,6 +39,18 @@ class IRQController(DevController):
     def __init__(self, vm):
         DevController.__init__(self, vm)
 
+    valid_cfg = ['irq', 'uuid']
+
+    def getDeviceConfiguration(self, devid, transaction = None):
+        result = DevController.getDeviceConfiguration(self, devid, transaction)
+        if transaction is None:
+            devinfo = self.readBackend(devid, *self.valid_cfg)
+        else:
+            devinfo = self.readBackendTxn(transaction, devid, *self.valid_cfg)
+        config = dict(zip(self.valid_cfg, devinfo))
+        config = dict([(key, val) for key, val in config.items()
+                       if val != None])
+        return config
 
     def getDeviceDetails(self, config):
         """@see DevController.getDeviceDetails"""
@@ -75,4 +87,9 @@ class IRQController(DevController):
         if rc < 0:
             raise VmError(
                 'irq: Failed to map irq %x' % (pirq))
-        return (None, {}, {})
+        back = dict([(k, config[k]) for k in self.valid_cfg if k in config])
+        return (self.allocateDeviceID(), back, {})
+
+    def waitForDevice(self, devid):
+        # don't wait for hotplug
+        return
diff -r 10f0e1bb8e5e -r e75cb35c798b tools/python/xen/xend/server/pciif.py
--- a/tools/python/xen/xend/server/pciif.py     Tue Nov 04 12:07:22 2008 +0900
+++ b/tools/python/xen/xend/server/pciif.py     Tue Nov 04 12:43:19 2008 +0900
@@ -25,7 +25,8 @@ from xen.xend.XendError import VmError
 from xen.xend.XendError import VmError
 from xen.xend.XendLogging import log
 
-from xen.xend.server.DevController import DevController, xenbusState
+from xen.xend.server.DevController import DevController
+from xen.xend.server.DevConstants import xenbusState
 
 import xen.lowlevel.xc
 
diff -r 10f0e1bb8e5e -r e75cb35c798b tools/python/xen/xend/server/vscsiif.py
--- a/tools/python/xen/xend/server/vscsiif.py   Tue Nov 04 12:07:22 2008 +0900
+++ b/tools/python/xen/xend/server/vscsiif.py   Tue Nov 04 12:43:19 2008 +0900
@@ -28,7 +28,8 @@ from xen.xend.XendError import VmError
 from xen.xend.XendError import VmError
 from xen.xend.XendLogging import log
 
-from xen.xend.server.DevController import DevController, xenbusState
+from xen.xend.server.DevController import DevController
+from xen.xend.server.DevConstants import xenbusState
 from xen.xend.xenstore.xstransact import xstransact
 
 class VSCSIController(DevController):
@@ -92,8 +93,8 @@ class VSCSIController(DevController):
             back[devpath + '/p-devname'] = pdevname
             vdev = vscsi_config.get('v-dev', '')
             back[devpath + '/v-dev'] = vdev
-            state = vscsi_config.get('state', '')
-            back[devpath + '/state'] = str(xenbusState[state])
+            state = vscsi_config.get('state', xenbusState['Unknown'])
+            back[devpath + '/state'] = str(state)
             devid = vscsi_config.get('devid', '')
             back[devpath + '/devid'] = str(devid)
 
@@ -168,17 +169,17 @@ class VSCSIController(DevController):
         (devid, back, front) = self.getDeviceDetails(config)
         devid = int(devid)
         vscsi_config = config['devs'][0]
-        state = vscsi_config.get('state', '')
+        state = vscsi_config.get('state', xenbusState['Unknown'])
         driver_state = self.readBackend(devid, 'state')
         if str(xenbusState['Connected']) != driver_state:
             raise VmError("Driver status is not connected")
 
         uuid = self.readBackend(devid, 'uuid')
-        if state == 'Initialising':
+        if state == xenbusState['Initialising']:
             back['uuid'] = uuid
             self.writeBackend(devid, back)
 
-        elif state == 'Closing':
+        elif state == xenbusState['Closing']:
             found = False
             devs = self.readBackendList(devid, "vscsi-devs")
             vscsipath = "vscsi-devs/"
@@ -198,7 +199,7 @@ class VSCSIController(DevController):
 
         else:
             raise XendError("Error configuring device invalid "
-                            "state '%s'" % state)
+                            "state '%s'" % xenbusState[state])
 
         self.writeBackend(devid, 'state', str(xenbusState['Reconfiguring']))
         return self.readBackend(devid, 'uuid')
diff -r 10f0e1bb8e5e -r e75cb35c798b tools/python/xen/xm/create.py
--- a/tools/python/xen/xm/create.py     Tue Nov 04 12:07:22 2008 +0900
+++ b/tools/python/xen/xm/create.py     Tue Nov 04 12:43:19 2008 +0900
@@ -32,6 +32,7 @@ from xen.xend import osdep
 from xen.xend import osdep
 import xen.xend.XendClient
 from xen.xend.XendBootloader import bootloader
+from xen.xend.server.DevConstants import xenbusState
 from xen.util import blkif
 from xen.util import vscsi_util
 import xen.util.xsm.xsm as security
@@ -707,7 +708,7 @@ def configure_vscsis(config_devs, vals):
             vscsi_util.vscsi_get_hctl_and_devname_by(p_dev, scsi_devices)
 
         if p_hctl == None:
-            raise ValueError("Cannot find device \"%s\"" % p_dev)
+            raise ValueError('Cannot find device "%s"' % p_dev)
 
         for config in config_scsi:
             dev = vscsi_convert_sxp_to_dict(config)
@@ -717,7 +718,7 @@ def configure_vscsis(config_devs, vals):
         v_hctl = v_dev.split(':')
         devid = int(v_hctl[0])
         config_scsi.append(['dev', \
-                        ['state', 'Initialising'], \
+                        ['state', xenbusState['Initialising']], \
                         ['devid', devid], \
                         ['p-dev', p_hctl], \
                         ['p-devname', devname], \
@@ -1035,6 +1036,14 @@ def preprocess_ioports(vals):
         ioports.append(hexd)
     vals.ioports = ioports
         
+def preprocess_irq(vals):
+    if not vals.irq: return
+    irq = []
+    for v in vals.irq:
+        d = repr(v)
+        irq.append(d)
+    vals.irq = irq
+
 def preprocess_vtpm(vals):
     if not vals.vtpm: return
     vtpms = []
@@ -1133,6 +1142,7 @@ def preprocess(vals):
     preprocess_vscsi(vals)
     preprocess_ioports(vals)
     preprocess_ip(vals)
+    preprocess_irq(vals)
     preprocess_nfs(vals)
     preprocess_vtpm(vals)
     preprocess_access_control(vals)
diff -r 10f0e1bb8e5e -r e75cb35c798b tools/python/xen/xm/main.py
--- a/tools/python/xen/xm/main.py       Tue Nov 04 12:07:22 2008 +0900
+++ b/tools/python/xen/xm/main.py       Tue Nov 04 12:43:19 2008 +0900
@@ -47,6 +47,7 @@ from xen.xend import sxp
 from xen.xend import sxp
 from xen.xend import XendClient
 from xen.xend.XendConstants import *
+from xen.xend.server.DevConstants import xenbusState
 
 from xen.xm.opts import OptionError, Opts, wrap, set_true
 from xen.xm import console
@@ -2515,7 +2516,7 @@ def xm_scsi_attach(args):
     dom = args[0]
     p_scsi = args[1]
     v_hctl = args[2]
-    scsi = parse_scsi_configuration(p_scsi, v_hctl, 'Initialising')
+    scsi = parse_scsi_configuration(p_scsi, v_hctl, 
xenbusState['Initialising'])
 
     if serverType == SERVER_XEN_API:
 
@@ -2635,7 +2636,7 @@ def xm_scsi_detach(args):
     arg_check(args, 'scsi-detach', 2)
     dom = args[0]
     v_hctl = args[1]
-    scsi = parse_scsi_configuration(None, v_hctl, 'Closing')
+    scsi = parse_scsi_configuration(None, v_hctl, xenbusState['Closing'])
 
     if serverType == SERVER_XEN_API:
 
diff -r 10f0e1bb8e5e -r e75cb35c798b tools/xenpmd/Makefile
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xenpmd/Makefile     Tue Nov 04 12:43:19 2008 +0900
@@ -0,0 +1,20 @@
+XEN_ROOT=../..
+include $(XEN_ROOT)/tools/Rules.mk
+
+CFLAGS  += -Werror
+CFLAGS  += $(CFLAGS_libxenstore)
+LDFLAGS += $(LDFLAGS_libxenstore)
+
+BIN      = xenpmd
+
+.PHONY: all
+all: $(BIN)
+
+.PHONY: install
+install: all
+       $(INSTALL_DIR) $(DESTDIR)$(SBINDIR)
+       $(INSTALL_PROG) $(BIN) $(DESTDIR)$(SBINDIR)
+
+.PHONY: clean
+clean:
+       $(RM) -f $(BIN)
diff -r 10f0e1bb8e5e -r e75cb35c798b tools/xenpmd/xenpmd.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xenpmd/xenpmd.c     Tue Nov 04 12:43:19 2008 +0900
@@ -0,0 +1,520 @@
+/*
+ * xenpmd.c
+ *
+ * xen power management daemon - Facilitates power management 
+ * functionality within xen guests.
+ *
+ * Copyright (c) 2008  Kamala Narasimhan 
+ * Copyright (c) 2008  Citrix Systems, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+/* Xen extended power management support provides HVM guest power management
+ * features beyond S3, S4, S5.  For example, it helps expose system level 
+ * battery status and battery meter information and in future will be extended
+ * to include more power management support.  This extended power management 
+ * support is enabled by setting xen_extended_power_mgmt to 1 or 2 in the HVM
+ * config file.  When set to 2, non-pass through mode is enabled which heavily
+ * relies on this power management daemon to glean battery information from 
+ * dom0 and store it xenstore which would then be queries and used by qemu and 
+ * passed to the guest when appropriate battery ports are read/written to.
+ */
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <string.h>
+#include <stdlib.h>
+#include <dirent.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <xs.h>
+
+/* #define RUN_STANDALONE */
+#define RUN_IN_SIMULATE_MODE
+
+enum BATTERY_INFO_TYPE {
+    BIF, 
+    BST 
+};
+
+enum BATTERY_PRESENT {
+    NO, 
+    YES 
+};
+
+enum BATTERY_TECHNOLOGY {
+    NON_RECHARGEABLE, 
+    RECHARGEABLE 
+};
+
+struct battery_info {
+    enum BATTERY_PRESENT    present;
+    unsigned long           design_capacity;
+    unsigned long           last_full_capacity;
+    enum BATTERY_TECHNOLOGY battery_technology;
+    unsigned long           design_voltage;
+    unsigned long           design_capacity_warning;
+    unsigned long           design_capacity_low;
+    unsigned long           capacity_granularity_1;
+    unsigned long           capacity_granularity_2;
+    char                    model_number[32];
+    char                    serial_number[32];
+    char                    battery_type[32];
+    char                    oem_info[32];
+};
+
+struct battery_status {
+    enum BATTERY_PRESENT    present;
+    unsigned long           state;
+    unsigned long           present_rate;
+    unsigned long           remaining_capacity;
+    unsigned long           present_voltage;
+};
+
+static struct xs_handle *xs;
+
+#ifdef RUN_IN_SIMULATE_MODE
+    #define BATTERY_DIR_PATH "/tmp/battery"
+    #define BATTERY_INFO_FILE_PATH "/tmp/battery/%s/info" 
+    #define BATTERY_STATE_FILE_PATH "/tmp/battery/%s/state"
+#else
+    #define BATTERY_DIR_PATH "/proc/acpi/battery"
+    #define BATTERY_INFO_FILE_PATH "/proc/acpi/battery/%s/info"
+    #define BATTERY_STATE_FILE_PATH "/proc/acpi/battery/%s/state"
+#endif
+
+FILE *get_next_battery_file(DIR *battery_dir, 
+                            enum BATTERY_INFO_TYPE battery_info_type)
+{
+    FILE *file = 0;
+    struct dirent *dir_entries;
+    char file_name[32];
+    
+    do 
+    {
+        dir_entries = readdir(battery_dir);
+        if ( !dir_entries ) 
+            return 0;
+        if ( strlen(dir_entries->d_name) < 4 )
+            continue;
+        if ( battery_info_type == BIF ) 
+            snprintf(file_name, 32, BATTERY_INFO_FILE_PATH,
+                     dir_entries->d_name);
+        else 
+            snprintf(file_name, 32, BATTERY_STATE_FILE_PATH,
+                     dir_entries->d_name);
+        file = fopen(file_name, "r");
+    } while ( !file );
+
+    return file;
+}
+
+void set_attribute_battery_info(char *attrib_name,
+                                char *attrib_value,
+                                struct battery_info *info)
+{
+    if ( strstr(attrib_name, "present") ) 
+    {
+        if ( strstr(attrib_value, "yes") ) 
+            info->present = YES;
+        return;
+    }
+
+    if ( strstr(attrib_name, "design capacity warning") ) 
+    {
+        info->design_capacity_warning = strtoull(attrib_value, NULL, 10);
+        return;
+    }
+
+    if ( strstr(attrib_name, "design capacity low") ) 
+    {
+        info->design_capacity_low = strtoull(attrib_value, NULL, 10);
+        return;
+    }
+
+    if ( strstr(attrib_name, "design capacity") ) 
+    { 
+        info->design_capacity = strtoull(attrib_value, NULL, 10);
+        return;
+    }
+
+    if ( strstr(attrib_name, "last full capacity") ) 
+    {
+        info->last_full_capacity = strtoull(attrib_value, NULL, 10);
+        return;
+    }
+
+    if ( strstr(attrib_name, "design voltage") ) 
+    {
+        info->design_voltage = strtoull(attrib_value, NULL, 10);
+        return;
+    }
+
+    if ( strstr(attrib_name, "capacity granularity 1") ) 
+    {
+        info->capacity_granularity_1 = strtoull(attrib_value, NULL, 10);
+        return;
+    }
+
+    if ( strstr(attrib_name, "capacity granularity 2") ) 
+    {
+        info->capacity_granularity_2 = strtoull(attrib_value, NULL, 10);
+        return;
+    }
+
+    if ( strstr(attrib_name, "battery technology") ) 
+    {
+        if ( strncmp(attrib_value, "rechargeable",
+                     strlen("rechargeable")) == 0 ) 
+            info->battery_technology = RECHARGEABLE;
+        else 
+            info->battery_technology = NON_RECHARGEABLE;
+        return;
+    }
+
+    if ( strstr(attrib_name, "model number") ) 
+    {
+        strncpy(info->model_number, attrib_value, 32);
+        return;
+    }
+
+    if ( strstr(attrib_name, "serial number") ) 
+    {
+        strncpy(info->serial_number, attrib_value, 32);
+        return;
+    }
+
+    if ( strstr(attrib_name, "battery type") ) 
+    {
+        strncpy(info->battery_type, attrib_value, 32);
+        return;
+    }
+
+    if ( strstr(attrib_name, "OEM info") ) 
+    {
+        strncpy(info->oem_info, attrib_value, 32);
+        return;
+    }
+
+    return;
+}
+
+void set_attribute_battery_status(char *attrib_name, 
+                                  char *attrib_value,
+                                  struct battery_status *status)
+{
+    if ( strstr(attrib_name, "charging state") ) 
+    {
+        /* Check this, below is half baked */
+        if ( strstr(attrib_value, "charged") ) 
+            status->state = 0;
+        else 
+            status->state = 1;
+        return;
+    }
+
+    if ( strstr(attrib_name, "present rate") ) 
+    {
+        status->present_rate = strtoull(attrib_value, NULL, 10);
+        return;
+    }
+
+    if ( strstr(attrib_name, "remaining capacity") ) 
+    {
+        status->remaining_capacity = strtoull(attrib_value, NULL, 10);
+        return;
+    }
+
+    if ( strstr(attrib_name, "present voltage") ) 
+    {
+        status->present_voltage = strtoull(attrib_value, NULL, 10);
+        return;
+    }
+
+    if ( strstr(attrib_name, "present") ) 
+    {
+        if ( strstr(attrib_value, "yes") ) 
+            status->present = YES;
+        return;
+    }
+}
+
+void parse_battery_info_or_status(char *line_info,
+                                  enum BATTERY_INFO_TYPE type,
+                                  void *info_or_status)
+{
+    char attrib_name[128];
+    char attrib_value[64];
+    char *delimiter;
+    unsigned long length;
+
+    length = strlen(line_info);
+    delimiter = (char *) strchr( line_info, ':');
+    if ( (!delimiter) || (delimiter == line_info) ||
+         (delimiter == line_info + length) ) 
+        return;
+
+    strncpy(attrib_name, line_info, delimiter-line_info);
+    while ( *(delimiter+1) == ' ' ) 
+    {
+        delimiter++;
+        if ( delimiter+1 == line_info + length)
+            return;
+    }
+    strncpy(attrib_value, delimiter+1, 
+            (unsigned long)line_info + length -(unsigned long)delimiter); 
+    
+    if ( type == BIF ) 
+        set_attribute_battery_info(attrib_name, attrib_value,
+                                   (struct battery_info *)info_or_status);
+    else 
+        set_attribute_battery_status(attrib_name, attrib_value,
+                                     (struct battery_status *)info_or_status);
+
+    return;
+}
+
+int get_next_battery_info_or_status(DIR *battery_dir,
+                                    enum BATTERY_INFO_TYPE type,
+                                    void *info_or_status)
+{
+    FILE *file;
+    char line_info[256];
+
+    if  ( !info_or_status )
+        return 0;
+
+    memset(line_info, 0, 256);
+    if (type == BIF) 
+        memset(info_or_status, 0, sizeof(struct battery_info));
+    else 
+        memset(info_or_status, 0, sizeof(struct battery_status));
+
+    file = get_next_battery_file(battery_dir, type);
+    if ( !file )
+        return 0;
+
+    while ( fgets(line_info, 1024, file) != NULL ) 
+    {
+        parse_battery_info_or_status(line_info, type, info_or_status);
+        memset(line_info, 0, 256);
+    }
+
+    fclose(file);
+    return 1;
+}
+
+#ifdef RUN_STANDALONE
+void print_battery_info(struct battery_info *info)
+{
+    printf("present:                %d\n", info->present);
+    printf("design capacity:        %d\n", info->design_capacity);
+    printf("last full capacity:     %d\n", info->last_full_capacity);
+    printf("battery technology:     %d\n", info->battery_technology);
+    printf("design voltage:         %d\n", info->design_voltage);
+    printf("design capacity warning:%d\n", info->design_capacity_warning);
+    printf("design capacity low:    %d\n", info->design_capacity_low);
+    printf("capacity granularity 1: %d\n", info->capacity_granularity_1);
+    printf("capacity granularity 2: %d\n", info->capacity_granularity_2);
+    printf("model number:           %s\n", info->model_number);
+    printf("serial number:          %s\n", info->serial_number);
+    printf("battery type:           %s\n", info->battery_type);
+    printf("OEM info:               %s\n", info->oem_info);
+}
+#endif /*RUN_STANDALONE*/
+
+void write_ulong_lsb_first(char *temp_val, unsigned long val)
+{
+    snprintf(temp_val, 9, "%02x%02x%02x%02x", (unsigned int)val & 0xff, 
+    (unsigned int)(val & 0xff00) >> 8, (unsigned int)(val & 0xff0000) >> 16, 
+    (unsigned int)(val & 0xff000000) >> 24);
+}
+
+void write_battery_info_to_xenstore(struct battery_info *info)
+{
+    char val[1024], string_info[256];
+
+    xs_mkdir(xs, XBT_NULL, "/pm");
+   
+    memset(val, 0, 1024);
+    memset(string_info, 0, 256);
+    /* write 9 dwords (so 9*4) + length of 4 strings + 4 null terminators */
+    snprintf(val, 3, "%02x", 
+             (unsigned int)(9*4 +
+                            strlen(info->model_number) +
+                            strlen(info->serial_number) +
+                            strlen(info->battery_type) +
+                            strlen(info->oem_info) + 4));
+    write_ulong_lsb_first(val+2, info->present);
+    write_ulong_lsb_first(val+10, info->design_capacity);
+    write_ulong_lsb_first(val+18, info->last_full_capacity);
+    write_ulong_lsb_first(val+26, info->battery_technology);
+    write_ulong_lsb_first(val+34, info->design_voltage);
+    write_ulong_lsb_first(val+42, info->design_capacity_warning);
+    write_ulong_lsb_first(val+50, info->design_capacity_low);
+    write_ulong_lsb_first(val+58, info->capacity_granularity_1);
+    write_ulong_lsb_first(val+66, info->capacity_granularity_2);
+
+    snprintf(string_info, 256, "%02x%s%02x%s%02x%s%02x%s", 
+             (unsigned int)strlen(info->model_number), info->model_number,
+             (unsigned int)strlen(info->serial_number), info->serial_number,
+             (unsigned int)strlen(info->battery_type), info->battery_type,
+             (unsigned int)strlen(info->oem_info), info->oem_info);
+    strncat(val+73, string_info, 1024);
+    xs_write(xs, XBT_NULL, "/pm/bif", 
+             val, 73+8+strlen(info->model_number)+strlen(info->serial_number)+
+             strlen(info->battery_type)+strlen(info->oem_info)+1);
+}
+
+int write_one_time_battery_info(void)
+{
+    DIR *dir;
+    int ret = 0;
+    struct battery_info info;
+    
+    dir = opendir(BATTERY_DIR_PATH);
+    if ( !dir )
+        return 0;
+
+    while ( get_next_battery_info_or_status(dir, BIF, (void *)&info) ) 
+    {
+#ifdef RUN_STANDALONE
+        print_battery_info(&info);
+#endif
+        if ( info.present == YES ) 
+        {
+            write_battery_info_to_xenstore(&info);
+            ret = 1;
+            break; /* rethink this... */
+        }
+    }
+
+    closedir(dir);
+    return ret;
+}
+
+#ifdef RUN_STANDALONE
+void print_battery_status(struct battery_status *status)
+{
+    printf("present:                     %d\n", status->present);
+    printf("Battery state                %d\n", status->state);
+    printf("Battery present rate         %d\n", status->present_rate);
+    printf("Battery remining capacity    %d\n", status->remaining_capacity);
+    printf("Battery present voltage      %d\n", status->present_voltage);
+}
+#endif /*RUN_STANDALONE*/
+
+void write_battery_status_to_xenstore(struct battery_status *status)
+{
+    char val[35];
+
+    xs_mkdir(xs, XBT_NULL, "/pm");
+
+    memset(val, 0, 35);
+    snprintf(val, 3, "%02x", 16);
+    write_ulong_lsb_first(val+2, status->state);
+    write_ulong_lsb_first(val+10, status->present_rate);
+    write_ulong_lsb_first(val+18, status->remaining_capacity);
+    write_ulong_lsb_first(val+26, status->present_voltage);
+
+    xs_write(xs, XBT_NULL, "/pm/bst", val, 35);
+}
+
+int wait_for_and_update_battery_status_request(void)
+{
+    DIR *dir;
+    int ret = 0;
+    unsigned int count;
+    struct battery_status status;
+
+    while ( true )
+    {
+        /* KN:@TODO - It is rather inefficient to not cache the file handle.
+         *  Switch to caching file handle. 
+         */
+        dir = opendir(BATTERY_DIR_PATH);
+        if ( !dir )
+            return 0;
+
+        while ( get_next_battery_info_or_status(dir, BST, (void *)&status) ) 
+        {
+#ifdef RUN_STANDALONE
+            print_battery_status(&status);
+#endif
+            if ( status.present == YES ) 
+            {
+                write_battery_status_to_xenstore(&status);
+                ret = 1;
+                /* rethink this; though I have never seen, there might be
+                 * systems out there with more than one battery device 
+                 * present
+                 */
+                break;
+            }
+        }
+        closedir(dir);
+        xs_watch(xs, "/pm/events", "refreshbatterystatus");
+        xs_read_watch(xs, &count); 
+    }
+
+    return ret;
+}
+
+/* Borrowed daemonize from xenstored - Initially written by Stevens. */
+static void daemonize(void)
+{
+    pid_t pid;
+
+    if ( (pid = fork()) < 0 )
+        exit(1);
+
+    if ( pid != 0 )
+        exit(0);
+
+    setsid();
+
+    if ( (pid = fork()) < 0 )
+        exit(1);
+
+    if ( pid != 0 )
+        exit(0);
+
+    if ( chdir("/") == -1 )
+        exit(1);
+
+    umask(0);
+}
+
+int main(int argc, char *argv[])
+{
+#ifndef RUN_STANDALONE
+    daemonize();
+#endif
+    xs = (struct xs_handle *)xs_daemon_open();
+    if ( xs == NULL ) 
+        return -1;
+
+    if ( write_one_time_battery_info() == 0 ) 
+    {
+        xs_daemon_close(xs);
+        return -1;
+    }
+
+    wait_for_and_update_battery_status_request();
+    xs_daemon_close(xs);
+    return 0;
+}
+
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/ia64/xen/cpufreq/cpufreq.c
--- a/xen/arch/ia64/xen/cpufreq/cpufreq.c       Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/ia64/xen/cpufreq/cpufreq.c       Tue Nov 04 12:43:19 2008 +0900
@@ -210,21 +210,6 @@ acpi_cpufreq_cpu_init (struct cpufreq_po
 
        data->acpi_data = &processor_pminfo[cpu]->perf;
 
-       /* capability check */
-       if (data->acpi_data->state_count <= 1) {
-               printk(KERN_WARNING "P-States\n");
-               result = -ENODEV;
-               goto err_unreg;
-       }
-
-       if ((data->acpi_data->control_register.space_id !=
-                               ACPI_ADR_SPACE_FIXED_HARDWARE) ||
-                       (data->acpi_data->status_register.space_id !=
-                        ACPI_ADR_SPACE_FIXED_HARDWARE)) {
-               result = -ENODEV;
-               goto err_unreg;
-       }
-
        data->freq_table = xmalloc_array(struct cpufreq_frequency_table,
                        (data->acpi_data->state_count + 1));
        if (!data->freq_table) {
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/ia64/xen/irq.c
--- a/xen/arch/ia64/xen/irq.c   Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/ia64/xen/irq.c   Tue Nov 04 12:43:19 2008 +0900
@@ -74,7 +74,7 @@ unsigned int __ia64_local_vector_to_irq 
 /*
  * Controller mappings for all interrupt sources:
  */
-irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = {
+irq_desc_t irq_desc[NR_IRQS] = {
        [0 ... NR_IRQS-1] = {
                .status = IRQ_DISABLED,
                .handler = &no_irq_type,
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/acpi/cpu_idle.c
--- a/xen/arch/x86/acpi/cpu_idle.c      Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/acpi/cpu_idle.c      Tue Nov 04 12:43:19 2008 +0900
@@ -75,13 +75,14 @@ static void print_acpi_power(uint32_t cp
 
     printk("==cpu%d==\n", cpu);
     printk("active state:\t\tC%d\n",
-           power->last_state ? (int)(power->last_state - power->states) : -1);
+           power->last_state ? power->last_state->idx : -1);
     printk("max_cstate:\t\tC%d\n", max_cstate);
     printk("states:\n");
     
     for ( i = 1; i < power->count; i++ )
     {
-        printk((power->last_state == &power->states[i]) ? "   *" : "    ");
+        printk((power->last_state && power->last_state->idx == i) ?
+               "   *" : "    ");
         printk("C%d:\t", i);
         printk("type[C%d] ", power->states[i].type);
         printk("latency[%03d] ", power->states[i].latency);
@@ -139,20 +140,26 @@ static void acpi_processor_ffh_cstate_en
 
 static void acpi_idle_do_entry(struct acpi_processor_cx *cx)
 {
-    if ( cx->space_id == ACPI_ADR_SPACE_FIXED_HARDWARE )
-    {
+    int unused;
+
+    switch ( cx->entry_method )
+    {
+    case ACPI_CSTATE_EM_FFH:
         /* Call into architectural FFH based C-state */
         acpi_processor_ffh_cstate_enter(cx);
-    }
-    else
-    {
-        int unused;
+        return;
+    case ACPI_CSTATE_EM_SYSIO:
         /* IO port based C-state */
         inb(cx->address);
         /* Dummy wait op - must do something useless after P_LVL2 read
            because chipsets cannot guarantee that STPCLK# signal
            gets asserted in time to freeze execution properly. */
         unused = inl(pmtmr_ioport);
+        return;
+    case ACPI_CSTATE_EM_HALT:
+        acpi_safe_halt();
+        local_irq_disable();
+        return;
     }
 }
 
@@ -222,7 +229,7 @@ static void acpi_processor_idle(void)
         if ( power->flags.bm_check && acpi_idle_bm_check()
              && cx->type == ACPI_STATE_C3 )
             cx = power->safe_state;
-        if ( cx - &power->states[0] > max_cstate )
+        if ( cx->idx > max_cstate )
             cx = &power->states[max_cstate];
     }
     if ( !cx )
@@ -252,35 +259,11 @@ static void acpi_processor_idle(void)
     switch ( cx->type )
     {
     case ACPI_STATE_C1:
-        /* Trace cpu idle entry */
-        TRACE_1D(TRC_PM_IDLE_ENTRY, 1);
-
-        /*
-         * Invoke C1.
-         * Use the appropriate idle routine, the one that would
-         * be used without acpi C-states.
-         */
-        if ( pm_idle_save )
-            pm_idle_save();
-        else 
-            acpi_safe_halt();
-
-        /* Trace cpu idle exit */
-        TRACE_1D(TRC_PM_IDLE_EXIT, 1);
-
-        /*
-         * TBD: Can't get time duration while in C1, as resumes
-         *      go to an ISR rather than here.  Need to instrument
-         *      base interrupt handler.
-         */
-        sleep_ticks = 0xFFFFFFFF;
-        break;
-
     case ACPI_STATE_C2:
-        if ( local_apic_timer_c2_ok )
+        if ( cx->type == ACPI_STATE_C1 || local_apic_timer_c2_ok )
         {
             /* Trace cpu idle entry */
-            TRACE_1D(TRC_PM_IDLE_ENTRY, 2);
+            TRACE_1D(TRC_PM_IDLE_ENTRY, cx->idx);
             /* Get start time (ticks) */
             t1 = inl(pmtmr_ioport);
             /* Invoke C2 */
@@ -288,7 +271,7 @@ static void acpi_processor_idle(void)
             /* Get end time (ticks) */
             t2 = inl(pmtmr_ioport);
             /* Trace cpu idle exit */
-            TRACE_1D(TRC_PM_IDLE_EXIT, 2);
+            TRACE_1D(TRC_PM_IDLE_EXIT, cx->idx);
 
             /* Re-enable interrupts */
             local_irq_enable();
@@ -328,7 +311,7 @@ static void acpi_processor_idle(void)
         }
 
         /* Trace cpu idle entry */
-        TRACE_1D(TRC_PM_IDLE_ENTRY, cx - &power->states[0]);
+        TRACE_1D(TRC_PM_IDLE_ENTRY, cx->idx);
         /*
          * Before invoking C3, be aware that TSC/APIC timer may be 
          * stopped by H/W. Without carefully handling of TSC/APIC stop issues,
@@ -349,7 +332,7 @@ static void acpi_processor_idle(void)
         /* recovering TSC */
         cstate_restore_tsc();
         /* Trace cpu idle exit */
-        TRACE_1D(TRC_PM_IDLE_EXIT, cx - &power->states[0]);
+        TRACE_1D(TRC_PM_IDLE_EXIT, cx->idx);
 
         if ( power->flags.bm_check && power->flags.bm_control )
         {
@@ -387,9 +370,15 @@ static void acpi_processor_idle(void)
 
 static int init_cx_pminfo(struct acpi_processor_power *acpi_power)
 {
+    int i;
+
     memset(acpi_power, 0, sizeof(*acpi_power));
 
+    for ( i = 0; i < ACPI_PROCESSOR_MAX_POWER; i++ )
+        acpi_power->states[i].idx = i;
+
     acpi_power->states[ACPI_STATE_C1].type = ACPI_STATE_C1;
+    acpi_power->states[ACPI_STATE_C1].entry_method = ACPI_CSTATE_EM_HALT;
 
     acpi_power->states[ACPI_STATE_C0].valid = 1;
     acpi_power->states[ACPI_STATE_C1].valid = 1;
@@ -486,16 +475,13 @@ static int check_cx(struct acpi_processo
         break;
 
     case ACPI_ADR_SPACE_FIXED_HARDWARE:
-        if ( cx->type > ACPI_STATE_C1 )
-        {
-            if ( cx->reg.bit_width != VENDOR_INTEL || 
-                 cx->reg.bit_offset != NATIVE_CSTATE_BEYOND_HALT )
-                return -EINVAL;
-
-            /* assume all logical cpu has the same support for mwait */
-            if ( acpi_processor_ffh_cstate_probe(cx) )
-                return -EINVAL;
-        }
+        if ( cx->reg.bit_width != VENDOR_INTEL || 
+             cx->reg.bit_offset != NATIVE_CSTATE_BEYOND_HALT )
+            return -EINVAL;
+
+        /* assume all logical cpu has the same support for mwait */
+        if ( acpi_processor_ffh_cstate_probe(cx) )
+            return -EINVAL;
         break;
 
     default:
@@ -599,7 +585,23 @@ static void set_cx(
     cx->valid    = 1;
     cx->type     = xen_cx->type;
     cx->address  = xen_cx->reg.address;
-    cx->space_id = xen_cx->reg.space_id;
+
+    switch ( xen_cx->reg.space_id )
+    {
+    case ACPI_ADR_SPACE_FIXED_HARDWARE:
+        if ( xen_cx->reg.bit_width == VENDOR_INTEL &&
+             xen_cx->reg.bit_offset == NATIVE_CSTATE_BEYOND_HALT )
+            cx->entry_method = ACPI_CSTATE_EM_FFH;
+        else
+            cx->entry_method = ACPI_CSTATE_EM_HALT;
+        break;
+    case ACPI_ADR_SPACE_SYSTEM_IO:
+        cx->entry_method = ACPI_CSTATE_EM_SYSIO;
+        break;
+    default:
+        cx->entry_method = ACPI_CSTATE_EM_NONE;
+    }
+
     cx->latency  = xen_cx->latency;
     cx->power    = xen_cx->power;
     
@@ -761,8 +763,7 @@ int pmstat_get_cx_stat(uint32_t cpuid, s
         return 0;
     }
 
-    stat->last = (power->last_state) ?
-        (int)(power->last_state - &power->states[0]) : 0;
+    stat->last = power->last_state ? power->last_state->idx : 0;
     stat->nr = power->count;
     stat->idle_time = v->runstate.time[RUNSTATE_running];
     if ( v->is_running )
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/acpi/cpufreq/cpufreq.c
--- a/xen/arch/x86/acpi/cpufreq/cpufreq.c       Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/acpi/cpufreq/cpufreq.c       Tue Nov 04 12:43:19 2008 +0900
@@ -370,7 +370,7 @@ static int acpi_cpufreq_target(struct cp
     if (!check_freqs(cmd.mask, freqs.new, data))
         return -EAGAIN;
 
-    for_each_cpu_mask(j, cmd.mask)
+    for_each_cpu_mask(j, online_policy_cpus)
         cpufreq_statistic_update(j, perf->state, next_perf_state);
 
     perf->state = next_perf_state;
@@ -447,18 +447,6 @@ acpi_cpufreq_cpu_init(struct cpufreq_pol
     perf = data->acpi_data;
     policy->shared_type = perf->shared_type;
 
-    /* capability check */
-    if (perf->state_count <= 1) {
-        printk("No P-States\n");
-        result = -ENODEV;
-        goto err_unreg;
-    }
-
-    if (perf->control_register.space_id != perf->status_register.space_id) {
-        result = -ENODEV;
-        goto err_unreg;
-    }
-
     switch (perf->control_register.space_id) {
     case ACPI_ADR_SPACE_SYSTEM_IO:
         printk("xen_pminfo: @acpi_cpufreq_cpu_init,"
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/acpi/cpufreq/powernow.c
--- a/xen/arch/x86/acpi/cpufreq/powernow.c      Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/acpi/cpufreq/powernow.c      Tue Nov 04 12:43:19 2008 +0900
@@ -229,9 +229,23 @@ err_unreg:
     return result;
 }
 
+static int powernow_cpufreq_cpu_exit(struct cpufreq_policy *policy)
+{
+    struct powernow_cpufreq_data *data = drv_data[policy->cpu];
+
+    if (data) {
+        drv_data[policy->cpu] = NULL;
+        xfree(data->freq_table);
+        xfree(data);
+    }
+
+    return 0;
+}
+
 static struct cpufreq_driver powernow_cpufreq_driver = {
     .target = powernow_cpufreq_target,
     .init   = powernow_cpufreq_cpu_init,
+    .exit   = powernow_cpufreq_cpu_exit
 };
 
 int powernow_cpufreq_init(void)
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/acpi/cpuidle_menu.c
--- a/xen/arch/x86/acpi/cpuidle_menu.c  Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/acpi/cpuidle_menu.c  Tue Nov 04 12:43:19 2008 +0900
@@ -59,7 +59,7 @@ static int menu_select(struct acpi_proce
     data->expected_us = (u32) get_sleep_length_ns() / 1000;
 
     /* find the deepest idle state that satisfies our constraints */
-    for ( i = 1; i < power->count; i++ )
+    for ( i = 2; i < power->count; i++ )
     {
         struct acpi_processor_cx *s = &power->states[i];
 
@@ -81,17 +81,7 @@ static void menu_reflect(struct acpi_pro
     unsigned int last_residency; 
     unsigned int measured_us;
 
-    /*
-     * Ugh, this idle state doesn't support residency measurements, so we
-     * are basically lost in the dark.  As a compromise, assume we slept
-     * for one full standard timer tick.  However, be aware that this
-     * could potentially result in a suboptimal state transition.
-     */
-    if ( target->type == ACPI_STATE_C1 )
-        last_residency = USEC_PER_SEC / HZ;
-    else
-        last_residency = power->last_residency;
-
+    last_residency = power->last_residency;
     measured_us = last_residency + data->elapsed_us;
 
     /* if wrapping, set to max uint (-1) */
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c     Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/domain.c     Tue Nov 04 12:43:19 2008 +0900
@@ -174,9 +174,10 @@ void free_vcpu_struct(struct vcpu *v)
 
 static int setup_compat_l4(struct vcpu *v)
 {
-    struct page_info *pg = alloc_domheap_page(NULL, 0);
+    struct page_info *pg;
     l4_pgentry_t *l4tab;
 
+    pg = alloc_domheap_page(NULL, MEMF_node(vcpu_to_node(v)));
     if ( pg == NULL )
         return -ENOMEM;
 
@@ -1639,31 +1640,22 @@ static int relinquish_memory(
         }
 
         if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
-            put_page_and_type(page);
+            ret = put_page_and_type_preemptible(page, 1);
+        switch ( ret )
+        {
+        case 0:
+            break;
+        case -EAGAIN:
+        case -EINTR:
+            set_bit(_PGT_pinned, &page->u.inuse.type_info);
+            put_page(page);
+            goto out;
+        default:
+            BUG();
+        }
 
         if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
             put_page(page);
-
-#ifdef DOMAIN_DESTRUCT_AVOID_RECURSION
-        /*
-         * Forcibly drop reference counts of page tables above top most (which
-         * were skipped to prevent long latencies due to deep recursion - see
-         * the special treatment in free_lX_table()).
-         */
-        y = page->u.inuse.type_info;
-        if ( (type < PGT_root_page_table) &&
-             unlikely(((y + PGT_type_mask) &
-                       (PGT_type_mask|PGT_validated)) == type) )
-        {
-            BUG_ON((y & PGT_count_mask) >=
-                   (page->count_info & PGC_count_mask));
-            while ( y & PGT_count_mask )
-            {
-                put_page_and_type(page);
-                y = page->u.inuse.type_info;
-            }
-        }
-#endif
 
         /*
          * Forcibly invalidate top-most, still valid page tables at this point
@@ -1685,8 +1677,31 @@ static int relinquish_memory(
                         x & ~(PGT_validated|PGT_partial));
             if ( likely(y == x) )
             {
-                if ( free_page_type(page, x, 0) != 0 )
+                /* No need for atomic update of type_info here: noone else 
updates it. */
+                switch ( ret = free_page_type(page, x, 1) )
+                {
+                case 0:
+                    break;
+                case -EINTR:
+                    page->u.inuse.type_info |= PGT_validated;
+                    if ( x & PGT_partial )
+                        put_page(page);
+                    put_page(page);
+                    ret = -EAGAIN;
+                    goto out;
+                case -EAGAIN:
+                    page->u.inuse.type_info |= PGT_partial;
+                    if ( x & PGT_partial )
+                        put_page(page);
+                    goto out;
+                default:
                     BUG();
+                }
+                if ( x & PGT_partial )
+                {
+                    page->u.inuse.type_info--;
+                    put_page(page);
+                }
                 break;
             }
         }
@@ -1831,11 +1846,6 @@ int domain_relinquish_resources(struct d
         /* fallthrough */
 
     case RELMEM_done:
-#ifdef DOMAIN_DESTRUCT_AVOID_RECURSION
-        ret = relinquish_memory(d, &d->page_list, PGT_l1_page_table);
-        if ( ret )
-            return ret;
-#endif
         break;
 
     default:
@@ -1891,6 +1901,54 @@ void domain_cpuid(
 
     *eax = *ebx = *ecx = *edx = 0;
 }
+
+void vcpu_kick(struct vcpu *v)
+{
+    /*
+     * NB1. 'pause_flags' and 'processor' must be checked /after/ update of
+     * pending flag. These values may fluctuate (after all, we hold no
+     * locks) but the key insight is that each change will cause
+     * evtchn_upcall_pending to be polled.
+     * 
+     * NB2. We save the running flag across the unblock to avoid a needless
+     * IPI for domains that we IPI'd to unblock.
+     */
+    bool_t running = v->is_running;
+    vcpu_unblock(v);
+    if ( running && (in_irq() || (v != current)) )
+        cpu_raise_softirq(v->processor, VCPU_KICK_SOFTIRQ);
+}
+
+void vcpu_mark_events_pending(struct vcpu *v)
+{
+    int already_pending = test_and_set_bit(
+        0, (unsigned long *)&vcpu_info(v, evtchn_upcall_pending));
+
+    if ( already_pending )
+        return;
+
+    if ( is_hvm_vcpu(v) )
+        hvm_assert_evtchn_irq(v);
+    else
+        vcpu_kick(v);
+}
+
+static void vcpu_kick_softirq(void)
+{
+    /*
+     * Nothing to do here: we merely prevent notifiers from racing with checks
+     * executed on return to guest context with interrupts enabled. See, for
+     * example, xxx_intr_assist() executed on return to HVM guest context.
+     */
+}
+
+static int __init init_vcpu_kick_softirq(void)
+{
+    open_softirq(VCPU_KICK_SOFTIRQ, vcpu_kick_softirq);
+    return 0;
+}
+__initcall(init_vcpu_kick_softirq);
+
 
 /*
  * Local variables:
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/domain_build.c
--- a/xen/arch/x86/domain_build.c       Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/domain_build.c       Tue Nov 04 12:43:19 2008 +0900
@@ -194,6 +194,30 @@ static void __init process_dom0_ioports_
     }
 }
 
+/* We run on dom0's page tables for the final part of the build process. */
+static void dom0_pt_enter(struct vcpu *v)
+{
+    struct desc_ptr gdt_desc = {
+        .limit = LAST_RESERVED_GDT_BYTE,
+        .base = (unsigned long)(this_cpu(gdt_table) - FIRST_RESERVED_GDT_ENTRY)
+    };
+
+    asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
+    write_ptbase(v);
+}
+
+/* Return to idle domain's page tables. */
+static void dom0_pt_exit(void)
+{
+    struct desc_ptr gdt_desc = {
+        .limit = LAST_RESERVED_GDT_BYTE,
+        .base = GDT_VIRT_START(current)
+    };
+
+    write_ptbase(current);
+    asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
+}
+
 int __init construct_dom0(
     struct domain *d,
     unsigned long _image_start, unsigned long image_len, 
@@ -700,14 +724,12 @@ int __init construct_dom0(
         (void)alloc_vcpu(d, i, i % num_online_cpus());
 
     /* Set up CR3 value for write_ptbase */
-    if ( paging_mode_enabled(v->domain) )
+    if ( paging_mode_enabled(d) )
         paging_update_paging_modes(v);
     else
         update_cr3(v);
 
-    /* Install the new page tables. */
-    local_irq_disable();
-    write_ptbase(v);
+    dom0_pt_enter(v);
 
     /* Copy the OS image and free temporary buffer. */
     elf.dest = (void*)vkern_start;
@@ -804,9 +826,7 @@ int __init construct_dom0(
         xlat_start_info(si, XLAT_start_info_console_dom0);
 #endif
 
-    /* Reinstate the caller's page tables. */
-    write_ptbase(current);
-    local_irq_enable();
+    dom0_pt_exit();
 
 #if defined(__i386__)
     /* Destroy low mappings - they were only for our convenience. */
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/hpet.c
--- a/xen/arch/x86/hpet.c       Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/hpet.c       Tue Nov 04 12:43:19 2008 +0900
@@ -14,8 +14,6 @@
 #include <asm/div64.h>
 #include <asm/hpet.h>
 
-#define STIME_MAX ((s_time_t)((uint64_t)~0ull>>1))
-
 #define MAX_DELTA_NS MILLISECS(10*1000)
 #define MIN_DELTA_NS MICROSECS(20)
 
@@ -146,7 +144,7 @@ static void handle_hpet_broadcast(struct
     s_time_t now, next_event;
     int cpu;
 
-    spin_lock(&ch->lock);
+    spin_lock_irq(&ch->lock);
 
 again:
     ch->next_event = STIME_MAX;
@@ -171,7 +169,7 @@ again:
         if ( reprogram_hpet_evt_channel(ch, next_event, now, 0) )
             goto again;
     }
-    spin_unlock(&ch->lock);
+    spin_unlock_irq(&ch->lock);
 }
 
 void hpet_broadcast_init(void)
@@ -213,6 +211,7 @@ void hpet_broadcast_enter(void)
 {
     struct hpet_event_channel *ch = &hpet_event;
 
+    ASSERT(!local_irq_is_enabled());
     spin_lock(&ch->lock);
 
     disable_APIC_timer();
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/hvm/emulate.c
--- a/xen/arch/x86/hvm/emulate.c        Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/hvm/emulate.c        Tue Nov 04 12:43:19 2008 +0900
@@ -14,10 +14,38 @@
 #include <xen/lib.h>
 #include <xen/sched.h>
 #include <xen/paging.h>
+#include <xen/trace.h>
 #include <asm/event.h>
 #include <asm/hvm/emulate.h>
 #include <asm/hvm/hvm.h>
 #include <asm/hvm/support.h>
+
+#define HVMTRACE_IO_ASSIST_WRITE 0x200
+static void hvmtrace_io_assist(int is_mmio, ioreq_t *p)
+{
+    unsigned int size, event;
+    unsigned char buffer[12];
+
+    if ( likely(!tb_init_done) )
+        return;
+
+    event = is_mmio ? TRC_HVM_MMIO_ASSIST : TRC_HVM_IO_ASSIST;
+    if ( !p->dir )
+        event |= HVMTRACE_IO_ASSIST_WRITE;
+
+    *(uint64_t *)buffer = p->addr;
+    size = (p->addr != (u32)p->addr) ? 8 : 4;
+    if ( size == 8 )
+        event |= TRC_64_FLAG;
+
+    if ( !p->data_is_ptr )
+    {
+        *(uint32_t *)&buffer[size] = p->data;
+        size += 4;
+    }
+
+    trace_var(event, 0/*!cycles*/, size, buffer);
+}
 
 static int hvmemul_do_io(
     int is_mmio, paddr_t addr, unsigned long *reps, int size,
@@ -110,6 +138,8 @@ static int hvmemul_do_io(
     p->df = df;
     p->data = value;
     p->io_count++;
+
+    hvmtrace_io_assist(is_mmio, p);
 
     if ( is_mmio )
     {
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/hvm/hpet.c
--- a/xen/arch/x86/hvm/hpet.c   Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/hvm/hpet.c   Tue Nov 04 12:43:19 2008 +0900
@@ -76,6 +76,7 @@
         ~0ULL : (tick) * (h)->hpet_to_ns_scale) >> 10))
 
 #define timer_config(h, n)       (h->hpet.timers[n].config)
+#define timer_enabled(h, n)      (timer_config(h, n) & HPET_TN_ENABLE)
 #define timer_is_periodic(h, n)  (timer_config(h, n) & HPET_TN_PERIODIC)
 #define timer_is_32bit(h, n)     (timer_config(h, n) & HPET_TN_32BIT)
 #define hpet_enabled(h)          (h->hpet.config & HPET_CFG_ENABLE)
@@ -88,9 +89,40 @@
     ((timer_config(h, n) & HPET_TN_INT_ROUTE_CAP_MASK) \
         >> HPET_TN_INT_ROUTE_CAP_SHIFT)
 
-#define hpet_time_after(a, b)   ((int32_t)(b) - (int32_t)(a) < 0)
-#define hpet_time_after64(a, b) ((int64_t)(b) - (int64_t)(a) < 0)
-
+static inline uint64_t hpet_read_maincounter(HPETState *h)
+{
+    ASSERT(spin_is_locked(&h->lock));
+
+    if ( hpet_enabled(h) )
+        return guest_time_hpet(h->vcpu) + h->mc_offset;
+    else 
+        return h->hpet.mc64;
+}
+
+static uint64_t hpet_get_comparator(HPETState *h, unsigned int tn)
+{
+    uint64_t comparator;
+    uint64_t elapsed;
+
+    comparator = h->hpet.comparator64[tn];
+    if ( timer_is_periodic(h, tn) )
+    {
+        /* update comparator by number of periods elapsed since last update */
+        uint64_t period = h->hpet.period[tn];
+        if (period)
+        {
+            elapsed = hpet_read_maincounter(h) + period - 1 - comparator;
+            comparator += (elapsed / period) * period;
+            h->hpet.comparator64[tn] = comparator;
+        }
+    }
+    
+    /* truncate if timer is in 32 bit mode */
+    if ( timer_is_32bit(h, tn) )
+        comparator = (uint32_t)comparator;
+    h->hpet.timers[tn].cmp = comparator;
+    return comparator;
+}
 static inline uint64_t hpet_read64(HPETState *h, unsigned long addr)
 {
     addr &= ~7;
@@ -104,7 +136,7 @@ static inline uint64_t hpet_read64(HPETS
     case HPET_STATUS:
         return h->hpet.isr;
     case HPET_COUNTER:
-        return h->hpet.mc64;
+        return hpet_read_maincounter(h);
     case HPET_T0_CFG:
     case HPET_T1_CFG:
     case HPET_T2_CFG:
@@ -112,7 +144,7 @@ static inline uint64_t hpet_read64(HPETS
     case HPET_T0_CMP:
     case HPET_T1_CMP:
     case HPET_T2_CMP:
-        return h->hpet.timers[(addr - HPET_T0_CMP) >> 5].cmp;
+        return hpet_get_comparator(h, (addr - HPET_T0_CMP) >> 5);
     case HPET_T0_ROUTE:
     case HPET_T1_ROUTE:
     case HPET_T2_ROUTE:
@@ -140,16 +172,6 @@ static inline int hpet_check_access_leng
     return 0;
 }
 
-static inline uint64_t hpet_read_maincounter(HPETState *h)
-{
-    ASSERT(spin_is_locked(&h->lock));
-
-    if ( hpet_enabled(h) )
-        return guest_time_hpet(h->vcpu) + h->mc_offset;
-    else 
-        return h->hpet.mc64;
-}
-
 static int hpet_read(
     struct vcpu *v, unsigned long addr, unsigned long length,
     unsigned long *pval)
@@ -169,8 +191,6 @@ static int hpet_read(
     spin_lock(&h->lock);
 
     val = hpet_read64(h, addr);
-    if ( (addr & ~7) == HPET_COUNTER )
-        val = hpet_read_maincounter(h);
 
     result = val;
     if ( length != 8 )
@@ -187,7 +207,10 @@ static void hpet_stop_timer(HPETState *h
 {
     ASSERT(tn < HPET_TIMER_NUM);
     ASSERT(spin_is_locked(&h->lock));
-    stop_timer(&h->timers[tn]);
+    destroy_periodic_time(&h->pt[tn]);
+    /* read the comparator to get it updated so a read while stopped will
+     * return the expected value. */
+    hpet_get_comparator(h, tn);
 }
 
 /* the number of HPET tick that stands for
@@ -197,6 +220,8 @@ static void hpet_set_timer(HPETState *h,
 static void hpet_set_timer(HPETState *h, unsigned int tn)
 {
     uint64_t tn_cmp, cur_tick, diff;
+    unsigned int irq;
+    unsigned int oneshot;
 
     ASSERT(tn < HPET_TIMER_NUM);
     ASSERT(spin_is_locked(&h->lock));
@@ -209,7 +234,10 @@ static void hpet_set_timer(HPETState *h,
         pit_stop_channel0_irq(pit);
     }
 
-    tn_cmp   = h->hpet.timers[tn].cmp;
+    if ( !timer_enabled(h, tn) )
+        return;
+
+    tn_cmp   = hpet_get_comparator(h, tn);
     cur_tick = hpet_read_maincounter(h);
     if ( timer_is_32bit(h, tn) )
     {
@@ -229,7 +257,25 @@ static void hpet_set_timer(HPETState *h,
         diff = (timer_is_32bit(h, tn) && (-diff > HPET_TINY_TIME_SPAN))
             ? (uint32_t)diff : 0;
 
-    set_timer(&h->timers[tn], NOW() + hpet_tick_to_ns(h, diff));
+    if ( (tn <= 1) && (h->hpet.config & HPET_CFG_LEGACY) )
+        /* if LegacyReplacementRoute bit is set, HPET specification requires
+           timer0 be routed to IRQ0 in NON-APIC or IRQ2 in the I/O APIC,
+           timer1 be routed to IRQ8 in NON-APIC or IRQ8 in the I/O APIC. */
+        irq = (tn == 0) ? 0 : 8;
+    else
+        irq = timer_int_route(h, tn);
+
+    /*
+     * diff is the time from now when the timer should fire, for a periodic 
+     * timer we also need the period which may be different because time may
+     * have elapsed between the time the comparator was written and the timer
+     * being enabled (now).
+     */
+    oneshot = !timer_is_periodic(h, tn);
+    create_periodic_time(h->vcpu, &h->pt[tn],
+                         hpet_tick_to_ns(h, diff),
+                         oneshot ? 0 : hpet_tick_to_ns(h, h->hpet.period[tn]),
+                         irq, NULL, NULL);
 }
 
 static inline uint64_t hpet_fixup_reg(
@@ -248,6 +294,13 @@ static int hpet_write(
     uint64_t old_val, new_val;
     int tn, i;
 
+    /* Acculumate a bit mask of timers whos state is changed by this write. */
+    unsigned long start_timers = 0;
+    unsigned long stop_timers  = 0;
+#define set_stop_timer(n)    (__set_bit((n), &stop_timers))
+#define set_start_timer(n)   (__set_bit((n), &start_timers))
+#define set_restart_timer(n) (set_stop_timer(n),set_start_timer(n))
+
     addr &= HPET_MMAP_SIZE-1;
 
     if ( hpet_check_access_length(addr, length) != 0 )
@@ -256,9 +309,6 @@ static int hpet_write(
     spin_lock(&h->lock);
 
     old_val = hpet_read64(h, addr);
-    if ( (addr & ~7) == HPET_COUNTER )
-        old_val = hpet_read_maincounter(h);
-
     new_val = val;
     if ( length != 8 )
         new_val = hpet_fixup_reg(
@@ -275,22 +325,35 @@ static int hpet_write(
             /* Enable main counter and interrupt generation. */
             h->mc_offset = h->hpet.mc64 - guest_time_hpet(h->vcpu);
             for ( i = 0; i < HPET_TIMER_NUM; i++ )
-                hpet_set_timer(h, i); 
+            {
+                h->hpet.comparator64[i] =
+                            h->hpet.timers[i].config & HPET_TN_32BIT ?
+                                          (uint32_t)h->hpet.timers[i].cmp :
+                                                    h->hpet.timers[i].cmp;
+                if ( timer_enabled(h, i) )
+                    set_start_timer(i);
+            }
         }
         else if ( (old_val & HPET_CFG_ENABLE) && !(new_val & HPET_CFG_ENABLE) )
         {
             /* Halt main counter and disable interrupt generation. */
             h->hpet.mc64 = h->mc_offset + guest_time_hpet(h->vcpu);
             for ( i = 0; i < HPET_TIMER_NUM; i++ )
-                hpet_stop_timer(h, i);
+                if ( timer_enabled(h, i) )
+                    set_stop_timer(i);
         }
         break;
 
     case HPET_COUNTER:
+        h->hpet.mc64 = new_val;
         if ( hpet_enabled(h) )
+        {
             gdprintk(XENLOG_WARNING, 
                      "HPET: writing main counter but it's not halted!\n");
-        h->hpet.mc64 = new_val;
+            for ( i = 0; i < HPET_TIMER_NUM; i++ )
+                if ( timer_enabled(h, i) )
+                    set_restart_timer(i);
+        }
         break;
 
     case HPET_T0_CFG:
@@ -313,7 +376,28 @@ static int hpet_write(
             h->hpet.timers[tn].cmp = (uint32_t)h->hpet.timers[tn].cmp;
             h->hpet.period[tn] = (uint32_t)h->hpet.period[tn];
         }
-
+        if ( hpet_enabled(h) )
+        {
+            if ( new_val & HPET_TN_ENABLE )
+            {
+                if ( (new_val ^ old_val) & HPET_TN_PERIODIC )
+                    /* timer is enabled but switching mode to/from periodic/
+                     * one-shot, stop and restart the vpt timer to get it in
+                     * the right mode. */
+                    set_restart_timer(tn);
+                else if ( (new_val & HPET_TN_32BIT) &&
+                         !(old_val & HPET_TN_32BIT) )
+                    /* switching from 64 bit to 32 bit mode could cause timer
+                     * next fire time, or period, to change. */
+                    set_restart_timer(tn);
+                else if ( !(old_val & HPET_TN_ENABLE) )
+                    /* transition from timer disabled to timer enabled. */
+                    set_start_timer(tn);
+            }
+            else if ( old_val & HPET_TN_ENABLE )
+                /* transition from timer enabled to timer disabled. */
+                set_stop_timer(tn);
+        }
         break;
 
     case HPET_T0_CMP:
@@ -322,24 +406,32 @@ static int hpet_write(
         tn = (addr - HPET_T0_CMP) >> 5;
         if ( timer_is_32bit(h, tn) )
             new_val = (uint32_t)new_val;
-        if ( !timer_is_periodic(h, tn) ||
-             (h->hpet.timers[tn].config & HPET_TN_SETVAL) )
-            h->hpet.timers[tn].cmp = new_val;
-        else
+        h->hpet.timers[tn].cmp = new_val;
+        if ( h->hpet.timers[tn].config & HPET_TN_SETVAL )
+            /*
+             * When SETVAL is one, software is able to "directly set a periodic
+             * timer's accumulator."  That is, set the comparator without
+             * adjusting the period.  Much the same as just setting the
+             * comparator on an enabled one-shot timer.
+             * 
+             * This configuration bit clears when the comparator is written.
+             */
+            h->hpet.timers[tn].config &= ~HPET_TN_SETVAL;
+        else if ( timer_is_periodic(h, tn) )
         {
             /*
              * Clamp period to reasonable min/max values:
-             *  - minimum is 900us, same as timers controlled by vpt.c
+             *  - minimum is 100us, same as timers controlled by vpt.c
              *  - maximum is to prevent overflow in time_after() calculations
              */
-            if ( hpet_tick_to_ns(h, new_val) < MICROSECS(900) )
-                new_val = (MICROSECS(900) << 10) / h->hpet_to_ns_scale;
+            if ( hpet_tick_to_ns(h, new_val) < MICROSECS(100) )
+                new_val = (MICROSECS(100) << 10) / h->hpet_to_ns_scale;
             new_val &= (timer_is_32bit(h, tn) ? ~0u : ~0ull) >> 1;
             h->hpet.period[tn] = new_val;
         }
-        h->hpet.timers[tn].config &= ~HPET_TN_SETVAL;
-        if ( hpet_enabled(h) )
-            hpet_set_timer(h, tn);
+        h->hpet.comparator64[tn] = new_val;
+        if ( hpet_enabled(h) && timer_enabled(h, tn) )
+            set_restart_timer(tn);
         break;
 
     case HPET_T0_ROUTE:
@@ -354,6 +446,25 @@ static int hpet_write(
         break;
     }
 
+    /* stop/start timers whos state was changed by this write. */
+    while (stop_timers)
+    {
+        i = find_first_set_bit(stop_timers);
+        __clear_bit(i, &stop_timers);
+        hpet_stop_timer(h, i);
+    }
+
+    while (start_timers)
+    {
+        i = find_first_set_bit(start_timers);
+        __clear_bit(i, &start_timers);
+        hpet_set_timer(h, i);
+    }
+
+#undef set_stop_timer
+#undef set_start_timer
+#undef set_restart_timer
+
     spin_unlock(&h->lock);
 
  out:
@@ -373,86 +484,6 @@ struct hvm_mmio_handler hpet_mmio_handle
     .write_handler = hpet_write
 };
 
-static void hpet_route_interrupt(HPETState *h, unsigned int tn)
-{
-    unsigned int tn_int_route = timer_int_route(h, tn);
-    struct domain *d = h->vcpu->domain;
-
-    ASSERT(spin_is_locked(&h->lock));
-
-    if ( (tn <= 1) && (h->hpet.config & HPET_CFG_LEGACY) )
-    {
-        /* if LegacyReplacementRoute bit is set, HPET specification requires
-           timer0 be routed to IRQ0 in NON-APIC or IRQ2 in the I/O APIC,
-           timer1 be routed to IRQ8 in NON-APIC or IRQ8 in the I/O APIC. */
-        int isa_irq = (tn == 0) ? 0 : 8;
-        hvm_isa_irq_deassert(d, isa_irq);
-        hvm_isa_irq_assert(d, isa_irq);
-        return;
-    }
-
-    if ( !(timer_int_route_cap(h, tn) & (1U << tn_int_route)) )
-    {
-        gdprintk(XENLOG_ERR,
-                 "HPET: timer%u: invalid interrupt route config\n", tn);
-        domain_crash(d);
-        return;
-    }
-
-    /* We support only edge-triggered interrupt. */
-    spin_lock(&d->arch.hvm_domain.irq_lock);
-    vioapic_irq_positive_edge(d, tn_int_route);
-    spin_unlock(&d->arch.hvm_domain.irq_lock);
-}
-
-static void hpet_timer_fn(void *opaque)
-{
-    struct HPET_timer_fn_info *htfi = opaque;
-    HPETState *h = htfi->hs;
-    unsigned int tn = htfi->tn;
-
-    spin_lock(&h->lock);
-
-    if ( !hpet_enabled(h) )
-    {
-        spin_unlock(&h->lock);
-        return;
-    }
-
-    if ( timer_config(h, tn) & HPET_TN_ENABLE )
-        hpet_route_interrupt(h, tn);
-
-    if ( timer_is_periodic(h, tn) && (h->hpet.period[tn] != 0) )
-    {
-        uint64_t mc = hpet_read_maincounter(h), period = h->hpet.period[tn];
-        if ( timer_is_32bit(h, tn) )
-        {
-            while ( hpet_time_after(mc, h->hpet.timers[tn].cmp) )
-                h->hpet.timers[tn].cmp = (uint32_t)(
-                    h->hpet.timers[tn].cmp + period);
-        }
-        else
-        {
-            while ( hpet_time_after64(mc, h->hpet.timers[tn].cmp) )
-                h->hpet.timers[tn].cmp += period;
-        }
-        set_timer(&h->timers[tn], NOW() + hpet_tick_to_ns(h, period));
-    }
-
-    spin_unlock(&h->lock);
-}
-
-void hpet_migrate_timers(struct vcpu *v)
-{
-    struct HPETState *h = &v->domain->arch.hvm_domain.pl_time.vhpet;
-    int i;
-
-    if ( v != h->vcpu )
-        return;
-
-    for ( i = 0; i < HPET_TIMER_NUM; i++ )
-        migrate_timer(&h->timers[i], v->processor);
-}
 
 static int hpet_save(struct domain *d, hvm_domain_context_t *h)
 {
@@ -477,18 +508,20 @@ static int hpet_save(struct domain *d, h
         C(isr);
         C(mc64);
         C(timers[0].config);
-        C(timers[0].cmp);
         C(timers[0].fsb);
         C(timers[1].config);
-        C(timers[1].cmp);
         C(timers[1].fsb);
         C(timers[2].config);
-        C(timers[2].cmp);
         C(timers[2].fsb);
         C(period[0]);
         C(period[1]);
         C(period[2]);
 #undef C
+        /* save the 64 bit comparator in the 64 bit timer[n].cmp field
+         * regardless of whether or not the timer is in 32 bit mode. */
+        rec->timers[0].cmp = hp->hpet.comparator64[0];
+        rec->timers[1].cmp = hp->hpet.comparator64[1];
+        rec->timers[2].cmp = hp->hpet.comparator64[2];
     }
 
     spin_unlock(&hp->lock);
@@ -500,6 +533,7 @@ static int hpet_load(struct domain *d, h
 {
     HPETState *hp = &d->arch.hvm_domain.pl_time.vhpet;
     struct hvm_hw_hpet *rec;
+    uint64_t cmp;
     int i;
 
     spin_lock(&hp->lock);
@@ -515,32 +549,38 @@ static int hpet_load(struct domain *d, h
     h->cur += HVM_SAVE_LENGTH(HPET);
 
 #define C(x) hp->hpet.x = rec->x
-        C(capability);
-        C(config);
-        C(isr);
-        C(mc64);
-        C(timers[0].config);
-        C(timers[0].cmp);
-        C(timers[0].fsb);
-        C(timers[1].config);
-        C(timers[1].cmp);
-        C(timers[1].fsb);
-        C(timers[2].config);
-        C(timers[2].cmp);
-        C(timers[2].fsb);
-        C(period[0]);
-        C(period[1]);
-        C(period[2]);
+    C(capability);
+    C(config);
+    C(isr);
+    C(mc64);
+    /* The following define will generate a compiler error if HPET_TIMER_NUM
+     * changes. This indicates an incompatability with previous saved state. */
+#define HPET_TIMER_NUM 3
+    for ( i = 0; i < HPET_TIMER_NUM; i++ )
+    {
+        C(timers[i].config);
+        C(timers[i].fsb);
+        C(period[i]);
+        /* restore the hidden 64 bit comparator and truncate the timer's
+         * visible comparator field if in 32 bit mode. */
+        cmp = rec->timers[i].cmp;
+        hp->hpet.comparator64[i] = cmp;
+        if ( timer_is_32bit(hp, i) )
+            cmp = (uint32_t)cmp;
+        hp->hpet.timers[i].cmp = cmp;
+    }
 #undef C
     
     /* Recalculate the offset between the main counter and guest time */
     hp->mc_offset = hp->hpet.mc64 - guest_time_hpet(hp->vcpu);
-                
-    /* Restart the timers */
-    for ( i = 0; i < HPET_TIMER_NUM; i++ )
-        if ( hpet_enabled(hp) )
-            hpet_set_timer(hp, i);
-
+
+    /* restart all timers */
+
+    if ( hpet_enabled(hp) )
+        for ( i = 0; i < HPET_TIMER_NUM; i++ )
+            if ( timer_enabled(hp, i) )
+                hpet_set_timer(hp, i);
+ 
     spin_unlock(&hp->lock);
 
     return 0;
@@ -575,10 +615,7 @@ void hpet_init(struct vcpu *v)
         h->hpet.timers[i].config = 
             HPET_TN_INT_ROUTE_CAP | HPET_TN_SIZE_CAP | HPET_TN_PERIODIC_CAP;
         h->hpet.timers[i].cmp = ~0ULL;
-        h->timer_fn_info[i].hs = h;
-        h->timer_fn_info[i].tn = i;
-        init_timer(&h->timers[i], hpet_timer_fn, &h->timer_fn_info[i],
-                   v->processor);
+        h->pt[i].source = PTSRC_isa;
     }
 }
 
@@ -587,8 +624,14 @@ void hpet_deinit(struct domain *d)
     int i;
     HPETState *h = &d->arch.hvm_domain.pl_time.vhpet;
 
-    for ( i = 0; i < HPET_TIMER_NUM; i++ )
-        kill_timer(&h->timers[i]);
+    spin_lock(&h->lock);
+
+    if ( hpet_enabled(h) )
+        for ( i = 0; i < HPET_TIMER_NUM; i++ )
+            if ( timer_enabled(h, i) )
+                hpet_stop_timer(h, i);
+
+    spin_unlock(&h->lock);
 }
 
 void hpet_reset(struct domain *d)
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/hvm/hvm.c
--- a/xen/arch/x86/hvm/hvm.c    Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/hvm/hvm.c    Tue Nov 04 12:43:19 2008 +0900
@@ -163,7 +163,6 @@ void hvm_migrate_timers(struct vcpu *v)
 void hvm_migrate_timers(struct vcpu *v)
 {
     rtc_migrate_timers(v);
-    hpet_migrate_timers(v);
     pt_migrate(v);
 }
 
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/hvm/i8254.c
--- a/xen/arch/x86/hvm/i8254.c  Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/hvm/i8254.c  Tue Nov 04 12:43:19 2008 +0900
@@ -213,13 +213,13 @@ static void pit_load_count(PITState *pit
     case 2:
     case 3:
         /* Periodic timer. */
-        create_periodic_time(v, &pit->pt0, period, 0, 0, pit_time_fired, 
+        create_periodic_time(v, &pit->pt0, period, period, 0, pit_time_fired, 
                              &pit->count_load_time[channel]);
         break;
     case 1:
     case 4:
         /* One-shot timer. */
-        create_periodic_time(v, &pit->pt0, period, 0, 1, pit_time_fired,
+        create_periodic_time(v, &pit->pt0, period, 0, 0, pit_time_fired,
                              &pit->count_load_time[channel]);
         break;
     default:
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/hvm/rtc.c
--- a/xen/arch/x86/hvm/rtc.c    Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/hvm/rtc.c    Tue Nov 04 12:43:19 2008 +0900
@@ -59,8 +59,8 @@ static void rtc_timer_update(RTCState *s
 
         period = 1 << (period_code - 1); /* period in 32 Khz cycles */
         period = DIV_ROUND((period * 1000000000ULL), 32768); /* period in ns */
-        create_periodic_time(v, &s->pt, period, RTC_IRQ,
-                             0, rtc_periodic_cb, s);
+        create_periodic_time(v, &s->pt, period, period, RTC_IRQ,
+                             rtc_periodic_cb, s);
     }
     else
     {
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/hvm/svm/entry.S
--- a/xen/arch/x86/hvm/svm/entry.S      Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/hvm/svm/entry.S      Tue Nov 04 12:43:19 2008 +0900
@@ -57,6 +57,8 @@
 #endif
 
 ENTRY(svm_asm_do_resume)
+        call svm_intr_assist
+
         get_current(bx)
         CLGI
 
@@ -67,7 +69,6 @@ ENTRY(svm_asm_do_resume)
         jnz  .Lsvm_process_softirqs
 
         call svm_asid_handle_vmrun
-        call svm_intr_assist
 
         cmpb $0,addr_of(tb_init_done)
         jnz  .Lsvm_trace
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/hvm/vlapic.c
--- a/xen/arch/x86/hvm/vlapic.c Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/hvm/vlapic.c Tue Nov 04 12:43:19 2008 +0900
@@ -701,8 +701,9 @@ static int vlapic_write(struct vcpu *v, 
                             (uint32_t)val * vlapic->hw.timer_divisor;
 
         vlapic_set_reg(vlapic, APIC_TMICT, val);
-        create_periodic_time(current, &vlapic->pt, period, vlapic->pt.irq,
-                             !vlapic_lvtt_period(vlapic), vlapic_pt_cb,
+        create_periodic_time(current, &vlapic->pt, period, 
+                             vlapic_lvtt_period(vlapic) ? period : 0,
+                             vlapic->pt.irq, vlapic_pt_cb,
                              &vlapic->timer_last_update);
         vlapic->timer_last_update = vlapic->pt.last_plt_gtime;
 
@@ -861,8 +862,9 @@ static void lapic_rearm(struct vlapic *s
     period = ((uint64_t)APIC_BUS_CYCLE_NS *
               (uint32_t)tmict * s->hw.timer_divisor);
     s->pt.irq = vlapic_get_reg(s, APIC_LVTT) & APIC_VECTOR_MASK;
-    create_periodic_time(vlapic_vcpu(s), &s->pt, period, s->pt.irq,
-                         !vlapic_lvtt_period(s), vlapic_pt_cb,
+    create_periodic_time(vlapic_vcpu(s), &s->pt, period,
+                         vlapic_lvtt_period(s) ? period : 0,
+                         s->pt.irq, vlapic_pt_cb,
                          &s->timer_last_update);
     s->timer_last_update = s->pt.last_plt_gtime;
 }
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/hvm/vmx/entry.S
--- a/xen/arch/x86/hvm/vmx/entry.S      Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/hvm/vmx/entry.S      Tue Nov 04 12:43:19 2008 +0900
@@ -122,6 +122,8 @@ vmx_asm_vmexit_handler:
 
 .globl vmx_asm_do_vmentry
 vmx_asm_do_vmentry:
+        call vmx_intr_assist
+
         get_current(bx)
         cli
 
@@ -130,8 +132,6 @@ vmx_asm_do_vmentry:
         lea  addr_of(irq_stat),r(dx)
         cmpl $0,(r(dx),r(ax),1)
         jnz  .Lvmx_process_softirqs
-
-        call vmx_intr_assist
 
         testb $0xff,VCPU_vmx_emul(r(bx))
         jnz  .Lvmx_goto_realmode
@@ -179,11 +179,13 @@ vmx_asm_do_vmentry:
 
 /*.Lvmx_resume:*/
         VMRESUME
+        sti
         call vm_resume_fail
         ud2
 
 .Lvmx_launch:
         VMLAUNCH
+        sti
         call vm_launch_fail
         ud2
 
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/hvm/vmx/vmx.c
--- a/xen/arch/x86/hvm/vmx/vmx.c        Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/hvm/vmx/vmx.c        Tue Nov 04 12:43:19 2008 +0900
@@ -49,6 +49,7 @@
 #include <asm/hvm/vpt.h>
 #include <public/hvm/save.h>
 #include <asm/hvm/trace.h>
+#include <asm/xenoprof.h>
 
 enum handler_return { HNDL_done, HNDL_unhandled, HNDL_exception_raised };
 
@@ -132,6 +133,7 @@ static void vmx_vcpu_destroy(struct vcpu
 {
     vmx_destroy_vmcs(v);
     vpmu_destroy(v);
+    passive_domain_destroy(v);
 }
 
 #ifdef __x86_64__
@@ -1666,6 +1668,8 @@ static int vmx_msr_read_intercept(struct
     default:
         if ( vpmu_do_rdmsr(regs) )
             goto done;
+        if ( passive_domain_do_rdmsr(regs) )
+            goto done;
         switch ( long_mode_do_msr_read(regs) )
         {
             case HNDL_unhandled:
@@ -1860,6 +1864,8 @@ static int vmx_msr_write_intercept(struc
         goto gp_fault;
     default:
         if ( vpmu_do_wrmsr(regs) )
+            return X86EMUL_OKAY;
+        if ( passive_domain_do_wrmsr(regs) )
             return X86EMUL_OKAY;
 
         if ( wrmsr_viridian_regs(ecx, regs->eax, regs->edx) ) 
@@ -1964,27 +1970,25 @@ static void ept_handle_violation(unsigne
 {
     unsigned long gla_validity = qualification & EPT_GLA_VALIDITY_MASK;
     struct domain *d = current->domain;
-    unsigned long gfn = gpa >> PAGE_SHIFT;
+    unsigned long gla, gfn = gpa >> PAGE_SHIFT;
     mfn_t mfn;
     p2m_type_t t;
 
-    if ( unlikely(qualification & EPT_GAW_VIOLATION) )
-    {
-        gdprintk(XENLOG_ERR, "EPT violation: guest physical address %"PRIpaddr
-                 " exceeded its width limit.\n", gpa);
-        goto crash;
-    }
-
-    if ( unlikely(gla_validity == EPT_GLA_VALIDITY_RSVD) ||
-         unlikely(gla_validity == EPT_GLA_VALIDITY_PDPTR_LOAD) )
-    {
-        gdprintk(XENLOG_ERR, "EPT violation: reserved bit or "
-                 "pdptr load violation.\n");
-        goto crash;
-    }
-
     mfn = gfn_to_mfn(d, gfn, &t);
-    if ( (t != p2m_ram_ro) && p2m_is_ram(t) && paging_mode_log_dirty(d) )
+
+    /* There are two legitimate reasons for taking an EPT violation. 
+     * One is a guest access to MMIO space. */
+    if ( gla_validity == EPT_GLA_VALIDITY_MATCH && p2m_is_mmio(t) )
+    {
+        handle_mmio();
+        return;
+    }
+
+    /* The other is log-dirty mode, writing to a read-only page */
+    if ( paging_mode_log_dirty(d)
+         && (gla_validity == EPT_GLA_VALIDITY_MATCH
+             || gla_validity == EPT_GLA_VALIDITY_GPT_WALK)
+         && p2m_is_ram(t) && (t != p2m_ram_ro) )
     {
         paging_mark_dirty(d, mfn_x(mfn));
         p2m_change_type(d, gfn, p2m_ram_logdirty, p2m_ram_rw);
@@ -1992,16 +1996,39 @@ static void ept_handle_violation(unsigne
         return;
     }
 
-    /* This can only happen in log-dirty mode, writing back A/D bits. */
-    if ( unlikely(gla_validity == EPT_GLA_VALIDITY_GPT_WALK) )
-        goto crash;
-
-    ASSERT(gla_validity == EPT_GLA_VALIDITY_MATCH);
-    handle_mmio();
-
-    return;
-
- crash:
+    /* Everything else is an error. */
+    gla = __vmread(GUEST_LINEAR_ADDRESS);
+    gdprintk(XENLOG_ERR, "EPT violation %#lx (%c%c%c/%c%c%c), "
+             "gpa %#"PRIpaddr", mfn %#lx, type %i.\n", 
+             qualification, 
+             (qualification & EPT_READ_VIOLATION) ? 'r' : '-',
+             (qualification & EPT_WRITE_VIOLATION) ? 'w' : '-',
+             (qualification & EPT_EXEC_VIOLATION) ? 'x' : '-',
+             (qualification & EPT_EFFECTIVE_READ) ? 'r' : '-',
+             (qualification & EPT_EFFECTIVE_WRITE) ? 'w' : '-',
+             (qualification & EPT_EFFECTIVE_EXEC) ? 'x' : '-',
+             gpa, mfn_x(mfn), t);
+
+    if ( qualification & EPT_GAW_VIOLATION )
+        gdprintk(XENLOG_ERR, " --- GPA too wide (max %u bits)\n", 
+                 9 * (unsigned) d->arch.hvm_domain.vmx.ept_control.gaw + 21);
+
+    switch ( gla_validity )
+    {
+    case EPT_GLA_VALIDITY_PDPTR_LOAD:
+        gdprintk(XENLOG_ERR, " --- PDPTR load failed\n"); 
+        break;
+    case EPT_GLA_VALIDITY_GPT_WALK:
+        gdprintk(XENLOG_ERR, " --- guest PT walk to %#lx failed\n", gla);
+        break;
+    case EPT_GLA_VALIDITY_RSVD:
+        gdprintk(XENLOG_ERR, " --- GLA_validity 2 (reserved)\n");
+        break;
+    case EPT_GLA_VALIDITY_MATCH:
+        gdprintk(XENLOG_ERR, " --- guest access to %#lx failed\n", gla);
+        break;
+    }
+
     domain_crash(d);
 }
 
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/hvm/vmx/vpmu_core2.c
--- a/xen/arch/x86/hvm/vmx/vpmu_core2.c Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/hvm/vmx/vpmu_core2.c Tue Nov 04 12:43:19 2008 +0900
@@ -35,6 +35,26 @@
 #include <asm/hvm/vmx/vpmu.h>
 #include <asm/hvm/vmx/vpmu_core2.h>
 
+u32 core2_counters_msr[] =   {
+    MSR_CORE_PERF_FIXED_CTR0,
+    MSR_CORE_PERF_FIXED_CTR1,
+    MSR_CORE_PERF_FIXED_CTR2};
+
+/* Core 2 Non-architectual Performance Control MSRs. */
+u32 core2_ctrls_msr[] = {
+    MSR_CORE_PERF_FIXED_CTR_CTRL,
+    MSR_IA32_PEBS_ENABLE,
+    MSR_IA32_DS_AREA};
+
+struct pmumsr core2_counters = {
+    3,
+    core2_counters_msr
+};
+
+struct pmumsr core2_ctrls = {
+    3,
+    core2_ctrls_msr
+};
 static int arch_pmc_cnt;
 
 static int core2_get_pmc_count(void)
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/hvm/vpt.c
--- a/xen/arch/x86/hvm/vpt.c    Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/hvm/vpt.c    Tue Nov 04 12:43:19 2008 +0900
@@ -355,8 +355,8 @@ void pt_migrate(struct vcpu *v)
 }
 
 void create_periodic_time(
-    struct vcpu *v, struct periodic_time *pt, uint64_t period,
-    uint8_t irq, char one_shot, time_cb *cb, void *data)
+    struct vcpu *v, struct periodic_time *pt, uint64_t delta,
+    uint64_t period, uint8_t irq, time_cb *cb, void *data)
 {
     ASSERT(pt->source != 0);
 
@@ -368,13 +368,13 @@ void create_periodic_time(
     pt->do_not_freeze = 0;
     pt->irq_issued = 0;
 
-    /* Periodic timer must be at least 0.9ms. */
-    if ( (period < 900000) && !one_shot )
+    /* Periodic timer must be at least 0.1ms. */
+    if ( (period < 100000) && period )
     {
         if ( !test_and_set_bool(pt->warned_timeout_too_short) )
             gdprintk(XENLOG_WARNING, "HVM_PlatformTime: program too "
                      "small period %"PRIu64"\n", period);
-        period = 900000;
+        period = 100000;
     }
 
     pt->period = period;
@@ -382,15 +382,15 @@ void create_periodic_time(
     pt->last_plt_gtime = hvm_get_guest_time(pt->vcpu);
     pt->irq = irq;
     pt->period_cycles = (u64)period;
-    pt->one_shot = one_shot;
-    pt->scheduled = NOW() + period;
+    pt->one_shot = !period;
+    pt->scheduled = NOW() + delta;
     /*
      * Offset LAPIC ticks from other timer ticks. Otherwise guests which use
      * LAPIC ticks for process accounting can see long sequences of process
      * ticks incorrectly accounted to interrupt processing.
      */
-    if ( pt->source == PTSRC_lapic )
-        pt->scheduled += period >> 1;
+    if ( !pt->one_shot && (pt->source == PTSRC_lapic) )
+        pt->scheduled += delta >> 1;
     pt->cb = cb;
     pt->priv = data;
 
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/irq.c
--- a/xen/arch/x86/irq.c        Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/irq.c        Tue Nov 04 12:43:19 2008 +0900
@@ -793,6 +793,10 @@ int map_domain_pirq(
 
     ASSERT(spin_is_locked(&d->event_lock));
 
+    /* XXX Until pcidev and msi locking is fixed. */
+    if ( type == MAP_PIRQ_TYPE_MSI )
+        return -EINVAL;
+
     if ( !IS_PRIV(current->domain) )
         return -EPERM;
 
@@ -840,7 +844,7 @@ int map_domain_pirq(
     d->arch.pirq_vector[pirq] = vector;
     d->arch.vector_pirq[vector] = pirq;
 
-done:
+ done:
     spin_unlock_irqrestore(&desc->lock, flags);
     return ret;
 }
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/mm.c
--- a/xen/arch/x86/mm.c Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/mm.c Tue Nov 04 12:43:19 2008 +0900
@@ -566,19 +566,21 @@ static int get_page_and_type_from_pagenr
 static int get_page_and_type_from_pagenr(unsigned long page_nr, 
                                          unsigned long type,
                                          struct domain *d,
+                                         int partial,
                                          int preemptible)
 {
     struct page_info *page = mfn_to_page(page_nr);
     int rc;
 
-    if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
+    if ( likely(partial >= 0) &&
+         unlikely(!get_page_from_pagenr(page_nr, d)) )
         return -EINVAL;
 
     rc = (preemptible ?
           get_page_type_preemptible(page, type) :
           (get_page_type(page, type) ? 0 : -EINVAL));
 
-    if ( rc )
+    if ( unlikely(rc) && partial >= 0 )
         put_page(page);
 
     return rc;
@@ -761,7 +763,7 @@ get_page_from_l2e(
     }
 
     rc = get_page_and_type_from_pagenr(
-        l2e_get_pfn(l2e), PGT_l1_page_table, d, 0);
+        l2e_get_pfn(l2e), PGT_l1_page_table, d, 0, 0);
     if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) )
         rc = 0;
 
@@ -772,7 +774,7 @@ define_get_linear_pagetable(l3);
 define_get_linear_pagetable(l3);
 static int
 get_page_from_l3e(
-    l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int preemptible)
+    l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int partial, int 
preemptible)
 {
     int rc;
 
@@ -786,7 +788,7 @@ get_page_from_l3e(
     }
 
     rc = get_page_and_type_from_pagenr(
-        l3e_get_pfn(l3e), PGT_l2_page_table, d, preemptible);
+        l3e_get_pfn(l3e), PGT_l2_page_table, d, partial, preemptible);
     if ( unlikely(rc == -EINVAL) && get_l3_linear_pagetable(l3e, pfn, d) )
         rc = 0;
 
@@ -797,7 +799,7 @@ define_get_linear_pagetable(l4);
 define_get_linear_pagetable(l4);
 static int
 get_page_from_l4e(
-    l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int preemptible)
+    l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int partial, int 
preemptible)
 {
     int rc;
 
@@ -811,7 +813,7 @@ get_page_from_l4e(
     }
 
     rc = get_page_and_type_from_pagenr(
-        l4e_get_pfn(l4e), PGT_l3_page_table, d, preemptible);
+        l4e_get_pfn(l4e), PGT_l3_page_table, d, partial, preemptible);
     if ( unlikely(rc == -EINVAL) && get_l4_linear_pagetable(l4e, pfn, d) )
         rc = 0;
 
@@ -961,23 +963,32 @@ static int put_page_from_l2e(l2_pgentry_
     return 1;
 }
 
+static int __put_page_type(struct page_info *, int preemptible);
 
 static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
-                             int preemptible)
+                             int partial, int preemptible)
 {
     if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) && 
          (l3e_get_pfn(l3e) != pfn) )
+    {
+        if ( unlikely(partial > 0) )
+            return __put_page_type(l3e_get_page(l3e), preemptible);
         return put_page_and_type_preemptible(l3e_get_page(l3e), preemptible);
+    }
     return 1;
 }
 
 #if CONFIG_PAGING_LEVELS >= 4
 static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
-                             int preemptible)
+                             int partial, int preemptible)
 {
     if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) && 
          (l4e_get_pfn(l4e) != pfn) )
+    {
+        if ( unlikely(partial > 0) )
+            return __put_page_type(l4e_get_page(l4e), preemptible);
         return put_page_and_type_preemptible(l4e_get_page(l4e), preemptible);
+    }
     return 1;
 }
 #endif
@@ -1184,7 +1195,7 @@ static int alloc_l3_table(struct page_in
     unsigned long  pfn = page_to_mfn(page);
     l3_pgentry_t  *pl3e;
     unsigned int   i;
-    int            rc = 0;
+    int            rc = 0, partial = page->partial_pte;
 
 #if CONFIG_PAGING_LEVELS == 3
     /*
@@ -1213,7 +1224,8 @@ static int alloc_l3_table(struct page_in
     if ( is_pv_32on64_domain(d) )
         memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));
 
-    for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES; i++ )
+    for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES;
+          i++, partial = 0 )
     {
         if ( is_pv_32bit_domain(d) && (i == 3) )
         {
@@ -1224,16 +1236,17 @@ static int alloc_l3_table(struct page_in
                 rc = get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]),
                                                    PGT_l2_page_table |
                                                    PGT_pae_xen_l2,
-                                                   d, preemptible);
+                                                   d, partial, preemptible);
         }
         else if ( !is_guest_l3_slot(i) ||
-                  (rc = get_page_from_l3e(pl3e[i], pfn, d, preemptible)) > 0 )
+                  (rc = get_page_from_l3e(pl3e[i], pfn, d,
+                                          partial, preemptible)) > 0 )
             continue;
 
         if ( rc == -EAGAIN )
         {
             page->nr_validated_ptes = i;
-            page->partial_pte = 1;
+            page->partial_pte = partial ?: 1;
         }
         else if ( rc == -EINTR && i )
         {
@@ -1257,7 +1270,7 @@ static int alloc_l3_table(struct page_in
             if ( !is_guest_l3_slot(i) )
                 continue;
             unadjust_guest_l3e(pl3e[i], d);
-            put_page_from_l3e(pl3e[i], pfn, 0);
+            put_page_from_l3e(pl3e[i], pfn, 0, 0);
         }
     }
 
@@ -1272,18 +1285,20 @@ static int alloc_l4_table(struct page_in
     unsigned long  pfn = page_to_mfn(page);
     l4_pgentry_t  *pl4e = page_to_virt(page);
     unsigned int   i;
-    int            rc = 0;
-
-    for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES; i++ )
+    int            rc = 0, partial = page->partial_pte;
+
+    for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES;
+          i++, partial = 0 )
     {
         if ( !is_guest_l4_slot(d, i) ||
-             (rc = get_page_from_l4e(pl4e[i], pfn, d, preemptible)) > 0 )
+             (rc = get_page_from_l4e(pl4e[i], pfn, d,
+                                     partial, preemptible)) > 0 )
             continue;
 
         if ( rc == -EAGAIN )
         {
             page->nr_validated_ptes = i;
-            page->partial_pte = 1;
+            page->partial_pte = partial ?: 1;
         }
         else if ( rc == -EINTR )
         {
@@ -1299,7 +1314,7 @@ static int alloc_l4_table(struct page_in
             MEM_LOG("Failure in alloc_l4_table: entry %d", i);
             while ( i-- > 0 )
                 if ( is_guest_l4_slot(d, i) )
-                    put_page_from_l4e(pl4e[i], pfn, 0);
+                    put_page_from_l4e(pl4e[i], pfn, 0, 0);
         }
         if ( rc < 0 )
             return rc;
@@ -1377,24 +1392,20 @@ static int free_l3_table(struct page_inf
     struct domain *d = page_get_owner(page);
     unsigned long pfn = page_to_mfn(page);
     l3_pgentry_t *pl3e;
-    unsigned int  i = page->nr_validated_ptes - !page->partial_pte;
-    int rc = 0;
-
-#ifdef DOMAIN_DESTRUCT_AVOID_RECURSION
-    if ( d->arch.relmem == RELMEM_l3 )
-        return 0;
-#endif
+    int rc = 0, partial = page->partial_pte;
+    unsigned int  i = page->nr_validated_ptes - !partial;
 
     pl3e = map_domain_page(pfn);
 
     do {
         if ( is_guest_l3_slot(i) )
         {
-            rc = put_page_from_l3e(pl3e[i], pfn, preemptible);
+            rc = put_page_from_l3e(pl3e[i], pfn, partial, preemptible);
+            if ( rc < 0 )
+                break;
+            partial = 0;
             if ( rc > 0 )
                 continue;
-            if ( rc )
-                break;
             unadjust_guest_l3e(pl3e[i], d);
         }
     } while ( i-- );
@@ -1404,7 +1415,7 @@ static int free_l3_table(struct page_inf
     if ( rc == -EAGAIN )
     {
         page->nr_validated_ptes = i;
-        page->partial_pte = 1;
+        page->partial_pte = partial ?: -1;
     }
     else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 )
     {
@@ -1421,23 +1432,21 @@ static int free_l4_table(struct page_inf
     struct domain *d = page_get_owner(page);
     unsigned long pfn = page_to_mfn(page);
     l4_pgentry_t *pl4e = page_to_virt(page);
-    unsigned int  i = page->nr_validated_ptes - !page->partial_pte;
-    int rc = 0;
-
-#ifdef DOMAIN_DESTRUCT_AVOID_RECURSION
-    if ( d->arch.relmem == RELMEM_l4 )
-        return 0;
-#endif
+    int rc = 0, partial = page->partial_pte;
+    unsigned int  i = page->nr_validated_ptes - !partial;
 
     do {
         if ( is_guest_l4_slot(d, i) )
-            rc = put_page_from_l4e(pl4e[i], pfn, preemptible);
-    } while ( rc >= 0 && i-- );
+            rc = put_page_from_l4e(pl4e[i], pfn, partial, preemptible);
+        if ( rc < 0 )
+            break;
+        partial = 0;
+    } while ( i-- );
 
     if ( rc == -EAGAIN )
     {
         page->nr_validated_ptes = i;
-        page->partial_pte = 1;
+        page->partial_pte = partial ?: -1;
     }
     else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 )
     {
@@ -1713,7 +1722,7 @@ static int mod_l3_entry(l3_pgentry_t *pl
             return rc ? 0 : -EFAULT;
         }
 
-        rc = get_page_from_l3e(nl3e, pfn, d, preemptible);
+        rc = get_page_from_l3e(nl3e, pfn, d, 0, preemptible);
         if ( unlikely(rc < 0) )
             return page_unlock(l3pg), rc;
         rc = 0;
@@ -1742,7 +1751,7 @@ static int mod_l3_entry(l3_pgentry_t *pl
     }
 
     page_unlock(l3pg);
-    put_page_from_l3e(ol3e, pfn, 0);
+    put_page_from_l3e(ol3e, pfn, 0, 0);
     return rc;
 }
 
@@ -1791,7 +1800,7 @@ static int mod_l4_entry(l4_pgentry_t *pl
             return rc ? 0 : -EFAULT;
         }
 
-        rc = get_page_from_l4e(nl4e, pfn, d, preemptible);
+        rc = get_page_from_l4e(nl4e, pfn, d, 0, preemptible);
         if ( unlikely(rc < 0) )
             return page_unlock(l4pg), rc;
         rc = 0;
@@ -1812,7 +1821,7 @@ static int mod_l4_entry(l4_pgentry_t *pl
     }
 
     page_unlock(l4pg);
-    put_page_from_l4e(ol4e, pfn, 0);
+    put_page_from_l4e(ol4e, pfn, 0, 0);
     return rc;
 }
 
@@ -1847,7 +1856,8 @@ int get_page(struct page_info *page, str
         nx = x + 1;
         d  = nd;
         if ( unlikely((x & PGC_count_mask) == 0) ||  /* Not allocated? */
-             unlikely((nx & PGC_count_mask) == 0) || /* Count overflow? */
+             /* Keep one spare reference to be acquired by get_page_light(). */
+             unlikely(((nx + 1) & PGC_count_mask) <= 1) || /* Overflow? */
              unlikely(d != _domain) )                /* Wrong owner? */
         {
             if ( !_shadow_mode_refcounts(domain) && !domain->is_dying )
@@ -1867,6 +1877,28 @@ int get_page(struct page_info *page, str
     while ( unlikely(nd != d) || unlikely(y != x) );
 
     return 1;
+}
+
+/*
+ * Special version of get_page() to be used exclusively when
+ * - a page is known to already have a non-zero reference count
+ * - the page does not need its owner to be checked
+ * - it will not be called more than once without dropping the thus
+ *   acquired reference again.
+ * Due to get_page() reserving one reference, this call cannot fail.
+ */
+static void get_page_light(struct page_info *page)
+{
+    u32 x, nx, y = page->count_info;
+
+    do {
+        x  = y;
+        nx = x + 1;
+        BUG_ON(!(x & PGC_count_mask)); /* Not allocated? */
+        BUG_ON(!(nx & PGC_count_mask)); /* Overflow? */
+        y = cmpxchg(&page->count_info, x, nx);
+    }
+    while ( unlikely(y != x) );
 }
 
 
@@ -1909,6 +1941,7 @@ static int alloc_page_type(struct page_i
     wmb();
     if ( rc == -EAGAIN )
     {
+        get_page_light(page);
         page->u.inuse.type_info |= PGT_partial;
     }
     else if ( rc == -EINTR )
@@ -1973,6 +2006,7 @@ int free_page_type(struct page_info *pag
         page->nr_validated_ptes = 1U << PAGETABLE_ORDER;
         page->partial_pte = 0;
     }
+
     switch ( type & PGT_type_mask )
     {
     case PGT_l1_page_table:
@@ -1998,6 +2032,15 @@ int free_page_type(struct page_info *pag
         BUG();
     }
 
+    return rc;
+}
+
+
+static int __put_final_page_type(
+    struct page_info *page, unsigned long type, int preemptible)
+{
+    int rc = free_page_type(page, type, preemptible);
+
     /* No need for atomic update of type_info here: noone else updates it. */
     if ( rc == 0 )
     {
@@ -2016,8 +2059,8 @@ int free_page_type(struct page_info *pag
     }
     else if ( rc == -EINTR )
     {
-        ASSERT(!(page->u.inuse.type_info &
-                 (PGT_count_mask|PGT_validated|PGT_partial)));
+        ASSERT((page->u.inuse.type_info &
+                (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
         if ( !(shadow_mode_enabled(page_get_owner(page)) &&
                (page->count_info & PGC_page_table)) )
             page->tlbflush_timestamp = tlbflush_current_time();
@@ -2028,6 +2071,7 @@ int free_page_type(struct page_info *pag
     {
         BUG_ON(rc != -EAGAIN);
         wmb();
+        get_page_light(page);
         page->u.inuse.type_info |= PGT_partial;
     }
 
@@ -2039,6 +2083,7 @@ static int __put_page_type(struct page_i
                            int preemptible)
 {
     unsigned long nx, x, y = page->u.inuse.type_info;
+    int rc = 0;
 
     for ( ; ; )
     {
@@ -2062,7 +2107,10 @@ static int __put_page_type(struct page_i
                                            x, nx)) != x) )
                     continue;
                 /* We cleared the 'valid bit' so we do the clean up. */
-                return free_page_type(page, x, preemptible);
+                rc = __put_final_page_type(page, x, preemptible);
+                if ( x & PGT_partial )
+                    put_page(page);
+                break;
             }
 
             /*
@@ -2084,7 +2132,7 @@ static int __put_page_type(struct page_i
             return -EINTR;
     }
 
-    return 0;
+    return rc;
 }
 
 
@@ -2092,6 +2140,7 @@ static int __get_page_type(struct page_i
                            int preemptible)
 {
     unsigned long nx, x, y = page->u.inuse.type_info;
+    int rc = 0;
 
     ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
 
@@ -2214,10 +2263,13 @@ static int __get_page_type(struct page_i
             page->nr_validated_ptes = 0;
             page->partial_pte = 0;
         }
-        return alloc_page_type(page, type, preemptible);
-    }
-
-    return 0;
+        rc = alloc_page_type(page, type, preemptible);
+    }
+
+    if ( (x & PGT_partial) && !(nx & PGT_partial) )
+        put_page(page);
+
+    return rc;
 }
 
 void put_page_type(struct page_info *page)
@@ -2296,7 +2348,7 @@ int new_guest_cr3(unsigned long mfn)
 #endif
     okay = paging_mode_refcounts(d)
         ? get_page_from_pagenr(mfn, d)
-        : !get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0);
+        : !get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0, 0);
     if ( unlikely(!okay) )
     {
         MEM_LOG("Error while installing new baseptr %lx", mfn);
@@ -2431,6 +2483,29 @@ static inline cpumask_t vcpumask_to_pcpu
     return pmask;
 }
 
+#ifdef __i386__
+static inline void *fixmap_domain_page(unsigned long mfn)
+{
+    unsigned int cpu = smp_processor_id();
+    void *ptr = (void *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu);
+
+    l1e_write(fix_pae_highmem_pl1e - cpu,
+              l1e_from_pfn(mfn, __PAGE_HYPERVISOR));
+    flush_tlb_one_local(ptr);
+    return ptr;
+}
+static inline void fixunmap_domain_page(const void *ptr)
+{
+    unsigned int cpu = virt_to_fix((unsigned long)ptr) - FIX_PAE_HIGHMEM_0;
+
+    l1e_write(fix_pae_highmem_pl1e - cpu, l1e_empty());
+    this_cpu(make_cr3_timestamp) = this_cpu(tlbflush_time);
+}
+#else
+#define fixmap_domain_page(mfn) mfn_to_virt(mfn)
+#define fixunmap_domain_page(ptr) ((void)(ptr))
+#endif
+
 int do_mmuext_op(
     XEN_GUEST_HANDLE(mmuext_op_t) uops,
     unsigned int count,
@@ -2517,7 +2592,7 @@ int do_mmuext_op(
             if ( paging_mode_refcounts(FOREIGNDOM) )
                 break;
 
-            rc = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM, 1);
+            rc = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM, 0, 1);
             okay = !rc;
             if ( unlikely(!okay) )
             {
@@ -2598,7 +2673,7 @@ int do_mmuext_op(
                     okay = get_page_from_pagenr(mfn, d);
                 else
                     okay = !get_page_and_type_from_pagenr(
-                        mfn, PGT_root_page_table, d, 0);
+                        mfn, PGT_root_page_table, d, 0, 0);
                 if ( unlikely(!okay) )
                 {
                     MEM_LOG("Error while installing new mfn %lx", mfn);
@@ -2697,6 +2772,66 @@ int do_mmuext_op(
                 if ( ents != 0 )
                     this_cpu(percpu_mm_info).deferred_ops |= DOP_RELOAD_LDT;
             }
+            break;
+        }
+
+        case MMUEXT_CLEAR_PAGE:
+        {
+            unsigned char *ptr;
+
+            okay = !get_page_and_type_from_pagenr(mfn, PGT_writable_page,
+                                                  FOREIGNDOM, 0, 0);
+            if ( unlikely(!okay) )
+            {
+                MEM_LOG("Error while clearing mfn %lx", mfn);
+                break;
+            }
+
+            /* A page is dirtied when it's being cleared. */
+            paging_mark_dirty(d, mfn);
+
+            ptr = fixmap_domain_page(mfn);
+            clear_page(ptr);
+            fixunmap_domain_page(ptr);
+
+            put_page_and_type(page);
+            break;
+        }
+
+        case MMUEXT_COPY_PAGE:
+        {
+            const unsigned char *src;
+            unsigned char *dst;
+            unsigned long src_mfn;
+
+            src_mfn = gmfn_to_mfn(FOREIGNDOM, op.arg2.src_mfn);
+            okay = get_page_from_pagenr(src_mfn, FOREIGNDOM);
+            if ( unlikely(!okay) )
+            {
+                MEM_LOG("Error while copying from mfn %lx", src_mfn);
+                break;
+            }
+
+            okay = !get_page_and_type_from_pagenr(mfn, PGT_writable_page,
+                                                  FOREIGNDOM, 0, 0);
+            if ( unlikely(!okay) )
+            {
+                put_page(mfn_to_page(src_mfn));
+                MEM_LOG("Error while copying to mfn %lx", mfn);
+                break;
+            }
+
+            /* A page is dirtied when it's being copied to. */
+            paging_mark_dirty(d, mfn);
+
+            src = map_domain_page(src_mfn);
+            dst = fixmap_domain_page(mfn);
+            copy_page(dst, src);
+            fixunmap_domain_page(dst);
+            unmap_domain_page(src);
+
+            put_page_and_type(page);
+            put_page(mfn_to_page(src_mfn));
             break;
         }
 
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/mm/hap/p2m-ept.c
--- a/xen/arch/x86/mm/hap/p2m-ept.c     Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/mm/hap/p2m-ept.c     Tue Nov 04 12:43:19 2008 +0900
@@ -157,9 +157,6 @@ ept_set_entry(struct domain *d, unsigned
     {
         if ( mfn_valid(mfn_x(mfn)) || (p2mt == p2m_mmio_direct) )
         {
-            /* Track the highest gfn for which we have ever had a valid 
mapping */
-            if ( gfn > d->arch.p2m->max_mapped_pfn )
-                d->arch.p2m->max_mapped_pfn = gfn;
             ept_entry->emt = epte_get_entry_emt(d, gfn, mfn_x(mfn));
             ept_entry->sp_avail = walk_level ? 1 : 0;
 
@@ -233,6 +230,11 @@ ept_set_entry(struct domain *d, unsigned
 
         unmap_domain_page(split_table);
     }
+
+    /* Track the highest gfn for which we have ever had a valid mapping */
+    if ( mfn_valid(mfn_x(mfn))
+         && (gfn + (1UL << order) - 1 > d->arch.p2m->max_mapped_pfn) )
+        d->arch.p2m->max_mapped_pfn = gfn + (1UL << order) - 1;
 
     /* Success */
     rv = 1;
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/mm/p2m.c
--- a/xen/arch/x86/mm/p2m.c     Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/mm/p2m.c     Tue Nov 04 12:43:19 2008 +0900
@@ -322,7 +322,8 @@ p2m_set_entry(struct domain *d, unsigned
     }
 
     /* Track the highest gfn for which we have ever had a valid mapping */
-    if ( mfn_valid(mfn) && (gfn > d->arch.p2m->max_mapped_pfn) )
+    if ( mfn_valid(mfn) 
+         && (gfn + (1UL << page_order) - 1 > d->arch.p2m->max_mapped_pfn) )
         d->arch.p2m->max_mapped_pfn = gfn + (1UL << page_order) - 1;
 
     if ( iommu_enabled && (is_hvm_domain(d) || need_iommu(d)) )
@@ -956,18 +957,18 @@ guest_physmap_add_entry(struct domain *d
     /* First, remove m->p mappings for existing p->m mappings */
     for ( i = 0; i < (1UL << page_order); i++ )
     {
-        omfn = gfn_to_mfn(d, gfn, &ot);
+        omfn = gfn_to_mfn(d, gfn + i, &ot);
         if ( p2m_is_ram(ot) )
         {
             ASSERT(mfn_valid(omfn));
-            set_gpfn_from_mfn(mfn_x(omfn)+i, INVALID_M2P_ENTRY);
+            set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
         }
     }
 
     /* Then, look for m->p mappings for this range and deal with them */
     for ( i = 0; i < (1UL << page_order); i++ )
     {
-        ogfn = mfn_to_gfn(d, _mfn(mfn));
+        ogfn = mfn_to_gfn(d, _mfn(mfn+i));
         if (
 #ifdef __x86_64__
             (ogfn != 0x5555555555555555L)
@@ -975,20 +976,20 @@ guest_physmap_add_entry(struct domain *d
             (ogfn != 0x55555555L)
 #endif
             && (ogfn != INVALID_M2P_ENTRY)
-            && (ogfn != gfn) )
+            && (ogfn != gfn + i) )
         {
             /* This machine frame is already mapped at another physical
              * address */
             P2M_DEBUG("aliased! mfn=%#lx, old gfn=%#lx, new gfn=%#lx\n",
-                      mfn, ogfn, gfn);
+                      mfn + i, ogfn, gfn + i);
             omfn = gfn_to_mfn(d, ogfn, &ot);
             if ( p2m_is_ram(ot) )
             {
                 ASSERT(mfn_valid(omfn));
                 P2M_DEBUG("old gfn=%#lx -> mfn %#lx\n",
                           ogfn , mfn_x(omfn));
-                if ( mfn_x(omfn) == mfn )
-                    p2m_remove_page(d, ogfn, mfn, 0);
+                if ( mfn_x(omfn) == (mfn + i) )
+                    p2m_remove_page(d, ogfn, mfn + i, 0);
             }
         }
     }
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/msi.c
--- a/xen/arch/x86/msi.c        Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/msi.c        Tue Nov 04 12:43:19 2008 +0900
@@ -33,8 +33,7 @@ DECLARE_BITMAP(msix_fixmap_pages, MAX_MS
 
 static int msix_fixmap_alloc(void)
 {
-    int i;
-    int rc = -1;
+    int i, rc = -1;
 
     spin_lock(&msix_fixmap_lock);
     for ( i = 0; i < MAX_MSIX_PAGES; i++ )
@@ -52,12 +51,8 @@ static int msix_fixmap_alloc(void)
 
 static void msix_fixmap_free(int idx)
 {
-    if ( idx < FIX_MSIX_IO_RESERV_BASE )
-        return;
-
-    spin_lock(&msix_fixmap_lock);
-    clear_bit(idx - FIX_MSIX_IO_RESERV_BASE, &msix_fixmap_pages);
-    spin_unlock(&msix_fixmap_lock);
+    if ( idx >= FIX_MSIX_IO_RESERV_BASE )
+        clear_bit(idx - FIX_MSIX_IO_RESERV_BASE, &msix_fixmap_pages);
 }
 
 /*
@@ -78,19 +73,19 @@ static void msi_compose_msg(struct pci_d
         msg->address_lo =
             MSI_ADDR_BASE_LO |
             ((INT_DEST_MODE == 0) ?
-                MSI_ADDR_DESTMODE_PHYS:
-                MSI_ADDR_DESTMODE_LOGIC) |
+             MSI_ADDR_DESTMODE_PHYS:
+             MSI_ADDR_DESTMODE_LOGIC) |
             ((INT_DELIVERY_MODE != dest_LowestPrio) ?
-                MSI_ADDR_REDIRECTION_CPU:
-                MSI_ADDR_REDIRECTION_LOWPRI) |
+             MSI_ADDR_REDIRECTION_CPU:
+             MSI_ADDR_REDIRECTION_LOWPRI) |
             MSI_ADDR_DEST_ID(dest);
 
         msg->data =
             MSI_DATA_TRIGGER_EDGE |
             MSI_DATA_LEVEL_ASSERT |
             ((INT_DELIVERY_MODE != dest_LowestPrio) ?
-                MSI_DATA_DELIVERY_FIXED:
-                MSI_DATA_DELIVERY_LOWPRI) |
+             MSI_DATA_DELIVERY_FIXED:
+             MSI_DATA_DELIVERY_LOWPRI) |
             MSI_DATA_VECTOR(vector);
     }
 }
@@ -128,7 +123,7 @@ static void read_msi_msg(struct msi_desc
     {
         void __iomem *base;
         base = entry->mask_base +
-           entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
+            entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
 
         msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
         msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
@@ -205,9 +200,9 @@ static void write_msi_msg(struct msi_des
             entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
 
         writel(msg->address_lo,
-            base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
+               base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET);
         writel(msg->address_hi,
-            base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
+               base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET);
         writel(msg->data, base + PCI_MSIX_ENTRY_DATA_OFFSET);
         break;
     }
@@ -230,7 +225,7 @@ void set_msi_irq_affinity(unsigned int i
     dest = cpu_mask_to_apicid(mask);
 
     if ( !desc )
-       return;
+        return;
 
     ASSERT(spin_is_locked(&irq_desc[irq].lock));
     spin_lock(&desc->dev->lock);
@@ -398,8 +393,8 @@ static void msi_free_vector(int vector)
         unsigned long start;
 
         writel(1, entry->mask_base + entry->msi_attrib.entry_nr
-              * PCI_MSIX_ENTRY_SIZE
-              + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
+               * PCI_MSIX_ENTRY_SIZE
+               + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET);
 
         start = (unsigned long)entry->mask_base & ~(PAGE_SIZE - 1);
         msix_fixmap_free(virt_to_fix(start));
@@ -460,20 +455,20 @@ static int msi_capability_init(struct pc
     entry->vector = vector;
     if ( is_mask_bit_support(control) )
         entry->mask_base = (void __iomem *)(long)msi_mask_bits_reg(pos,
-                is_64bit_address(control));
+                                                                   
is_64bit_address(control));
     entry->dev = dev;
     if ( entry->msi_attrib.maskbit )
     {
         unsigned int maskbits, temp;
         /* All MSIs are unmasked by default, Mask them all */
         maskbits = pci_conf_read32(bus, slot, func,
-                       msi_mask_bits_reg(pos, is_64bit_address(control)));
+                                   msi_mask_bits_reg(pos, 
is_64bit_address(control)));
         temp = (1 << multi_msi_capable(control));
         temp = ((temp - 1) & ~temp);
         maskbits |= temp;
         pci_conf_write32(bus, slot, func,
-            msi_mask_bits_reg(pos, is_64bit_address(control)),
-            maskbits);
+                         msi_mask_bits_reg(pos, is_64bit_address(control)),
+                         maskbits);
     }
     list_add_tail(&entry->list, &dev->msi_list);
 
@@ -575,14 +570,14 @@ static int __pci_enable_msi(struct msi_i
 
     pdev = pci_lock_pdev(msi->bus, msi->devfn);
     if ( !pdev )
-       return -ENODEV;
+        return -ENODEV;
 
     if ( find_msi_entry(pdev, msi->vector, PCI_CAP_ID_MSI) )
     {
-       spin_unlock(&pdev->lock);
+        spin_unlock(&pdev->lock);
         dprintk(XENLOG_WARNING, "vector %d has already mapped to MSI on "
-            "device %02x:%02x.%01x.\n", msi->vector, msi->bus,
-            PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
+                "device %02x:%02x.%01x.\n", msi->vector, msi->bus,
+                PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
         return 0;
     }
 
@@ -601,7 +596,7 @@ static void __pci_disable_msi(int vector
 
     entry = irq_desc[vector].msi_desc;
     if ( !entry )
-       return;
+        return;
     /*
      * Lock here is safe.  msi_desc can not be removed without holding
      * both irq_desc[].lock (which we do) and pdev->lock.
@@ -649,20 +644,20 @@ static int __pci_enable_msix(struct msi_
 
     pdev = pci_lock_pdev(msi->bus, msi->devfn);
     if ( !pdev )
-       return -ENODEV;
+        return -ENODEV;
 
     pos = pci_find_cap_offset(msi->bus, slot, func, PCI_CAP_ID_MSIX);
     control = pci_conf_read16(msi->bus, slot, func, msi_control_reg(pos));
     nr_entries = multi_msix_capable(control);
     if (msi->entry_nr > nr_entries)
     {
-       spin_unlock(&pdev->lock);
+        spin_unlock(&pdev->lock);
         return -EINVAL;
     }
 
     if ( find_msi_entry(pdev, msi->vector, PCI_CAP_ID_MSIX) )
     {
-       spin_unlock(&pdev->lock);
+        spin_unlock(&pdev->lock);
         dprintk(XENLOG_WARNING, "vector %d has already mapped to MSIX on "
                 "device %02x:%02x.%01x.\n", msi->vector, msi->bus,
                 PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn));
@@ -684,7 +679,7 @@ static void __pci_disable_msix(int vecto
 
     entry = irq_desc[vector].msi_desc;
     if ( !entry )
-       return;
+        return;
     /*
      * Lock here is safe.  msi_desc can not be removed without holding
      * both irq_desc[].lock (which we do) and pdev->lock.
@@ -712,7 +707,7 @@ int pci_enable_msi(struct msi_info *msi)
     ASSERT(spin_is_locked(&irq_desc[msi->vector].lock));
 
     return  msi->table_base ? __pci_enable_msix(msi) :
-                              __pci_enable_msi(msi);
+        __pci_enable_msi(msi);
 }
 
 void pci_disable_msi(int vector)
@@ -720,7 +715,7 @@ void pci_disable_msi(int vector)
     irq_desc_t *desc = &irq_desc[vector];
     ASSERT(spin_is_locked(&desc->lock));
     if ( !desc->msi_desc )
-       return;
+        return;
 
     if ( desc->msi_desc->msi_attrib.type == PCI_CAP_ID_MSI )
         __pci_disable_msi(vector);
@@ -734,7 +729,7 @@ static void msi_free_vectors(struct pci_
     irq_desc_t *desc;
     unsigned long flags;
 
-retry:
+ retry:
     list_for_each_entry_safe( entry, tmp, &dev->msi_list, list )
     {
         desc = &irq_desc[entry->vector];
@@ -742,7 +737,7 @@ retry:
         local_irq_save(flags);
         if ( !spin_trylock(&desc->lock) )
         {
-             local_irq_restore(flags);
+            local_irq_restore(flags);
             goto retry;
         }
 
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/oprofile/nmi_int.c
--- a/xen/arch/x86/oprofile/nmi_int.c   Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/oprofile/nmi_int.c   Tue Nov 04 12:43:19 2008 +0900
@@ -36,6 +36,55 @@ static char *cpu_type;
 static char *cpu_type;
 
 extern int is_active(struct domain *d);
+extern int is_passive(struct domain *d);
+
+int passive_domain_do_rdmsr(struct cpu_user_regs *regs)
+{
+       u64 msr_content;
+       int type, index;
+       struct vpmu_struct *vpmu = vcpu_vpmu(current);
+
+       if ( model->is_arch_pmu_msr == NULL )
+               return 0;
+       if ( !model->is_arch_pmu_msr((u64)regs->ecx, &type, &index) )
+               return 0;
+       if ( !(vpmu->flags & PASSIVE_DOMAIN_ALLOCATED) )
+               if ( ! model->allocated_msr(current) )
+                       return 0;
+
+       model->load_msr(current, type, index, &msr_content);
+       regs->eax = msr_content & 0xFFFFFFFF;
+       regs->edx = msr_content >> 32;
+       return 1;
+}
+
+
+int passive_domain_do_wrmsr(struct cpu_user_regs *regs)
+{
+       u64 msr_content;
+       int type, index;
+       struct vpmu_struct *vpmu = vcpu_vpmu(current);
+
+       if ( model->is_arch_pmu_msr == NULL )
+               return 0;
+       if ( !model->is_arch_pmu_msr((u64)regs->ecx, &type, &index) )
+               return 0;
+
+       if ( !(vpmu->flags & PASSIVE_DOMAIN_ALLOCATED) )
+               if ( ! model->allocated_msr(current) )
+                       return 0;
+
+       msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
+       model->save_msr(current, type, index, msr_content);
+       return 1;
+}
+
+void passive_domain_destroy(struct vcpu *v)
+{
+       struct vpmu_struct *vpmu = vcpu_vpmu(v);
+       if ( vpmu->flags & PASSIVE_DOMAIN_ALLOCATED )
+               model->free_msr(v);
+}
 
 static int nmi_callback(struct cpu_user_regs *regs, int cpu)
 {
@@ -46,6 +95,8 @@ static int nmi_callback(struct cpu_user_
        if ( ovf && is_active(current->domain) && !xen_mode )
                send_guest_vcpu_virq(current, VIRQ_XENOPROF);
 
+       if ( ovf == 2 ) 
+                test_and_set_bool(current->nmi_pending);
        return 1;
 }
  
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/oprofile/op_model_ppro.c
--- a/xen/arch/x86/oprofile/op_model_ppro.c     Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/oprofile/op_model_ppro.c     Tue Nov 04 12:43:19 2008 +0900
@@ -18,6 +18,8 @@
 #include <xen/sched.h>
 #include <asm/regs.h>
 #include <asm/current.h>
+#include <asm/hvm/vmx/vpmu.h>
+#include <asm/hvm/vmx/vpmu_core2.h>
  
 #include "op_x86_model.h"
 #include "op_counter.h"
@@ -39,9 +41,11 @@
 #define CTRL_SET_KERN(val,k) (val |= ((k & 1) << 17))
 #define CTRL_SET_UM(val, m) (val |= (m << 8))
 #define CTRL_SET_EVENT(val, e) (val |= e)
-
+#define IS_ACTIVE(val) (val & (1 << 22) )  
+#define IS_ENABLE(val) (val & (1 << 20) )
 static unsigned long reset_value[NUM_COUNTERS];
 int ppro_has_global_ctrl = 0;
+extern int is_passive(struct domain *d);
  
 static void ppro_fill_in_addresses(struct op_msrs * const msrs)
 {
@@ -103,6 +107,7 @@ static int ppro_check_ctrs(unsigned int 
        int ovf = 0;
        unsigned long eip = regs->eip;
        int mode = xenoprofile_get_mode(current, regs);
+       struct arch_msr_pair *msrs_content = vcpu_vpmu(current)->context;
 
        for (i = 0 ; i < NUM_COUNTERS; ++i) {
                if (!reset_value[i])
@@ -111,7 +116,18 @@ static int ppro_check_ctrs(unsigned int 
                if (CTR_OVERFLOWED(low)) {
                        xenoprof_log_event(current, regs, eip, mode, i);
                        CTR_WRITE(reset_value[i], msrs, i);
-                       ovf = 1;
+                       if ( is_passive(current->domain) && (mode != 2) && 
+                               (vcpu_vpmu(current)->flags & 
PASSIVE_DOMAIN_ALLOCATED) ) 
+                       {
+                               if ( IS_ACTIVE(msrs_content[i].control) )
+                               {
+                                       msrs_content[i].counter = (low | 
(u64)high << 32);
+                                       if ( IS_ENABLE(msrs_content[i].control) 
)
+                                               ovf = 2;
+                               }
+                       }
+                       if ( !ovf )
+                               ovf = 1;
                }
        }
 
@@ -159,6 +175,82 @@ static void ppro_stop(struct op_msrs con
         wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
 }
 
+static int ppro_is_arch_pmu_msr(u64 msr_index, int *type, int *index)
+{
+       if ( (msr_index >= MSR_IA32_PERFCTR0) &&
+            (msr_index < (MSR_IA32_PERFCTR0 + NUM_COUNTERS)) )
+       {
+               *type = MSR_TYPE_ARCH_COUNTER;
+               *index = msr_index - MSR_IA32_PERFCTR0;
+               return 1;
+        }
+        if ( (msr_index >= MSR_P6_EVNTSEL0) &&
+            (msr_index < (MSR_P6_EVNTSEL0 + NUM_CONTROLS)) )
+        {
+               *type = MSR_TYPE_ARCH_CTRL;
+               *index = msr_index - MSR_P6_EVNTSEL0;
+               return 1;
+        }
+
+        return 0;
+}
+
+static int ppro_allocate_msr(struct vcpu *v)
+{
+       struct vpmu_struct *vpmu = vcpu_vpmu(v);
+       struct arch_msr_pair *msr_content;
+       
+       msr_content = xmalloc_bytes( sizeof(struct arch_msr_pair) * 
NUM_COUNTERS );
+       if ( !msr_content )
+               goto out;
+       memset(msr_content, 0, sizeof(struct arch_msr_pair) * NUM_COUNTERS);
+       vpmu->context = (void *)msr_content;
+       vpmu->flags = 0;
+       vpmu->flags |= PASSIVE_DOMAIN_ALLOCATED;
+       return 1;
+out:
+        gdprintk(XENLOG_WARNING, "Insufficient memory for oprofile, oprofile 
is "
+                 "unavailable on domain %d vcpu %d.\n",
+                 v->vcpu_id, v->domain->domain_id);
+        return 0;      
+}
+
+static void ppro_free_msr(struct vcpu *v)
+{
+       struct vpmu_struct *vpmu = vcpu_vpmu(v);
+
+       xfree(vpmu->context);
+       vpmu->flags &= ~PASSIVE_DOMAIN_ALLOCATED;
+}
+
+static void ppro_load_msr(struct vcpu *v, int type, int index, u64 
*msr_content)
+{
+       struct arch_msr_pair *msrs = vcpu_vpmu(v)->context;
+       switch ( type )
+       {
+       case MSR_TYPE_ARCH_COUNTER:
+               *msr_content = msrs[index].counter;
+               break;
+       case MSR_TYPE_ARCH_CTRL:
+               *msr_content = msrs[index].control;
+               break;
+       }       
+}
+
+static void ppro_save_msr(struct vcpu *v, int type, int index, u64 msr_content)
+{
+       struct arch_msr_pair *msrs = vcpu_vpmu(v)->context;
+       
+       switch ( type )
+       {
+       case MSR_TYPE_ARCH_COUNTER:
+               msrs[index].counter = msr_content;
+               break;
+       case MSR_TYPE_ARCH_CTRL:
+               msrs[index].control = msr_content;
+               break;
+       }       
+}
 
 struct op_x86_model_spec const op_ppro_spec = {
        .num_counters = NUM_COUNTERS,
@@ -167,5 +259,10 @@ struct op_x86_model_spec const op_ppro_s
        .setup_ctrs = &ppro_setup_ctrs,
        .check_ctrs = &ppro_check_ctrs,
        .start = &ppro_start,
-       .stop = &ppro_stop
+       .stop = &ppro_stop,
+       .is_arch_pmu_msr = &ppro_is_arch_pmu_msr,
+       .allocated_msr = &ppro_allocate_msr,
+       .free_msr = &ppro_free_msr,
+       .load_msr = &ppro_load_msr,
+       .save_msr = &ppro_save_msr
 };
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/oprofile/op_x86_model.h
--- a/xen/arch/x86/oprofile/op_x86_model.h      Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/oprofile/op_x86_model.h      Tue Nov 04 12:43:19 2008 +0900
@@ -41,6 +41,11 @@ struct op_x86_model_spec {
                          struct cpu_user_regs * const regs);
        void (*start)(struct op_msrs const * const msrs);
        void (*stop)(struct op_msrs const * const msrs);
+       int (*is_arch_pmu_msr)(u64 msr_index, int *type, int *index);
+       int (*allocated_msr)(struct vcpu *v);
+       void (*free_msr)(struct vcpu *v);
+       void (*load_msr)(struct vcpu * const v, int type, int index, u64 
*msr_content);
+        void (*save_msr)(struct vcpu * const v, int type, int index, u64 
msr_content);
 };
 
 extern struct op_x86_model_spec const op_ppro_spec;
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/setup.c
--- a/xen/arch/x86/setup.c      Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/setup.c      Tue Nov 04 12:43:19 2008 +0900
@@ -969,6 +969,7 @@ void __init __start_xen(unsigned long mb
     serial_init_postirq();
 
     BUG_ON(!local_irq_is_enabled());
+    spin_debug_enable();
 
     for_each_present_cpu ( i )
     {
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/smpboot.c
--- a/xen/arch/x86/smpboot.c    Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/smpboot.c    Tue Nov 04 12:43:19 2008 +0900
@@ -101,7 +101,7 @@ static int __devinitdata tsc_sync_disabl
 static int __devinitdata tsc_sync_disabled;
 
 /* Per CPU bogomips and other parameters */
-struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
+struct cpuinfo_x86 cpu_data[NR_CPUS];
 EXPORT_SYMBOL(cpu_data);
 
 u32 x86_cpu_to_apicid[NR_CPUS] __read_mostly =
@@ -112,7 +112,7 @@ static void map_cpu_to_logical_apicid(vo
 /* State of each CPU. */
 DEFINE_PER_CPU(int, cpu_state) = { 0 };
 
-static void *stack_base[NR_CPUS] __cacheline_aligned;
+static void *stack_base[NR_CPUS];
 static DEFINE_SPINLOCK(cpu_add_remove_lock);
 
 /*
@@ -805,14 +805,6 @@ static inline int alloc_cpu_id(void)
        return cpu;
 }
 
-static struct vcpu *prepare_idle_vcpu(unsigned int cpu)
-{
-       if (idle_vcpu[cpu])
-               return idle_vcpu[cpu];
-
-       return alloc_idle_vcpu(cpu);
-}
-
 static void *prepare_idle_stack(unsigned int cpu)
 {
        if (!stack_base[cpu])
@@ -849,7 +841,7 @@ static int __devinit do_boot_cpu(int api
 
        booting_cpu = cpu;
 
-       v = prepare_idle_vcpu(cpu);
+       v = alloc_idle_vcpu(cpu);
        BUG_ON(v == NULL);
 
        /* start_eip had better be page-aligned! */
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/time.c
--- a/xen/arch/x86/time.c       Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/time.c       Tue Nov 04 12:43:19 2008 +0900
@@ -1063,8 +1063,6 @@ void init_percpu_time(void)
 /* Late init function (after all CPUs are booted). */
 int __init init_xen_time(void)
 {
-    local_irq_disable();
-
     /* check if TSC is invariant during deep C state
        this is a new feature introduced by Nehalem*/
     if ( cpuid_edx(0x80000007) & (1u<<8) )
@@ -1078,8 +1076,6 @@ int __init init_xen_time(void)
     init_platform_timer();
 
     do_settime(get_cmos_time(), 0, NOW());
-
-    local_irq_enable();
 
     return 0;
 }
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/traps.c
--- a/xen/arch/x86/traps.c      Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/traps.c      Tue Nov 04 12:43:19 2008 +0900
@@ -1030,7 +1030,7 @@ static int handle_gdt_ldt_mapping_fault(
 #endif
 
 static int __spurious_page_fault(
-    unsigned long addr, struct cpu_user_regs *regs)
+    unsigned long addr, unsigned int error_code)
 {
     unsigned long mfn, cr3 = read_cr3();
 #if CONFIG_PAGING_LEVELS >= 4
@@ -1052,17 +1052,17 @@ static int __spurious_page_fault(
         return 0;
 
     /* Reserved bit violations are never spurious faults. */
-    if ( regs->error_code & PFEC_reserved_bit )
+    if ( error_code & PFEC_reserved_bit )
         return 0;
 
     required_flags  = _PAGE_PRESENT;
-    if ( regs->error_code & PFEC_write_access )
+    if ( error_code & PFEC_write_access )
         required_flags |= _PAGE_RW;
-    if ( regs->error_code & PFEC_user_mode )
+    if ( error_code & PFEC_user_mode )
         required_flags |= _PAGE_USER;
 
     disallowed_flags = 0;
-    if ( regs->error_code & PFEC_insn_fetch )
+    if ( error_code & PFEC_insn_fetch )
         disallowed_flags |= _PAGE_NX;
 
     mfn = cr3 >> PAGE_SHIFT;
@@ -1120,7 +1120,7 @@ static int __spurious_page_fault(
     dprintk(XENLOG_WARNING, "Spurious fault in domain %u:%u "
             "at addr %lx, e/c %04x\n",
             current->domain->domain_id, current->vcpu_id,
-            addr, regs->error_code);
+            addr, error_code);
 #if CONFIG_PAGING_LEVELS >= 4
     dprintk(XENLOG_WARNING, " l4e = %"PRIpte"\n", l4e_get_intpte(l4e));
 #endif
@@ -1129,14 +1129,11 @@ static int __spurious_page_fault(
 #endif
     dprintk(XENLOG_WARNING, " l2e = %"PRIpte"\n", l2e_get_intpte(l2e));
     dprintk(XENLOG_WARNING, " l1e = %"PRIpte"\n", l1e_get_intpte(l1e));
-#ifndef NDEBUG
-    show_registers(regs);
-#endif
     return 1;
 }
 
 static int spurious_page_fault(
-    unsigned long addr, struct cpu_user_regs *regs)
+    unsigned long addr, unsigned int error_code)
 {
     unsigned long flags;
     int           is_spurious;
@@ -1146,7 +1143,7 @@ static int spurious_page_fault(
      * page tables from becoming invalid under our feet during the walk.
      */
     local_irq_save(flags);
-    is_spurious = __spurious_page_fault(addr, regs);
+    is_spurious = __spurious_page_fault(addr, error_code);
     local_irq_restore(flags);
 
     return is_spurious;
@@ -1208,8 +1205,12 @@ asmlinkage void do_page_fault(struct cpu
 asmlinkage void do_page_fault(struct cpu_user_regs *regs)
 {
     unsigned long addr, fixup;
+    unsigned int error_code;
 
     addr = read_cr2();
+
+    /* fixup_page_fault() might change regs->error_code, so cache it here. */
+    error_code = regs->error_code;
 
     DEBUGGER_trap_entry(TRAP_page_fault, regs);
 
@@ -1220,7 +1221,7 @@ asmlinkage void do_page_fault(struct cpu
 
     if ( unlikely(!guest_mode(regs)) )
     {
-        if ( spurious_page_fault(addr, regs) )
+        if ( spurious_page_fault(addr, error_code) )
             return;
 
         if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
@@ -1239,11 +1240,11 @@ asmlinkage void do_page_fault(struct cpu
         panic("FATAL PAGE FAULT\n"
               "[error_code=%04x]\n"
               "Faulting linear address: %p\n",
-              regs->error_code, _p(addr));
+              error_code, _p(addr));
     }
 
     if ( unlikely(current->domain->arch.suppress_spurious_page_faults
-                  && spurious_page_fault(addr, regs)) )
+                  && spurious_page_fault(addr, error_code)) )
         return;
 
     propagate_page_fault(addr, regs->error_code);
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/x86_32/domain_page.c
--- a/xen/arch/x86/x86_32/domain_page.c Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/x86_32/domain_page.c Tue Nov 04 12:43:19 2008 +0900
@@ -43,7 +43,7 @@ void *map_domain_page(unsigned long mfn)
 void *map_domain_page(unsigned long mfn)
 {
     unsigned long va;
-    unsigned int idx, i;
+    unsigned int idx, i, flags;
     struct vcpu *v;
     struct mapcache_domain *dcache;
     struct mapcache_vcpu *vcache;
@@ -69,7 +69,7 @@ void *map_domain_page(unsigned long mfn)
         goto out;
     }
 
-    spin_lock(&dcache->lock);
+    spin_lock_irqsave(&dcache->lock, flags);
 
     /* Has some other CPU caused a wrap? We must flush if so. */
     if ( unlikely(dcache->epoch != vcache->shadow_epoch) )
@@ -105,7 +105,7 @@ void *map_domain_page(unsigned long mfn)
     set_bit(idx, dcache->inuse);
     dcache->cursor = idx + 1;
 
-    spin_unlock(&dcache->lock);
+    spin_unlock_irqrestore(&dcache->lock, flags);
 
     l1e_write(&dcache->l1tab[idx], l1e_from_pfn(mfn, __PAGE_HYPERVISOR));
 
@@ -114,7 +114,7 @@ void *map_domain_page(unsigned long mfn)
     return (void *)va;
 }
 
-void unmap_domain_page(void *va)
+void unmap_domain_page(const void *va)
 {
     unsigned int idx;
     struct vcpu *v;
@@ -241,7 +241,7 @@ void *map_domain_page_global(unsigned lo
     return (void *)va;
 }
 
-void unmap_domain_page_global(void *va)
+void unmap_domain_page_global(const void *va)
 {
     unsigned long __va = (unsigned long)va;
     l2_pgentry_t *pl2e;
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/x86_64/compat/mm.c
--- a/xen/arch/x86/x86_64/compat/mm.c   Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/x86_64/compat/mm.c   Tue Nov 04 12:43:19 2008 +0900
@@ -231,6 +231,8 @@ int compat_mmuext_op(XEN_GUEST_HANDLE(mm
             case MMUEXT_PIN_L4_TABLE:
             case MMUEXT_UNPIN_TABLE:
             case MMUEXT_NEW_BASEPTR:
+            case MMUEXT_CLEAR_PAGE:
+            case MMUEXT_COPY_PAGE:
                 arg1 = XLAT_mmuext_op_arg1_mfn;
                 break;
             default:
@@ -257,6 +259,9 @@ int compat_mmuext_op(XEN_GUEST_HANDLE(mm
             case MMUEXT_TLB_FLUSH_MULTI:
             case MMUEXT_INVLPG_MULTI:
                 arg2 = XLAT_mmuext_op_arg2_vcpumask;
+                break;
+            case MMUEXT_COPY_PAGE:
+                arg2 = XLAT_mmuext_op_arg2_src_mfn;
                 break;
             default:
                 arg2 = -1;
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/x86_64/cpufreq.c
--- a/xen/arch/x86/x86_64/cpufreq.c     Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/arch/x86/x86_64/cpufreq.c     Tue Nov 04 12:43:19 2008 +0900
@@ -56,34 +56,13 @@ compat_set_px_pminfo(uint32_t cpu, struc
        return -EFAULT;
 
 #define XLAT_processor_performance_HNDL_states(_d_, _s_) do { \
-    xen_processor_px_t *xen_states = NULL; \
-\
-    if ( likely((_s_)->state_count > 0) ) \
-    { \
-        XEN_GUEST_HANDLE(compat_processor_px_t) states; \
-        compat_processor_px_t state; \
-        int i; \
-\
-        xen_states = xlat_malloc_array(xlat_page_current, \
-                               xen_processor_px_t, (_s_)->state_count); \
-        if ( unlikely(xen_states == NULL) ) \
-            return -EFAULT; \
-\
-        if ( unlikely(!compat_handle_okay((_s_)->states, \
-                                (_s_)->state_count)) ) \
-            return -EFAULT; \
-        guest_from_compat_handle(states, (_s_)->states); \
-\
-        for ( i = 0; i < _s_->state_count; i++ ) \
-        { \
-           if ( unlikely(copy_from_guest_offset(&state, states, i, 1)) ) \
-               return -EFAULT; \
-           XLAT_processor_px(&xen_states[i], &state); \
-        } \
-    } \
-\
-    set_xen_guest_handle((_d_)->states, xen_states); \
+    XEN_GUEST_HANDLE(compat_processor_px_t) states; \
+    if ( unlikely(!compat_handle_okay((_s_)->states, (_s_)->state_count)) ) \
+        return -EFAULT; \
+    guest_from_compat_handle(states, (_s_)->states); \
+    (_d_)->states = guest_handle_cast(states, xen_processor_px_t); \
 } while (0)
+
     XLAT_processor_performance(xen_perf, perf);
 #undef XLAT_processor_performance_HNDL_states
 
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/common/event_channel.c
--- a/xen/common/event_channel.c        Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/common/event_channel.c        Tue Nov 04 12:43:19 2008 +0900
@@ -386,7 +386,7 @@ static long __evtchn_close(struct domain
             if ( v->virq_to_evtchn[chn1->u.virq] != port1 )
                 continue;
             v->virq_to_evtchn[chn1->u.virq] = 0;
-            spin_barrier(&v->virq_lock);
+            spin_barrier_irq(&v->virq_lock);
         }
         break;
 
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/common/kernel.c
--- a/xen/common/kernel.c       Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/common/kernel.c       Tue Nov 04 12:43:19 2008 +0900
@@ -221,7 +221,8 @@ DO(xen_version)(int cmd, XEN_GUEST_HANDL
                 fi.submap |= 1U << XENFEAT_supervisor_mode_kernel;
 #ifdef CONFIG_X86
             if ( !is_hvm_vcpu(current) )
-                fi.submap |= 1U << XENFEAT_mmu_pt_update_preserve_ad;
+                fi.submap |= (1U << XENFEAT_mmu_pt_update_preserve_ad) |
+                             (1U << XENFEAT_highmem_assist);
 #endif
             break;
         default:
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/common/keyhandler.c
--- a/xen/common/keyhandler.c   Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/common/keyhandler.c   Tue Nov 04 12:43:19 2008 +0900
@@ -183,9 +183,9 @@ static void dump_domains(unsigned char k
     {
         printk("General information for domain %u:\n", d->domain_id);
         cpuset_print(tmpstr, sizeof(tmpstr), d->domain_dirty_cpumask);
-        printk("    refcnt=%d nr_pages=%d xenheap_pages=%d "
+        printk("    refcnt=%d dying=%d nr_pages=%d xenheap_pages=%d "
                "dirty_cpus=%s\n",
-               atomic_read(&d->refcnt),
+               atomic_read(&d->refcnt), d->is_dying,
                d->tot_pages, d->xenheap_pages, tmpstr);
         printk("    handle=%02x%02x%02x%02x-%02x%02x-%02x%02x-"
                "%02x%02x-%02x%02x%02x%02x%02x%02x vm_assist=%08lx\n",
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/common/spinlock.c
--- a/xen/common/spinlock.c     Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/common/spinlock.c     Tue Nov 04 12:43:19 2008 +0900
@@ -1,15 +1,56 @@
 #include <xen/config.h>
+#include <xen/irq.h>
 #include <xen/smp.h>
 #include <xen/spinlock.h>
 
+#ifndef NDEBUG
+
+static atomic_t spin_debug __read_mostly = ATOMIC_INIT(0);
+
+static void check_lock(struct lock_debug *debug)
+{
+    int irq_safe = !local_irq_is_enabled();
+
+    if ( unlikely(atomic_read(&spin_debug) <= 0) )
+        return;
+
+    /* A few places take liberties with this. */
+    /* BUG_ON(in_irq() && !irq_safe); */
+
+    if ( unlikely(debug->irq_safe != irq_safe) )
+    {
+        int seen = cmpxchg(&debug->irq_safe, -1, irq_safe);
+        BUG_ON(seen == !irq_safe);
+    }
+}
+
+void spin_debug_enable(void)
+{
+    atomic_inc(&spin_debug);
+}
+
+void spin_debug_disable(void)
+{
+    atomic_dec(&spin_debug);
+}
+
+#else /* defined(NDEBUG) */
+
+#define check_lock(l) ((void)0)
+
+#endif
+
 void _spin_lock(spinlock_t *lock)
 {
+    check_lock(&lock->debug);
     _raw_spin_lock(&lock->raw);
 }
 
 void _spin_lock_irq(spinlock_t *lock)
 {
-    local_irq_disable();
+    ASSERT(local_irq_is_enabled());
+    local_irq_disable();
+    check_lock(&lock->debug);
     _raw_spin_lock(&lock->raw);
 }
 
@@ -17,6 +58,7 @@ unsigned long _spin_lock_irqsave(spinloc
 {
     unsigned long flags;
     local_irq_save(flags);
+    check_lock(&lock->debug);
     _raw_spin_lock(&lock->raw);
     return flags;
 }
@@ -40,26 +82,39 @@ void _spin_unlock_irqrestore(spinlock_t 
 
 int _spin_is_locked(spinlock_t *lock)
 {
+    check_lock(&lock->debug);
     return _raw_spin_is_locked(&lock->raw);
 }
 
 int _spin_trylock(spinlock_t *lock)
 {
+    check_lock(&lock->debug);
     return _raw_spin_trylock(&lock->raw);
 }
 
 void _spin_barrier(spinlock_t *lock)
 {
+    check_lock(&lock->debug);
     do { mb(); } while ( _raw_spin_is_locked(&lock->raw) );
     mb();
 }
 
+void _spin_barrier_irq(spinlock_t *lock)
+{
+    unsigned long flags;
+    local_irq_save(flags);
+    _spin_barrier(lock);
+    local_irq_restore(flags);
+}
+
 void _spin_lock_recursive(spinlock_t *lock)
 {
     int cpu = smp_processor_id();
 
     /* Don't allow overflow of recurse_cpu field. */
     BUILD_BUG_ON(NR_CPUS > 0xfffu);
+
+    check_lock(&lock->debug);
 
     if ( likely(lock->recurse_cpu != cpu) )
     {
@@ -83,12 +138,15 @@ void _spin_unlock_recursive(spinlock_t *
 
 void _read_lock(rwlock_t *lock)
 {
+    check_lock(&lock->debug);
     _raw_read_lock(&lock->raw);
 }
 
 void _read_lock_irq(rwlock_t *lock)
 {
-    local_irq_disable();
+    ASSERT(local_irq_is_enabled());
+    local_irq_disable();
+    check_lock(&lock->debug);
     _raw_read_lock(&lock->raw);
 }
 
@@ -96,6 +154,7 @@ unsigned long _read_lock_irqsave(rwlock_
 {
     unsigned long flags;
     local_irq_save(flags);
+    check_lock(&lock->debug);
     _raw_read_lock(&lock->raw);
     return flags;
 }
@@ -119,12 +178,15 @@ void _read_unlock_irqrestore(rwlock_t *l
 
 void _write_lock(rwlock_t *lock)
 {
+    check_lock(&lock->debug);
     _raw_write_lock(&lock->raw);
 }
 
 void _write_lock_irq(rwlock_t *lock)
 {
-    local_irq_disable();
+    ASSERT(local_irq_is_enabled());
+    local_irq_disable();
+    check_lock(&lock->debug);
     _raw_write_lock(&lock->raw);
 }
 
@@ -132,6 +194,7 @@ unsigned long _write_lock_irqsave(rwlock
 {
     unsigned long flags;
     local_irq_save(flags);
+    check_lock(&lock->debug);
     _raw_write_lock(&lock->raw);
     return flags;
 }
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/common/timer.c
--- a/xen/common/timer.c        Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/common/timer.c        Tue Nov 04 12:43:19 2008 +0900
@@ -25,10 +25,12 @@
  * We pull handlers off the timer list this far in future,
  * rather than reprogramming the time hardware.
  */
-#define TIMER_SLOP (50*1000) /* ns */
+static unsigned int timer_slop __read_mostly = 50000; /* 50 us */
+integer_param("timer_slop", timer_slop);
 
 struct timers {
     spinlock_t     lock;
+    bool_t         overflow;
     struct timer **heap;
     struct timer  *list;
     struct timer  *running;
@@ -200,6 +202,7 @@ static int add_entry(struct timers *time
         return rc;
 
     /* Fall back to adding to the slower linked list. */
+    timers->overflow = 1;
     t->status = TIMER_STATUS_in_list;
     return add_to_list(&timers->list, t);
 }
@@ -258,6 +261,7 @@ void set_timer(struct timer *timer, s_ti
         __stop_timer(timer);
 
     timer->expires = expires;
+    timer->expires_end = expires + timer_slop;
 
     if ( likely(timer->status != TIMER_STATUS_killed) )
         __add_timer(timer);
@@ -344,19 +348,30 @@ void kill_timer(struct timer *timer)
 }
 
 
+static void execute_timer(struct timers *ts, struct timer *t)
+{
+    void (*fn)(void *) = t->function;
+    void *data = t->data;
+
+    ts->running = t;
+    spin_unlock_irq(&ts->lock);
+    (*fn)(data);
+    spin_lock_irq(&ts->lock);
+    ts->running = NULL;
+}
+
+
 static void timer_softirq_action(void)
 {
     struct timer  *t, **heap, *next;
     struct timers *ts;
-    s_time_t       now, deadline;
-    void         (*fn)(void *);
-    void          *data;
+    s_time_t       now;
 
     ts = &this_cpu(timers);
     heap = ts->heap;
 
-    /* If we are using overflow linked list, try to allocate a larger heap. */
-    if ( unlikely(ts->list != NULL) )
+    /* If we overflowed the heap, try to allocate a larger heap. */
+    if ( unlikely(ts->overflow) )
     {
         /* old_limit == (2^n)-1; new_limit == (2^(n+4))-1 */
         int old_limit = GET_HEAP_LIMIT(heap);
@@ -377,7 +392,26 @@ static void timer_softirq_action(void)
 
     spin_lock_irq(&ts->lock);
 
-    /* Try to move timers from overflow linked list to more efficient heap. */
+    now = NOW();
+
+    /* Execute ready heap timers. */
+    while ( (GET_HEAP_SIZE(heap) != 0) &&
+            ((t = heap[1])->expires_end < now) )
+    {
+        remove_from_heap(heap, t);
+        t->status = TIMER_STATUS_inactive;
+        execute_timer(ts, t);
+    }
+
+    /* Execute ready list timers. */
+    while ( ((t = ts->list) != NULL) && (t->expires_end < now) )
+    {
+        ts->list = t->list_next;
+        t->status = TIMER_STATUS_inactive;
+        execute_timer(ts, t);
+    }
+
+    /* Try to move timers from linked list to more efficient heap. */
     next = ts->list;
     ts->list = NULL;
     while ( unlikely((t = next) != NULL) )
@@ -387,51 +421,44 @@ static void timer_softirq_action(void)
         add_entry(ts, t);
     }
 
-    now = NOW();
-
-    while ( (GET_HEAP_SIZE(heap) != 0) &&
-            ((t = heap[1])->expires < (now + TIMER_SLOP)) )
-    {
-        remove_entry(ts, t);
-
-        ts->running = t;
-
-        fn   = t->function;
-        data = t->data;
-
-        spin_unlock_irq(&ts->lock);
-        (*fn)(data);
-        spin_lock_irq(&ts->lock);
-    }
-
-    deadline = GET_HEAP_SIZE(heap) ? heap[1]->expires : 0;
-
-    while ( unlikely((t = ts->list) != NULL) )
-    {
-        if ( t->expires >= (now + TIMER_SLOP) )
+    ts->overflow = (ts->list != NULL);
+    if ( unlikely(ts->overflow) )
+    {
+        /* Find earliest deadline at head of list or top of heap. */
+        this_cpu(timer_deadline) = ts->list->expires;
+        if ( (GET_HEAP_SIZE(heap) != 0) &&
+             ((t = heap[1])->expires < this_cpu(timer_deadline)) )
+            this_cpu(timer_deadline) = t->expires;
+    }
+    else
+    {
+        /*
+         * Find the earliest deadline that encompasses largest number of timers
+         * on the heap. To do this we take timers from the heap while their
+         * valid deadline ranges continue to intersect.
+         */
+        s_time_t start = 0, end = STIME_MAX;
+        struct timer **list_tail = &ts->list;
+
+        while ( (GET_HEAP_SIZE(heap) != 0) &&
+                ((t = heap[1])->expires <= end) )
         {
-            if ( (deadline == 0) || (deadline > t->expires) )
-                deadline = t->expires;
-            break;
+            remove_entry(ts, t);
+
+            t->status = TIMER_STATUS_in_list;
+            t->list_next = NULL;
+            *list_tail = t;
+            list_tail = &t->list_next;
+
+            start = t->expires;
+            if ( end > t->expires_end )
+                end = t->expires_end;
         }
 
-        ts->list = t->list_next;
-        t->status = TIMER_STATUS_inactive;
-
-        ts->running = t;
-
-        fn   = t->function;
-        data = t->data;
-
-        spin_unlock_irq(&ts->lock);
-        (*fn)(data);
-        spin_lock_irq(&ts->lock);
-    }
-
-    ts->running = NULL;
-
-    this_cpu(timer_deadline) = deadline;
-    if ( !reprogram_timer(deadline) )
+        this_cpu(timer_deadline) = start;
+    }
+
+    if ( !reprogram_timer(this_cpu(timer_deadline)) )
         raise_softirq(TIMER_SOFTIRQ);
 
     spin_unlock_irq(&ts->lock);
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/common/xenoprof.c
--- a/xen/common/xenoprof.c     Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/common/xenoprof.c     Tue Nov 04 12:43:19 2008 +0900
@@ -85,7 +85,7 @@ int is_active(struct domain *d)
     return ((x != NULL) && (x->domain_type == XENOPROF_DOMAIN_ACTIVE));
 }
 
-static int is_passive(struct domain *d)
+int is_passive(struct domain *d)
 {
     struct xenoprof *x = d->xenoprof;
     return ((x != NULL) && (x->domain_type == XENOPROF_DOMAIN_PASSIVE));
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/common/xmalloc.c
--- a/xen/common/xmalloc.c      Tue Nov 04 12:07:22 2008 +0900
+++ /dev/null   Thu Jan 01 00:00:00 1970 +0000
@@ -1,286 +0,0 @@
-/******************************************************************************
- * Simple allocator for Xen.  If larger than a page, simply use the
- * page-order allocator.
- *
- * Copyright (C) 2005 Rusty Russell IBM Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-
-/*
- * TODO (Keir, 17/2/05):
- *  1. Use space in page_info to avoid xmalloc_hdr in allocated blocks.
- *  2. page_info points into free list to make xfree() O(1) complexity.
- *  3. Perhaps make this a sub-page buddy allocator? xmalloc() == O(1).
- *     (Disadvantage is potentially greater internal fragmentation).
- */
-
-#include <xen/config.h>
-#include <xen/mm.h>
-#include <xen/spinlock.h>
-#include <xen/timer.h>
-#include <xen/cache.h>
-#include <xen/prefetch.h>
-#include <xen/irq.h>
-#include <xen/smp.h>
-
-/*
- * XMALLOC_DEBUG:
- *  1. Free data blocks are filled with poison bytes.
- *  2. In-use data blocks have guard bytes at the start and end.
- */
-#ifndef NDEBUG
-#define XMALLOC_DEBUG 1
-#endif
-
-static LIST_HEAD(freelist);
-static DEFINE_SPINLOCK(freelist_lock);
-
-struct xmalloc_hdr
-{
-    /* Size is total including this header. */
-    size_t size;
-    struct list_head freelist;
-} __cacheline_aligned;
-
-static void add_to_freelist(struct xmalloc_hdr *hdr)
-{
-#if XMALLOC_DEBUG
-    memset(hdr + 1, 0xa5, hdr->size - sizeof(*hdr));
-#endif
-    list_add(&hdr->freelist, &freelist);
-}
-
-static void del_from_freelist(struct xmalloc_hdr *hdr)
-{
-#if XMALLOC_DEBUG
-    size_t i;
-    unsigned char *data = (unsigned char *)(hdr + 1);
-    for ( i = 0; i < (hdr->size - sizeof(*hdr)); i++ )
-        BUG_ON(data[i] != 0xa5);
-    BUG_ON((hdr->size <= 0) || (hdr->size >= PAGE_SIZE));
-#endif
-    list_del(&hdr->freelist);
-}
-
-static void *data_from_header(struct xmalloc_hdr *hdr)
-{
-#if XMALLOC_DEBUG
-    /* Data block contain SMP_CACHE_BYTES of guard canary. */
-    unsigned char *data = (unsigned char *)(hdr + 1);
-    memset(data, 0x5a, SMP_CACHE_BYTES);
-    memset(data + hdr->size - sizeof(*hdr) - SMP_CACHE_BYTES,
-           0x5a, SMP_CACHE_BYTES);
-    return data + SMP_CACHE_BYTES;
-#else
-    return hdr + 1;
-#endif
-}
-
-static struct xmalloc_hdr *header_from_data(void *p)
-{
-#if XMALLOC_DEBUG
-    unsigned char *data = (unsigned char *)p - SMP_CACHE_BYTES;
-    struct xmalloc_hdr *hdr = (struct xmalloc_hdr *)data - 1;
-    size_t i;
-
-    /* Check header guard canary. */
-    for ( i = 0; i < SMP_CACHE_BYTES; i++ )
-        BUG_ON(data[i] != 0x5a);
-
-    /* Check footer guard canary. */
-    data += hdr->size - sizeof(*hdr) - SMP_CACHE_BYTES;
-    for ( i = 0; i < SMP_CACHE_BYTES; i++ )
-        BUG_ON(data[i] != 0x5a);
-
-    return hdr;
-#else
-    return (struct xmalloc_hdr *)p - 1;
-#endif
-}
-
-static void maybe_split(struct xmalloc_hdr *hdr, size_t size, size_t block)
-{
-    struct xmalloc_hdr *extra;
-    size_t leftover = block - size;
-
-    /* If enough is left to make a block, put it on free list. */
-    if ( leftover >= (2 * sizeof(struct xmalloc_hdr)) )
-    {
-        extra = (struct xmalloc_hdr *)((unsigned long)hdr + size);
-        extra->size = leftover;
-        add_to_freelist(extra);
-    }
-    else
-    {
-        size = block;
-    }
-
-    hdr->size = size;
-    /* Debugging aid. */
-    hdr->freelist.next = hdr->freelist.prev = NULL;
-}
-
-static void *xmalloc_new_page(size_t size)
-{
-    struct xmalloc_hdr *hdr;
-
-    hdr = alloc_xenheap_page();
-    if ( hdr == NULL )
-        return NULL;
-
-    spin_lock(&freelist_lock);
-    maybe_split(hdr, size, PAGE_SIZE);
-    spin_unlock(&freelist_lock);
-
-    return data_from_header(hdr);
-}
-
-/* Big object?  Just use the page allocator. */
-static void *xmalloc_whole_pages(size_t size)
-{
-    struct xmalloc_hdr *hdr;
-    unsigned int pageorder = get_order_from_bytes(size);
-
-    hdr = alloc_xenheap_pages(pageorder);
-    if ( hdr == NULL )
-        return NULL;
-
-    hdr->size = (1 << (pageorder + PAGE_SHIFT));
-    /* Debugging aid. */
-    hdr->freelist.next = hdr->freelist.prev = NULL;
-
-    return data_from_header(hdr);
-}
-
-/* Return size, increased to alignment with align. */
-static inline size_t align_up(size_t size, size_t align)
-{
-    return (size + align - 1) & ~(align - 1);
-}
-
-void *_xmalloc(size_t size, size_t align)
-{
-    struct xmalloc_hdr *i;
-
-    ASSERT(!in_irq());
-
-    /* We currently always return cacheline aligned. */
-    BUG_ON(align > SMP_CACHE_BYTES);
-
-#if XMALLOC_DEBUG
-    /* Add room for canaries at start and end of data block. */
-    size += 2 * SMP_CACHE_BYTES;
-#endif
-
-    /* Add room for header, pad to align next header. */
-    size += sizeof(struct xmalloc_hdr);
-    size = align_up(size, __alignof__(struct xmalloc_hdr));
-
-    /* For big allocs, give them whole pages. */
-    if ( size >= PAGE_SIZE )
-        return xmalloc_whole_pages(size);
-
-    /* Search free list. */
-    spin_lock(&freelist_lock);
-    list_for_each_entry( i, &freelist, freelist )
-    {
-        if ( i->size < size )
-            continue;
-        del_from_freelist(i);
-        maybe_split(i, size, i->size);
-        spin_unlock(&freelist_lock);
-        return data_from_header(i);
-    }
-    spin_unlock(&freelist_lock);
-
-    /* Alloc a new page and return from that. */
-    return xmalloc_new_page(size);
-}
-
-void xfree(void *p)
-{
-    struct xmalloc_hdr *i, *tmp, *hdr;
-
-    ASSERT(!in_irq());
-
-    if ( p == NULL )
-        return;
-
-    hdr = header_from_data(p);
-
-    /* We know hdr will be on same page. */
-    BUG_ON(((long)p & PAGE_MASK) != ((long)hdr & PAGE_MASK));
-
-    /* Not previously freed. */
-    BUG_ON(hdr->freelist.next || hdr->freelist.prev);
-
-    /* Big allocs free directly. */
-    if ( hdr->size >= PAGE_SIZE )
-    {
-        free_xenheap_pages(hdr, get_order_from_bytes(hdr->size));
-        return;
-    }
-
-    /* Merge with other free block, or put in list. */
-    spin_lock(&freelist_lock);
-    list_for_each_entry_safe( i, tmp, &freelist, freelist )
-    {
-        unsigned long _i   = (unsigned long)i;
-        unsigned long _hdr = (unsigned long)hdr;
-
-        /* Do not merge across page boundaries. */
-        if ( ((_i ^ _hdr) & PAGE_MASK) != 0 )
-            continue;
-
-        /* We follow this block?  Swallow it. */
-        if ( (_i + i->size) == _hdr )
-        {
-            del_from_freelist(i);
-            i->size += hdr->size;
-            hdr = i;
-        }
-
-        /* We precede this block? Swallow it. */
-        if ( (_hdr + hdr->size) == _i )
-        {
-            del_from_freelist(i);
-            hdr->size += i->size;
-        }
-    }
-
-    /* Did we merge an entire page? */
-    if ( hdr->size == PAGE_SIZE )
-    {
-        BUG_ON((((unsigned long)hdr) & (PAGE_SIZE-1)) != 0);
-        free_xenheap_pages(hdr, 0);
-    }
-    else
-    {
-        add_to_freelist(hdr);
-    }
-
-    spin_unlock(&freelist_lock);
-}
-
-/*
- * Local variables:
- * mode: C
- * c-set-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/drivers/char/serial.c
--- a/xen/drivers/char/serial.c Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/drivers/char/serial.c Tue Nov 04 12:43:19 2008 +0900
@@ -74,7 +74,7 @@ void serial_tx_interrupt(struct serial_p
     while ( !spin_trylock(&port->tx_lock) )
     {
         if ( !port->driver->tx_empty(port) )
-            return;
+            goto out;
         cpu_relax();
     }
 
@@ -89,7 +89,10 @@ void serial_tx_interrupt(struct serial_p
         }
     }
 
-    spin_unlock_irqrestore(&port->tx_lock, flags);
+    spin_unlock(&port->tx_lock);
+
+ out:
+    local_irq_restore(flags);
 }
 
 static void __serial_putc(struct serial_port *port, char c)
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/drivers/cpufreq/cpufreq.c
--- a/xen/drivers/cpufreq/cpufreq.c     Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/drivers/cpufreq/cpufreq.c     Tue Nov 04 12:43:19 2008 +0900
@@ -31,6 +31,7 @@
 #include <xen/errno.h>
 #include <xen/delay.h>
 #include <xen/cpumask.h>
+#include <xen/list.h>
 #include <xen/sched.h>
 #include <xen/timer.h>
 #include <xen/xmalloc.h>
@@ -44,8 +45,12 @@
 #include <acpi/acpi.h>
 #include <acpi/cpufreq/cpufreq.h>
 
-/* TODO: change to link list later as domain number may be sparse */
-static cpumask_t cpufreq_dom_map[NR_CPUS];
+struct cpufreq_dom {
+    unsigned int       dom;
+    cpumask_t          map;
+    struct list_head   node;
+};
+static LIST_HEAD(cpufreq_dom_list_head);
 
 int cpufreq_limit_change(unsigned int cpu)
 {
@@ -72,48 +77,80 @@ int cpufreq_add_cpu(unsigned int cpu)
 {
     int ret = 0;
     unsigned int firstcpu;
-    unsigned int dom;
+    unsigned int dom, domexist = 0;
     unsigned int j;
+    struct list_head *pos;
+    struct cpufreq_dom *cpufreq_dom = NULL;
     struct cpufreq_policy new_policy;
     struct cpufreq_policy *policy;
     struct processor_performance *perf = &processor_pminfo[cpu]->perf;
 
     /* to protect the case when Px was not controlled by xen */
-    if (!processor_pminfo[cpu] || !(perf->init & XEN_PX_INIT))
+    if (!processor_pminfo[cpu]      ||
+        !(perf->init & XEN_PX_INIT) ||
+        !cpu_online(cpu))
+        return -EINVAL;
+
+    if (cpufreq_cpu_policy[cpu])
         return 0;
-
-    if (!cpu_online(cpu) || cpufreq_cpu_policy[cpu])
-        return -EINVAL;
 
     ret = cpufreq_statistic_init(cpu);
     if (ret)
         return ret;
 
     dom = perf->domain_info.domain;
-    if (cpus_weight(cpufreq_dom_map[dom])) {
+
+    list_for_each(pos, &cpufreq_dom_list_head) {
+        cpufreq_dom = list_entry(pos, struct cpufreq_dom, node);
+        if (dom == cpufreq_dom->dom) {
+            domexist = 1;
+            break;
+        }
+    }
+
+    if (domexist) {
         /* share policy with the first cpu since on same boat */
-        firstcpu = first_cpu(cpufreq_dom_map[dom]);
+        firstcpu = first_cpu(cpufreq_dom->map);
         policy = cpufreq_cpu_policy[firstcpu];
 
         cpufreq_cpu_policy[cpu] = policy;
-        cpu_set(cpu, cpufreq_dom_map[dom]);
+        cpu_set(cpu, cpufreq_dom->map);
         cpu_set(cpu, policy->cpus);
+
+        /* domain coordination sanity check */
+        if ((perf->domain_info.coord_type !=
+             processor_pminfo[firstcpu]->perf.domain_info.coord_type) ||
+            (perf->domain_info.num_processors !=
+             processor_pminfo[firstcpu]->perf.domain_info.num_processors)) {
+            ret = -EINVAL;
+            goto err2;
+        }
 
         printk(KERN_EMERG"adding CPU %u\n", cpu);
     } else {
+        cpufreq_dom = xmalloc(struct cpufreq_dom);
+        if (!cpufreq_dom) {
+            cpufreq_statistic_exit(cpu);
+            return -ENOMEM;
+        }
+        memset(cpufreq_dom, 0, sizeof(struct cpufreq_dom));
+        cpufreq_dom->dom = dom;
+        cpu_set(cpu, cpufreq_dom->map);
+        list_add(&cpufreq_dom->node, &cpufreq_dom_list_head);
+
         /* for the first cpu, setup policy and do init work */
         policy = xmalloc(struct cpufreq_policy);
         if (!policy) {
+            list_del(&cpufreq_dom->node);
+            xfree(cpufreq_dom);
             cpufreq_statistic_exit(cpu);
             return -ENOMEM;
         }
         memset(policy, 0, sizeof(struct cpufreq_policy));
-
+        policy->cpu = cpu;
+        cpu_set(cpu, policy->cpus);
         cpufreq_cpu_policy[cpu] = policy;
-        cpu_set(cpu, cpufreq_dom_map[dom]);
-        cpu_set(cpu, policy->cpus);
-
-        policy->cpu = cpu;
+
         ret = cpufreq_driver->init(policy);
         if (ret)
             goto err1;
@@ -124,7 +161,7 @@ int cpufreq_add_cpu(unsigned int cpu)
      * After get full cpumap of the coordination domain,
      * we can safely start gov here.
      */
-    if (cpus_weight(cpufreq_dom_map[dom]) ==
+    if (cpus_weight(cpufreq_dom->map) ==
         perf->domain_info.num_processors) {
         memcpy(&new_policy, policy, sizeof(struct cpufreq_policy));
         policy->governor = NULL;
@@ -138,51 +175,68 @@ err2:
 err2:
     cpufreq_driver->exit(policy);
 err1:
-    for_each_cpu_mask(j, cpufreq_dom_map[dom]) {
+    for_each_cpu_mask(j, cpufreq_dom->map) {
         cpufreq_cpu_policy[j] = NULL;
         cpufreq_statistic_exit(j);
     }
 
-    cpus_clear(cpufreq_dom_map[dom]);
+    list_del(&cpufreq_dom->node);
+    xfree(cpufreq_dom);
     xfree(policy);
     return ret;
 }
 
 int cpufreq_del_cpu(unsigned int cpu)
 {
-    unsigned int dom;
+    unsigned int dom, domexist = 0;
+    struct list_head *pos;
+    struct cpufreq_dom *cpufreq_dom = NULL;
     struct cpufreq_policy *policy;
     struct processor_performance *perf = &processor_pminfo[cpu]->perf;
 
     /* to protect the case when Px was not controlled by xen */
-    if (!processor_pminfo[cpu] || !(perf->init & XEN_PX_INIT))
+    if (!processor_pminfo[cpu]      ||
+        !(perf->init & XEN_PX_INIT) ||
+        !cpu_online(cpu))
+        return -EINVAL;
+
+    if (!cpufreq_cpu_policy[cpu])
         return 0;
-
-    if (!cpu_online(cpu) || !cpufreq_cpu_policy[cpu])
-        return -EINVAL;
 
     dom = perf->domain_info.domain;
     policy = cpufreq_cpu_policy[cpu];
 
-    printk(KERN_EMERG"deleting CPU %u\n", cpu);
+    list_for_each(pos, &cpufreq_dom_list_head) {
+        cpufreq_dom = list_entry(pos, struct cpufreq_dom, node);
+        if (dom == cpufreq_dom->dom) {
+            domexist = 1;
+            break;
+        }
+    }
+
+    if (!domexist)
+        return -EINVAL;
 
     /* for the first cpu of the domain, stop gov */
-    if (cpus_weight(cpufreq_dom_map[dom]) ==
+    if (cpus_weight(cpufreq_dom->map) ==
         perf->domain_info.num_processors)
         __cpufreq_governor(policy, CPUFREQ_GOV_STOP);
 
     cpufreq_cpu_policy[cpu] = NULL;
     cpu_clear(cpu, policy->cpus);
-    cpu_clear(cpu, cpufreq_dom_map[dom]);
+    cpu_clear(cpu, cpufreq_dom->map);
     cpufreq_statistic_exit(cpu);
 
     /* for the last cpu of the domain, clean room */
     /* It's safe here to free freq_table, drv_data and policy */
-    if (!cpus_weight(cpufreq_dom_map[dom])) {
+    if (!cpus_weight(cpufreq_dom->map)) {
         cpufreq_driver->exit(policy);
+        list_del(&cpufreq_dom->node);
+        xfree(cpufreq_dom);
         xfree(policy);
     }
 
+    printk(KERN_EMERG"deleting CPU %u\n", cpu);
     return 0;
 }
 
@@ -258,6 +312,24 @@ int set_px_pminfo(uint32_t acpi_id, stru
 
     if ( dom0_px_info->flags & XEN_PX_PCT )
     {
+        /* space_id check */
+        if (dom0_px_info->control_register.space_id != 
+            dom0_px_info->status_register.space_id)
+        {
+            ret = -EINVAL;
+            goto out;
+        }
+
+#ifdef CONFIG_IA64
+        /* for IA64, currently it only supports FFH */
+        if (dom0_px_info->control_register.space_id !=
+            ACPI_ADR_SPACE_FIXED_HARDWARE)
+        {
+            ret = -EINVAL;
+            goto out;
+        }
+#endif
+
         memcpy ((void *)&pxpt->control_register,
                 (void *)&dom0_px_info->control_register,
                 sizeof(struct xen_pct_register));
@@ -267,8 +339,16 @@ int set_px_pminfo(uint32_t acpi_id, stru
         print_PCT(&pxpt->control_register);
         print_PCT(&pxpt->status_register);
     }
+
     if ( dom0_px_info->flags & XEN_PX_PSS ) 
     {
+        /* capability check */
+        if (dom0_px_info->state_count <= 1)
+        {
+            ret = -EINVAL;
+            goto out;
+        }
+
         if ( !(pxpt->states = xmalloc_array(struct xen_processor_px,
                         dom0_px_info->state_count)) )
         {
@@ -280,14 +360,28 @@ int set_px_pminfo(uint32_t acpi_id, stru
         pxpt->state_count = dom0_px_info->state_count;
         print_PSS(pxpt->states,pxpt->state_count);
     }
+
     if ( dom0_px_info->flags & XEN_PX_PSD )
     {
+#ifdef CONFIG_X86
+        /* for X86, check domain coordination */
+        /* for IA64, _PSD is optional for current IA64 cpufreq algorithm */
+        if (dom0_px_info->shared_type != CPUFREQ_SHARED_TYPE_ALL &&
+            dom0_px_info->shared_type != CPUFREQ_SHARED_TYPE_ANY &&
+            dom0_px_info->shared_type != CPUFREQ_SHARED_TYPE_HW)
+        {
+            ret = -EINVAL;
+            goto out;
+        }
+#endif
+
         pxpt->shared_type = dom0_px_info->shared_type;
         memcpy ((void *)&pxpt->domain_info,
                 (void *)&dom0_px_info->domain_info,
                 sizeof(struct xen_psd_package));
         print_PSD(&pxpt->domain_info);
     }
+
     if ( dom0_px_info->flags & XEN_PX_PPC )
     {
         pxpt->platform_limit = dom0_px_info->platform_limit;
@@ -295,7 +389,6 @@ int set_px_pminfo(uint32_t acpi_id, stru
 
         if ( pxpt->init == XEN_PX_INIT )
         {
-
             ret = cpufreq_limit_change(cpuid); 
             goto out;
         }
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/asm-x86/config.h
--- a/xen/include/asm-x86/config.h      Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/include/asm-x86/config.h      Tue Nov 04 12:43:19 2008 +0900
@@ -40,14 +40,6 @@
 
 #define CONFIG_HOTPLUG 1
 #define CONFIG_HOTPLUG_CPU 1
-
-/*
- * Avoid deep recursion when tearing down pagetables during domain destruction,
- * causing dom0 to become unresponsive and Xen to miss time-critical softirq
- * deadlines. This will ultimately be replaced by built-in preemptibility of
- * get_page_type().
- */
-#define DOMAIN_DESTRUCT_AVOID_RECURSION 1
 
 #define HZ 100
 
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/asm-x86/event.h
--- a/xen/include/asm-x86/event.h       Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/include/asm-x86/event.h       Tue Nov 04 12:43:19 2008 +0900
@@ -11,36 +11,8 @@
 
 #include <xen/shared.h>
 
-static inline void vcpu_kick(struct vcpu *v)
-{
-    /*
-     * NB1. 'pause_flags' and 'processor' must be checked /after/ update of
-     * pending flag. These values may fluctuate (after all, we hold no
-     * locks) but the key insight is that each change will cause
-     * evtchn_upcall_pending to be polled.
-     * 
-     * NB2. We save the running flag across the unblock to avoid a needless
-     * IPI for domains that we IPI'd to unblock.
-     */
-    int running = v->is_running;
-    vcpu_unblock(v);
-    if ( running )
-        smp_send_event_check_cpu(v->processor);
-}
-
-static inline void vcpu_mark_events_pending(struct vcpu *v)
-{
-    int already_pending = test_and_set_bit(
-        0, (unsigned long *)&vcpu_info(v, evtchn_upcall_pending));
-
-    if ( already_pending )
-        return;
-
-    if ( is_hvm_vcpu(v) )
-        hvm_assert_evtchn_irq(v);
-    else
-        vcpu_kick(v);
-}
+void vcpu_kick(struct vcpu *v);
+void vcpu_mark_events_pending(struct vcpu *v);
 
 int hvm_local_events_need_delivery(struct vcpu *v);
 static inline int local_events_need_delivery(void)
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/asm-x86/fixmap.h
--- a/xen/include/asm-x86/fixmap.h      Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/include/asm-x86/fixmap.h      Tue Nov 04 12:43:19 2008 +0900
@@ -29,6 +29,7 @@
  * from the end of virtual memory backwards.
  */
 enum fixed_addresses {
+    FIX_RESERVED, /* Index 0 is reserved since fix_to_virt(0) > FIXADDR_TOP. */
 #ifdef __i386__
     FIX_PAE_HIGHMEM_0,
     FIX_PAE_HIGHMEM_END = FIX_PAE_HIGHMEM_0 + NR_CPUS-1,
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/asm-x86/hvm/vmx/vpmu.h
--- a/xen/include/asm-x86/hvm/vmx/vpmu.h        Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/include/asm-x86/hvm/vmx/vpmu.h        Tue Nov 04 12:43:19 2008 +0900
@@ -67,7 +67,7 @@ struct vpmu_struct {
 #define VPMU_CONTEXT_ALLOCATED              0x1
 #define VPMU_CONTEXT_LOADED                 0x2
 #define VPMU_RUNNING                        0x4
-
+#define PASSIVE_DOMAIN_ALLOCATED           0x8
 int vpmu_do_wrmsr(struct cpu_user_regs *regs);
 int vpmu_do_rdmsr(struct cpu_user_regs *regs);
 int vpmu_do_interrupt(struct cpu_user_regs *regs);
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/asm-x86/hvm/vmx/vpmu_core2.h
--- a/xen/include/asm-x86/hvm/vmx/vpmu_core2.h  Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/include/asm-x86/hvm/vmx/vpmu_core2.h  Tue Nov 04 12:43:19 2008 +0900
@@ -23,28 +23,6 @@
 #ifndef __ASM_X86_HVM_VPMU_CORE_H_
 #define __ASM_X86_HVM_VPMU_CORE_H_
 
-/* Core 2 Non-architectual Performance Counter MSRs. */
-u32 core2_counters_msr[] =   {
-    MSR_CORE_PERF_FIXED_CTR0,
-    MSR_CORE_PERF_FIXED_CTR1,
-    MSR_CORE_PERF_FIXED_CTR2};
-
-/* Core 2 Non-architectual Performance Control MSRs. */
-u32 core2_ctrls_msr[] = {
-    MSR_CORE_PERF_FIXED_CTR_CTRL,
-    MSR_IA32_PEBS_ENABLE,
-    MSR_IA32_DS_AREA};
-
-struct pmumsr core2_counters = {
-    3,
-    core2_counters_msr
-};
-
-struct pmumsr core2_ctrls = {
-    3,
-    core2_ctrls_msr
-};
-
 struct arch_msr_pair {
     u64 counter;
     u64 control;
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/asm-x86/hvm/vpt.h
--- a/xen/include/asm-x86/hvm/vpt.h     Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/include/asm-x86/hvm/vpt.h     Tue Nov 04 12:43:19 2008 +0900
@@ -32,41 +32,6 @@
 #include <asm/hvm/irq.h>
 #include <public/hvm/save.h>
 
-struct HPETState;
-struct HPET_timer_fn_info {
-    struct HPETState *hs;
-    unsigned int tn;
-};
-
-struct hpet_registers {
-    /* Memory-mapped, software visible registers */
-    uint64_t capability;        /* capabilities */
-    uint64_t config;            /* configuration */
-    uint64_t isr;               /* interrupt status reg */
-    uint64_t mc64;              /* main counter */
-    struct {                    /* timers */
-        uint64_t config;        /* configuration/cap */
-        uint64_t cmp;           /* comparator */
-        uint64_t fsb;           /* FSB route, not supported now */
-    } timers[HPET_TIMER_NUM];
-
-    /* Hidden register state */
-    uint64_t period[HPET_TIMER_NUM]; /* Last value written to comparator */
-};
-
-typedef struct HPETState {
-    struct hpet_registers hpet;
-    struct vcpu *vcpu;
-    uint64_t stime_freq;
-    uint64_t hpet_to_ns_scale; /* hpet ticks to ns (multiplied by 2^10) */
-    uint64_t hpet_to_ns_limit; /* max hpet ticks convertable to ns      */
-    uint64_t mc_offset;
-    struct timer timers[HPET_TIMER_NUM];
-    struct HPET_timer_fn_info timer_fn_info[HPET_TIMER_NUM]; 
-    spinlock_t lock;
-} HPETState;
-
-
 /*
  * Abstract layer of periodic time, one short time.
  */
@@ -107,6 +72,34 @@ typedef struct PITState {
     struct periodic_time pt0;
     spinlock_t lock;
 } PITState;
+
+struct hpet_registers {
+    /* Memory-mapped, software visible registers */
+    uint64_t capability;        /* capabilities */
+    uint64_t config;            /* configuration */
+    uint64_t isr;               /* interrupt status reg */
+    uint64_t mc64;              /* main counter */
+    struct {                    /* timers */
+        uint64_t config;        /* configuration/cap */
+        uint64_t cmp;           /* comparator */
+        uint64_t fsb;           /* FSB route, not supported now */
+    } timers[HPET_TIMER_NUM];
+
+    /* Hidden register state */
+    uint64_t period[HPET_TIMER_NUM]; /* Last value written to comparator */
+    uint64_t comparator64[HPET_TIMER_NUM]; /* 64 bit running comparator */
+};
+
+typedef struct HPETState {
+    struct hpet_registers hpet;
+    struct vcpu *vcpu;
+    uint64_t stime_freq;
+    uint64_t hpet_to_ns_scale; /* hpet ticks to ns (multiplied by 2^10) */
+    uint64_t hpet_to_ns_limit; /* max hpet ticks convertable to ns      */
+    uint64_t mc_offset;
+    struct periodic_time pt[HPET_TIMER_NUM];
+    spinlock_t lock;
+} HPETState;
 
 typedef struct RTCState {
     /* Hardware state */
@@ -160,13 +153,13 @@ void pt_migrate(struct vcpu *v);
  * The given periodic timer structure must be initialised with zero bytes,
  * except for the 'source' field which must be initialised with the
  * correct PTSRC_ value. The initialised timer structure can then be passed
- * to {create,destroy}_periodic_time() and number of times and in any order.
+ * to {create,destroy}_periodic_time() any number of times and in any order.
  * Note that, for a given periodic timer, invocations of these functions MUST
  * be serialised.
  */
 void create_periodic_time(
-    struct vcpu *v, struct periodic_time *pt, uint64_t period,
-    uint8_t irq, char one_shot, time_cb *cb, void *data);
+    struct vcpu *v, struct periodic_time *pt, uint64_t delta,
+    uint64_t period, uint8_t irq, time_cb *cb, void *data);
 void destroy_periodic_time(struct periodic_time *pt);
 
 int pv_pit_handler(int port, int data, int write);
@@ -185,7 +178,6 @@ void pmtimer_deinit(struct domain *d);
 void pmtimer_deinit(struct domain *d);
 void pmtimer_reset(struct domain *d);
 
-void hpet_migrate_timers(struct vcpu *v);
 void hpet_init(struct vcpu *v);
 void hpet_deinit(struct domain *d);
 void hpet_reset(struct domain *d);
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/asm-x86/mm.h
--- a/xen/include/asm-x86/mm.h  Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/include/asm-x86/mm.h  Tue Nov 04 12:43:19 2008 +0900
@@ -61,12 +61,36 @@ struct page_info
         /*
          * When PGT_partial is true then this field is valid and indicates
          * that PTEs in the range [0, @nr_validated_ptes) have been validated.
-         * If @partial_pte is true then PTE at @nr_validated_ptes+1 has been
-         * partially validated.
+         * An extra page reference must be acquired (or not dropped) whenever
+         * PGT_partial gets set, and it must be dropped when the flag gets
+         * cleared. This is so that a get() leaving a page in partially
+         * validated state (where the caller would drop the reference acquired
+         * due to the getting of the type [apparently] failing [-EAGAIN])
+         * would not accidentally result in a page left with zero general
+         * reference count, but non-zero type reference count (possible when
+         * the partial get() is followed immediately by domain destruction).
+         * Likewise, the ownership of the single type reference for partially
+         * (in-)validated pages is tied to this flag, i.e. the instance
+         * setting the flag must not drop that reference, whereas the instance
+         * clearing it will have to.
+         *
+         * If @partial_pte is positive then PTE at @nr_validated_ptes+1 has
+         * been partially validated. This implies that the general reference
+         * to the page (acquired from get_page_from_lNe()) would be dropped
+         * (again due to the apparent failure) and hence must be re-acquired
+         * when resuming the validation, but must not be dropped when picking
+         * up the page for invalidation.
+         *
+         * If @partial_pte is negative then PTE at @nr_validated_ptes+1 has
+         * been partially invalidated. This is basically the opposite case of
+         * above, i.e. the general reference to the page was not dropped in
+         * put_page_from_lNe() (due to the apparent failure), and hence it
+         * must be dropped when the put operation is resumed (and completes),
+         * but it must not be acquired if picking up the page for validation.
          */
         struct {
             u16 nr_validated_ptes;
-            bool_t partial_pte;
+            s8 partial_pte;
         };
 
         /*
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/asm-x86/page.h
--- a/xen/include/asm-x86/page.h        Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/include/asm-x86/page.h        Tue Nov 04 12:43:19 2008 +0900
@@ -314,6 +314,9 @@ unsigned long clone_idle_pagetable(struc
 #define __PAGE_HYPERVISOR_NOCACHE \
     (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_PCD | _PAGE_ACCESSED)
 
+#define GRANT_PTE_FLAGS \
+    (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_NX | _PAGE_GNTTAB)
+
 #ifndef __ASSEMBLY__
 
 static inline int get_order_from_bytes(paddr_t size)
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/asm-x86/softirq.h
--- a/xen/include/asm-x86/softirq.h     Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/include/asm-x86/softirq.h     Tue Nov 04 12:43:19 2008 +0900
@@ -3,7 +3,8 @@
 
 #define NMI_MCE_SOFTIRQ        (NR_COMMON_SOFTIRQS + 0)
 #define TIME_CALIBRATE_SOFTIRQ (NR_COMMON_SOFTIRQS + 1)
+#define VCPU_KICK_SOFTIRQ      (NR_COMMON_SOFTIRQS + 2)
 
-#define NR_ARCH_SOFTIRQS       2
+#define NR_ARCH_SOFTIRQS       3
 
 #endif /* __ASM_SOFTIRQ_H__ */
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/asm-x86/x86_32/page.h
--- a/xen/include/asm-x86/x86_32/page.h Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/include/asm-x86/x86_32/page.h Tue Nov 04 12:43:19 2008 +0900
@@ -105,9 +105,6 @@ extern unsigned int PAGE_HYPERVISOR_NOCA
 #define get_pte_flags(x) (((int)((x) >> 32) & ~0xFFF) | ((int)(x) & 0xFFF))
 #define put_pte_flags(x) (((intpte_t)((x) & ~0xFFF) << 32) | ((x) & 0xFFF))
 
-#define GRANT_PTE_FLAGS \
-    (_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_GNTTAB)
-
 /*
  * Disallow unused flag bits plus PAT/PSE, PCD, PWT and GLOBAL.
  * Permit the NX bit if the hardware supports it.
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/asm-x86/x86_64/page.h
--- a/xen/include/asm-x86/x86_64/page.h Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/include/asm-x86/x86_64/page.h Tue Nov 04 12:43:19 2008 +0900
@@ -119,13 +119,10 @@ typedef l4_pgentry_t root_pgentry_t;
 #define L3_DISALLOW_MASK (BASE_DISALLOW_MASK)
 #define L4_DISALLOW_MASK (BASE_DISALLOW_MASK)
 
-#define COMPAT_L3_DISALLOW_MASK 0xFFFFF1FEU
+#define COMPAT_L3_DISALLOW_MASK 0xFFFFF198U
 
 #define PAGE_HYPERVISOR         (__PAGE_HYPERVISOR         | _PAGE_GLOBAL)
 #define PAGE_HYPERVISOR_NOCACHE (__PAGE_HYPERVISOR_NOCACHE | _PAGE_GLOBAL)
-
-#define GRANT_PTE_FLAGS \
-    (_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_GNTTAB|_PAGE_USER)
 
 #define USER_MAPPINGS_ARE_GLOBAL
 #ifdef USER_MAPPINGS_ARE_GLOBAL
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/asm-x86/xenoprof.h
--- a/xen/include/asm-x86/xenoprof.h    Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/include/asm-x86/xenoprof.h    Tue Nov 04 12:43:19 2008 +0900
@@ -64,6 +64,9 @@ void xenoprof_backtrace(
                  "xenoprof/x86 with autotranslated mode enabled"    \
                  "isn't supported yet\n");                          \
     } while (0)
+int passive_domain_do_rdmsr(struct cpu_user_regs *regs);
+int passive_domain_do_wrmsr(struct cpu_user_regs *regs);
+void passive_domain_destroy(struct vcpu *v);
 
 #endif /* __ASM_X86_XENOPROF_H__ */
 
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/public/features.h
--- a/xen/include/public/features.h     Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/include/public/features.h     Tue Nov 04 12:43:19 2008 +0900
@@ -59,6 +59,9 @@
 /* x86: Does this Xen host support the MMU_PT_UPDATE_PRESERVE_AD hypercall? */
 #define XENFEAT_mmu_pt_update_preserve_ad  5
 
+/* x86: Does this Xen host support the MMU_{CLEAR,COPY}_PAGE hypercall? */
+#define XENFEAT_highmem_assist             6
+
 #define XENFEAT_NR_SUBMAPS 1
 
 #endif /* __XEN_PUBLIC_FEATURES_H__ */
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/public/trace.h
--- a/xen/include/public/trace.h        Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/include/public/trace.h        Tue Nov 04 12:43:19 2008 +0900
@@ -142,7 +142,9 @@
 #define TRC_HVM_INVLPG64        (TRC_HVM_HANDLER + TRC_64_FLAG + 0x14)
 #define TRC_HVM_MCE             (TRC_HVM_HANDLER + 0x15)
 #define TRC_HVM_IO_ASSIST       (TRC_HVM_HANDLER + 0x16)
+#define TRC_HVM_IO_ASSIST64     (TRC_HVM_HANDLER + TRC_64_FLAG + 0x16)
 #define TRC_HVM_MMIO_ASSIST     (TRC_HVM_HANDLER + 0x17)
+#define TRC_HVM_MMIO_ASSIST64   (TRC_HVM_HANDLER + TRC_64_FLAG + 0x17)
 #define TRC_HVM_CLTS            (TRC_HVM_HANDLER + 0x18)
 #define TRC_HVM_LMSW            (TRC_HVM_HANDLER + 0x19)
 #define TRC_HVM_LMSW64          (TRC_HVM_HANDLER + TRC_64_FLAG + 0x19)
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/public/xen.h
--- a/xen/include/public/xen.h  Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/include/public/xen.h  Tue Nov 04 12:43:19 2008 +0900
@@ -231,6 +231,13 @@ DEFINE_XEN_GUEST_HANDLE(xen_pfn_t);
  * cmd: MMUEXT_SET_LDT
  * linear_addr: Linear address of LDT base (NB. must be page-aligned).
  * nr_ents: Number of entries in LDT.
+ *
+ * cmd: MMUEXT_CLEAR_PAGE
+ * mfn: Machine frame number to be cleared.
+ *
+ * cmd: MMUEXT_COPY_PAGE
+ * mfn: Machine frame number of the destination page.
+ * src_mfn: Machine frame number of the source page.
  */
 #define MMUEXT_PIN_L1_TABLE      0
 #define MMUEXT_PIN_L2_TABLE      1
@@ -247,12 +254,15 @@ DEFINE_XEN_GUEST_HANDLE(xen_pfn_t);
 #define MMUEXT_FLUSH_CACHE      12
 #define MMUEXT_SET_LDT          13
 #define MMUEXT_NEW_USER_BASEPTR 15
+#define MMUEXT_CLEAR_PAGE       16
+#define MMUEXT_COPY_PAGE        17
 
 #ifndef __ASSEMBLY__
 struct mmuext_op {
     unsigned int cmd;
     union {
-        /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR */
+        /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR
+         * CLEAR_PAGE, COPY_PAGE */
         xen_pfn_t     mfn;
         /* INVLPG_LOCAL, INVLPG_ALL, SET_LDT */
         unsigned long linear_addr;
@@ -266,6 +276,8 @@ struct mmuext_op {
 #else
         void *vcpumask;
 #endif
+        /* COPY_PAGE */
+        xen_pfn_t src_mfn;
     } arg2;
 };
 typedef struct mmuext_op mmuext_op_t;
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/xen/cpuidle.h
--- a/xen/include/xen/cpuidle.h Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/include/xen/cpuidle.h Tue Nov 04 12:43:19 2008 +0900
@@ -30,12 +30,18 @@
 #define ACPI_PROCESSOR_MAX_POWER        8
 #define CPUIDLE_NAME_LEN                16
 
+#define ACPI_CSTATE_EM_NONE     0
+#define ACPI_CSTATE_EM_SYSIO    1
+#define ACPI_CSTATE_EM_FFH      2
+#define ACPI_CSTATE_EM_HALT     3
+
 struct acpi_processor_cx
 {
+    u8 idx;
     u8 valid;
     u8 type;
     u32 address;
-    u8 space_id;
+    u8 entry_method; /* ACPI_CSTATE_EM_xxx */
     u32 latency;
     u32 latency_ticks;
     u32 power;
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/xen/domain_page.h
--- a/xen/include/xen/domain_page.h     Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/include/xen/domain_page.h     Tue Nov 04 12:43:19 2008 +0900
@@ -24,7 +24,7 @@ void *map_domain_page(unsigned long mfn)
  * Pass a VA within a page previously mapped in the context of the
  * currently-executing VCPU via a call to map_domain_page().
  */
-void unmap_domain_page(void *va);
+void unmap_domain_page(const void *va);
 
 /*
  * Similar to the above calls, except the mapping is accessible in all
@@ -32,7 +32,7 @@ void unmap_domain_page(void *va);
  * mappings can also be unmapped from any context.
  */
 void *map_domain_page_global(unsigned long mfn);
-void unmap_domain_page_global(void *va);
+void unmap_domain_page_global(const void *va);
 
 #define DMCACHE_ENTRY_VALID 1U
 #define DMCACHE_ENTRY_HELD  2U
@@ -75,7 +75,7 @@ map_domain_page_with_cache(unsigned long
 }
 
 static inline void
-unmap_domain_page_with_cache(void *va, struct domain_mmap_cache *cache)
+unmap_domain_page_with_cache(const void *va, struct domain_mmap_cache *cache)
 {
     ASSERT(cache != NULL);
     cache->flags &= ~DMCACHE_ENTRY_HELD;
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/xen/spinlock.h
--- a/xen/include/xen/spinlock.h        Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/include/xen/spinlock.h        Tue Nov 04 12:43:19 2008 +0900
@@ -5,21 +5,38 @@
 #include <asm/system.h>
 #include <asm/spinlock.h>
 
+#ifndef NDEBUG
+struct lock_debug {
+    int irq_safe; /* +1: IRQ-safe; 0: not IRQ-safe; -1: don't know yet */
+};
+#define _LOCK_DEBUG { -1 }
+void spin_debug_enable(void);
+void spin_debug_disable(void);
+#else
+struct lock_debug { };
+#define _LOCK_DEBUG { }
+#define spin_debug_enable() ((void)0)
+#define spin_debug_disable() ((void)0)
+#endif
+
 typedef struct {
     raw_spinlock_t raw;
     u16 recurse_cpu:12;
     u16 recurse_cnt:4;
+    struct lock_debug debug;
 } spinlock_t;
 
-#define SPIN_LOCK_UNLOCKED { _RAW_SPIN_LOCK_UNLOCKED, 0xfffu, 0 }
+
+#define SPIN_LOCK_UNLOCKED { _RAW_SPIN_LOCK_UNLOCKED, 0xfffu, 0, _LOCK_DEBUG }
 #define DEFINE_SPINLOCK(l) spinlock_t l = SPIN_LOCK_UNLOCKED
 #define spin_lock_init(l) (*(l) = (spinlock_t)SPIN_LOCK_UNLOCKED)
 
 typedef struct {
     raw_rwlock_t raw;
+    struct lock_debug debug;
 } rwlock_t;
 
-#define RW_LOCK_UNLOCKED { _RAW_RW_LOCK_UNLOCKED }
+#define RW_LOCK_UNLOCKED { _RAW_RW_LOCK_UNLOCKED, _LOCK_DEBUG }
 #define DEFINE_RWLOCK(l) rwlock_t l = RW_LOCK_UNLOCKED
 #define rwlock_init(l) (*(l) = (rwlock_t)RW_LOCK_UNLOCKED)
 
@@ -34,6 +51,7 @@ int _spin_is_locked(spinlock_t *lock);
 int _spin_is_locked(spinlock_t *lock);
 int _spin_trylock(spinlock_t *lock);
 void _spin_barrier(spinlock_t *lock);
+void _spin_barrier_irq(spinlock_t *lock);
 
 void _spin_lock_recursive(spinlock_t *lock);
 void _spin_unlock_recursive(spinlock_t *lock);
@@ -67,6 +85,7 @@ void _write_unlock_irqrestore(rwlock_t *
 
 /* Ensure a lock is quiescent between two critical operations. */
 #define spin_barrier(l)               _spin_barrier(l)
+#define spin_barrier_irq(l)           _spin_barrier_irq(l)
 
 /*
  * spin_[un]lock_recursive(): Use these forms when the lock can (safely!) be
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/xen/time.h
--- a/xen/include/xen/time.h    Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/include/xen/time.h    Tue Nov 04 12:43:19 2008 +0900
@@ -52,6 +52,7 @@ struct tm gmtime(unsigned long t);
 #define SECONDS(_s)     ((s_time_t)((_s)  * 1000000000ULL))
 #define MILLISECS(_ms)  ((s_time_t)((_ms) * 1000000ULL))
 #define MICROSECS(_us)  ((s_time_t)((_us) * 1000ULL))
+#define STIME_MAX ((s_time_t)((uint64_t)~0ull>>1))
 
 extern void update_vcpu_system_time(struct vcpu *v);
 extern void update_domain_wallclock_time(struct domain *d);
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/xen/timer.h
--- a/xen/include/xen/timer.h   Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/include/xen/timer.h   Tue Nov 04 12:43:19 2008 +0900
@@ -15,12 +15,13 @@ struct timer {
 struct timer {
     /* System time expiry value (nanoseconds since boot). */
     s_time_t expires;
+    s_time_t expires_end;
 
     /* Position in active-timer data structure. */
     union {
         /* Timer-heap offset. */
         unsigned int heap_offset;
-        /* Overflow linked list. */
+        /* Linked list. */
         struct timer *list_next;
     };
 
diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/xlat.lst
--- a/xen/include/xlat.lst      Tue Nov 04 12:07:22 2008 +0900
+++ b/xen/include/xlat.lst      Tue Nov 04 12:43:19 2008 +0900
@@ -56,6 +56,6 @@
 !      processor_flags                 platform.h
 !      processor_power                 platform.h
 !      pct_register                    platform.h
-!      processor_px                    platform.h
+?      processor_px                    platform.h
 !      psd_package                     platform.h
 !      processor_performance           platform.h

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.