[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-changelog] [xen-unstable] merge with xen-unstable.hg
# HG changeset patch # User Isaku Yamahata <yamahata@xxxxxxxxxxxxx> # Date 1225770199 -32400 # Node ID e75cb35c798beabee0b0ed4025ef82a39c702279 # Parent 10f0e1bb8e5e9a28e1ebe3fbb9291fb8114ef4bc # Parent 43a079fd50fdab01cd2be443bfef011b3b0495ae merge with xen-unstable.hg --- xen/common/xmalloc.c | 286 -------------- .hgignore | 1 extras/mini-os/include/sched.h | 3 extras/mini-os/include/wait.h | 10 extras/mini-os/minios.mk | 3 tools/Makefile | 1 tools/blktap/drivers/block-qcow.c | 24 - tools/firmware/hvmloader/acpi/static_tables.c | 2 tools/firmware/rombios/rombios.c | 4 tools/flask/policy/policy/modules/xen/xen.te | 3 tools/python/xen/util/diagnose.py | 4 tools/python/xen/xend/XendConfig.py | 17 tools/python/xen/xend/XendDomainInfo.py | 73 ++- tools/python/xen/xend/server/DevConstants.py | 45 ++ tools/python/xen/xend/server/DevController.py | 31 - tools/python/xen/xend/server/iopif.py | 20 - tools/python/xen/xend/server/irqif.py | 19 tools/python/xen/xend/server/pciif.py | 3 tools/python/xen/xend/server/vscsiif.py | 15 tools/python/xen/xm/create.py | 14 tools/python/xen/xm/main.py | 5 tools/xenpmd/Makefile | 20 + tools/xenpmd/xenpmd.c | 520 ++++++++++++++++++++++++++ xen/arch/ia64/xen/cpufreq/cpufreq.c | 15 xen/arch/ia64/xen/irq.c | 2 xen/arch/x86/acpi/cpu_idle.c | 103 ++--- xen/arch/x86/acpi/cpufreq/cpufreq.c | 14 xen/arch/x86/acpi/cpufreq/powernow.c | 14 xen/arch/x86/acpi/cpuidle_menu.c | 14 xen/arch/x86/domain.c | 116 ++++- xen/arch/x86/domain_build.c | 34 + xen/arch/x86/hpet.c | 7 xen/arch/x86/hvm/emulate.c | 30 + xen/arch/x86/hvm/hpet.c | 339 +++++++++------- xen/arch/x86/hvm/hvm.c | 1 xen/arch/x86/hvm/i8254.c | 4 xen/arch/x86/hvm/rtc.c | 4 xen/arch/x86/hvm/svm/entry.S | 3 xen/arch/x86/hvm/vlapic.c | 10 xen/arch/x86/hvm/vmx/entry.S | 6 xen/arch/x86/hvm/vmx/vmx.c | 81 ++-- xen/arch/x86/hvm/vmx/vpmu_core2.c | 20 + xen/arch/x86/hvm/vpt.c | 18 xen/arch/x86/irq.c | 6 xen/arch/x86/mm.c | 251 +++++++++--- xen/arch/x86/mm/hap/p2m-ept.c | 8 xen/arch/x86/mm/p2m.c | 17 xen/arch/x86/msi.c | 69 +-- xen/arch/x86/oprofile/nmi_int.c | 51 ++ xen/arch/x86/oprofile/op_model_ppro.c | 103 +++++ xen/arch/x86/oprofile/op_x86_model.h | 5 xen/arch/x86/setup.c | 1 xen/arch/x86/smpboot.c | 14 xen/arch/x86/time.c | 4 xen/arch/x86/traps.c | 29 - xen/arch/x86/x86_32/domain_page.c | 10 xen/arch/x86/x86_64/compat/mm.c | 5 xen/arch/x86/x86_64/cpufreq.c | 33 - xen/common/event_channel.c | 2 xen/common/kernel.c | 3 xen/common/keyhandler.c | 4 xen/common/spinlock.c | 69 +++ xen/common/timer.c | 125 +++--- xen/common/xenoprof.c | 2 xen/drivers/char/serial.c | 7 xen/drivers/cpufreq/cpufreq.c | 149 ++++++- xen/include/asm-x86/config.h | 8 xen/include/asm-x86/event.h | 32 - xen/include/asm-x86/fixmap.h | 1 xen/include/asm-x86/hvm/vmx/vpmu.h | 2 xen/include/asm-x86/hvm/vmx/vpmu_core2.h | 22 - xen/include/asm-x86/hvm/vpt.h | 70 +-- xen/include/asm-x86/mm.h | 30 + xen/include/asm-x86/page.h | 3 xen/include/asm-x86/softirq.h | 3 xen/include/asm-x86/x86_32/page.h | 3 xen/include/asm-x86/x86_64/page.h | 5 xen/include/asm-x86/xenoprof.h | 3 xen/include/public/features.h | 3 xen/include/public/trace.h | 2 xen/include/public/xen.h | 14 xen/include/xen/cpuidle.h | 8 xen/include/xen/domain_page.h | 6 xen/include/xen/spinlock.h | 23 + xen/include/xen/time.h | 1 xen/include/xen/timer.h | 3 xen/include/xlat.lst | 2 87 files changed, 2085 insertions(+), 1084 deletions(-) diff -r 10f0e1bb8e5e -r e75cb35c798b .hgignore --- a/.hgignore Tue Nov 04 12:07:22 2008 +0900 +++ b/.hgignore Tue Nov 04 12:43:19 2008 +0900 @@ -211,6 +211,7 @@ ^tools/xenfb/vncfb$ ^tools/xenmon/xentrace_setmask$ ^tools/xenmon/xenbaked$ +^tools/xenpmd/xenpmd$ ^tools/xenstat/xentop/xentop$ ^tools/xenstore/testsuite/tmp/.*$ ^tools/xenstore/xen$ diff -r 10f0e1bb8e5e -r e75cb35c798b extras/mini-os/include/sched.h --- a/extras/mini-os/include/sched.h Tue Nov 04 12:07:22 2008 +0900 +++ b/extras/mini-os/include/sched.h Tue Nov 04 12:43:19 2008 +0900 @@ -48,8 +48,9 @@ void exit_thread(void) __attribute__((no void exit_thread(void) __attribute__((noreturn)); void schedule(void); +#ifdef __INSIDE_MINIOS__ #define current get_current() - +#endif void wake(struct thread *thread); void block(struct thread *thread); diff -r 10f0e1bb8e5e -r e75cb35c798b extras/mini-os/include/wait.h --- a/extras/mini-os/include/wait.h Tue Nov 04 12:07:22 2008 +0900 +++ b/extras/mini-os/include/wait.h Tue Nov 04 12:43:19 2008 +0900 @@ -7,7 +7,7 @@ #define DEFINE_WAIT(name) \ struct wait_queue name = { \ - .thread = current, \ + .thread = get_current(), \ .thread_list = MINIOS_LIST_HEAD_INIT((name).thread_list), \ } @@ -53,7 +53,7 @@ static inline void wake_up(struct wait_q unsigned long flags; \ local_irq_save(flags); \ add_wait_queue(&wq, &w); \ - block(current); \ + block(get_current()); \ local_irq_restore(flags); \ } while (0) @@ -74,8 +74,8 @@ static inline void wake_up(struct wait_q /* protect the list */ \ local_irq_save(flags); \ add_wait_queue(&wq, &__wait); \ - current->wakeup_time = deadline; \ - clear_runnable(current); \ + get_current()->wakeup_time = deadline; \ + clear_runnable(get_current()); \ local_irq_restore(flags); \ if((condition) || (deadline && NOW() >= deadline)) \ break; \ @@ -83,7 +83,7 @@ static inline void wake_up(struct wait_q } \ local_irq_save(flags); \ /* need to wake up */ \ - wake(current); \ + wake(get_current()); \ remove_wait_queue(&__wait); \ local_irq_restore(flags); \ } while(0) diff -r 10f0e1bb8e5e -r e75cb35c798b extras/mini-os/minios.mk --- a/extras/mini-os/minios.mk Tue Nov 04 12:07:22 2008 +0900 +++ b/extras/mini-os/minios.mk Tue Nov 04 12:43:19 2008 +0900 @@ -25,6 +25,9 @@ else else DEF_CFLAGS += -O3 endif + +# Make the headers define our internal stuff +DEF_CFLAGS += -D__INSIDE_MINIOS__ # Build the CFLAGS and ASFLAGS for compiling and assembling. # DEF_... flags are the common mini-os flags, diff -r 10f0e1bb8e5e -r e75cb35c798b tools/Makefile --- a/tools/Makefile Tue Nov 04 12:07:22 2008 +0900 +++ b/tools/Makefile Tue Nov 04 12:43:19 2008 +0900 @@ -24,6 +24,7 @@ SUBDIRS-$(LIBXENAPI_BINDINGS) += libxen SUBDIRS-$(LIBXENAPI_BINDINGS) += libxen SUBDIRS-y += fs-back SUBDIRS-$(CONFIG_IOEMU) += ioemu-dir +SUBDIRS-y += xenpmd # These don't cross-compile ifeq ($(XEN_COMPILE_ARCH),$(XEN_TARGET_ARCH)) diff -r 10f0e1bb8e5e -r e75cb35c798b tools/blktap/drivers/block-qcow.c --- a/tools/blktap/drivers/block-qcow.c Tue Nov 04 12:07:22 2008 +0900 +++ b/tools/blktap/drivers/block-qcow.c Tue Nov 04 12:43:19 2008 +0900 @@ -722,11 +722,11 @@ static inline void init_fds(struct disk_ /* Open the disk file and initialize qcow state. */ static int tdqcow_open (struct disk_driver *dd, const char *name, td_flag_t flags) { - int fd, len, i, shift, ret, size, l1_table_size, o_flags; + int fd, len, i, shift, ret, size, l1_table_size, o_flags, l1_table_block; int max_aio_reqs; struct td_state *bs = dd->td_state; struct tdqcow_state *s = (struct tdqcow_state *)dd->private; - char *buf; + char *buf, *buf2; QCowHeader *header; QCowHeader_ext *exthdr; uint32_t cksum; @@ -734,8 +734,8 @@ static int tdqcow_open (struct disk_driv DPRINTF("QCOW: Opening %s\n",name); - /* Since we don't handle O_DIRECT correctly, don't use it */ - o_flags = O_LARGEFILE | ((flags == TD_RDONLY) ? O_RDONLY : O_RDWR); + o_flags = O_DIRECT | O_LARGEFILE | + ((flags == TD_RDONLY) ? O_RDONLY : O_RDWR); fd = open(name, o_flags); if (fd < 0) { DPRINTF("Unable to open %s (%d)\n",name,0 - errno); @@ -819,9 +819,14 @@ static int tdqcow_open (struct disk_driv (int) (s->l1_size * sizeof(uint64_t)), l1_table_size); - lseek(fd, s->l1_table_offset, SEEK_SET); - if (read(fd, s->l1_table, l1_table_size) != l1_table_size) + lseek(fd, 0, SEEK_SET); + l1_table_block = l1_table_size + s->l1_table_offset; + l1_table_block = l1_table_block + 512 - (l1_table_block % 512); + ret = posix_memalign((void **)&buf2, 4096, l1_table_block); + if (ret != 0) goto fail; + if (read(fd, buf2, l1_table_block) != l1_table_block) goto fail; + memcpy(s->l1_table, buf2 + s->l1_table_offset, l1_table_size); for(i = 0; i < s->l1_size; i++) { be64_to_cpus(&s->l1_table[i]); @@ -871,8 +876,9 @@ static int tdqcow_open (struct disk_driv DPRINTF("qcow: Converting image to big endian L1 table\n"); - lseek(fd, s->l1_table_offset, SEEK_SET); - if (write(fd, s->l1_table, l1_table_size) != l1_table_size) { + memcpy(buf2 + s->l1_table_offset, s->l1_table, l1_table_size); + lseek(fd, 0, SEEK_SET); + if (write(fd, buf2, l1_table_block) != l1_table_block) { DPRINTF("qcow: Failed to write new L1 table\n"); goto fail; } @@ -917,7 +923,7 @@ static int tdqcow_open (struct disk_driv init_fds(dd); if (!final_cluster) - s->fd_end = s->l1_table_offset + l1_table_size; + s->fd_end = l1_table_block; else { s->fd_end = lseek(fd, 0, SEEK_END); if (s->fd_end == (off_t)-1) diff -r 10f0e1bb8e5e -r e75cb35c798b tools/firmware/hvmloader/acpi/static_tables.c --- a/tools/firmware/hvmloader/acpi/static_tables.c Tue Nov 04 12:07:22 2008 +0900 +++ b/tools/firmware/hvmloader/acpi/static_tables.c Tue Nov 04 12:43:19 2008 +0900 @@ -67,7 +67,7 @@ struct acpi_20_fadt Fadt = { .p_lvl2_lat = 0x0fff, /* >100, means we do not support C2 state */ .p_lvl3_lat = 0x0fff, /* >1000, means we do not support C3 state */ - .iapc_boot_arch = ACPI_LEGACY_DEVICES | ACPI_8042, + .iapc_boot_arch = ACPI_8042, .flags = (ACPI_PROC_C1 | ACPI_SLP_BUTTON | ACPI_WBINVD | ACPI_PWR_BUTTON | ACPI_FIX_RTC | ACPI_TMR_VAL_EXT), diff -r 10f0e1bb8e5e -r e75cb35c798b tools/firmware/rombios/rombios.c --- a/tools/firmware/rombios/rombios.c Tue Nov 04 12:07:22 2008 +0900 +++ b/tools/firmware/rombios/rombios.c Tue Nov 04 12:43:19 2008 +0900 @@ -7216,7 +7216,7 @@ BX_INFO("floppy: drive>1 || head>1 ...\n outb(0x03f5, head); outb(0x03f5, sector); outb(0x03f5, 2); // 512 byte sector size - outb(0x03f5, 0); // last sector number possible on track + outb(0x03f5, sector + num_sectors - 1); // last sector to read on track outb(0x03f5, 0); // Gap length outb(0x03f5, 0xff); // Gap length @@ -7364,7 +7364,7 @@ BX_INFO("floppy: drive>1 || head>1 ...\n outb(0x03f5, head); outb(0x03f5, sector); outb(0x03f5, 2); // 512 byte sector size - outb(0x03f5, 0); // last sector number possible on track + outb(0x03f5, sector + num_sectors - 1); // last sector to write on track outb(0x03f5, 0); // Gap length outb(0x03f5, 0xff); // Gap length diff -r 10f0e1bb8e5e -r e75cb35c798b tools/flask/policy/policy/modules/xen/xen.te --- a/tools/flask/policy/policy/modules/xen/xen.te Tue Nov 04 12:07:22 2008 +0900 +++ b/tools/flask/policy/policy/modules/xen/xen.te Tue Nov 04 12:43:19 2008 +0900 @@ -74,7 +74,7 @@ allow dom0_t pirq_t:event {vector}; allow dom0_t pirq_t:event {vector}; allow dom0_t xen_t:mmu {memorymap}; -allow dom0_t dom0_t:mmu {pinpage map_read map_write adjust}; +allow dom0_t dom0_t:mmu {pinpage map_read map_write adjust updatemp}; allow dom0_t dom0_t:grant {query setup}; allow dom0_t dom0_t:domain {scheduler getdomaininfo getvcpuinfo getvcpuaffinity}; @@ -112,6 +112,7 @@ allow domU_t evchnU-0_t:event {send}; allow dom0_t dom0_t:event {send}; allow dom0_t domU_t:grant {copy}; +allow domU_t domU_t:grant {copy}; manage_domain(dom0_t, domU_t) diff -r 10f0e1bb8e5e -r e75cb35c798b tools/python/xen/util/diagnose.py --- a/tools/python/xen/util/diagnose.py Tue Nov 04 12:07:22 2008 +0900 +++ b/tools/python/xen/util/diagnose.py Tue Nov 04 12:43:19 2008 +0900 @@ -23,7 +23,7 @@ from xen.xend.XendClient import server from xen.xend.XendClient import server from xen.xend.XendError import XendError from xen.xend.xenstore.xstransact import xstransact -from xen.xend.server import DevController +from xen.xend.server import DevConstants import xen.xend.XendProtocol @@ -169,7 +169,7 @@ def diagnose_hotplugging(): def stateString(state): - return state and DevController.xenbusState[int(state)] or '<None>' + return state and DevConstants.xenbusState[int(state)] or '<None>' def main(argv = None): diff -r 10f0e1bb8e5e -r e75cb35c798b tools/python/xen/xend/XendConfig.py --- a/tools/python/xen/xend/XendConfig.py Tue Nov 04 12:07:22 2008 +0900 +++ b/tools/python/xen/xend/XendConfig.py Tue Nov 04 12:43:19 2008 +0900 @@ -1602,21 +1602,21 @@ class XendConfig(dict): # [vscsi, # [dev, # [devid, 0], [p-devname, sdb], [p-dev, 1:0:0:1], - # [v-dev, 0:0:0:0], [state, Initialising] + # [v-dev, 0:0:0:0], [state, 1] # ], # [dev, # [devid, 0], [p-devname, sdc], [p-dev, 1:0:0:2], - # [v-dev, 0:0:0:1], [satet, Initialising] + # [v-dev, 0:0:0:1], [satet, 1] # ] # ], # [vscsi, # [dev, # [devid, 1], [p-devname, sdg], [p-dev, 2:0:0:0], - # [v-dev, 1:0:0:0], [state, Initialising] + # [v-dev, 1:0:0:0], [state, 1] # ], # [dev, # [devid, 1], [p-devname, sdh], [p-dev, 2:0:0:1], - # [v-dev, 1:0:0:1], [satet, Initialising] + # [v-dev, 1:0:0:1], [satet, 1] # ] # ] # ] @@ -1632,18 +1632,19 @@ class XendConfig(dict): # [vscsi, # [dev, # [devid, 0], [p-devname, sdd], [p-dev, 1:0:0:3], - # [v-dev, 0:0:0:2], [state, Initialising] + # [v-dev, 0:0:0:2], [state, 1] # ] # ] # ] # - # state 'Initialising' indicates that the device is being attached, - # while state 'Closing' indicates that the device is being detached. + # state xenbusState['Initialising'] indicates that the device is + # being attached, while state xenbusState['Closing'] indicates + # that the device is being detached. # # The Dict looks like this: # # { devs: [ {devid: 0, p-devname: sdd, p-dev: 1:0:0:3, - # v-dev: 0:0:0:2, state: Initialising} ] } + # v-dev: 0:0:0:2, state: 1} ] } dev_config = {} diff -r 10f0e1bb8e5e -r e75cb35c798b tools/python/xen/xend/XendDomainInfo.py --- a/tools/python/xen/xend/XendDomainInfo.py Tue Nov 04 12:07:22 2008 +0900 +++ b/tools/python/xen/xend/XendDomainInfo.py Tue Nov 04 12:43:19 2008 +0900 @@ -52,6 +52,7 @@ from xen.xend.xenstore.xswatch import xs from xen.xend.xenstore.xswatch import xswatch from xen.xend.XendConstants import * from xen.xend.XendAPIConstants import * +from xen.xend.server.DevConstants import xenbusState from xen.xend.XendVMMetrics import XendVMMetrics @@ -797,7 +798,7 @@ class XendDomainInfo: existing_dev_info = self._getDeviceInfo_vscsi(req_devid, dev['v-dev']) state = dev['state'] - if state == 'Initialising': + if state == xenbusState['Initialising']: # new create # If request devid does not exist, create and exit. if existing_dev_info is None: @@ -806,25 +807,48 @@ class XendDomainInfo: elif existing_dev_info == "exists": raise XendError("The virtual device %s is already defined" % dev['v-dev']) - elif state == 'Closing': + elif state == xenbusState['Closing']: if existing_dev_info is None: raise XendError("Cannot detach vscsi device does not exist") - # use DevController.reconfigureDevice to change device config - dev_control = self.getDeviceController(dev_class) - dev_uuid = dev_control.reconfigureDevice(req_devid, dev_config) - dev_control.waitForDevice_reconfigure(req_devid) - num_devs = dev_control.cleanupDevice(req_devid) - - # update XendConfig with new device info - if dev_uuid: - new_dev_sxp = dev_control.configuration(req_devid) + if self.domid is not None: + # use DevController.reconfigureDevice to change device config + dev_control = self.getDeviceController(dev_class) + dev_uuid = dev_control.reconfigureDevice(req_devid, dev_config) + dev_control.waitForDevice_reconfigure(req_devid) + num_devs = dev_control.cleanupDevice(req_devid) + + # update XendConfig with new device info + if dev_uuid: + new_dev_sxp = dev_control.configuration(req_devid) + self.info.device_update(dev_uuid, new_dev_sxp) + + # If there is no device left, destroy vscsi and remove config. + if num_devs == 0: + self.destroyDevice('vscsi', req_devid) + del self.info['devices'][dev_uuid] + + else: + cur_dev_sxp = self._getDeviceInfo_vscsi(req_devid, None) + new_dev_sxp = ['vscsi'] + for cur_dev in sxp.children(cur_dev_sxp, 'dev'): + if state == xenbusState['Closing']: + cur_dev_vdev = sxp.child_value(cur_dev, 'v-dev') + if cur_dev_vdev == dev['v-dev']: + continue + new_dev_sxp.append(cur_dev) + + if state == xenbusState['Initialising']: + new_dev_sxp.append(sxp.child0(dev_sxp, 'dev')) + + dev_uuid = sxp.child_value(cur_dev_sxp, 'uuid') self.info.device_update(dev_uuid, new_dev_sxp) - # If there is no device left, destroy vscsi and remove config. - if num_devs == 0: - self.destroyDevice('vscsi', req_devid) - del self.info['devices'][dev_uuid] + # If there is only 'vscsi' in new_dev_sxp, remove the config. + if len(sxp.children(new_dev_sxp, 'dev')) == 0: + del self.info['devices'][dev_uuid] + + xen.xend.XendDomain.instance().managed_config_save(self) return True @@ -986,7 +1010,17 @@ class XendDomainInfo: sxprs = [] dev_num = 0 for dev_type, dev_info in self.info.all_devices_sxpr(): - if dev_type == deviceClass: + if dev_type != deviceClass: + continue + + if deviceClass == 'vscsi': + vscsi_devs = ['devs', []] + for vscsi_dev in sxp.children(dev_info, 'dev'): + vscsi_dev.append(['frontstate', None]) + vscsi_devs[1].append(vscsi_dev) + dev_num = int(sxp.child_value(vscsi_dev, 'devid')) + sxprs.append([dev_num, [vscsi_devs]]) + else: sxprs.append([dev_num, dev_info]) dev_num += 1 return sxprs @@ -2380,11 +2414,10 @@ class XendDomainInfo: time.sleep(2) for paths in plist: if paths.find('backend') != -1: - from xen.xend.server import DevController # Modify online status /before/ updating state (latter is watched by # drivers, so this ordering avoids a race). xstransact.Write(paths, 'online', "0") - xstransact.Write(paths, 'state', str(DevController.xenbusState['Closing'])) + xstransact.Write(paths, 'state', str(xenbusState['Closing'])) # force xstransact.Remove(paths) @@ -3439,7 +3472,7 @@ class XendDomainInfo: ['p-devname', pscsi.get_dev_name()], ['p-dev', pscsi.get_physical_HCTL()], ['v-dev', xenapi_dscsi.get('virtual_HCTL')], - ['state', 'Initialising'], + ['state', xenbusState['Initialising']], ['uuid', dscsi_uuid] ] ] @@ -3558,7 +3591,7 @@ class XendDomainInfo: if target_dev is None: raise XendError('Failed to destroy device') - target_dev.append(['state', 'Closing']) + target_dev.append(['state', xenbusState['Closing']]) target_vscsi_sxp = ['vscsi', target_dev] if self._stateGet() != XEN_API_VM_POWER_STATE_RUNNING: diff -r 10f0e1bb8e5e -r e75cb35c798b tools/python/xen/xend/server/DevConstants.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/python/xen/xend/server/DevConstants.py Tue Nov 04 12:43:19 2008 +0900 @@ -0,0 +1,45 @@ +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +# Copyright (C) 2005 XenSource Ltd +#============================================================================ + +DEVICE_CREATE_TIMEOUT = 100 +DEVICE_DESTROY_TIMEOUT = 100 +HOTPLUG_STATUS_NODE = "hotplug-status" +HOTPLUG_ERROR_NODE = "hotplug-error" +HOTPLUG_STATUS_ERROR = "error" +HOTPLUG_STATUS_BUSY = "busy" + +Connected = 1 +Error = 2 +Missing = 3 +Timeout = 4 +Busy = 5 +Disconnected = 6 + +xenbusState = { + 'Unknown' : 0, + 'Initialising' : 1, + 'InitWait' : 2, + 'Initialised' : 3, + 'Connected' : 4, + 'Closing' : 5, + 'Closed' : 6, + 'Reconfiguring' : 7, + 'Reconfigured' : 8, + } +xenbusState.update(dict(zip(xenbusState.values(), xenbusState.keys()))) + diff -r 10f0e1bb8e5e -r e75cb35c798b tools/python/xen/xend/server/DevController.py --- a/tools/python/xen/xend/server/DevController.py Tue Nov 04 12:07:22 2008 +0900 +++ b/tools/python/xen/xend/server/DevController.py Tue Nov 04 12:43:19 2008 +0900 @@ -23,41 +23,14 @@ from xen.xend.XendError import VmError from xen.xend.XendError import VmError from xen.xend.XendLogging import log import xen.xend.XendConfig +from xen.xend.server.DevConstants import * from xen.xend.xenstore.xstransact import xstransact, complete from xen.xend.xenstore.xswatch import xswatch import os -DEVICE_CREATE_TIMEOUT = 100 -DEVICE_DESTROY_TIMEOUT = 100 -HOTPLUG_STATUS_NODE = "hotplug-status" -HOTPLUG_ERROR_NODE = "hotplug-error" -HOTPLUG_STATUS_ERROR = "error" -HOTPLUG_STATUS_BUSY = "busy" - -Connected = 1 -Error = 2 -Missing = 3 -Timeout = 4 -Busy = 5 -Disconnected = 6 - -xenbusState = { - 'Unknown' : 0, - 'Initialising' : 1, - 'InitWait' : 2, - 'Initialised' : 3, - 'Connected' : 4, - 'Closing' : 5, - 'Closed' : 6, - 'Reconfiguring': 7, - 'Reconfigured' : 8, - } - xoptions = XendOptions.instance() - -xenbusState.update(dict(zip(xenbusState.values(), xenbusState.keys()))) class DevController: @@ -569,7 +542,7 @@ class DevController: xswatch(statusPath, hotplugStatusCallback, ev, result) ev.wait(DEVICE_CREATE_TIMEOUT) err = xstransact.Read(statusPath, HOTPLUG_ERROR_NODE) - if result['status'] != 'Connected': + if result['status'] != Connected: return (result['status'], err) backpath = self.readVm(devid, "backend") diff -r 10f0e1bb8e5e -r e75cb35c798b tools/python/xen/xend/server/iopif.py --- a/tools/python/xen/xend/server/iopif.py Tue Nov 04 12:07:22 2008 +0900 +++ b/tools/python/xen/xend/server/iopif.py Tue Nov 04 12:43:19 2008 +0900 @@ -45,8 +45,21 @@ def parse_ioport(val): class IOPortsController(DevController): + valid_cfg = ['to', 'from', 'uuid'] + def __init__(self, vm): DevController.__init__(self, vm) + + def getDeviceConfiguration(self, devid, transaction = None): + result = DevController.getDeviceConfiguration(self, devid, transaction) + if transaction is None: + devinfo = self.readBackend(devid, *self.valid_cfg) + else: + devinfo = self.readBackendTxn(transaction, devid, *self.valid_cfg) + config = dict(zip(self.valid_cfg, devinfo)) + config = dict([(key, val) for key, val in config.items() + if val != None]) + return config def getDeviceDetails(self, config): """@see DevController.getDeviceDetails""" @@ -81,4 +94,9 @@ class IOPortsController(DevController): 'ioports: Failed to configure legacy i/o range: %s - %s' % (io_from, io_to)) - return (None, {}, {}) + back = dict([(k, config[k]) for k in self.valid_cfg if k in config]) + return (self.allocateDeviceID(), back, {}) + + def waitForDevice(self, devid): + # don't wait for hotplug + return diff -r 10f0e1bb8e5e -r e75cb35c798b tools/python/xen/xend/server/irqif.py --- a/tools/python/xen/xend/server/irqif.py Tue Nov 04 12:07:22 2008 +0900 +++ b/tools/python/xen/xend/server/irqif.py Tue Nov 04 12:43:19 2008 +0900 @@ -39,6 +39,18 @@ class IRQController(DevController): def __init__(self, vm): DevController.__init__(self, vm) + valid_cfg = ['irq', 'uuid'] + + def getDeviceConfiguration(self, devid, transaction = None): + result = DevController.getDeviceConfiguration(self, devid, transaction) + if transaction is None: + devinfo = self.readBackend(devid, *self.valid_cfg) + else: + devinfo = self.readBackendTxn(transaction, devid, *self.valid_cfg) + config = dict(zip(self.valid_cfg, devinfo)) + config = dict([(key, val) for key, val in config.items() + if val != None]) + return config def getDeviceDetails(self, config): """@see DevController.getDeviceDetails""" @@ -75,4 +87,9 @@ class IRQController(DevController): if rc < 0: raise VmError( 'irq: Failed to map irq %x' % (pirq)) - return (None, {}, {}) + back = dict([(k, config[k]) for k in self.valid_cfg if k in config]) + return (self.allocateDeviceID(), back, {}) + + def waitForDevice(self, devid): + # don't wait for hotplug + return diff -r 10f0e1bb8e5e -r e75cb35c798b tools/python/xen/xend/server/pciif.py --- a/tools/python/xen/xend/server/pciif.py Tue Nov 04 12:07:22 2008 +0900 +++ b/tools/python/xen/xend/server/pciif.py Tue Nov 04 12:43:19 2008 +0900 @@ -25,7 +25,8 @@ from xen.xend.XendError import VmError from xen.xend.XendError import VmError from xen.xend.XendLogging import log -from xen.xend.server.DevController import DevController, xenbusState +from xen.xend.server.DevController import DevController +from xen.xend.server.DevConstants import xenbusState import xen.lowlevel.xc diff -r 10f0e1bb8e5e -r e75cb35c798b tools/python/xen/xend/server/vscsiif.py --- a/tools/python/xen/xend/server/vscsiif.py Tue Nov 04 12:07:22 2008 +0900 +++ b/tools/python/xen/xend/server/vscsiif.py Tue Nov 04 12:43:19 2008 +0900 @@ -28,7 +28,8 @@ from xen.xend.XendError import VmError from xen.xend.XendError import VmError from xen.xend.XendLogging import log -from xen.xend.server.DevController import DevController, xenbusState +from xen.xend.server.DevController import DevController +from xen.xend.server.DevConstants import xenbusState from xen.xend.xenstore.xstransact import xstransact class VSCSIController(DevController): @@ -92,8 +93,8 @@ class VSCSIController(DevController): back[devpath + '/p-devname'] = pdevname vdev = vscsi_config.get('v-dev', '') back[devpath + '/v-dev'] = vdev - state = vscsi_config.get('state', '') - back[devpath + '/state'] = str(xenbusState[state]) + state = vscsi_config.get('state', xenbusState['Unknown']) + back[devpath + '/state'] = str(state) devid = vscsi_config.get('devid', '') back[devpath + '/devid'] = str(devid) @@ -168,17 +169,17 @@ class VSCSIController(DevController): (devid, back, front) = self.getDeviceDetails(config) devid = int(devid) vscsi_config = config['devs'][0] - state = vscsi_config.get('state', '') + state = vscsi_config.get('state', xenbusState['Unknown']) driver_state = self.readBackend(devid, 'state') if str(xenbusState['Connected']) != driver_state: raise VmError("Driver status is not connected") uuid = self.readBackend(devid, 'uuid') - if state == 'Initialising': + if state == xenbusState['Initialising']: back['uuid'] = uuid self.writeBackend(devid, back) - elif state == 'Closing': + elif state == xenbusState['Closing']: found = False devs = self.readBackendList(devid, "vscsi-devs") vscsipath = "vscsi-devs/" @@ -198,7 +199,7 @@ class VSCSIController(DevController): else: raise XendError("Error configuring device invalid " - "state '%s'" % state) + "state '%s'" % xenbusState[state]) self.writeBackend(devid, 'state', str(xenbusState['Reconfiguring'])) return self.readBackend(devid, 'uuid') diff -r 10f0e1bb8e5e -r e75cb35c798b tools/python/xen/xm/create.py --- a/tools/python/xen/xm/create.py Tue Nov 04 12:07:22 2008 +0900 +++ b/tools/python/xen/xm/create.py Tue Nov 04 12:43:19 2008 +0900 @@ -32,6 +32,7 @@ from xen.xend import osdep from xen.xend import osdep import xen.xend.XendClient from xen.xend.XendBootloader import bootloader +from xen.xend.server.DevConstants import xenbusState from xen.util import blkif from xen.util import vscsi_util import xen.util.xsm.xsm as security @@ -707,7 +708,7 @@ def configure_vscsis(config_devs, vals): vscsi_util.vscsi_get_hctl_and_devname_by(p_dev, scsi_devices) if p_hctl == None: - raise ValueError("Cannot find device \"%s\"" % p_dev) + raise ValueError('Cannot find device "%s"' % p_dev) for config in config_scsi: dev = vscsi_convert_sxp_to_dict(config) @@ -717,7 +718,7 @@ def configure_vscsis(config_devs, vals): v_hctl = v_dev.split(':') devid = int(v_hctl[0]) config_scsi.append(['dev', \ - ['state', 'Initialising'], \ + ['state', xenbusState['Initialising']], \ ['devid', devid], \ ['p-dev', p_hctl], \ ['p-devname', devname], \ @@ -1035,6 +1036,14 @@ def preprocess_ioports(vals): ioports.append(hexd) vals.ioports = ioports +def preprocess_irq(vals): + if not vals.irq: return + irq = [] + for v in vals.irq: + d = repr(v) + irq.append(d) + vals.irq = irq + def preprocess_vtpm(vals): if not vals.vtpm: return vtpms = [] @@ -1133,6 +1142,7 @@ def preprocess(vals): preprocess_vscsi(vals) preprocess_ioports(vals) preprocess_ip(vals) + preprocess_irq(vals) preprocess_nfs(vals) preprocess_vtpm(vals) preprocess_access_control(vals) diff -r 10f0e1bb8e5e -r e75cb35c798b tools/python/xen/xm/main.py --- a/tools/python/xen/xm/main.py Tue Nov 04 12:07:22 2008 +0900 +++ b/tools/python/xen/xm/main.py Tue Nov 04 12:43:19 2008 +0900 @@ -47,6 +47,7 @@ from xen.xend import sxp from xen.xend import sxp from xen.xend import XendClient from xen.xend.XendConstants import * +from xen.xend.server.DevConstants import xenbusState from xen.xm.opts import OptionError, Opts, wrap, set_true from xen.xm import console @@ -2515,7 +2516,7 @@ def xm_scsi_attach(args): dom = args[0] p_scsi = args[1] v_hctl = args[2] - scsi = parse_scsi_configuration(p_scsi, v_hctl, 'Initialising') + scsi = parse_scsi_configuration(p_scsi, v_hctl, xenbusState['Initialising']) if serverType == SERVER_XEN_API: @@ -2635,7 +2636,7 @@ def xm_scsi_detach(args): arg_check(args, 'scsi-detach', 2) dom = args[0] v_hctl = args[1] - scsi = parse_scsi_configuration(None, v_hctl, 'Closing') + scsi = parse_scsi_configuration(None, v_hctl, xenbusState['Closing']) if serverType == SERVER_XEN_API: diff -r 10f0e1bb8e5e -r e75cb35c798b tools/xenpmd/Makefile --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/xenpmd/Makefile Tue Nov 04 12:43:19 2008 +0900 @@ -0,0 +1,20 @@ +XEN_ROOT=../.. +include $(XEN_ROOT)/tools/Rules.mk + +CFLAGS += -Werror +CFLAGS += $(CFLAGS_libxenstore) +LDFLAGS += $(LDFLAGS_libxenstore) + +BIN = xenpmd + +.PHONY: all +all: $(BIN) + +.PHONY: install +install: all + $(INSTALL_DIR) $(DESTDIR)$(SBINDIR) + $(INSTALL_PROG) $(BIN) $(DESTDIR)$(SBINDIR) + +.PHONY: clean +clean: + $(RM) -f $(BIN) diff -r 10f0e1bb8e5e -r e75cb35c798b tools/xenpmd/xenpmd.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/xenpmd/xenpmd.c Tue Nov 04 12:43:19 2008 +0900 @@ -0,0 +1,520 @@ +/* + * xenpmd.c + * + * xen power management daemon - Facilitates power management + * functionality within xen guests. + * + * Copyright (c) 2008 Kamala Narasimhan + * Copyright (c) 2008 Citrix Systems, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* Xen extended power management support provides HVM guest power management + * features beyond S3, S4, S5. For example, it helps expose system level + * battery status and battery meter information and in future will be extended + * to include more power management support. This extended power management + * support is enabled by setting xen_extended_power_mgmt to 1 or 2 in the HVM + * config file. When set to 2, non-pass through mode is enabled which heavily + * relies on this power management daemon to glean battery information from + * dom0 and store it xenstore which would then be queries and used by qemu and + * passed to the guest when appropriate battery ports are read/written to. + */ + +#include <stdio.h> +#include <stdarg.h> +#include <string.h> +#include <stdlib.h> +#include <dirent.h> +#include <unistd.h> +#include <sys/stat.h> +#include <xs.h> + +/* #define RUN_STANDALONE */ +#define RUN_IN_SIMULATE_MODE + +enum BATTERY_INFO_TYPE { + BIF, + BST +}; + +enum BATTERY_PRESENT { + NO, + YES +}; + +enum BATTERY_TECHNOLOGY { + NON_RECHARGEABLE, + RECHARGEABLE +}; + +struct battery_info { + enum BATTERY_PRESENT present; + unsigned long design_capacity; + unsigned long last_full_capacity; + enum BATTERY_TECHNOLOGY battery_technology; + unsigned long design_voltage; + unsigned long design_capacity_warning; + unsigned long design_capacity_low; + unsigned long capacity_granularity_1; + unsigned long capacity_granularity_2; + char model_number[32]; + char serial_number[32]; + char battery_type[32]; + char oem_info[32]; +}; + +struct battery_status { + enum BATTERY_PRESENT present; + unsigned long state; + unsigned long present_rate; + unsigned long remaining_capacity; + unsigned long present_voltage; +}; + +static struct xs_handle *xs; + +#ifdef RUN_IN_SIMULATE_MODE + #define BATTERY_DIR_PATH "/tmp/battery" + #define BATTERY_INFO_FILE_PATH "/tmp/battery/%s/info" + #define BATTERY_STATE_FILE_PATH "/tmp/battery/%s/state" +#else + #define BATTERY_DIR_PATH "/proc/acpi/battery" + #define BATTERY_INFO_FILE_PATH "/proc/acpi/battery/%s/info" + #define BATTERY_STATE_FILE_PATH "/proc/acpi/battery/%s/state" +#endif + +FILE *get_next_battery_file(DIR *battery_dir, + enum BATTERY_INFO_TYPE battery_info_type) +{ + FILE *file = 0; + struct dirent *dir_entries; + char file_name[32]; + + do + { + dir_entries = readdir(battery_dir); + if ( !dir_entries ) + return 0; + if ( strlen(dir_entries->d_name) < 4 ) + continue; + if ( battery_info_type == BIF ) + snprintf(file_name, 32, BATTERY_INFO_FILE_PATH, + dir_entries->d_name); + else + snprintf(file_name, 32, BATTERY_STATE_FILE_PATH, + dir_entries->d_name); + file = fopen(file_name, "r"); + } while ( !file ); + + return file; +} + +void set_attribute_battery_info(char *attrib_name, + char *attrib_value, + struct battery_info *info) +{ + if ( strstr(attrib_name, "present") ) + { + if ( strstr(attrib_value, "yes") ) + info->present = YES; + return; + } + + if ( strstr(attrib_name, "design capacity warning") ) + { + info->design_capacity_warning = strtoull(attrib_value, NULL, 10); + return; + } + + if ( strstr(attrib_name, "design capacity low") ) + { + info->design_capacity_low = strtoull(attrib_value, NULL, 10); + return; + } + + if ( strstr(attrib_name, "design capacity") ) + { + info->design_capacity = strtoull(attrib_value, NULL, 10); + return; + } + + if ( strstr(attrib_name, "last full capacity") ) + { + info->last_full_capacity = strtoull(attrib_value, NULL, 10); + return; + } + + if ( strstr(attrib_name, "design voltage") ) + { + info->design_voltage = strtoull(attrib_value, NULL, 10); + return; + } + + if ( strstr(attrib_name, "capacity granularity 1") ) + { + info->capacity_granularity_1 = strtoull(attrib_value, NULL, 10); + return; + } + + if ( strstr(attrib_name, "capacity granularity 2") ) + { + info->capacity_granularity_2 = strtoull(attrib_value, NULL, 10); + return; + } + + if ( strstr(attrib_name, "battery technology") ) + { + if ( strncmp(attrib_value, "rechargeable", + strlen("rechargeable")) == 0 ) + info->battery_technology = RECHARGEABLE; + else + info->battery_technology = NON_RECHARGEABLE; + return; + } + + if ( strstr(attrib_name, "model number") ) + { + strncpy(info->model_number, attrib_value, 32); + return; + } + + if ( strstr(attrib_name, "serial number") ) + { + strncpy(info->serial_number, attrib_value, 32); + return; + } + + if ( strstr(attrib_name, "battery type") ) + { + strncpy(info->battery_type, attrib_value, 32); + return; + } + + if ( strstr(attrib_name, "OEM info") ) + { + strncpy(info->oem_info, attrib_value, 32); + return; + } + + return; +} + +void set_attribute_battery_status(char *attrib_name, + char *attrib_value, + struct battery_status *status) +{ + if ( strstr(attrib_name, "charging state") ) + { + /* Check this, below is half baked */ + if ( strstr(attrib_value, "charged") ) + status->state = 0; + else + status->state = 1; + return; + } + + if ( strstr(attrib_name, "present rate") ) + { + status->present_rate = strtoull(attrib_value, NULL, 10); + return; + } + + if ( strstr(attrib_name, "remaining capacity") ) + { + status->remaining_capacity = strtoull(attrib_value, NULL, 10); + return; + } + + if ( strstr(attrib_name, "present voltage") ) + { + status->present_voltage = strtoull(attrib_value, NULL, 10); + return; + } + + if ( strstr(attrib_name, "present") ) + { + if ( strstr(attrib_value, "yes") ) + status->present = YES; + return; + } +} + +void parse_battery_info_or_status(char *line_info, + enum BATTERY_INFO_TYPE type, + void *info_or_status) +{ + char attrib_name[128]; + char attrib_value[64]; + char *delimiter; + unsigned long length; + + length = strlen(line_info); + delimiter = (char *) strchr( line_info, ':'); + if ( (!delimiter) || (delimiter == line_info) || + (delimiter == line_info + length) ) + return; + + strncpy(attrib_name, line_info, delimiter-line_info); + while ( *(delimiter+1) == ' ' ) + { + delimiter++; + if ( delimiter+1 == line_info + length) + return; + } + strncpy(attrib_value, delimiter+1, + (unsigned long)line_info + length -(unsigned long)delimiter); + + if ( type == BIF ) + set_attribute_battery_info(attrib_name, attrib_value, + (struct battery_info *)info_or_status); + else + set_attribute_battery_status(attrib_name, attrib_value, + (struct battery_status *)info_or_status); + + return; +} + +int get_next_battery_info_or_status(DIR *battery_dir, + enum BATTERY_INFO_TYPE type, + void *info_or_status) +{ + FILE *file; + char line_info[256]; + + if ( !info_or_status ) + return 0; + + memset(line_info, 0, 256); + if (type == BIF) + memset(info_or_status, 0, sizeof(struct battery_info)); + else + memset(info_or_status, 0, sizeof(struct battery_status)); + + file = get_next_battery_file(battery_dir, type); + if ( !file ) + return 0; + + while ( fgets(line_info, 1024, file) != NULL ) + { + parse_battery_info_or_status(line_info, type, info_or_status); + memset(line_info, 0, 256); + } + + fclose(file); + return 1; +} + +#ifdef RUN_STANDALONE +void print_battery_info(struct battery_info *info) +{ + printf("present: %d\n", info->present); + printf("design capacity: %d\n", info->design_capacity); + printf("last full capacity: %d\n", info->last_full_capacity); + printf("battery technology: %d\n", info->battery_technology); + printf("design voltage: %d\n", info->design_voltage); + printf("design capacity warning:%d\n", info->design_capacity_warning); + printf("design capacity low: %d\n", info->design_capacity_low); + printf("capacity granularity 1: %d\n", info->capacity_granularity_1); + printf("capacity granularity 2: %d\n", info->capacity_granularity_2); + printf("model number: %s\n", info->model_number); + printf("serial number: %s\n", info->serial_number); + printf("battery type: %s\n", info->battery_type); + printf("OEM info: %s\n", info->oem_info); +} +#endif /*RUN_STANDALONE*/ + +void write_ulong_lsb_first(char *temp_val, unsigned long val) +{ + snprintf(temp_val, 9, "%02x%02x%02x%02x", (unsigned int)val & 0xff, + (unsigned int)(val & 0xff00) >> 8, (unsigned int)(val & 0xff0000) >> 16, + (unsigned int)(val & 0xff000000) >> 24); +} + +void write_battery_info_to_xenstore(struct battery_info *info) +{ + char val[1024], string_info[256]; + + xs_mkdir(xs, XBT_NULL, "/pm"); + + memset(val, 0, 1024); + memset(string_info, 0, 256); + /* write 9 dwords (so 9*4) + length of 4 strings + 4 null terminators */ + snprintf(val, 3, "%02x", + (unsigned int)(9*4 + + strlen(info->model_number) + + strlen(info->serial_number) + + strlen(info->battery_type) + + strlen(info->oem_info) + 4)); + write_ulong_lsb_first(val+2, info->present); + write_ulong_lsb_first(val+10, info->design_capacity); + write_ulong_lsb_first(val+18, info->last_full_capacity); + write_ulong_lsb_first(val+26, info->battery_technology); + write_ulong_lsb_first(val+34, info->design_voltage); + write_ulong_lsb_first(val+42, info->design_capacity_warning); + write_ulong_lsb_first(val+50, info->design_capacity_low); + write_ulong_lsb_first(val+58, info->capacity_granularity_1); + write_ulong_lsb_first(val+66, info->capacity_granularity_2); + + snprintf(string_info, 256, "%02x%s%02x%s%02x%s%02x%s", + (unsigned int)strlen(info->model_number), info->model_number, + (unsigned int)strlen(info->serial_number), info->serial_number, + (unsigned int)strlen(info->battery_type), info->battery_type, + (unsigned int)strlen(info->oem_info), info->oem_info); + strncat(val+73, string_info, 1024); + xs_write(xs, XBT_NULL, "/pm/bif", + val, 73+8+strlen(info->model_number)+strlen(info->serial_number)+ + strlen(info->battery_type)+strlen(info->oem_info)+1); +} + +int write_one_time_battery_info(void) +{ + DIR *dir; + int ret = 0; + struct battery_info info; + + dir = opendir(BATTERY_DIR_PATH); + if ( !dir ) + return 0; + + while ( get_next_battery_info_or_status(dir, BIF, (void *)&info) ) + { +#ifdef RUN_STANDALONE + print_battery_info(&info); +#endif + if ( info.present == YES ) + { + write_battery_info_to_xenstore(&info); + ret = 1; + break; /* rethink this... */ + } + } + + closedir(dir); + return ret; +} + +#ifdef RUN_STANDALONE +void print_battery_status(struct battery_status *status) +{ + printf("present: %d\n", status->present); + printf("Battery state %d\n", status->state); + printf("Battery present rate %d\n", status->present_rate); + printf("Battery remining capacity %d\n", status->remaining_capacity); + printf("Battery present voltage %d\n", status->present_voltage); +} +#endif /*RUN_STANDALONE*/ + +void write_battery_status_to_xenstore(struct battery_status *status) +{ + char val[35]; + + xs_mkdir(xs, XBT_NULL, "/pm"); + + memset(val, 0, 35); + snprintf(val, 3, "%02x", 16); + write_ulong_lsb_first(val+2, status->state); + write_ulong_lsb_first(val+10, status->present_rate); + write_ulong_lsb_first(val+18, status->remaining_capacity); + write_ulong_lsb_first(val+26, status->present_voltage); + + xs_write(xs, XBT_NULL, "/pm/bst", val, 35); +} + +int wait_for_and_update_battery_status_request(void) +{ + DIR *dir; + int ret = 0; + unsigned int count; + struct battery_status status; + + while ( true ) + { + /* KN:@TODO - It is rather inefficient to not cache the file handle. + * Switch to caching file handle. + */ + dir = opendir(BATTERY_DIR_PATH); + if ( !dir ) + return 0; + + while ( get_next_battery_info_or_status(dir, BST, (void *)&status) ) + { +#ifdef RUN_STANDALONE + print_battery_status(&status); +#endif + if ( status.present == YES ) + { + write_battery_status_to_xenstore(&status); + ret = 1; + /* rethink this; though I have never seen, there might be + * systems out there with more than one battery device + * present + */ + break; + } + } + closedir(dir); + xs_watch(xs, "/pm/events", "refreshbatterystatus"); + xs_read_watch(xs, &count); + } + + return ret; +} + +/* Borrowed daemonize from xenstored - Initially written by Stevens. */ +static void daemonize(void) +{ + pid_t pid; + + if ( (pid = fork()) < 0 ) + exit(1); + + if ( pid != 0 ) + exit(0); + + setsid(); + + if ( (pid = fork()) < 0 ) + exit(1); + + if ( pid != 0 ) + exit(0); + + if ( chdir("/") == -1 ) + exit(1); + + umask(0); +} + +int main(int argc, char *argv[]) +{ +#ifndef RUN_STANDALONE + daemonize(); +#endif + xs = (struct xs_handle *)xs_daemon_open(); + if ( xs == NULL ) + return -1; + + if ( write_one_time_battery_info() == 0 ) + { + xs_daemon_close(xs); + return -1; + } + + wait_for_and_update_battery_status_request(); + xs_daemon_close(xs); + return 0; +} + diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/ia64/xen/cpufreq/cpufreq.c --- a/xen/arch/ia64/xen/cpufreq/cpufreq.c Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/arch/ia64/xen/cpufreq/cpufreq.c Tue Nov 04 12:43:19 2008 +0900 @@ -210,21 +210,6 @@ acpi_cpufreq_cpu_init (struct cpufreq_po data->acpi_data = &processor_pminfo[cpu]->perf; - /* capability check */ - if (data->acpi_data->state_count <= 1) { - printk(KERN_WARNING "P-States\n"); - result = -ENODEV; - goto err_unreg; - } - - if ((data->acpi_data->control_register.space_id != - ACPI_ADR_SPACE_FIXED_HARDWARE) || - (data->acpi_data->status_register.space_id != - ACPI_ADR_SPACE_FIXED_HARDWARE)) { - result = -ENODEV; - goto err_unreg; - } - data->freq_table = xmalloc_array(struct cpufreq_frequency_table, (data->acpi_data->state_count + 1)); if (!data->freq_table) { diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/ia64/xen/irq.c --- a/xen/arch/ia64/xen/irq.c Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/arch/ia64/xen/irq.c Tue Nov 04 12:43:19 2008 +0900 @@ -74,7 +74,7 @@ unsigned int __ia64_local_vector_to_irq /* * Controller mappings for all interrupt sources: */ -irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = { +irq_desc_t irq_desc[NR_IRQS] = { [0 ... NR_IRQS-1] = { .status = IRQ_DISABLED, .handler = &no_irq_type, diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/acpi/cpu_idle.c --- a/xen/arch/x86/acpi/cpu_idle.c Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/arch/x86/acpi/cpu_idle.c Tue Nov 04 12:43:19 2008 +0900 @@ -75,13 +75,14 @@ static void print_acpi_power(uint32_t cp printk("==cpu%d==\n", cpu); printk("active state:\t\tC%d\n", - power->last_state ? (int)(power->last_state - power->states) : -1); + power->last_state ? power->last_state->idx : -1); printk("max_cstate:\t\tC%d\n", max_cstate); printk("states:\n"); for ( i = 1; i < power->count; i++ ) { - printk((power->last_state == &power->states[i]) ? " *" : " "); + printk((power->last_state && power->last_state->idx == i) ? + " *" : " "); printk("C%d:\t", i); printk("type[C%d] ", power->states[i].type); printk("latency[%03d] ", power->states[i].latency); @@ -139,20 +140,26 @@ static void acpi_processor_ffh_cstate_en static void acpi_idle_do_entry(struct acpi_processor_cx *cx) { - if ( cx->space_id == ACPI_ADR_SPACE_FIXED_HARDWARE ) - { + int unused; + + switch ( cx->entry_method ) + { + case ACPI_CSTATE_EM_FFH: /* Call into architectural FFH based C-state */ acpi_processor_ffh_cstate_enter(cx); - } - else - { - int unused; + return; + case ACPI_CSTATE_EM_SYSIO: /* IO port based C-state */ inb(cx->address); /* Dummy wait op - must do something useless after P_LVL2 read because chipsets cannot guarantee that STPCLK# signal gets asserted in time to freeze execution properly. */ unused = inl(pmtmr_ioport); + return; + case ACPI_CSTATE_EM_HALT: + acpi_safe_halt(); + local_irq_disable(); + return; } } @@ -222,7 +229,7 @@ static void acpi_processor_idle(void) if ( power->flags.bm_check && acpi_idle_bm_check() && cx->type == ACPI_STATE_C3 ) cx = power->safe_state; - if ( cx - &power->states[0] > max_cstate ) + if ( cx->idx > max_cstate ) cx = &power->states[max_cstate]; } if ( !cx ) @@ -252,35 +259,11 @@ static void acpi_processor_idle(void) switch ( cx->type ) { case ACPI_STATE_C1: - /* Trace cpu idle entry */ - TRACE_1D(TRC_PM_IDLE_ENTRY, 1); - - /* - * Invoke C1. - * Use the appropriate idle routine, the one that would - * be used without acpi C-states. - */ - if ( pm_idle_save ) - pm_idle_save(); - else - acpi_safe_halt(); - - /* Trace cpu idle exit */ - TRACE_1D(TRC_PM_IDLE_EXIT, 1); - - /* - * TBD: Can't get time duration while in C1, as resumes - * go to an ISR rather than here. Need to instrument - * base interrupt handler. - */ - sleep_ticks = 0xFFFFFFFF; - break; - case ACPI_STATE_C2: - if ( local_apic_timer_c2_ok ) + if ( cx->type == ACPI_STATE_C1 || local_apic_timer_c2_ok ) { /* Trace cpu idle entry */ - TRACE_1D(TRC_PM_IDLE_ENTRY, 2); + TRACE_1D(TRC_PM_IDLE_ENTRY, cx->idx); /* Get start time (ticks) */ t1 = inl(pmtmr_ioport); /* Invoke C2 */ @@ -288,7 +271,7 @@ static void acpi_processor_idle(void) /* Get end time (ticks) */ t2 = inl(pmtmr_ioport); /* Trace cpu idle exit */ - TRACE_1D(TRC_PM_IDLE_EXIT, 2); + TRACE_1D(TRC_PM_IDLE_EXIT, cx->idx); /* Re-enable interrupts */ local_irq_enable(); @@ -328,7 +311,7 @@ static void acpi_processor_idle(void) } /* Trace cpu idle entry */ - TRACE_1D(TRC_PM_IDLE_ENTRY, cx - &power->states[0]); + TRACE_1D(TRC_PM_IDLE_ENTRY, cx->idx); /* * Before invoking C3, be aware that TSC/APIC timer may be * stopped by H/W. Without carefully handling of TSC/APIC stop issues, @@ -349,7 +332,7 @@ static void acpi_processor_idle(void) /* recovering TSC */ cstate_restore_tsc(); /* Trace cpu idle exit */ - TRACE_1D(TRC_PM_IDLE_EXIT, cx - &power->states[0]); + TRACE_1D(TRC_PM_IDLE_EXIT, cx->idx); if ( power->flags.bm_check && power->flags.bm_control ) { @@ -387,9 +370,15 @@ static void acpi_processor_idle(void) static int init_cx_pminfo(struct acpi_processor_power *acpi_power) { + int i; + memset(acpi_power, 0, sizeof(*acpi_power)); + for ( i = 0; i < ACPI_PROCESSOR_MAX_POWER; i++ ) + acpi_power->states[i].idx = i; + acpi_power->states[ACPI_STATE_C1].type = ACPI_STATE_C1; + acpi_power->states[ACPI_STATE_C1].entry_method = ACPI_CSTATE_EM_HALT; acpi_power->states[ACPI_STATE_C0].valid = 1; acpi_power->states[ACPI_STATE_C1].valid = 1; @@ -486,16 +475,13 @@ static int check_cx(struct acpi_processo break; case ACPI_ADR_SPACE_FIXED_HARDWARE: - if ( cx->type > ACPI_STATE_C1 ) - { - if ( cx->reg.bit_width != VENDOR_INTEL || - cx->reg.bit_offset != NATIVE_CSTATE_BEYOND_HALT ) - return -EINVAL; - - /* assume all logical cpu has the same support for mwait */ - if ( acpi_processor_ffh_cstate_probe(cx) ) - return -EINVAL; - } + if ( cx->reg.bit_width != VENDOR_INTEL || + cx->reg.bit_offset != NATIVE_CSTATE_BEYOND_HALT ) + return -EINVAL; + + /* assume all logical cpu has the same support for mwait */ + if ( acpi_processor_ffh_cstate_probe(cx) ) + return -EINVAL; break; default: @@ -599,7 +585,23 @@ static void set_cx( cx->valid = 1; cx->type = xen_cx->type; cx->address = xen_cx->reg.address; - cx->space_id = xen_cx->reg.space_id; + + switch ( xen_cx->reg.space_id ) + { + case ACPI_ADR_SPACE_FIXED_HARDWARE: + if ( xen_cx->reg.bit_width == VENDOR_INTEL && + xen_cx->reg.bit_offset == NATIVE_CSTATE_BEYOND_HALT ) + cx->entry_method = ACPI_CSTATE_EM_FFH; + else + cx->entry_method = ACPI_CSTATE_EM_HALT; + break; + case ACPI_ADR_SPACE_SYSTEM_IO: + cx->entry_method = ACPI_CSTATE_EM_SYSIO; + break; + default: + cx->entry_method = ACPI_CSTATE_EM_NONE; + } + cx->latency = xen_cx->latency; cx->power = xen_cx->power; @@ -761,8 +763,7 @@ int pmstat_get_cx_stat(uint32_t cpuid, s return 0; } - stat->last = (power->last_state) ? - (int)(power->last_state - &power->states[0]) : 0; + stat->last = power->last_state ? power->last_state->idx : 0; stat->nr = power->count; stat->idle_time = v->runstate.time[RUNSTATE_running]; if ( v->is_running ) diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/acpi/cpufreq/cpufreq.c --- a/xen/arch/x86/acpi/cpufreq/cpufreq.c Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/arch/x86/acpi/cpufreq/cpufreq.c Tue Nov 04 12:43:19 2008 +0900 @@ -370,7 +370,7 @@ static int acpi_cpufreq_target(struct cp if (!check_freqs(cmd.mask, freqs.new, data)) return -EAGAIN; - for_each_cpu_mask(j, cmd.mask) + for_each_cpu_mask(j, online_policy_cpus) cpufreq_statistic_update(j, perf->state, next_perf_state); perf->state = next_perf_state; @@ -447,18 +447,6 @@ acpi_cpufreq_cpu_init(struct cpufreq_pol perf = data->acpi_data; policy->shared_type = perf->shared_type; - /* capability check */ - if (perf->state_count <= 1) { - printk("No P-States\n"); - result = -ENODEV; - goto err_unreg; - } - - if (perf->control_register.space_id != perf->status_register.space_id) { - result = -ENODEV; - goto err_unreg; - } - switch (perf->control_register.space_id) { case ACPI_ADR_SPACE_SYSTEM_IO: printk("xen_pminfo: @acpi_cpufreq_cpu_init," diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/acpi/cpufreq/powernow.c --- a/xen/arch/x86/acpi/cpufreq/powernow.c Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/arch/x86/acpi/cpufreq/powernow.c Tue Nov 04 12:43:19 2008 +0900 @@ -229,9 +229,23 @@ err_unreg: return result; } +static int powernow_cpufreq_cpu_exit(struct cpufreq_policy *policy) +{ + struct powernow_cpufreq_data *data = drv_data[policy->cpu]; + + if (data) { + drv_data[policy->cpu] = NULL; + xfree(data->freq_table); + xfree(data); + } + + return 0; +} + static struct cpufreq_driver powernow_cpufreq_driver = { .target = powernow_cpufreq_target, .init = powernow_cpufreq_cpu_init, + .exit = powernow_cpufreq_cpu_exit }; int powernow_cpufreq_init(void) diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/acpi/cpuidle_menu.c --- a/xen/arch/x86/acpi/cpuidle_menu.c Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/arch/x86/acpi/cpuidle_menu.c Tue Nov 04 12:43:19 2008 +0900 @@ -59,7 +59,7 @@ static int menu_select(struct acpi_proce data->expected_us = (u32) get_sleep_length_ns() / 1000; /* find the deepest idle state that satisfies our constraints */ - for ( i = 1; i < power->count; i++ ) + for ( i = 2; i < power->count; i++ ) { struct acpi_processor_cx *s = &power->states[i]; @@ -81,17 +81,7 @@ static void menu_reflect(struct acpi_pro unsigned int last_residency; unsigned int measured_us; - /* - * Ugh, this idle state doesn't support residency measurements, so we - * are basically lost in the dark. As a compromise, assume we slept - * for one full standard timer tick. However, be aware that this - * could potentially result in a suboptimal state transition. - */ - if ( target->type == ACPI_STATE_C1 ) - last_residency = USEC_PER_SEC / HZ; - else - last_residency = power->last_residency; - + last_residency = power->last_residency; measured_us = last_residency + data->elapsed_us; /* if wrapping, set to max uint (-1) */ diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/domain.c --- a/xen/arch/x86/domain.c Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/arch/x86/domain.c Tue Nov 04 12:43:19 2008 +0900 @@ -174,9 +174,10 @@ void free_vcpu_struct(struct vcpu *v) static int setup_compat_l4(struct vcpu *v) { - struct page_info *pg = alloc_domheap_page(NULL, 0); + struct page_info *pg; l4_pgentry_t *l4tab; + pg = alloc_domheap_page(NULL, MEMF_node(vcpu_to_node(v))); if ( pg == NULL ) return -ENOMEM; @@ -1639,31 +1640,22 @@ static int relinquish_memory( } if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) ) - put_page_and_type(page); + ret = put_page_and_type_preemptible(page, 1); + switch ( ret ) + { + case 0: + break; + case -EAGAIN: + case -EINTR: + set_bit(_PGT_pinned, &page->u.inuse.type_info); + put_page(page); + goto out; + default: + BUG(); + } if ( test_and_clear_bit(_PGC_allocated, &page->count_info) ) put_page(page); - -#ifdef DOMAIN_DESTRUCT_AVOID_RECURSION - /* - * Forcibly drop reference counts of page tables above top most (which - * were skipped to prevent long latencies due to deep recursion - see - * the special treatment in free_lX_table()). - */ - y = page->u.inuse.type_info; - if ( (type < PGT_root_page_table) && - unlikely(((y + PGT_type_mask) & - (PGT_type_mask|PGT_validated)) == type) ) - { - BUG_ON((y & PGT_count_mask) >= - (page->count_info & PGC_count_mask)); - while ( y & PGT_count_mask ) - { - put_page_and_type(page); - y = page->u.inuse.type_info; - } - } -#endif /* * Forcibly invalidate top-most, still valid page tables at this point @@ -1685,8 +1677,31 @@ static int relinquish_memory( x & ~(PGT_validated|PGT_partial)); if ( likely(y == x) ) { - if ( free_page_type(page, x, 0) != 0 ) + /* No need for atomic update of type_info here: noone else updates it. */ + switch ( ret = free_page_type(page, x, 1) ) + { + case 0: + break; + case -EINTR: + page->u.inuse.type_info |= PGT_validated; + if ( x & PGT_partial ) + put_page(page); + put_page(page); + ret = -EAGAIN; + goto out; + case -EAGAIN: + page->u.inuse.type_info |= PGT_partial; + if ( x & PGT_partial ) + put_page(page); + goto out; + default: BUG(); + } + if ( x & PGT_partial ) + { + page->u.inuse.type_info--; + put_page(page); + } break; } } @@ -1831,11 +1846,6 @@ int domain_relinquish_resources(struct d /* fallthrough */ case RELMEM_done: -#ifdef DOMAIN_DESTRUCT_AVOID_RECURSION - ret = relinquish_memory(d, &d->page_list, PGT_l1_page_table); - if ( ret ) - return ret; -#endif break; default: @@ -1891,6 +1901,54 @@ void domain_cpuid( *eax = *ebx = *ecx = *edx = 0; } + +void vcpu_kick(struct vcpu *v) +{ + /* + * NB1. 'pause_flags' and 'processor' must be checked /after/ update of + * pending flag. These values may fluctuate (after all, we hold no + * locks) but the key insight is that each change will cause + * evtchn_upcall_pending to be polled. + * + * NB2. We save the running flag across the unblock to avoid a needless + * IPI for domains that we IPI'd to unblock. + */ + bool_t running = v->is_running; + vcpu_unblock(v); + if ( running && (in_irq() || (v != current)) ) + cpu_raise_softirq(v->processor, VCPU_KICK_SOFTIRQ); +} + +void vcpu_mark_events_pending(struct vcpu *v) +{ + int already_pending = test_and_set_bit( + 0, (unsigned long *)&vcpu_info(v, evtchn_upcall_pending)); + + if ( already_pending ) + return; + + if ( is_hvm_vcpu(v) ) + hvm_assert_evtchn_irq(v); + else + vcpu_kick(v); +} + +static void vcpu_kick_softirq(void) +{ + /* + * Nothing to do here: we merely prevent notifiers from racing with checks + * executed on return to guest context with interrupts enabled. See, for + * example, xxx_intr_assist() executed on return to HVM guest context. + */ +} + +static int __init init_vcpu_kick_softirq(void) +{ + open_softirq(VCPU_KICK_SOFTIRQ, vcpu_kick_softirq); + return 0; +} +__initcall(init_vcpu_kick_softirq); + /* * Local variables: diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/domain_build.c --- a/xen/arch/x86/domain_build.c Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/arch/x86/domain_build.c Tue Nov 04 12:43:19 2008 +0900 @@ -194,6 +194,30 @@ static void __init process_dom0_ioports_ } } +/* We run on dom0's page tables for the final part of the build process. */ +static void dom0_pt_enter(struct vcpu *v) +{ + struct desc_ptr gdt_desc = { + .limit = LAST_RESERVED_GDT_BYTE, + .base = (unsigned long)(this_cpu(gdt_table) - FIRST_RESERVED_GDT_ENTRY) + }; + + asm volatile ( "lgdt %0" : : "m" (gdt_desc) ); + write_ptbase(v); +} + +/* Return to idle domain's page tables. */ +static void dom0_pt_exit(void) +{ + struct desc_ptr gdt_desc = { + .limit = LAST_RESERVED_GDT_BYTE, + .base = GDT_VIRT_START(current) + }; + + write_ptbase(current); + asm volatile ( "lgdt %0" : : "m" (gdt_desc) ); +} + int __init construct_dom0( struct domain *d, unsigned long _image_start, unsigned long image_len, @@ -700,14 +724,12 @@ int __init construct_dom0( (void)alloc_vcpu(d, i, i % num_online_cpus()); /* Set up CR3 value for write_ptbase */ - if ( paging_mode_enabled(v->domain) ) + if ( paging_mode_enabled(d) ) paging_update_paging_modes(v); else update_cr3(v); - /* Install the new page tables. */ - local_irq_disable(); - write_ptbase(v); + dom0_pt_enter(v); /* Copy the OS image and free temporary buffer. */ elf.dest = (void*)vkern_start; @@ -804,9 +826,7 @@ int __init construct_dom0( xlat_start_info(si, XLAT_start_info_console_dom0); #endif - /* Reinstate the caller's page tables. */ - write_ptbase(current); - local_irq_enable(); + dom0_pt_exit(); #if defined(__i386__) /* Destroy low mappings - they were only for our convenience. */ diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/hpet.c --- a/xen/arch/x86/hpet.c Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/arch/x86/hpet.c Tue Nov 04 12:43:19 2008 +0900 @@ -14,8 +14,6 @@ #include <asm/div64.h> #include <asm/hpet.h> -#define STIME_MAX ((s_time_t)((uint64_t)~0ull>>1)) - #define MAX_DELTA_NS MILLISECS(10*1000) #define MIN_DELTA_NS MICROSECS(20) @@ -146,7 +144,7 @@ static void handle_hpet_broadcast(struct s_time_t now, next_event; int cpu; - spin_lock(&ch->lock); + spin_lock_irq(&ch->lock); again: ch->next_event = STIME_MAX; @@ -171,7 +169,7 @@ again: if ( reprogram_hpet_evt_channel(ch, next_event, now, 0) ) goto again; } - spin_unlock(&ch->lock); + spin_unlock_irq(&ch->lock); } void hpet_broadcast_init(void) @@ -213,6 +211,7 @@ void hpet_broadcast_enter(void) { struct hpet_event_channel *ch = &hpet_event; + ASSERT(!local_irq_is_enabled()); spin_lock(&ch->lock); disable_APIC_timer(); diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/hvm/emulate.c --- a/xen/arch/x86/hvm/emulate.c Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/arch/x86/hvm/emulate.c Tue Nov 04 12:43:19 2008 +0900 @@ -14,10 +14,38 @@ #include <xen/lib.h> #include <xen/sched.h> #include <xen/paging.h> +#include <xen/trace.h> #include <asm/event.h> #include <asm/hvm/emulate.h> #include <asm/hvm/hvm.h> #include <asm/hvm/support.h> + +#define HVMTRACE_IO_ASSIST_WRITE 0x200 +static void hvmtrace_io_assist(int is_mmio, ioreq_t *p) +{ + unsigned int size, event; + unsigned char buffer[12]; + + if ( likely(!tb_init_done) ) + return; + + event = is_mmio ? TRC_HVM_MMIO_ASSIST : TRC_HVM_IO_ASSIST; + if ( !p->dir ) + event |= HVMTRACE_IO_ASSIST_WRITE; + + *(uint64_t *)buffer = p->addr; + size = (p->addr != (u32)p->addr) ? 8 : 4; + if ( size == 8 ) + event |= TRC_64_FLAG; + + if ( !p->data_is_ptr ) + { + *(uint32_t *)&buffer[size] = p->data; + size += 4; + } + + trace_var(event, 0/*!cycles*/, size, buffer); +} static int hvmemul_do_io( int is_mmio, paddr_t addr, unsigned long *reps, int size, @@ -110,6 +138,8 @@ static int hvmemul_do_io( p->df = df; p->data = value; p->io_count++; + + hvmtrace_io_assist(is_mmio, p); if ( is_mmio ) { diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/hvm/hpet.c --- a/xen/arch/x86/hvm/hpet.c Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/arch/x86/hvm/hpet.c Tue Nov 04 12:43:19 2008 +0900 @@ -76,6 +76,7 @@ ~0ULL : (tick) * (h)->hpet_to_ns_scale) >> 10)) #define timer_config(h, n) (h->hpet.timers[n].config) +#define timer_enabled(h, n) (timer_config(h, n) & HPET_TN_ENABLE) #define timer_is_periodic(h, n) (timer_config(h, n) & HPET_TN_PERIODIC) #define timer_is_32bit(h, n) (timer_config(h, n) & HPET_TN_32BIT) #define hpet_enabled(h) (h->hpet.config & HPET_CFG_ENABLE) @@ -88,9 +89,40 @@ ((timer_config(h, n) & HPET_TN_INT_ROUTE_CAP_MASK) \ >> HPET_TN_INT_ROUTE_CAP_SHIFT) -#define hpet_time_after(a, b) ((int32_t)(b) - (int32_t)(a) < 0) -#define hpet_time_after64(a, b) ((int64_t)(b) - (int64_t)(a) < 0) - +static inline uint64_t hpet_read_maincounter(HPETState *h) +{ + ASSERT(spin_is_locked(&h->lock)); + + if ( hpet_enabled(h) ) + return guest_time_hpet(h->vcpu) + h->mc_offset; + else + return h->hpet.mc64; +} + +static uint64_t hpet_get_comparator(HPETState *h, unsigned int tn) +{ + uint64_t comparator; + uint64_t elapsed; + + comparator = h->hpet.comparator64[tn]; + if ( timer_is_periodic(h, tn) ) + { + /* update comparator by number of periods elapsed since last update */ + uint64_t period = h->hpet.period[tn]; + if (period) + { + elapsed = hpet_read_maincounter(h) + period - 1 - comparator; + comparator += (elapsed / period) * period; + h->hpet.comparator64[tn] = comparator; + } + } + + /* truncate if timer is in 32 bit mode */ + if ( timer_is_32bit(h, tn) ) + comparator = (uint32_t)comparator; + h->hpet.timers[tn].cmp = comparator; + return comparator; +} static inline uint64_t hpet_read64(HPETState *h, unsigned long addr) { addr &= ~7; @@ -104,7 +136,7 @@ static inline uint64_t hpet_read64(HPETS case HPET_STATUS: return h->hpet.isr; case HPET_COUNTER: - return h->hpet.mc64; + return hpet_read_maincounter(h); case HPET_T0_CFG: case HPET_T1_CFG: case HPET_T2_CFG: @@ -112,7 +144,7 @@ static inline uint64_t hpet_read64(HPETS case HPET_T0_CMP: case HPET_T1_CMP: case HPET_T2_CMP: - return h->hpet.timers[(addr - HPET_T0_CMP) >> 5].cmp; + return hpet_get_comparator(h, (addr - HPET_T0_CMP) >> 5); case HPET_T0_ROUTE: case HPET_T1_ROUTE: case HPET_T2_ROUTE: @@ -140,16 +172,6 @@ static inline int hpet_check_access_leng return 0; } -static inline uint64_t hpet_read_maincounter(HPETState *h) -{ - ASSERT(spin_is_locked(&h->lock)); - - if ( hpet_enabled(h) ) - return guest_time_hpet(h->vcpu) + h->mc_offset; - else - return h->hpet.mc64; -} - static int hpet_read( struct vcpu *v, unsigned long addr, unsigned long length, unsigned long *pval) @@ -169,8 +191,6 @@ static int hpet_read( spin_lock(&h->lock); val = hpet_read64(h, addr); - if ( (addr & ~7) == HPET_COUNTER ) - val = hpet_read_maincounter(h); result = val; if ( length != 8 ) @@ -187,7 +207,10 @@ static void hpet_stop_timer(HPETState *h { ASSERT(tn < HPET_TIMER_NUM); ASSERT(spin_is_locked(&h->lock)); - stop_timer(&h->timers[tn]); + destroy_periodic_time(&h->pt[tn]); + /* read the comparator to get it updated so a read while stopped will + * return the expected value. */ + hpet_get_comparator(h, tn); } /* the number of HPET tick that stands for @@ -197,6 +220,8 @@ static void hpet_set_timer(HPETState *h, static void hpet_set_timer(HPETState *h, unsigned int tn) { uint64_t tn_cmp, cur_tick, diff; + unsigned int irq; + unsigned int oneshot; ASSERT(tn < HPET_TIMER_NUM); ASSERT(spin_is_locked(&h->lock)); @@ -209,7 +234,10 @@ static void hpet_set_timer(HPETState *h, pit_stop_channel0_irq(pit); } - tn_cmp = h->hpet.timers[tn].cmp; + if ( !timer_enabled(h, tn) ) + return; + + tn_cmp = hpet_get_comparator(h, tn); cur_tick = hpet_read_maincounter(h); if ( timer_is_32bit(h, tn) ) { @@ -229,7 +257,25 @@ static void hpet_set_timer(HPETState *h, diff = (timer_is_32bit(h, tn) && (-diff > HPET_TINY_TIME_SPAN)) ? (uint32_t)diff : 0; - set_timer(&h->timers[tn], NOW() + hpet_tick_to_ns(h, diff)); + if ( (tn <= 1) && (h->hpet.config & HPET_CFG_LEGACY) ) + /* if LegacyReplacementRoute bit is set, HPET specification requires + timer0 be routed to IRQ0 in NON-APIC or IRQ2 in the I/O APIC, + timer1 be routed to IRQ8 in NON-APIC or IRQ8 in the I/O APIC. */ + irq = (tn == 0) ? 0 : 8; + else + irq = timer_int_route(h, tn); + + /* + * diff is the time from now when the timer should fire, for a periodic + * timer we also need the period which may be different because time may + * have elapsed between the time the comparator was written and the timer + * being enabled (now). + */ + oneshot = !timer_is_periodic(h, tn); + create_periodic_time(h->vcpu, &h->pt[tn], + hpet_tick_to_ns(h, diff), + oneshot ? 0 : hpet_tick_to_ns(h, h->hpet.period[tn]), + irq, NULL, NULL); } static inline uint64_t hpet_fixup_reg( @@ -248,6 +294,13 @@ static int hpet_write( uint64_t old_val, new_val; int tn, i; + /* Acculumate a bit mask of timers whos state is changed by this write. */ + unsigned long start_timers = 0; + unsigned long stop_timers = 0; +#define set_stop_timer(n) (__set_bit((n), &stop_timers)) +#define set_start_timer(n) (__set_bit((n), &start_timers)) +#define set_restart_timer(n) (set_stop_timer(n),set_start_timer(n)) + addr &= HPET_MMAP_SIZE-1; if ( hpet_check_access_length(addr, length) != 0 ) @@ -256,9 +309,6 @@ static int hpet_write( spin_lock(&h->lock); old_val = hpet_read64(h, addr); - if ( (addr & ~7) == HPET_COUNTER ) - old_val = hpet_read_maincounter(h); - new_val = val; if ( length != 8 ) new_val = hpet_fixup_reg( @@ -275,22 +325,35 @@ static int hpet_write( /* Enable main counter and interrupt generation. */ h->mc_offset = h->hpet.mc64 - guest_time_hpet(h->vcpu); for ( i = 0; i < HPET_TIMER_NUM; i++ ) - hpet_set_timer(h, i); + { + h->hpet.comparator64[i] = + h->hpet.timers[i].config & HPET_TN_32BIT ? + (uint32_t)h->hpet.timers[i].cmp : + h->hpet.timers[i].cmp; + if ( timer_enabled(h, i) ) + set_start_timer(i); + } } else if ( (old_val & HPET_CFG_ENABLE) && !(new_val & HPET_CFG_ENABLE) ) { /* Halt main counter and disable interrupt generation. */ h->hpet.mc64 = h->mc_offset + guest_time_hpet(h->vcpu); for ( i = 0; i < HPET_TIMER_NUM; i++ ) - hpet_stop_timer(h, i); + if ( timer_enabled(h, i) ) + set_stop_timer(i); } break; case HPET_COUNTER: + h->hpet.mc64 = new_val; if ( hpet_enabled(h) ) + { gdprintk(XENLOG_WARNING, "HPET: writing main counter but it's not halted!\n"); - h->hpet.mc64 = new_val; + for ( i = 0; i < HPET_TIMER_NUM; i++ ) + if ( timer_enabled(h, i) ) + set_restart_timer(i); + } break; case HPET_T0_CFG: @@ -313,7 +376,28 @@ static int hpet_write( h->hpet.timers[tn].cmp = (uint32_t)h->hpet.timers[tn].cmp; h->hpet.period[tn] = (uint32_t)h->hpet.period[tn]; } - + if ( hpet_enabled(h) ) + { + if ( new_val & HPET_TN_ENABLE ) + { + if ( (new_val ^ old_val) & HPET_TN_PERIODIC ) + /* timer is enabled but switching mode to/from periodic/ + * one-shot, stop and restart the vpt timer to get it in + * the right mode. */ + set_restart_timer(tn); + else if ( (new_val & HPET_TN_32BIT) && + !(old_val & HPET_TN_32BIT) ) + /* switching from 64 bit to 32 bit mode could cause timer + * next fire time, or period, to change. */ + set_restart_timer(tn); + else if ( !(old_val & HPET_TN_ENABLE) ) + /* transition from timer disabled to timer enabled. */ + set_start_timer(tn); + } + else if ( old_val & HPET_TN_ENABLE ) + /* transition from timer enabled to timer disabled. */ + set_stop_timer(tn); + } break; case HPET_T0_CMP: @@ -322,24 +406,32 @@ static int hpet_write( tn = (addr - HPET_T0_CMP) >> 5; if ( timer_is_32bit(h, tn) ) new_val = (uint32_t)new_val; - if ( !timer_is_periodic(h, tn) || - (h->hpet.timers[tn].config & HPET_TN_SETVAL) ) - h->hpet.timers[tn].cmp = new_val; - else + h->hpet.timers[tn].cmp = new_val; + if ( h->hpet.timers[tn].config & HPET_TN_SETVAL ) + /* + * When SETVAL is one, software is able to "directly set a periodic + * timer's accumulator." That is, set the comparator without + * adjusting the period. Much the same as just setting the + * comparator on an enabled one-shot timer. + * + * This configuration bit clears when the comparator is written. + */ + h->hpet.timers[tn].config &= ~HPET_TN_SETVAL; + else if ( timer_is_periodic(h, tn) ) { /* * Clamp period to reasonable min/max values: - * - minimum is 900us, same as timers controlled by vpt.c + * - minimum is 100us, same as timers controlled by vpt.c * - maximum is to prevent overflow in time_after() calculations */ - if ( hpet_tick_to_ns(h, new_val) < MICROSECS(900) ) - new_val = (MICROSECS(900) << 10) / h->hpet_to_ns_scale; + if ( hpet_tick_to_ns(h, new_val) < MICROSECS(100) ) + new_val = (MICROSECS(100) << 10) / h->hpet_to_ns_scale; new_val &= (timer_is_32bit(h, tn) ? ~0u : ~0ull) >> 1; h->hpet.period[tn] = new_val; } - h->hpet.timers[tn].config &= ~HPET_TN_SETVAL; - if ( hpet_enabled(h) ) - hpet_set_timer(h, tn); + h->hpet.comparator64[tn] = new_val; + if ( hpet_enabled(h) && timer_enabled(h, tn) ) + set_restart_timer(tn); break; case HPET_T0_ROUTE: @@ -354,6 +446,25 @@ static int hpet_write( break; } + /* stop/start timers whos state was changed by this write. */ + while (stop_timers) + { + i = find_first_set_bit(stop_timers); + __clear_bit(i, &stop_timers); + hpet_stop_timer(h, i); + } + + while (start_timers) + { + i = find_first_set_bit(start_timers); + __clear_bit(i, &start_timers); + hpet_set_timer(h, i); + } + +#undef set_stop_timer +#undef set_start_timer +#undef set_restart_timer + spin_unlock(&h->lock); out: @@ -373,86 +484,6 @@ struct hvm_mmio_handler hpet_mmio_handle .write_handler = hpet_write }; -static void hpet_route_interrupt(HPETState *h, unsigned int tn) -{ - unsigned int tn_int_route = timer_int_route(h, tn); - struct domain *d = h->vcpu->domain; - - ASSERT(spin_is_locked(&h->lock)); - - if ( (tn <= 1) && (h->hpet.config & HPET_CFG_LEGACY) ) - { - /* if LegacyReplacementRoute bit is set, HPET specification requires - timer0 be routed to IRQ0 in NON-APIC or IRQ2 in the I/O APIC, - timer1 be routed to IRQ8 in NON-APIC or IRQ8 in the I/O APIC. */ - int isa_irq = (tn == 0) ? 0 : 8; - hvm_isa_irq_deassert(d, isa_irq); - hvm_isa_irq_assert(d, isa_irq); - return; - } - - if ( !(timer_int_route_cap(h, tn) & (1U << tn_int_route)) ) - { - gdprintk(XENLOG_ERR, - "HPET: timer%u: invalid interrupt route config\n", tn); - domain_crash(d); - return; - } - - /* We support only edge-triggered interrupt. */ - spin_lock(&d->arch.hvm_domain.irq_lock); - vioapic_irq_positive_edge(d, tn_int_route); - spin_unlock(&d->arch.hvm_domain.irq_lock); -} - -static void hpet_timer_fn(void *opaque) -{ - struct HPET_timer_fn_info *htfi = opaque; - HPETState *h = htfi->hs; - unsigned int tn = htfi->tn; - - spin_lock(&h->lock); - - if ( !hpet_enabled(h) ) - { - spin_unlock(&h->lock); - return; - } - - if ( timer_config(h, tn) & HPET_TN_ENABLE ) - hpet_route_interrupt(h, tn); - - if ( timer_is_periodic(h, tn) && (h->hpet.period[tn] != 0) ) - { - uint64_t mc = hpet_read_maincounter(h), period = h->hpet.period[tn]; - if ( timer_is_32bit(h, tn) ) - { - while ( hpet_time_after(mc, h->hpet.timers[tn].cmp) ) - h->hpet.timers[tn].cmp = (uint32_t)( - h->hpet.timers[tn].cmp + period); - } - else - { - while ( hpet_time_after64(mc, h->hpet.timers[tn].cmp) ) - h->hpet.timers[tn].cmp += period; - } - set_timer(&h->timers[tn], NOW() + hpet_tick_to_ns(h, period)); - } - - spin_unlock(&h->lock); -} - -void hpet_migrate_timers(struct vcpu *v) -{ - struct HPETState *h = &v->domain->arch.hvm_domain.pl_time.vhpet; - int i; - - if ( v != h->vcpu ) - return; - - for ( i = 0; i < HPET_TIMER_NUM; i++ ) - migrate_timer(&h->timers[i], v->processor); -} static int hpet_save(struct domain *d, hvm_domain_context_t *h) { @@ -477,18 +508,20 @@ static int hpet_save(struct domain *d, h C(isr); C(mc64); C(timers[0].config); - C(timers[0].cmp); C(timers[0].fsb); C(timers[1].config); - C(timers[1].cmp); C(timers[1].fsb); C(timers[2].config); - C(timers[2].cmp); C(timers[2].fsb); C(period[0]); C(period[1]); C(period[2]); #undef C + /* save the 64 bit comparator in the 64 bit timer[n].cmp field + * regardless of whether or not the timer is in 32 bit mode. */ + rec->timers[0].cmp = hp->hpet.comparator64[0]; + rec->timers[1].cmp = hp->hpet.comparator64[1]; + rec->timers[2].cmp = hp->hpet.comparator64[2]; } spin_unlock(&hp->lock); @@ -500,6 +533,7 @@ static int hpet_load(struct domain *d, h { HPETState *hp = &d->arch.hvm_domain.pl_time.vhpet; struct hvm_hw_hpet *rec; + uint64_t cmp; int i; spin_lock(&hp->lock); @@ -515,32 +549,38 @@ static int hpet_load(struct domain *d, h h->cur += HVM_SAVE_LENGTH(HPET); #define C(x) hp->hpet.x = rec->x - C(capability); - C(config); - C(isr); - C(mc64); - C(timers[0].config); - C(timers[0].cmp); - C(timers[0].fsb); - C(timers[1].config); - C(timers[1].cmp); - C(timers[1].fsb); - C(timers[2].config); - C(timers[2].cmp); - C(timers[2].fsb); - C(period[0]); - C(period[1]); - C(period[2]); + C(capability); + C(config); + C(isr); + C(mc64); + /* The following define will generate a compiler error if HPET_TIMER_NUM + * changes. This indicates an incompatability with previous saved state. */ +#define HPET_TIMER_NUM 3 + for ( i = 0; i < HPET_TIMER_NUM; i++ ) + { + C(timers[i].config); + C(timers[i].fsb); + C(period[i]); + /* restore the hidden 64 bit comparator and truncate the timer's + * visible comparator field if in 32 bit mode. */ + cmp = rec->timers[i].cmp; + hp->hpet.comparator64[i] = cmp; + if ( timer_is_32bit(hp, i) ) + cmp = (uint32_t)cmp; + hp->hpet.timers[i].cmp = cmp; + } #undef C /* Recalculate the offset between the main counter and guest time */ hp->mc_offset = hp->hpet.mc64 - guest_time_hpet(hp->vcpu); - - /* Restart the timers */ - for ( i = 0; i < HPET_TIMER_NUM; i++ ) - if ( hpet_enabled(hp) ) - hpet_set_timer(hp, i); - + + /* restart all timers */ + + if ( hpet_enabled(hp) ) + for ( i = 0; i < HPET_TIMER_NUM; i++ ) + if ( timer_enabled(hp, i) ) + hpet_set_timer(hp, i); + spin_unlock(&hp->lock); return 0; @@ -575,10 +615,7 @@ void hpet_init(struct vcpu *v) h->hpet.timers[i].config = HPET_TN_INT_ROUTE_CAP | HPET_TN_SIZE_CAP | HPET_TN_PERIODIC_CAP; h->hpet.timers[i].cmp = ~0ULL; - h->timer_fn_info[i].hs = h; - h->timer_fn_info[i].tn = i; - init_timer(&h->timers[i], hpet_timer_fn, &h->timer_fn_info[i], - v->processor); + h->pt[i].source = PTSRC_isa; } } @@ -587,8 +624,14 @@ void hpet_deinit(struct domain *d) int i; HPETState *h = &d->arch.hvm_domain.pl_time.vhpet; - for ( i = 0; i < HPET_TIMER_NUM; i++ ) - kill_timer(&h->timers[i]); + spin_lock(&h->lock); + + if ( hpet_enabled(h) ) + for ( i = 0; i < HPET_TIMER_NUM; i++ ) + if ( timer_enabled(h, i) ) + hpet_stop_timer(h, i); + + spin_unlock(&h->lock); } void hpet_reset(struct domain *d) diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/hvm/hvm.c --- a/xen/arch/x86/hvm/hvm.c Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/arch/x86/hvm/hvm.c Tue Nov 04 12:43:19 2008 +0900 @@ -163,7 +163,6 @@ void hvm_migrate_timers(struct vcpu *v) void hvm_migrate_timers(struct vcpu *v) { rtc_migrate_timers(v); - hpet_migrate_timers(v); pt_migrate(v); } diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/hvm/i8254.c --- a/xen/arch/x86/hvm/i8254.c Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/arch/x86/hvm/i8254.c Tue Nov 04 12:43:19 2008 +0900 @@ -213,13 +213,13 @@ static void pit_load_count(PITState *pit case 2: case 3: /* Periodic timer. */ - create_periodic_time(v, &pit->pt0, period, 0, 0, pit_time_fired, + create_periodic_time(v, &pit->pt0, period, period, 0, pit_time_fired, &pit->count_load_time[channel]); break; case 1: case 4: /* One-shot timer. */ - create_periodic_time(v, &pit->pt0, period, 0, 1, pit_time_fired, + create_periodic_time(v, &pit->pt0, period, 0, 0, pit_time_fired, &pit->count_load_time[channel]); break; default: diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/hvm/rtc.c --- a/xen/arch/x86/hvm/rtc.c Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/arch/x86/hvm/rtc.c Tue Nov 04 12:43:19 2008 +0900 @@ -59,8 +59,8 @@ static void rtc_timer_update(RTCState *s period = 1 << (period_code - 1); /* period in 32 Khz cycles */ period = DIV_ROUND((period * 1000000000ULL), 32768); /* period in ns */ - create_periodic_time(v, &s->pt, period, RTC_IRQ, - 0, rtc_periodic_cb, s); + create_periodic_time(v, &s->pt, period, period, RTC_IRQ, + rtc_periodic_cb, s); } else { diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/hvm/svm/entry.S --- a/xen/arch/x86/hvm/svm/entry.S Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/arch/x86/hvm/svm/entry.S Tue Nov 04 12:43:19 2008 +0900 @@ -57,6 +57,8 @@ #endif ENTRY(svm_asm_do_resume) + call svm_intr_assist + get_current(bx) CLGI @@ -67,7 +69,6 @@ ENTRY(svm_asm_do_resume) jnz .Lsvm_process_softirqs call svm_asid_handle_vmrun - call svm_intr_assist cmpb $0,addr_of(tb_init_done) jnz .Lsvm_trace diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/hvm/vlapic.c --- a/xen/arch/x86/hvm/vlapic.c Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/arch/x86/hvm/vlapic.c Tue Nov 04 12:43:19 2008 +0900 @@ -701,8 +701,9 @@ static int vlapic_write(struct vcpu *v, (uint32_t)val * vlapic->hw.timer_divisor; vlapic_set_reg(vlapic, APIC_TMICT, val); - create_periodic_time(current, &vlapic->pt, period, vlapic->pt.irq, - !vlapic_lvtt_period(vlapic), vlapic_pt_cb, + create_periodic_time(current, &vlapic->pt, period, + vlapic_lvtt_period(vlapic) ? period : 0, + vlapic->pt.irq, vlapic_pt_cb, &vlapic->timer_last_update); vlapic->timer_last_update = vlapic->pt.last_plt_gtime; @@ -861,8 +862,9 @@ static void lapic_rearm(struct vlapic *s period = ((uint64_t)APIC_BUS_CYCLE_NS * (uint32_t)tmict * s->hw.timer_divisor); s->pt.irq = vlapic_get_reg(s, APIC_LVTT) & APIC_VECTOR_MASK; - create_periodic_time(vlapic_vcpu(s), &s->pt, period, s->pt.irq, - !vlapic_lvtt_period(s), vlapic_pt_cb, + create_periodic_time(vlapic_vcpu(s), &s->pt, period, + vlapic_lvtt_period(s) ? period : 0, + s->pt.irq, vlapic_pt_cb, &s->timer_last_update); s->timer_last_update = s->pt.last_plt_gtime; } diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/hvm/vmx/entry.S --- a/xen/arch/x86/hvm/vmx/entry.S Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/arch/x86/hvm/vmx/entry.S Tue Nov 04 12:43:19 2008 +0900 @@ -122,6 +122,8 @@ vmx_asm_vmexit_handler: .globl vmx_asm_do_vmentry vmx_asm_do_vmentry: + call vmx_intr_assist + get_current(bx) cli @@ -130,8 +132,6 @@ vmx_asm_do_vmentry: lea addr_of(irq_stat),r(dx) cmpl $0,(r(dx),r(ax),1) jnz .Lvmx_process_softirqs - - call vmx_intr_assist testb $0xff,VCPU_vmx_emul(r(bx)) jnz .Lvmx_goto_realmode @@ -179,11 +179,13 @@ vmx_asm_do_vmentry: /*.Lvmx_resume:*/ VMRESUME + sti call vm_resume_fail ud2 .Lvmx_launch: VMLAUNCH + sti call vm_launch_fail ud2 diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/hvm/vmx/vmx.c --- a/xen/arch/x86/hvm/vmx/vmx.c Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/arch/x86/hvm/vmx/vmx.c Tue Nov 04 12:43:19 2008 +0900 @@ -49,6 +49,7 @@ #include <asm/hvm/vpt.h> #include <public/hvm/save.h> #include <asm/hvm/trace.h> +#include <asm/xenoprof.h> enum handler_return { HNDL_done, HNDL_unhandled, HNDL_exception_raised }; @@ -132,6 +133,7 @@ static void vmx_vcpu_destroy(struct vcpu { vmx_destroy_vmcs(v); vpmu_destroy(v); + passive_domain_destroy(v); } #ifdef __x86_64__ @@ -1666,6 +1668,8 @@ static int vmx_msr_read_intercept(struct default: if ( vpmu_do_rdmsr(regs) ) goto done; + if ( passive_domain_do_rdmsr(regs) ) + goto done; switch ( long_mode_do_msr_read(regs) ) { case HNDL_unhandled: @@ -1860,6 +1864,8 @@ static int vmx_msr_write_intercept(struc goto gp_fault; default: if ( vpmu_do_wrmsr(regs) ) + return X86EMUL_OKAY; + if ( passive_domain_do_wrmsr(regs) ) return X86EMUL_OKAY; if ( wrmsr_viridian_regs(ecx, regs->eax, regs->edx) ) @@ -1964,27 +1970,25 @@ static void ept_handle_violation(unsigne { unsigned long gla_validity = qualification & EPT_GLA_VALIDITY_MASK; struct domain *d = current->domain; - unsigned long gfn = gpa >> PAGE_SHIFT; + unsigned long gla, gfn = gpa >> PAGE_SHIFT; mfn_t mfn; p2m_type_t t; - if ( unlikely(qualification & EPT_GAW_VIOLATION) ) - { - gdprintk(XENLOG_ERR, "EPT violation: guest physical address %"PRIpaddr - " exceeded its width limit.\n", gpa); - goto crash; - } - - if ( unlikely(gla_validity == EPT_GLA_VALIDITY_RSVD) || - unlikely(gla_validity == EPT_GLA_VALIDITY_PDPTR_LOAD) ) - { - gdprintk(XENLOG_ERR, "EPT violation: reserved bit or " - "pdptr load violation.\n"); - goto crash; - } - mfn = gfn_to_mfn(d, gfn, &t); - if ( (t != p2m_ram_ro) && p2m_is_ram(t) && paging_mode_log_dirty(d) ) + + /* There are two legitimate reasons for taking an EPT violation. + * One is a guest access to MMIO space. */ + if ( gla_validity == EPT_GLA_VALIDITY_MATCH && p2m_is_mmio(t) ) + { + handle_mmio(); + return; + } + + /* The other is log-dirty mode, writing to a read-only page */ + if ( paging_mode_log_dirty(d) + && (gla_validity == EPT_GLA_VALIDITY_MATCH + || gla_validity == EPT_GLA_VALIDITY_GPT_WALK) + && p2m_is_ram(t) && (t != p2m_ram_ro) ) { paging_mark_dirty(d, mfn_x(mfn)); p2m_change_type(d, gfn, p2m_ram_logdirty, p2m_ram_rw); @@ -1992,16 +1996,39 @@ static void ept_handle_violation(unsigne return; } - /* This can only happen in log-dirty mode, writing back A/D bits. */ - if ( unlikely(gla_validity == EPT_GLA_VALIDITY_GPT_WALK) ) - goto crash; - - ASSERT(gla_validity == EPT_GLA_VALIDITY_MATCH); - handle_mmio(); - - return; - - crash: + /* Everything else is an error. */ + gla = __vmread(GUEST_LINEAR_ADDRESS); + gdprintk(XENLOG_ERR, "EPT violation %#lx (%c%c%c/%c%c%c), " + "gpa %#"PRIpaddr", mfn %#lx, type %i.\n", + qualification, + (qualification & EPT_READ_VIOLATION) ? 'r' : '-', + (qualification & EPT_WRITE_VIOLATION) ? 'w' : '-', + (qualification & EPT_EXEC_VIOLATION) ? 'x' : '-', + (qualification & EPT_EFFECTIVE_READ) ? 'r' : '-', + (qualification & EPT_EFFECTIVE_WRITE) ? 'w' : '-', + (qualification & EPT_EFFECTIVE_EXEC) ? 'x' : '-', + gpa, mfn_x(mfn), t); + + if ( qualification & EPT_GAW_VIOLATION ) + gdprintk(XENLOG_ERR, " --- GPA too wide (max %u bits)\n", + 9 * (unsigned) d->arch.hvm_domain.vmx.ept_control.gaw + 21); + + switch ( gla_validity ) + { + case EPT_GLA_VALIDITY_PDPTR_LOAD: + gdprintk(XENLOG_ERR, " --- PDPTR load failed\n"); + break; + case EPT_GLA_VALIDITY_GPT_WALK: + gdprintk(XENLOG_ERR, " --- guest PT walk to %#lx failed\n", gla); + break; + case EPT_GLA_VALIDITY_RSVD: + gdprintk(XENLOG_ERR, " --- GLA_validity 2 (reserved)\n"); + break; + case EPT_GLA_VALIDITY_MATCH: + gdprintk(XENLOG_ERR, " --- guest access to %#lx failed\n", gla); + break; + } + domain_crash(d); } diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/hvm/vmx/vpmu_core2.c --- a/xen/arch/x86/hvm/vmx/vpmu_core2.c Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/arch/x86/hvm/vmx/vpmu_core2.c Tue Nov 04 12:43:19 2008 +0900 @@ -35,6 +35,26 @@ #include <asm/hvm/vmx/vpmu.h> #include <asm/hvm/vmx/vpmu_core2.h> +u32 core2_counters_msr[] = { + MSR_CORE_PERF_FIXED_CTR0, + MSR_CORE_PERF_FIXED_CTR1, + MSR_CORE_PERF_FIXED_CTR2}; + +/* Core 2 Non-architectual Performance Control MSRs. */ +u32 core2_ctrls_msr[] = { + MSR_CORE_PERF_FIXED_CTR_CTRL, + MSR_IA32_PEBS_ENABLE, + MSR_IA32_DS_AREA}; + +struct pmumsr core2_counters = { + 3, + core2_counters_msr +}; + +struct pmumsr core2_ctrls = { + 3, + core2_ctrls_msr +}; static int arch_pmc_cnt; static int core2_get_pmc_count(void) diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/hvm/vpt.c --- a/xen/arch/x86/hvm/vpt.c Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/arch/x86/hvm/vpt.c Tue Nov 04 12:43:19 2008 +0900 @@ -355,8 +355,8 @@ void pt_migrate(struct vcpu *v) } void create_periodic_time( - struct vcpu *v, struct periodic_time *pt, uint64_t period, - uint8_t irq, char one_shot, time_cb *cb, void *data) + struct vcpu *v, struct periodic_time *pt, uint64_t delta, + uint64_t period, uint8_t irq, time_cb *cb, void *data) { ASSERT(pt->source != 0); @@ -368,13 +368,13 @@ void create_periodic_time( pt->do_not_freeze = 0; pt->irq_issued = 0; - /* Periodic timer must be at least 0.9ms. */ - if ( (period < 900000) && !one_shot ) + /* Periodic timer must be at least 0.1ms. */ + if ( (period < 100000) && period ) { if ( !test_and_set_bool(pt->warned_timeout_too_short) ) gdprintk(XENLOG_WARNING, "HVM_PlatformTime: program too " "small period %"PRIu64"\n", period); - period = 900000; + period = 100000; } pt->period = period; @@ -382,15 +382,15 @@ void create_periodic_time( pt->last_plt_gtime = hvm_get_guest_time(pt->vcpu); pt->irq = irq; pt->period_cycles = (u64)period; - pt->one_shot = one_shot; - pt->scheduled = NOW() + period; + pt->one_shot = !period; + pt->scheduled = NOW() + delta; /* * Offset LAPIC ticks from other timer ticks. Otherwise guests which use * LAPIC ticks for process accounting can see long sequences of process * ticks incorrectly accounted to interrupt processing. */ - if ( pt->source == PTSRC_lapic ) - pt->scheduled += period >> 1; + if ( !pt->one_shot && (pt->source == PTSRC_lapic) ) + pt->scheduled += delta >> 1; pt->cb = cb; pt->priv = data; diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/irq.c --- a/xen/arch/x86/irq.c Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/arch/x86/irq.c Tue Nov 04 12:43:19 2008 +0900 @@ -793,6 +793,10 @@ int map_domain_pirq( ASSERT(spin_is_locked(&d->event_lock)); + /* XXX Until pcidev and msi locking is fixed. */ + if ( type == MAP_PIRQ_TYPE_MSI ) + return -EINVAL; + if ( !IS_PRIV(current->domain) ) return -EPERM; @@ -840,7 +844,7 @@ int map_domain_pirq( d->arch.pirq_vector[pirq] = vector; d->arch.vector_pirq[vector] = pirq; -done: + done: spin_unlock_irqrestore(&desc->lock, flags); return ret; } diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/mm.c --- a/xen/arch/x86/mm.c Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/arch/x86/mm.c Tue Nov 04 12:43:19 2008 +0900 @@ -566,19 +566,21 @@ static int get_page_and_type_from_pagenr static int get_page_and_type_from_pagenr(unsigned long page_nr, unsigned long type, struct domain *d, + int partial, int preemptible) { struct page_info *page = mfn_to_page(page_nr); int rc; - if ( unlikely(!get_page_from_pagenr(page_nr, d)) ) + if ( likely(partial >= 0) && + unlikely(!get_page_from_pagenr(page_nr, d)) ) return -EINVAL; rc = (preemptible ? get_page_type_preemptible(page, type) : (get_page_type(page, type) ? 0 : -EINVAL)); - if ( rc ) + if ( unlikely(rc) && partial >= 0 ) put_page(page); return rc; @@ -761,7 +763,7 @@ get_page_from_l2e( } rc = get_page_and_type_from_pagenr( - l2e_get_pfn(l2e), PGT_l1_page_table, d, 0); + l2e_get_pfn(l2e), PGT_l1_page_table, d, 0, 0); if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) ) rc = 0; @@ -772,7 +774,7 @@ define_get_linear_pagetable(l3); define_get_linear_pagetable(l3); static int get_page_from_l3e( - l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int preemptible) + l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int partial, int preemptible) { int rc; @@ -786,7 +788,7 @@ get_page_from_l3e( } rc = get_page_and_type_from_pagenr( - l3e_get_pfn(l3e), PGT_l2_page_table, d, preemptible); + l3e_get_pfn(l3e), PGT_l2_page_table, d, partial, preemptible); if ( unlikely(rc == -EINVAL) && get_l3_linear_pagetable(l3e, pfn, d) ) rc = 0; @@ -797,7 +799,7 @@ define_get_linear_pagetable(l4); define_get_linear_pagetable(l4); static int get_page_from_l4e( - l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int preemptible) + l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int partial, int preemptible) { int rc; @@ -811,7 +813,7 @@ get_page_from_l4e( } rc = get_page_and_type_from_pagenr( - l4e_get_pfn(l4e), PGT_l3_page_table, d, preemptible); + l4e_get_pfn(l4e), PGT_l3_page_table, d, partial, preemptible); if ( unlikely(rc == -EINVAL) && get_l4_linear_pagetable(l4e, pfn, d) ) rc = 0; @@ -961,23 +963,32 @@ static int put_page_from_l2e(l2_pgentry_ return 1; } +static int __put_page_type(struct page_info *, int preemptible); static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, - int preemptible) + int partial, int preemptible) { if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) && (l3e_get_pfn(l3e) != pfn) ) + { + if ( unlikely(partial > 0) ) + return __put_page_type(l3e_get_page(l3e), preemptible); return put_page_and_type_preemptible(l3e_get_page(l3e), preemptible); + } return 1; } #if CONFIG_PAGING_LEVELS >= 4 static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn, - int preemptible) + int partial, int preemptible) { if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) && (l4e_get_pfn(l4e) != pfn) ) + { + if ( unlikely(partial > 0) ) + return __put_page_type(l4e_get_page(l4e), preemptible); return put_page_and_type_preemptible(l4e_get_page(l4e), preemptible); + } return 1; } #endif @@ -1184,7 +1195,7 @@ static int alloc_l3_table(struct page_in unsigned long pfn = page_to_mfn(page); l3_pgentry_t *pl3e; unsigned int i; - int rc = 0; + int rc = 0, partial = page->partial_pte; #if CONFIG_PAGING_LEVELS == 3 /* @@ -1213,7 +1224,8 @@ static int alloc_l3_table(struct page_in if ( is_pv_32on64_domain(d) ) memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e)); - for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES; i++ ) + for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES; + i++, partial = 0 ) { if ( is_pv_32bit_domain(d) && (i == 3) ) { @@ -1224,16 +1236,17 @@ static int alloc_l3_table(struct page_in rc = get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]), PGT_l2_page_table | PGT_pae_xen_l2, - d, preemptible); + d, partial, preemptible); } else if ( !is_guest_l3_slot(i) || - (rc = get_page_from_l3e(pl3e[i], pfn, d, preemptible)) > 0 ) + (rc = get_page_from_l3e(pl3e[i], pfn, d, + partial, preemptible)) > 0 ) continue; if ( rc == -EAGAIN ) { page->nr_validated_ptes = i; - page->partial_pte = 1; + page->partial_pte = partial ?: 1; } else if ( rc == -EINTR && i ) { @@ -1257,7 +1270,7 @@ static int alloc_l3_table(struct page_in if ( !is_guest_l3_slot(i) ) continue; unadjust_guest_l3e(pl3e[i], d); - put_page_from_l3e(pl3e[i], pfn, 0); + put_page_from_l3e(pl3e[i], pfn, 0, 0); } } @@ -1272,18 +1285,20 @@ static int alloc_l4_table(struct page_in unsigned long pfn = page_to_mfn(page); l4_pgentry_t *pl4e = page_to_virt(page); unsigned int i; - int rc = 0; - - for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES; i++ ) + int rc = 0, partial = page->partial_pte; + + for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES; + i++, partial = 0 ) { if ( !is_guest_l4_slot(d, i) || - (rc = get_page_from_l4e(pl4e[i], pfn, d, preemptible)) > 0 ) + (rc = get_page_from_l4e(pl4e[i], pfn, d, + partial, preemptible)) > 0 ) continue; if ( rc == -EAGAIN ) { page->nr_validated_ptes = i; - page->partial_pte = 1; + page->partial_pte = partial ?: 1; } else if ( rc == -EINTR ) { @@ -1299,7 +1314,7 @@ static int alloc_l4_table(struct page_in MEM_LOG("Failure in alloc_l4_table: entry %d", i); while ( i-- > 0 ) if ( is_guest_l4_slot(d, i) ) - put_page_from_l4e(pl4e[i], pfn, 0); + put_page_from_l4e(pl4e[i], pfn, 0, 0); } if ( rc < 0 ) return rc; @@ -1377,24 +1392,20 @@ static int free_l3_table(struct page_inf struct domain *d = page_get_owner(page); unsigned long pfn = page_to_mfn(page); l3_pgentry_t *pl3e; - unsigned int i = page->nr_validated_ptes - !page->partial_pte; - int rc = 0; - -#ifdef DOMAIN_DESTRUCT_AVOID_RECURSION - if ( d->arch.relmem == RELMEM_l3 ) - return 0; -#endif + int rc = 0, partial = page->partial_pte; + unsigned int i = page->nr_validated_ptes - !partial; pl3e = map_domain_page(pfn); do { if ( is_guest_l3_slot(i) ) { - rc = put_page_from_l3e(pl3e[i], pfn, preemptible); + rc = put_page_from_l3e(pl3e[i], pfn, partial, preemptible); + if ( rc < 0 ) + break; + partial = 0; if ( rc > 0 ) continue; - if ( rc ) - break; unadjust_guest_l3e(pl3e[i], d); } } while ( i-- ); @@ -1404,7 +1415,7 @@ static int free_l3_table(struct page_inf if ( rc == -EAGAIN ) { page->nr_validated_ptes = i; - page->partial_pte = 1; + page->partial_pte = partial ?: -1; } else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 ) { @@ -1421,23 +1432,21 @@ static int free_l4_table(struct page_inf struct domain *d = page_get_owner(page); unsigned long pfn = page_to_mfn(page); l4_pgentry_t *pl4e = page_to_virt(page); - unsigned int i = page->nr_validated_ptes - !page->partial_pte; - int rc = 0; - -#ifdef DOMAIN_DESTRUCT_AVOID_RECURSION - if ( d->arch.relmem == RELMEM_l4 ) - return 0; -#endif + int rc = 0, partial = page->partial_pte; + unsigned int i = page->nr_validated_ptes - !partial; do { if ( is_guest_l4_slot(d, i) ) - rc = put_page_from_l4e(pl4e[i], pfn, preemptible); - } while ( rc >= 0 && i-- ); + rc = put_page_from_l4e(pl4e[i], pfn, partial, preemptible); + if ( rc < 0 ) + break; + partial = 0; + } while ( i-- ); if ( rc == -EAGAIN ) { page->nr_validated_ptes = i; - page->partial_pte = 1; + page->partial_pte = partial ?: -1; } else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 ) { @@ -1713,7 +1722,7 @@ static int mod_l3_entry(l3_pgentry_t *pl return rc ? 0 : -EFAULT; } - rc = get_page_from_l3e(nl3e, pfn, d, preemptible); + rc = get_page_from_l3e(nl3e, pfn, d, 0, preemptible); if ( unlikely(rc < 0) ) return page_unlock(l3pg), rc; rc = 0; @@ -1742,7 +1751,7 @@ static int mod_l3_entry(l3_pgentry_t *pl } page_unlock(l3pg); - put_page_from_l3e(ol3e, pfn, 0); + put_page_from_l3e(ol3e, pfn, 0, 0); return rc; } @@ -1791,7 +1800,7 @@ static int mod_l4_entry(l4_pgentry_t *pl return rc ? 0 : -EFAULT; } - rc = get_page_from_l4e(nl4e, pfn, d, preemptible); + rc = get_page_from_l4e(nl4e, pfn, d, 0, preemptible); if ( unlikely(rc < 0) ) return page_unlock(l4pg), rc; rc = 0; @@ -1812,7 +1821,7 @@ static int mod_l4_entry(l4_pgentry_t *pl } page_unlock(l4pg); - put_page_from_l4e(ol4e, pfn, 0); + put_page_from_l4e(ol4e, pfn, 0, 0); return rc; } @@ -1847,7 +1856,8 @@ int get_page(struct page_info *page, str nx = x + 1; d = nd; if ( unlikely((x & PGC_count_mask) == 0) || /* Not allocated? */ - unlikely((nx & PGC_count_mask) == 0) || /* Count overflow? */ + /* Keep one spare reference to be acquired by get_page_light(). */ + unlikely(((nx + 1) & PGC_count_mask) <= 1) || /* Overflow? */ unlikely(d != _domain) ) /* Wrong owner? */ { if ( !_shadow_mode_refcounts(domain) && !domain->is_dying ) @@ -1867,6 +1877,28 @@ int get_page(struct page_info *page, str while ( unlikely(nd != d) || unlikely(y != x) ); return 1; +} + +/* + * Special version of get_page() to be used exclusively when + * - a page is known to already have a non-zero reference count + * - the page does not need its owner to be checked + * - it will not be called more than once without dropping the thus + * acquired reference again. + * Due to get_page() reserving one reference, this call cannot fail. + */ +static void get_page_light(struct page_info *page) +{ + u32 x, nx, y = page->count_info; + + do { + x = y; + nx = x + 1; + BUG_ON(!(x & PGC_count_mask)); /* Not allocated? */ + BUG_ON(!(nx & PGC_count_mask)); /* Overflow? */ + y = cmpxchg(&page->count_info, x, nx); + } + while ( unlikely(y != x) ); } @@ -1909,6 +1941,7 @@ static int alloc_page_type(struct page_i wmb(); if ( rc == -EAGAIN ) { + get_page_light(page); page->u.inuse.type_info |= PGT_partial; } else if ( rc == -EINTR ) @@ -1973,6 +2006,7 @@ int free_page_type(struct page_info *pag page->nr_validated_ptes = 1U << PAGETABLE_ORDER; page->partial_pte = 0; } + switch ( type & PGT_type_mask ) { case PGT_l1_page_table: @@ -1998,6 +2032,15 @@ int free_page_type(struct page_info *pag BUG(); } + return rc; +} + + +static int __put_final_page_type( + struct page_info *page, unsigned long type, int preemptible) +{ + int rc = free_page_type(page, type, preemptible); + /* No need for atomic update of type_info here: noone else updates it. */ if ( rc == 0 ) { @@ -2016,8 +2059,8 @@ int free_page_type(struct page_info *pag } else if ( rc == -EINTR ) { - ASSERT(!(page->u.inuse.type_info & - (PGT_count_mask|PGT_validated|PGT_partial))); + ASSERT((page->u.inuse.type_info & + (PGT_count_mask|PGT_validated|PGT_partial)) == 1); if ( !(shadow_mode_enabled(page_get_owner(page)) && (page->count_info & PGC_page_table)) ) page->tlbflush_timestamp = tlbflush_current_time(); @@ -2028,6 +2071,7 @@ int free_page_type(struct page_info *pag { BUG_ON(rc != -EAGAIN); wmb(); + get_page_light(page); page->u.inuse.type_info |= PGT_partial; } @@ -2039,6 +2083,7 @@ static int __put_page_type(struct page_i int preemptible) { unsigned long nx, x, y = page->u.inuse.type_info; + int rc = 0; for ( ; ; ) { @@ -2062,7 +2107,10 @@ static int __put_page_type(struct page_i x, nx)) != x) ) continue; /* We cleared the 'valid bit' so we do the clean up. */ - return free_page_type(page, x, preemptible); + rc = __put_final_page_type(page, x, preemptible); + if ( x & PGT_partial ) + put_page(page); + break; } /* @@ -2084,7 +2132,7 @@ static int __put_page_type(struct page_i return -EINTR; } - return 0; + return rc; } @@ -2092,6 +2140,7 @@ static int __get_page_type(struct page_i int preemptible) { unsigned long nx, x, y = page->u.inuse.type_info; + int rc = 0; ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2))); @@ -2214,10 +2263,13 @@ static int __get_page_type(struct page_i page->nr_validated_ptes = 0; page->partial_pte = 0; } - return alloc_page_type(page, type, preemptible); - } - - return 0; + rc = alloc_page_type(page, type, preemptible); + } + + if ( (x & PGT_partial) && !(nx & PGT_partial) ) + put_page(page); + + return rc; } void put_page_type(struct page_info *page) @@ -2296,7 +2348,7 @@ int new_guest_cr3(unsigned long mfn) #endif okay = paging_mode_refcounts(d) ? get_page_from_pagenr(mfn, d) - : !get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0); + : !get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0, 0); if ( unlikely(!okay) ) { MEM_LOG("Error while installing new baseptr %lx", mfn); @@ -2431,6 +2483,29 @@ static inline cpumask_t vcpumask_to_pcpu return pmask; } +#ifdef __i386__ +static inline void *fixmap_domain_page(unsigned long mfn) +{ + unsigned int cpu = smp_processor_id(); + void *ptr = (void *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu); + + l1e_write(fix_pae_highmem_pl1e - cpu, + l1e_from_pfn(mfn, __PAGE_HYPERVISOR)); + flush_tlb_one_local(ptr); + return ptr; +} +static inline void fixunmap_domain_page(const void *ptr) +{ + unsigned int cpu = virt_to_fix((unsigned long)ptr) - FIX_PAE_HIGHMEM_0; + + l1e_write(fix_pae_highmem_pl1e - cpu, l1e_empty()); + this_cpu(make_cr3_timestamp) = this_cpu(tlbflush_time); +} +#else +#define fixmap_domain_page(mfn) mfn_to_virt(mfn) +#define fixunmap_domain_page(ptr) ((void)(ptr)) +#endif + int do_mmuext_op( XEN_GUEST_HANDLE(mmuext_op_t) uops, unsigned int count, @@ -2517,7 +2592,7 @@ int do_mmuext_op( if ( paging_mode_refcounts(FOREIGNDOM) ) break; - rc = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM, 1); + rc = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM, 0, 1); okay = !rc; if ( unlikely(!okay) ) { @@ -2598,7 +2673,7 @@ int do_mmuext_op( okay = get_page_from_pagenr(mfn, d); else okay = !get_page_and_type_from_pagenr( - mfn, PGT_root_page_table, d, 0); + mfn, PGT_root_page_table, d, 0, 0); if ( unlikely(!okay) ) { MEM_LOG("Error while installing new mfn %lx", mfn); @@ -2697,6 +2772,66 @@ int do_mmuext_op( if ( ents != 0 ) this_cpu(percpu_mm_info).deferred_ops |= DOP_RELOAD_LDT; } + break; + } + + case MMUEXT_CLEAR_PAGE: + { + unsigned char *ptr; + + okay = !get_page_and_type_from_pagenr(mfn, PGT_writable_page, + FOREIGNDOM, 0, 0); + if ( unlikely(!okay) ) + { + MEM_LOG("Error while clearing mfn %lx", mfn); + break; + } + + /* A page is dirtied when it's being cleared. */ + paging_mark_dirty(d, mfn); + + ptr = fixmap_domain_page(mfn); + clear_page(ptr); + fixunmap_domain_page(ptr); + + put_page_and_type(page); + break; + } + + case MMUEXT_COPY_PAGE: + { + const unsigned char *src; + unsigned char *dst; + unsigned long src_mfn; + + src_mfn = gmfn_to_mfn(FOREIGNDOM, op.arg2.src_mfn); + okay = get_page_from_pagenr(src_mfn, FOREIGNDOM); + if ( unlikely(!okay) ) + { + MEM_LOG("Error while copying from mfn %lx", src_mfn); + break; + } + + okay = !get_page_and_type_from_pagenr(mfn, PGT_writable_page, + FOREIGNDOM, 0, 0); + if ( unlikely(!okay) ) + { + put_page(mfn_to_page(src_mfn)); + MEM_LOG("Error while copying to mfn %lx", mfn); + break; + } + + /* A page is dirtied when it's being copied to. */ + paging_mark_dirty(d, mfn); + + src = map_domain_page(src_mfn); + dst = fixmap_domain_page(mfn); + copy_page(dst, src); + fixunmap_domain_page(dst); + unmap_domain_page(src); + + put_page_and_type(page); + put_page(mfn_to_page(src_mfn)); break; } diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/mm/hap/p2m-ept.c --- a/xen/arch/x86/mm/hap/p2m-ept.c Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/arch/x86/mm/hap/p2m-ept.c Tue Nov 04 12:43:19 2008 +0900 @@ -157,9 +157,6 @@ ept_set_entry(struct domain *d, unsigned { if ( mfn_valid(mfn_x(mfn)) || (p2mt == p2m_mmio_direct) ) { - /* Track the highest gfn for which we have ever had a valid mapping */ - if ( gfn > d->arch.p2m->max_mapped_pfn ) - d->arch.p2m->max_mapped_pfn = gfn; ept_entry->emt = epte_get_entry_emt(d, gfn, mfn_x(mfn)); ept_entry->sp_avail = walk_level ? 1 : 0; @@ -233,6 +230,11 @@ ept_set_entry(struct domain *d, unsigned unmap_domain_page(split_table); } + + /* Track the highest gfn for which we have ever had a valid mapping */ + if ( mfn_valid(mfn_x(mfn)) + && (gfn + (1UL << order) - 1 > d->arch.p2m->max_mapped_pfn) ) + d->arch.p2m->max_mapped_pfn = gfn + (1UL << order) - 1; /* Success */ rv = 1; diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/mm/p2m.c --- a/xen/arch/x86/mm/p2m.c Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/arch/x86/mm/p2m.c Tue Nov 04 12:43:19 2008 +0900 @@ -322,7 +322,8 @@ p2m_set_entry(struct domain *d, unsigned } /* Track the highest gfn for which we have ever had a valid mapping */ - if ( mfn_valid(mfn) && (gfn > d->arch.p2m->max_mapped_pfn) ) + if ( mfn_valid(mfn) + && (gfn + (1UL << page_order) - 1 > d->arch.p2m->max_mapped_pfn) ) d->arch.p2m->max_mapped_pfn = gfn + (1UL << page_order) - 1; if ( iommu_enabled && (is_hvm_domain(d) || need_iommu(d)) ) @@ -956,18 +957,18 @@ guest_physmap_add_entry(struct domain *d /* First, remove m->p mappings for existing p->m mappings */ for ( i = 0; i < (1UL << page_order); i++ ) { - omfn = gfn_to_mfn(d, gfn, &ot); + omfn = gfn_to_mfn(d, gfn + i, &ot); if ( p2m_is_ram(ot) ) { ASSERT(mfn_valid(omfn)); - set_gpfn_from_mfn(mfn_x(omfn)+i, INVALID_M2P_ENTRY); + set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY); } } /* Then, look for m->p mappings for this range and deal with them */ for ( i = 0; i < (1UL << page_order); i++ ) { - ogfn = mfn_to_gfn(d, _mfn(mfn)); + ogfn = mfn_to_gfn(d, _mfn(mfn+i)); if ( #ifdef __x86_64__ (ogfn != 0x5555555555555555L) @@ -975,20 +976,20 @@ guest_physmap_add_entry(struct domain *d (ogfn != 0x55555555L) #endif && (ogfn != INVALID_M2P_ENTRY) - && (ogfn != gfn) ) + && (ogfn != gfn + i) ) { /* This machine frame is already mapped at another physical * address */ P2M_DEBUG("aliased! mfn=%#lx, old gfn=%#lx, new gfn=%#lx\n", - mfn, ogfn, gfn); + mfn + i, ogfn, gfn + i); omfn = gfn_to_mfn(d, ogfn, &ot); if ( p2m_is_ram(ot) ) { ASSERT(mfn_valid(omfn)); P2M_DEBUG("old gfn=%#lx -> mfn %#lx\n", ogfn , mfn_x(omfn)); - if ( mfn_x(omfn) == mfn ) - p2m_remove_page(d, ogfn, mfn, 0); + if ( mfn_x(omfn) == (mfn + i) ) + p2m_remove_page(d, ogfn, mfn + i, 0); } } } diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/msi.c --- a/xen/arch/x86/msi.c Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/arch/x86/msi.c Tue Nov 04 12:43:19 2008 +0900 @@ -33,8 +33,7 @@ DECLARE_BITMAP(msix_fixmap_pages, MAX_MS static int msix_fixmap_alloc(void) { - int i; - int rc = -1; + int i, rc = -1; spin_lock(&msix_fixmap_lock); for ( i = 0; i < MAX_MSIX_PAGES; i++ ) @@ -52,12 +51,8 @@ static int msix_fixmap_alloc(void) static void msix_fixmap_free(int idx) { - if ( idx < FIX_MSIX_IO_RESERV_BASE ) - return; - - spin_lock(&msix_fixmap_lock); - clear_bit(idx - FIX_MSIX_IO_RESERV_BASE, &msix_fixmap_pages); - spin_unlock(&msix_fixmap_lock); + if ( idx >= FIX_MSIX_IO_RESERV_BASE ) + clear_bit(idx - FIX_MSIX_IO_RESERV_BASE, &msix_fixmap_pages); } /* @@ -78,19 +73,19 @@ static void msi_compose_msg(struct pci_d msg->address_lo = MSI_ADDR_BASE_LO | ((INT_DEST_MODE == 0) ? - MSI_ADDR_DESTMODE_PHYS: - MSI_ADDR_DESTMODE_LOGIC) | + MSI_ADDR_DESTMODE_PHYS: + MSI_ADDR_DESTMODE_LOGIC) | ((INT_DELIVERY_MODE != dest_LowestPrio) ? - MSI_ADDR_REDIRECTION_CPU: - MSI_ADDR_REDIRECTION_LOWPRI) | + MSI_ADDR_REDIRECTION_CPU: + MSI_ADDR_REDIRECTION_LOWPRI) | MSI_ADDR_DEST_ID(dest); msg->data = MSI_DATA_TRIGGER_EDGE | MSI_DATA_LEVEL_ASSERT | ((INT_DELIVERY_MODE != dest_LowestPrio) ? - MSI_DATA_DELIVERY_FIXED: - MSI_DATA_DELIVERY_LOWPRI) | + MSI_DATA_DELIVERY_FIXED: + MSI_DATA_DELIVERY_LOWPRI) | MSI_DATA_VECTOR(vector); } } @@ -128,7 +123,7 @@ static void read_msi_msg(struct msi_desc { void __iomem *base; base = entry->mask_base + - entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE; + entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE; msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET); msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET); @@ -205,9 +200,9 @@ static void write_msi_msg(struct msi_des entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE; writel(msg->address_lo, - base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET); + base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET); writel(msg->address_hi, - base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET); + base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET); writel(msg->data, base + PCI_MSIX_ENTRY_DATA_OFFSET); break; } @@ -230,7 +225,7 @@ void set_msi_irq_affinity(unsigned int i dest = cpu_mask_to_apicid(mask); if ( !desc ) - return; + return; ASSERT(spin_is_locked(&irq_desc[irq].lock)); spin_lock(&desc->dev->lock); @@ -398,8 +393,8 @@ static void msi_free_vector(int vector) unsigned long start; writel(1, entry->mask_base + entry->msi_attrib.entry_nr - * PCI_MSIX_ENTRY_SIZE - + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET); + * PCI_MSIX_ENTRY_SIZE + + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET); start = (unsigned long)entry->mask_base & ~(PAGE_SIZE - 1); msix_fixmap_free(virt_to_fix(start)); @@ -460,20 +455,20 @@ static int msi_capability_init(struct pc entry->vector = vector; if ( is_mask_bit_support(control) ) entry->mask_base = (void __iomem *)(long)msi_mask_bits_reg(pos, - is_64bit_address(control)); + is_64bit_address(control)); entry->dev = dev; if ( entry->msi_attrib.maskbit ) { unsigned int maskbits, temp; /* All MSIs are unmasked by default, Mask them all */ maskbits = pci_conf_read32(bus, slot, func, - msi_mask_bits_reg(pos, is_64bit_address(control))); + msi_mask_bits_reg(pos, is_64bit_address(control))); temp = (1 << multi_msi_capable(control)); temp = ((temp - 1) & ~temp); maskbits |= temp; pci_conf_write32(bus, slot, func, - msi_mask_bits_reg(pos, is_64bit_address(control)), - maskbits); + msi_mask_bits_reg(pos, is_64bit_address(control)), + maskbits); } list_add_tail(&entry->list, &dev->msi_list); @@ -575,14 +570,14 @@ static int __pci_enable_msi(struct msi_i pdev = pci_lock_pdev(msi->bus, msi->devfn); if ( !pdev ) - return -ENODEV; + return -ENODEV; if ( find_msi_entry(pdev, msi->vector, PCI_CAP_ID_MSI) ) { - spin_unlock(&pdev->lock); + spin_unlock(&pdev->lock); dprintk(XENLOG_WARNING, "vector %d has already mapped to MSI on " - "device %02x:%02x.%01x.\n", msi->vector, msi->bus, - PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn)); + "device %02x:%02x.%01x.\n", msi->vector, msi->bus, + PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn)); return 0; } @@ -601,7 +596,7 @@ static void __pci_disable_msi(int vector entry = irq_desc[vector].msi_desc; if ( !entry ) - return; + return; /* * Lock here is safe. msi_desc can not be removed without holding * both irq_desc[].lock (which we do) and pdev->lock. @@ -649,20 +644,20 @@ static int __pci_enable_msix(struct msi_ pdev = pci_lock_pdev(msi->bus, msi->devfn); if ( !pdev ) - return -ENODEV; + return -ENODEV; pos = pci_find_cap_offset(msi->bus, slot, func, PCI_CAP_ID_MSIX); control = pci_conf_read16(msi->bus, slot, func, msi_control_reg(pos)); nr_entries = multi_msix_capable(control); if (msi->entry_nr > nr_entries) { - spin_unlock(&pdev->lock); + spin_unlock(&pdev->lock); return -EINVAL; } if ( find_msi_entry(pdev, msi->vector, PCI_CAP_ID_MSIX) ) { - spin_unlock(&pdev->lock); + spin_unlock(&pdev->lock); dprintk(XENLOG_WARNING, "vector %d has already mapped to MSIX on " "device %02x:%02x.%01x.\n", msi->vector, msi->bus, PCI_SLOT(msi->devfn), PCI_FUNC(msi->devfn)); @@ -684,7 +679,7 @@ static void __pci_disable_msix(int vecto entry = irq_desc[vector].msi_desc; if ( !entry ) - return; + return; /* * Lock here is safe. msi_desc can not be removed without holding * both irq_desc[].lock (which we do) and pdev->lock. @@ -712,7 +707,7 @@ int pci_enable_msi(struct msi_info *msi) ASSERT(spin_is_locked(&irq_desc[msi->vector].lock)); return msi->table_base ? __pci_enable_msix(msi) : - __pci_enable_msi(msi); + __pci_enable_msi(msi); } void pci_disable_msi(int vector) @@ -720,7 +715,7 @@ void pci_disable_msi(int vector) irq_desc_t *desc = &irq_desc[vector]; ASSERT(spin_is_locked(&desc->lock)); if ( !desc->msi_desc ) - return; + return; if ( desc->msi_desc->msi_attrib.type == PCI_CAP_ID_MSI ) __pci_disable_msi(vector); @@ -734,7 +729,7 @@ static void msi_free_vectors(struct pci_ irq_desc_t *desc; unsigned long flags; -retry: + retry: list_for_each_entry_safe( entry, tmp, &dev->msi_list, list ) { desc = &irq_desc[entry->vector]; @@ -742,7 +737,7 @@ retry: local_irq_save(flags); if ( !spin_trylock(&desc->lock) ) { - local_irq_restore(flags); + local_irq_restore(flags); goto retry; } diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/oprofile/nmi_int.c --- a/xen/arch/x86/oprofile/nmi_int.c Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/arch/x86/oprofile/nmi_int.c Tue Nov 04 12:43:19 2008 +0900 @@ -36,6 +36,55 @@ static char *cpu_type; static char *cpu_type; extern int is_active(struct domain *d); +extern int is_passive(struct domain *d); + +int passive_domain_do_rdmsr(struct cpu_user_regs *regs) +{ + u64 msr_content; + int type, index; + struct vpmu_struct *vpmu = vcpu_vpmu(current); + + if ( model->is_arch_pmu_msr == NULL ) + return 0; + if ( !model->is_arch_pmu_msr((u64)regs->ecx, &type, &index) ) + return 0; + if ( !(vpmu->flags & PASSIVE_DOMAIN_ALLOCATED) ) + if ( ! model->allocated_msr(current) ) + return 0; + + model->load_msr(current, type, index, &msr_content); + regs->eax = msr_content & 0xFFFFFFFF; + regs->edx = msr_content >> 32; + return 1; +} + + +int passive_domain_do_wrmsr(struct cpu_user_regs *regs) +{ + u64 msr_content; + int type, index; + struct vpmu_struct *vpmu = vcpu_vpmu(current); + + if ( model->is_arch_pmu_msr == NULL ) + return 0; + if ( !model->is_arch_pmu_msr((u64)regs->ecx, &type, &index) ) + return 0; + + if ( !(vpmu->flags & PASSIVE_DOMAIN_ALLOCATED) ) + if ( ! model->allocated_msr(current) ) + return 0; + + msr_content = (u32)regs->eax | ((u64)regs->edx << 32); + model->save_msr(current, type, index, msr_content); + return 1; +} + +void passive_domain_destroy(struct vcpu *v) +{ + struct vpmu_struct *vpmu = vcpu_vpmu(v); + if ( vpmu->flags & PASSIVE_DOMAIN_ALLOCATED ) + model->free_msr(v); +} static int nmi_callback(struct cpu_user_regs *regs, int cpu) { @@ -46,6 +95,8 @@ static int nmi_callback(struct cpu_user_ if ( ovf && is_active(current->domain) && !xen_mode ) send_guest_vcpu_virq(current, VIRQ_XENOPROF); + if ( ovf == 2 ) + test_and_set_bool(current->nmi_pending); return 1; } diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/oprofile/op_model_ppro.c --- a/xen/arch/x86/oprofile/op_model_ppro.c Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/arch/x86/oprofile/op_model_ppro.c Tue Nov 04 12:43:19 2008 +0900 @@ -18,6 +18,8 @@ #include <xen/sched.h> #include <asm/regs.h> #include <asm/current.h> +#include <asm/hvm/vmx/vpmu.h> +#include <asm/hvm/vmx/vpmu_core2.h> #include "op_x86_model.h" #include "op_counter.h" @@ -39,9 +41,11 @@ #define CTRL_SET_KERN(val,k) (val |= ((k & 1) << 17)) #define CTRL_SET_UM(val, m) (val |= (m << 8)) #define CTRL_SET_EVENT(val, e) (val |= e) - +#define IS_ACTIVE(val) (val & (1 << 22) ) +#define IS_ENABLE(val) (val & (1 << 20) ) static unsigned long reset_value[NUM_COUNTERS]; int ppro_has_global_ctrl = 0; +extern int is_passive(struct domain *d); static void ppro_fill_in_addresses(struct op_msrs * const msrs) { @@ -103,6 +107,7 @@ static int ppro_check_ctrs(unsigned int int ovf = 0; unsigned long eip = regs->eip; int mode = xenoprofile_get_mode(current, regs); + struct arch_msr_pair *msrs_content = vcpu_vpmu(current)->context; for (i = 0 ; i < NUM_COUNTERS; ++i) { if (!reset_value[i]) @@ -111,7 +116,18 @@ static int ppro_check_ctrs(unsigned int if (CTR_OVERFLOWED(low)) { xenoprof_log_event(current, regs, eip, mode, i); CTR_WRITE(reset_value[i], msrs, i); - ovf = 1; + if ( is_passive(current->domain) && (mode != 2) && + (vcpu_vpmu(current)->flags & PASSIVE_DOMAIN_ALLOCATED) ) + { + if ( IS_ACTIVE(msrs_content[i].control) ) + { + msrs_content[i].counter = (low | (u64)high << 32); + if ( IS_ENABLE(msrs_content[i].control) ) + ovf = 2; + } + } + if ( !ovf ) + ovf = 1; } } @@ -159,6 +175,82 @@ static void ppro_stop(struct op_msrs con wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); } +static int ppro_is_arch_pmu_msr(u64 msr_index, int *type, int *index) +{ + if ( (msr_index >= MSR_IA32_PERFCTR0) && + (msr_index < (MSR_IA32_PERFCTR0 + NUM_COUNTERS)) ) + { + *type = MSR_TYPE_ARCH_COUNTER; + *index = msr_index - MSR_IA32_PERFCTR0; + return 1; + } + if ( (msr_index >= MSR_P6_EVNTSEL0) && + (msr_index < (MSR_P6_EVNTSEL0 + NUM_CONTROLS)) ) + { + *type = MSR_TYPE_ARCH_CTRL; + *index = msr_index - MSR_P6_EVNTSEL0; + return 1; + } + + return 0; +} + +static int ppro_allocate_msr(struct vcpu *v) +{ + struct vpmu_struct *vpmu = vcpu_vpmu(v); + struct arch_msr_pair *msr_content; + + msr_content = xmalloc_bytes( sizeof(struct arch_msr_pair) * NUM_COUNTERS ); + if ( !msr_content ) + goto out; + memset(msr_content, 0, sizeof(struct arch_msr_pair) * NUM_COUNTERS); + vpmu->context = (void *)msr_content; + vpmu->flags = 0; + vpmu->flags |= PASSIVE_DOMAIN_ALLOCATED; + return 1; +out: + gdprintk(XENLOG_WARNING, "Insufficient memory for oprofile, oprofile is " + "unavailable on domain %d vcpu %d.\n", + v->vcpu_id, v->domain->domain_id); + return 0; +} + +static void ppro_free_msr(struct vcpu *v) +{ + struct vpmu_struct *vpmu = vcpu_vpmu(v); + + xfree(vpmu->context); + vpmu->flags &= ~PASSIVE_DOMAIN_ALLOCATED; +} + +static void ppro_load_msr(struct vcpu *v, int type, int index, u64 *msr_content) +{ + struct arch_msr_pair *msrs = vcpu_vpmu(v)->context; + switch ( type ) + { + case MSR_TYPE_ARCH_COUNTER: + *msr_content = msrs[index].counter; + break; + case MSR_TYPE_ARCH_CTRL: + *msr_content = msrs[index].control; + break; + } +} + +static void ppro_save_msr(struct vcpu *v, int type, int index, u64 msr_content) +{ + struct arch_msr_pair *msrs = vcpu_vpmu(v)->context; + + switch ( type ) + { + case MSR_TYPE_ARCH_COUNTER: + msrs[index].counter = msr_content; + break; + case MSR_TYPE_ARCH_CTRL: + msrs[index].control = msr_content; + break; + } +} struct op_x86_model_spec const op_ppro_spec = { .num_counters = NUM_COUNTERS, @@ -167,5 +259,10 @@ struct op_x86_model_spec const op_ppro_s .setup_ctrs = &ppro_setup_ctrs, .check_ctrs = &ppro_check_ctrs, .start = &ppro_start, - .stop = &ppro_stop + .stop = &ppro_stop, + .is_arch_pmu_msr = &ppro_is_arch_pmu_msr, + .allocated_msr = &ppro_allocate_msr, + .free_msr = &ppro_free_msr, + .load_msr = &ppro_load_msr, + .save_msr = &ppro_save_msr }; diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/oprofile/op_x86_model.h --- a/xen/arch/x86/oprofile/op_x86_model.h Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/arch/x86/oprofile/op_x86_model.h Tue Nov 04 12:43:19 2008 +0900 @@ -41,6 +41,11 @@ struct op_x86_model_spec { struct cpu_user_regs * const regs); void (*start)(struct op_msrs const * const msrs); void (*stop)(struct op_msrs const * const msrs); + int (*is_arch_pmu_msr)(u64 msr_index, int *type, int *index); + int (*allocated_msr)(struct vcpu *v); + void (*free_msr)(struct vcpu *v); + void (*load_msr)(struct vcpu * const v, int type, int index, u64 *msr_content); + void (*save_msr)(struct vcpu * const v, int type, int index, u64 msr_content); }; extern struct op_x86_model_spec const op_ppro_spec; diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/setup.c --- a/xen/arch/x86/setup.c Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/arch/x86/setup.c Tue Nov 04 12:43:19 2008 +0900 @@ -969,6 +969,7 @@ void __init __start_xen(unsigned long mb serial_init_postirq(); BUG_ON(!local_irq_is_enabled()); + spin_debug_enable(); for_each_present_cpu ( i ) { diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/smpboot.c --- a/xen/arch/x86/smpboot.c Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/arch/x86/smpboot.c Tue Nov 04 12:43:19 2008 +0900 @@ -101,7 +101,7 @@ static int __devinitdata tsc_sync_disabl static int __devinitdata tsc_sync_disabled; /* Per CPU bogomips and other parameters */ -struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; +struct cpuinfo_x86 cpu_data[NR_CPUS]; EXPORT_SYMBOL(cpu_data); u32 x86_cpu_to_apicid[NR_CPUS] __read_mostly = @@ -112,7 +112,7 @@ static void map_cpu_to_logical_apicid(vo /* State of each CPU. */ DEFINE_PER_CPU(int, cpu_state) = { 0 }; -static void *stack_base[NR_CPUS] __cacheline_aligned; +static void *stack_base[NR_CPUS]; static DEFINE_SPINLOCK(cpu_add_remove_lock); /* @@ -805,14 +805,6 @@ static inline int alloc_cpu_id(void) return cpu; } -static struct vcpu *prepare_idle_vcpu(unsigned int cpu) -{ - if (idle_vcpu[cpu]) - return idle_vcpu[cpu]; - - return alloc_idle_vcpu(cpu); -} - static void *prepare_idle_stack(unsigned int cpu) { if (!stack_base[cpu]) @@ -849,7 +841,7 @@ static int __devinit do_boot_cpu(int api booting_cpu = cpu; - v = prepare_idle_vcpu(cpu); + v = alloc_idle_vcpu(cpu); BUG_ON(v == NULL); /* start_eip had better be page-aligned! */ diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/time.c --- a/xen/arch/x86/time.c Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/arch/x86/time.c Tue Nov 04 12:43:19 2008 +0900 @@ -1063,8 +1063,6 @@ void init_percpu_time(void) /* Late init function (after all CPUs are booted). */ int __init init_xen_time(void) { - local_irq_disable(); - /* check if TSC is invariant during deep C state this is a new feature introduced by Nehalem*/ if ( cpuid_edx(0x80000007) & (1u<<8) ) @@ -1078,8 +1076,6 @@ int __init init_xen_time(void) init_platform_timer(); do_settime(get_cmos_time(), 0, NOW()); - - local_irq_enable(); return 0; } diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/traps.c --- a/xen/arch/x86/traps.c Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/arch/x86/traps.c Tue Nov 04 12:43:19 2008 +0900 @@ -1030,7 +1030,7 @@ static int handle_gdt_ldt_mapping_fault( #endif static int __spurious_page_fault( - unsigned long addr, struct cpu_user_regs *regs) + unsigned long addr, unsigned int error_code) { unsigned long mfn, cr3 = read_cr3(); #if CONFIG_PAGING_LEVELS >= 4 @@ -1052,17 +1052,17 @@ static int __spurious_page_fault( return 0; /* Reserved bit violations are never spurious faults. */ - if ( regs->error_code & PFEC_reserved_bit ) + if ( error_code & PFEC_reserved_bit ) return 0; required_flags = _PAGE_PRESENT; - if ( regs->error_code & PFEC_write_access ) + if ( error_code & PFEC_write_access ) required_flags |= _PAGE_RW; - if ( regs->error_code & PFEC_user_mode ) + if ( error_code & PFEC_user_mode ) required_flags |= _PAGE_USER; disallowed_flags = 0; - if ( regs->error_code & PFEC_insn_fetch ) + if ( error_code & PFEC_insn_fetch ) disallowed_flags |= _PAGE_NX; mfn = cr3 >> PAGE_SHIFT; @@ -1120,7 +1120,7 @@ static int __spurious_page_fault( dprintk(XENLOG_WARNING, "Spurious fault in domain %u:%u " "at addr %lx, e/c %04x\n", current->domain->domain_id, current->vcpu_id, - addr, regs->error_code); + addr, error_code); #if CONFIG_PAGING_LEVELS >= 4 dprintk(XENLOG_WARNING, " l4e = %"PRIpte"\n", l4e_get_intpte(l4e)); #endif @@ -1129,14 +1129,11 @@ static int __spurious_page_fault( #endif dprintk(XENLOG_WARNING, " l2e = %"PRIpte"\n", l2e_get_intpte(l2e)); dprintk(XENLOG_WARNING, " l1e = %"PRIpte"\n", l1e_get_intpte(l1e)); -#ifndef NDEBUG - show_registers(regs); -#endif return 1; } static int spurious_page_fault( - unsigned long addr, struct cpu_user_regs *regs) + unsigned long addr, unsigned int error_code) { unsigned long flags; int is_spurious; @@ -1146,7 +1143,7 @@ static int spurious_page_fault( * page tables from becoming invalid under our feet during the walk. */ local_irq_save(flags); - is_spurious = __spurious_page_fault(addr, regs); + is_spurious = __spurious_page_fault(addr, error_code); local_irq_restore(flags); return is_spurious; @@ -1208,8 +1205,12 @@ asmlinkage void do_page_fault(struct cpu asmlinkage void do_page_fault(struct cpu_user_regs *regs) { unsigned long addr, fixup; + unsigned int error_code; addr = read_cr2(); + + /* fixup_page_fault() might change regs->error_code, so cache it here. */ + error_code = regs->error_code; DEBUGGER_trap_entry(TRAP_page_fault, regs); @@ -1220,7 +1221,7 @@ asmlinkage void do_page_fault(struct cpu if ( unlikely(!guest_mode(regs)) ) { - if ( spurious_page_fault(addr, regs) ) + if ( spurious_page_fault(addr, error_code) ) return; if ( likely((fixup = search_exception_table(regs->eip)) != 0) ) @@ -1239,11 +1240,11 @@ asmlinkage void do_page_fault(struct cpu panic("FATAL PAGE FAULT\n" "[error_code=%04x]\n" "Faulting linear address: %p\n", - regs->error_code, _p(addr)); + error_code, _p(addr)); } if ( unlikely(current->domain->arch.suppress_spurious_page_faults - && spurious_page_fault(addr, regs)) ) + && spurious_page_fault(addr, error_code)) ) return; propagate_page_fault(addr, regs->error_code); diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/x86_32/domain_page.c --- a/xen/arch/x86/x86_32/domain_page.c Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/arch/x86/x86_32/domain_page.c Tue Nov 04 12:43:19 2008 +0900 @@ -43,7 +43,7 @@ void *map_domain_page(unsigned long mfn) void *map_domain_page(unsigned long mfn) { unsigned long va; - unsigned int idx, i; + unsigned int idx, i, flags; struct vcpu *v; struct mapcache_domain *dcache; struct mapcache_vcpu *vcache; @@ -69,7 +69,7 @@ void *map_domain_page(unsigned long mfn) goto out; } - spin_lock(&dcache->lock); + spin_lock_irqsave(&dcache->lock, flags); /* Has some other CPU caused a wrap? We must flush if so. */ if ( unlikely(dcache->epoch != vcache->shadow_epoch) ) @@ -105,7 +105,7 @@ void *map_domain_page(unsigned long mfn) set_bit(idx, dcache->inuse); dcache->cursor = idx + 1; - spin_unlock(&dcache->lock); + spin_unlock_irqrestore(&dcache->lock, flags); l1e_write(&dcache->l1tab[idx], l1e_from_pfn(mfn, __PAGE_HYPERVISOR)); @@ -114,7 +114,7 @@ void *map_domain_page(unsigned long mfn) return (void *)va; } -void unmap_domain_page(void *va) +void unmap_domain_page(const void *va) { unsigned int idx; struct vcpu *v; @@ -241,7 +241,7 @@ void *map_domain_page_global(unsigned lo return (void *)va; } -void unmap_domain_page_global(void *va) +void unmap_domain_page_global(const void *va) { unsigned long __va = (unsigned long)va; l2_pgentry_t *pl2e; diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/x86_64/compat/mm.c --- a/xen/arch/x86/x86_64/compat/mm.c Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/arch/x86/x86_64/compat/mm.c Tue Nov 04 12:43:19 2008 +0900 @@ -231,6 +231,8 @@ int compat_mmuext_op(XEN_GUEST_HANDLE(mm case MMUEXT_PIN_L4_TABLE: case MMUEXT_UNPIN_TABLE: case MMUEXT_NEW_BASEPTR: + case MMUEXT_CLEAR_PAGE: + case MMUEXT_COPY_PAGE: arg1 = XLAT_mmuext_op_arg1_mfn; break; default: @@ -257,6 +259,9 @@ int compat_mmuext_op(XEN_GUEST_HANDLE(mm case MMUEXT_TLB_FLUSH_MULTI: case MMUEXT_INVLPG_MULTI: arg2 = XLAT_mmuext_op_arg2_vcpumask; + break; + case MMUEXT_COPY_PAGE: + arg2 = XLAT_mmuext_op_arg2_src_mfn; break; default: arg2 = -1; diff -r 10f0e1bb8e5e -r e75cb35c798b xen/arch/x86/x86_64/cpufreq.c --- a/xen/arch/x86/x86_64/cpufreq.c Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/arch/x86/x86_64/cpufreq.c Tue Nov 04 12:43:19 2008 +0900 @@ -56,34 +56,13 @@ compat_set_px_pminfo(uint32_t cpu, struc return -EFAULT; #define XLAT_processor_performance_HNDL_states(_d_, _s_) do { \ - xen_processor_px_t *xen_states = NULL; \ -\ - if ( likely((_s_)->state_count > 0) ) \ - { \ - XEN_GUEST_HANDLE(compat_processor_px_t) states; \ - compat_processor_px_t state; \ - int i; \ -\ - xen_states = xlat_malloc_array(xlat_page_current, \ - xen_processor_px_t, (_s_)->state_count); \ - if ( unlikely(xen_states == NULL) ) \ - return -EFAULT; \ -\ - if ( unlikely(!compat_handle_okay((_s_)->states, \ - (_s_)->state_count)) ) \ - return -EFAULT; \ - guest_from_compat_handle(states, (_s_)->states); \ -\ - for ( i = 0; i < _s_->state_count; i++ ) \ - { \ - if ( unlikely(copy_from_guest_offset(&state, states, i, 1)) ) \ - return -EFAULT; \ - XLAT_processor_px(&xen_states[i], &state); \ - } \ - } \ -\ - set_xen_guest_handle((_d_)->states, xen_states); \ + XEN_GUEST_HANDLE(compat_processor_px_t) states; \ + if ( unlikely(!compat_handle_okay((_s_)->states, (_s_)->state_count)) ) \ + return -EFAULT; \ + guest_from_compat_handle(states, (_s_)->states); \ + (_d_)->states = guest_handle_cast(states, xen_processor_px_t); \ } while (0) + XLAT_processor_performance(xen_perf, perf); #undef XLAT_processor_performance_HNDL_states diff -r 10f0e1bb8e5e -r e75cb35c798b xen/common/event_channel.c --- a/xen/common/event_channel.c Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/common/event_channel.c Tue Nov 04 12:43:19 2008 +0900 @@ -386,7 +386,7 @@ static long __evtchn_close(struct domain if ( v->virq_to_evtchn[chn1->u.virq] != port1 ) continue; v->virq_to_evtchn[chn1->u.virq] = 0; - spin_barrier(&v->virq_lock); + spin_barrier_irq(&v->virq_lock); } break; diff -r 10f0e1bb8e5e -r e75cb35c798b xen/common/kernel.c --- a/xen/common/kernel.c Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/common/kernel.c Tue Nov 04 12:43:19 2008 +0900 @@ -221,7 +221,8 @@ DO(xen_version)(int cmd, XEN_GUEST_HANDL fi.submap |= 1U << XENFEAT_supervisor_mode_kernel; #ifdef CONFIG_X86 if ( !is_hvm_vcpu(current) ) - fi.submap |= 1U << XENFEAT_mmu_pt_update_preserve_ad; + fi.submap |= (1U << XENFEAT_mmu_pt_update_preserve_ad) | + (1U << XENFEAT_highmem_assist); #endif break; default: diff -r 10f0e1bb8e5e -r e75cb35c798b xen/common/keyhandler.c --- a/xen/common/keyhandler.c Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/common/keyhandler.c Tue Nov 04 12:43:19 2008 +0900 @@ -183,9 +183,9 @@ static void dump_domains(unsigned char k { printk("General information for domain %u:\n", d->domain_id); cpuset_print(tmpstr, sizeof(tmpstr), d->domain_dirty_cpumask); - printk(" refcnt=%d nr_pages=%d xenheap_pages=%d " + printk(" refcnt=%d dying=%d nr_pages=%d xenheap_pages=%d " "dirty_cpus=%s\n", - atomic_read(&d->refcnt), + atomic_read(&d->refcnt), d->is_dying, d->tot_pages, d->xenheap_pages, tmpstr); printk(" handle=%02x%02x%02x%02x-%02x%02x-%02x%02x-" "%02x%02x-%02x%02x%02x%02x%02x%02x vm_assist=%08lx\n", diff -r 10f0e1bb8e5e -r e75cb35c798b xen/common/spinlock.c --- a/xen/common/spinlock.c Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/common/spinlock.c Tue Nov 04 12:43:19 2008 +0900 @@ -1,15 +1,56 @@ #include <xen/config.h> +#include <xen/irq.h> #include <xen/smp.h> #include <xen/spinlock.h> +#ifndef NDEBUG + +static atomic_t spin_debug __read_mostly = ATOMIC_INIT(0); + +static void check_lock(struct lock_debug *debug) +{ + int irq_safe = !local_irq_is_enabled(); + + if ( unlikely(atomic_read(&spin_debug) <= 0) ) + return; + + /* A few places take liberties with this. */ + /* BUG_ON(in_irq() && !irq_safe); */ + + if ( unlikely(debug->irq_safe != irq_safe) ) + { + int seen = cmpxchg(&debug->irq_safe, -1, irq_safe); + BUG_ON(seen == !irq_safe); + } +} + +void spin_debug_enable(void) +{ + atomic_inc(&spin_debug); +} + +void spin_debug_disable(void) +{ + atomic_dec(&spin_debug); +} + +#else /* defined(NDEBUG) */ + +#define check_lock(l) ((void)0) + +#endif + void _spin_lock(spinlock_t *lock) { + check_lock(&lock->debug); _raw_spin_lock(&lock->raw); } void _spin_lock_irq(spinlock_t *lock) { - local_irq_disable(); + ASSERT(local_irq_is_enabled()); + local_irq_disable(); + check_lock(&lock->debug); _raw_spin_lock(&lock->raw); } @@ -17,6 +58,7 @@ unsigned long _spin_lock_irqsave(spinloc { unsigned long flags; local_irq_save(flags); + check_lock(&lock->debug); _raw_spin_lock(&lock->raw); return flags; } @@ -40,26 +82,39 @@ void _spin_unlock_irqrestore(spinlock_t int _spin_is_locked(spinlock_t *lock) { + check_lock(&lock->debug); return _raw_spin_is_locked(&lock->raw); } int _spin_trylock(spinlock_t *lock) { + check_lock(&lock->debug); return _raw_spin_trylock(&lock->raw); } void _spin_barrier(spinlock_t *lock) { + check_lock(&lock->debug); do { mb(); } while ( _raw_spin_is_locked(&lock->raw) ); mb(); } +void _spin_barrier_irq(spinlock_t *lock) +{ + unsigned long flags; + local_irq_save(flags); + _spin_barrier(lock); + local_irq_restore(flags); +} + void _spin_lock_recursive(spinlock_t *lock) { int cpu = smp_processor_id(); /* Don't allow overflow of recurse_cpu field. */ BUILD_BUG_ON(NR_CPUS > 0xfffu); + + check_lock(&lock->debug); if ( likely(lock->recurse_cpu != cpu) ) { @@ -83,12 +138,15 @@ void _spin_unlock_recursive(spinlock_t * void _read_lock(rwlock_t *lock) { + check_lock(&lock->debug); _raw_read_lock(&lock->raw); } void _read_lock_irq(rwlock_t *lock) { - local_irq_disable(); + ASSERT(local_irq_is_enabled()); + local_irq_disable(); + check_lock(&lock->debug); _raw_read_lock(&lock->raw); } @@ -96,6 +154,7 @@ unsigned long _read_lock_irqsave(rwlock_ { unsigned long flags; local_irq_save(flags); + check_lock(&lock->debug); _raw_read_lock(&lock->raw); return flags; } @@ -119,12 +178,15 @@ void _read_unlock_irqrestore(rwlock_t *l void _write_lock(rwlock_t *lock) { + check_lock(&lock->debug); _raw_write_lock(&lock->raw); } void _write_lock_irq(rwlock_t *lock) { - local_irq_disable(); + ASSERT(local_irq_is_enabled()); + local_irq_disable(); + check_lock(&lock->debug); _raw_write_lock(&lock->raw); } @@ -132,6 +194,7 @@ unsigned long _write_lock_irqsave(rwlock { unsigned long flags; local_irq_save(flags); + check_lock(&lock->debug); _raw_write_lock(&lock->raw); return flags; } diff -r 10f0e1bb8e5e -r e75cb35c798b xen/common/timer.c --- a/xen/common/timer.c Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/common/timer.c Tue Nov 04 12:43:19 2008 +0900 @@ -25,10 +25,12 @@ * We pull handlers off the timer list this far in future, * rather than reprogramming the time hardware. */ -#define TIMER_SLOP (50*1000) /* ns */ +static unsigned int timer_slop __read_mostly = 50000; /* 50 us */ +integer_param("timer_slop", timer_slop); struct timers { spinlock_t lock; + bool_t overflow; struct timer **heap; struct timer *list; struct timer *running; @@ -200,6 +202,7 @@ static int add_entry(struct timers *time return rc; /* Fall back to adding to the slower linked list. */ + timers->overflow = 1; t->status = TIMER_STATUS_in_list; return add_to_list(&timers->list, t); } @@ -258,6 +261,7 @@ void set_timer(struct timer *timer, s_ti __stop_timer(timer); timer->expires = expires; + timer->expires_end = expires + timer_slop; if ( likely(timer->status != TIMER_STATUS_killed) ) __add_timer(timer); @@ -344,19 +348,30 @@ void kill_timer(struct timer *timer) } +static void execute_timer(struct timers *ts, struct timer *t) +{ + void (*fn)(void *) = t->function; + void *data = t->data; + + ts->running = t; + spin_unlock_irq(&ts->lock); + (*fn)(data); + spin_lock_irq(&ts->lock); + ts->running = NULL; +} + + static void timer_softirq_action(void) { struct timer *t, **heap, *next; struct timers *ts; - s_time_t now, deadline; - void (*fn)(void *); - void *data; + s_time_t now; ts = &this_cpu(timers); heap = ts->heap; - /* If we are using overflow linked list, try to allocate a larger heap. */ - if ( unlikely(ts->list != NULL) ) + /* If we overflowed the heap, try to allocate a larger heap. */ + if ( unlikely(ts->overflow) ) { /* old_limit == (2^n)-1; new_limit == (2^(n+4))-1 */ int old_limit = GET_HEAP_LIMIT(heap); @@ -377,7 +392,26 @@ static void timer_softirq_action(void) spin_lock_irq(&ts->lock); - /* Try to move timers from overflow linked list to more efficient heap. */ + now = NOW(); + + /* Execute ready heap timers. */ + while ( (GET_HEAP_SIZE(heap) != 0) && + ((t = heap[1])->expires_end < now) ) + { + remove_from_heap(heap, t); + t->status = TIMER_STATUS_inactive; + execute_timer(ts, t); + } + + /* Execute ready list timers. */ + while ( ((t = ts->list) != NULL) && (t->expires_end < now) ) + { + ts->list = t->list_next; + t->status = TIMER_STATUS_inactive; + execute_timer(ts, t); + } + + /* Try to move timers from linked list to more efficient heap. */ next = ts->list; ts->list = NULL; while ( unlikely((t = next) != NULL) ) @@ -387,51 +421,44 @@ static void timer_softirq_action(void) add_entry(ts, t); } - now = NOW(); - - while ( (GET_HEAP_SIZE(heap) != 0) && - ((t = heap[1])->expires < (now + TIMER_SLOP)) ) - { - remove_entry(ts, t); - - ts->running = t; - - fn = t->function; - data = t->data; - - spin_unlock_irq(&ts->lock); - (*fn)(data); - spin_lock_irq(&ts->lock); - } - - deadline = GET_HEAP_SIZE(heap) ? heap[1]->expires : 0; - - while ( unlikely((t = ts->list) != NULL) ) - { - if ( t->expires >= (now + TIMER_SLOP) ) + ts->overflow = (ts->list != NULL); + if ( unlikely(ts->overflow) ) + { + /* Find earliest deadline at head of list or top of heap. */ + this_cpu(timer_deadline) = ts->list->expires; + if ( (GET_HEAP_SIZE(heap) != 0) && + ((t = heap[1])->expires < this_cpu(timer_deadline)) ) + this_cpu(timer_deadline) = t->expires; + } + else + { + /* + * Find the earliest deadline that encompasses largest number of timers + * on the heap. To do this we take timers from the heap while their + * valid deadline ranges continue to intersect. + */ + s_time_t start = 0, end = STIME_MAX; + struct timer **list_tail = &ts->list; + + while ( (GET_HEAP_SIZE(heap) != 0) && + ((t = heap[1])->expires <= end) ) { - if ( (deadline == 0) || (deadline > t->expires) ) - deadline = t->expires; - break; + remove_entry(ts, t); + + t->status = TIMER_STATUS_in_list; + t->list_next = NULL; + *list_tail = t; + list_tail = &t->list_next; + + start = t->expires; + if ( end > t->expires_end ) + end = t->expires_end; } - ts->list = t->list_next; - t->status = TIMER_STATUS_inactive; - - ts->running = t; - - fn = t->function; - data = t->data; - - spin_unlock_irq(&ts->lock); - (*fn)(data); - spin_lock_irq(&ts->lock); - } - - ts->running = NULL; - - this_cpu(timer_deadline) = deadline; - if ( !reprogram_timer(deadline) ) + this_cpu(timer_deadline) = start; + } + + if ( !reprogram_timer(this_cpu(timer_deadline)) ) raise_softirq(TIMER_SOFTIRQ); spin_unlock_irq(&ts->lock); diff -r 10f0e1bb8e5e -r e75cb35c798b xen/common/xenoprof.c --- a/xen/common/xenoprof.c Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/common/xenoprof.c Tue Nov 04 12:43:19 2008 +0900 @@ -85,7 +85,7 @@ int is_active(struct domain *d) return ((x != NULL) && (x->domain_type == XENOPROF_DOMAIN_ACTIVE)); } -static int is_passive(struct domain *d) +int is_passive(struct domain *d) { struct xenoprof *x = d->xenoprof; return ((x != NULL) && (x->domain_type == XENOPROF_DOMAIN_PASSIVE)); diff -r 10f0e1bb8e5e -r e75cb35c798b xen/common/xmalloc.c --- a/xen/common/xmalloc.c Tue Nov 04 12:07:22 2008 +0900 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,286 +0,0 @@ -/****************************************************************************** - * Simple allocator for Xen. If larger than a page, simply use the - * page-order allocator. - * - * Copyright (C) 2005 Rusty Russell IBM Corporation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -/* - * TODO (Keir, 17/2/05): - * 1. Use space in page_info to avoid xmalloc_hdr in allocated blocks. - * 2. page_info points into free list to make xfree() O(1) complexity. - * 3. Perhaps make this a sub-page buddy allocator? xmalloc() == O(1). - * (Disadvantage is potentially greater internal fragmentation). - */ - -#include <xen/config.h> -#include <xen/mm.h> -#include <xen/spinlock.h> -#include <xen/timer.h> -#include <xen/cache.h> -#include <xen/prefetch.h> -#include <xen/irq.h> -#include <xen/smp.h> - -/* - * XMALLOC_DEBUG: - * 1. Free data blocks are filled with poison bytes. - * 2. In-use data blocks have guard bytes at the start and end. - */ -#ifndef NDEBUG -#define XMALLOC_DEBUG 1 -#endif - -static LIST_HEAD(freelist); -static DEFINE_SPINLOCK(freelist_lock); - -struct xmalloc_hdr -{ - /* Size is total including this header. */ - size_t size; - struct list_head freelist; -} __cacheline_aligned; - -static void add_to_freelist(struct xmalloc_hdr *hdr) -{ -#if XMALLOC_DEBUG - memset(hdr + 1, 0xa5, hdr->size - sizeof(*hdr)); -#endif - list_add(&hdr->freelist, &freelist); -} - -static void del_from_freelist(struct xmalloc_hdr *hdr) -{ -#if XMALLOC_DEBUG - size_t i; - unsigned char *data = (unsigned char *)(hdr + 1); - for ( i = 0; i < (hdr->size - sizeof(*hdr)); i++ ) - BUG_ON(data[i] != 0xa5); - BUG_ON((hdr->size <= 0) || (hdr->size >= PAGE_SIZE)); -#endif - list_del(&hdr->freelist); -} - -static void *data_from_header(struct xmalloc_hdr *hdr) -{ -#if XMALLOC_DEBUG - /* Data block contain SMP_CACHE_BYTES of guard canary. */ - unsigned char *data = (unsigned char *)(hdr + 1); - memset(data, 0x5a, SMP_CACHE_BYTES); - memset(data + hdr->size - sizeof(*hdr) - SMP_CACHE_BYTES, - 0x5a, SMP_CACHE_BYTES); - return data + SMP_CACHE_BYTES; -#else - return hdr + 1; -#endif -} - -static struct xmalloc_hdr *header_from_data(void *p) -{ -#if XMALLOC_DEBUG - unsigned char *data = (unsigned char *)p - SMP_CACHE_BYTES; - struct xmalloc_hdr *hdr = (struct xmalloc_hdr *)data - 1; - size_t i; - - /* Check header guard canary. */ - for ( i = 0; i < SMP_CACHE_BYTES; i++ ) - BUG_ON(data[i] != 0x5a); - - /* Check footer guard canary. */ - data += hdr->size - sizeof(*hdr) - SMP_CACHE_BYTES; - for ( i = 0; i < SMP_CACHE_BYTES; i++ ) - BUG_ON(data[i] != 0x5a); - - return hdr; -#else - return (struct xmalloc_hdr *)p - 1; -#endif -} - -static void maybe_split(struct xmalloc_hdr *hdr, size_t size, size_t block) -{ - struct xmalloc_hdr *extra; - size_t leftover = block - size; - - /* If enough is left to make a block, put it on free list. */ - if ( leftover >= (2 * sizeof(struct xmalloc_hdr)) ) - { - extra = (struct xmalloc_hdr *)((unsigned long)hdr + size); - extra->size = leftover; - add_to_freelist(extra); - } - else - { - size = block; - } - - hdr->size = size; - /* Debugging aid. */ - hdr->freelist.next = hdr->freelist.prev = NULL; -} - -static void *xmalloc_new_page(size_t size) -{ - struct xmalloc_hdr *hdr; - - hdr = alloc_xenheap_page(); - if ( hdr == NULL ) - return NULL; - - spin_lock(&freelist_lock); - maybe_split(hdr, size, PAGE_SIZE); - spin_unlock(&freelist_lock); - - return data_from_header(hdr); -} - -/* Big object? Just use the page allocator. */ -static void *xmalloc_whole_pages(size_t size) -{ - struct xmalloc_hdr *hdr; - unsigned int pageorder = get_order_from_bytes(size); - - hdr = alloc_xenheap_pages(pageorder); - if ( hdr == NULL ) - return NULL; - - hdr->size = (1 << (pageorder + PAGE_SHIFT)); - /* Debugging aid. */ - hdr->freelist.next = hdr->freelist.prev = NULL; - - return data_from_header(hdr); -} - -/* Return size, increased to alignment with align. */ -static inline size_t align_up(size_t size, size_t align) -{ - return (size + align - 1) & ~(align - 1); -} - -void *_xmalloc(size_t size, size_t align) -{ - struct xmalloc_hdr *i; - - ASSERT(!in_irq()); - - /* We currently always return cacheline aligned. */ - BUG_ON(align > SMP_CACHE_BYTES); - -#if XMALLOC_DEBUG - /* Add room for canaries at start and end of data block. */ - size += 2 * SMP_CACHE_BYTES; -#endif - - /* Add room for header, pad to align next header. */ - size += sizeof(struct xmalloc_hdr); - size = align_up(size, __alignof__(struct xmalloc_hdr)); - - /* For big allocs, give them whole pages. */ - if ( size >= PAGE_SIZE ) - return xmalloc_whole_pages(size); - - /* Search free list. */ - spin_lock(&freelist_lock); - list_for_each_entry( i, &freelist, freelist ) - { - if ( i->size < size ) - continue; - del_from_freelist(i); - maybe_split(i, size, i->size); - spin_unlock(&freelist_lock); - return data_from_header(i); - } - spin_unlock(&freelist_lock); - - /* Alloc a new page and return from that. */ - return xmalloc_new_page(size); -} - -void xfree(void *p) -{ - struct xmalloc_hdr *i, *tmp, *hdr; - - ASSERT(!in_irq()); - - if ( p == NULL ) - return; - - hdr = header_from_data(p); - - /* We know hdr will be on same page. */ - BUG_ON(((long)p & PAGE_MASK) != ((long)hdr & PAGE_MASK)); - - /* Not previously freed. */ - BUG_ON(hdr->freelist.next || hdr->freelist.prev); - - /* Big allocs free directly. */ - if ( hdr->size >= PAGE_SIZE ) - { - free_xenheap_pages(hdr, get_order_from_bytes(hdr->size)); - return; - } - - /* Merge with other free block, or put in list. */ - spin_lock(&freelist_lock); - list_for_each_entry_safe( i, tmp, &freelist, freelist ) - { - unsigned long _i = (unsigned long)i; - unsigned long _hdr = (unsigned long)hdr; - - /* Do not merge across page boundaries. */ - if ( ((_i ^ _hdr) & PAGE_MASK) != 0 ) - continue; - - /* We follow this block? Swallow it. */ - if ( (_i + i->size) == _hdr ) - { - del_from_freelist(i); - i->size += hdr->size; - hdr = i; - } - - /* We precede this block? Swallow it. */ - if ( (_hdr + hdr->size) == _i ) - { - del_from_freelist(i); - hdr->size += i->size; - } - } - - /* Did we merge an entire page? */ - if ( hdr->size == PAGE_SIZE ) - { - BUG_ON((((unsigned long)hdr) & (PAGE_SIZE-1)) != 0); - free_xenheap_pages(hdr, 0); - } - else - { - add_to_freelist(hdr); - } - - spin_unlock(&freelist_lock); -} - -/* - * Local variables: - * mode: C - * c-set-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ diff -r 10f0e1bb8e5e -r e75cb35c798b xen/drivers/char/serial.c --- a/xen/drivers/char/serial.c Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/drivers/char/serial.c Tue Nov 04 12:43:19 2008 +0900 @@ -74,7 +74,7 @@ void serial_tx_interrupt(struct serial_p while ( !spin_trylock(&port->tx_lock) ) { if ( !port->driver->tx_empty(port) ) - return; + goto out; cpu_relax(); } @@ -89,7 +89,10 @@ void serial_tx_interrupt(struct serial_p } } - spin_unlock_irqrestore(&port->tx_lock, flags); + spin_unlock(&port->tx_lock); + + out: + local_irq_restore(flags); } static void __serial_putc(struct serial_port *port, char c) diff -r 10f0e1bb8e5e -r e75cb35c798b xen/drivers/cpufreq/cpufreq.c --- a/xen/drivers/cpufreq/cpufreq.c Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/drivers/cpufreq/cpufreq.c Tue Nov 04 12:43:19 2008 +0900 @@ -31,6 +31,7 @@ #include <xen/errno.h> #include <xen/delay.h> #include <xen/cpumask.h> +#include <xen/list.h> #include <xen/sched.h> #include <xen/timer.h> #include <xen/xmalloc.h> @@ -44,8 +45,12 @@ #include <acpi/acpi.h> #include <acpi/cpufreq/cpufreq.h> -/* TODO: change to link list later as domain number may be sparse */ -static cpumask_t cpufreq_dom_map[NR_CPUS]; +struct cpufreq_dom { + unsigned int dom; + cpumask_t map; + struct list_head node; +}; +static LIST_HEAD(cpufreq_dom_list_head); int cpufreq_limit_change(unsigned int cpu) { @@ -72,48 +77,80 @@ int cpufreq_add_cpu(unsigned int cpu) { int ret = 0; unsigned int firstcpu; - unsigned int dom; + unsigned int dom, domexist = 0; unsigned int j; + struct list_head *pos; + struct cpufreq_dom *cpufreq_dom = NULL; struct cpufreq_policy new_policy; struct cpufreq_policy *policy; struct processor_performance *perf = &processor_pminfo[cpu]->perf; /* to protect the case when Px was not controlled by xen */ - if (!processor_pminfo[cpu] || !(perf->init & XEN_PX_INIT)) + if (!processor_pminfo[cpu] || + !(perf->init & XEN_PX_INIT) || + !cpu_online(cpu)) + return -EINVAL; + + if (cpufreq_cpu_policy[cpu]) return 0; - - if (!cpu_online(cpu) || cpufreq_cpu_policy[cpu]) - return -EINVAL; ret = cpufreq_statistic_init(cpu); if (ret) return ret; dom = perf->domain_info.domain; - if (cpus_weight(cpufreq_dom_map[dom])) { + + list_for_each(pos, &cpufreq_dom_list_head) { + cpufreq_dom = list_entry(pos, struct cpufreq_dom, node); + if (dom == cpufreq_dom->dom) { + domexist = 1; + break; + } + } + + if (domexist) { /* share policy with the first cpu since on same boat */ - firstcpu = first_cpu(cpufreq_dom_map[dom]); + firstcpu = first_cpu(cpufreq_dom->map); policy = cpufreq_cpu_policy[firstcpu]; cpufreq_cpu_policy[cpu] = policy; - cpu_set(cpu, cpufreq_dom_map[dom]); + cpu_set(cpu, cpufreq_dom->map); cpu_set(cpu, policy->cpus); + + /* domain coordination sanity check */ + if ((perf->domain_info.coord_type != + processor_pminfo[firstcpu]->perf.domain_info.coord_type) || + (perf->domain_info.num_processors != + processor_pminfo[firstcpu]->perf.domain_info.num_processors)) { + ret = -EINVAL; + goto err2; + } printk(KERN_EMERG"adding CPU %u\n", cpu); } else { + cpufreq_dom = xmalloc(struct cpufreq_dom); + if (!cpufreq_dom) { + cpufreq_statistic_exit(cpu); + return -ENOMEM; + } + memset(cpufreq_dom, 0, sizeof(struct cpufreq_dom)); + cpufreq_dom->dom = dom; + cpu_set(cpu, cpufreq_dom->map); + list_add(&cpufreq_dom->node, &cpufreq_dom_list_head); + /* for the first cpu, setup policy and do init work */ policy = xmalloc(struct cpufreq_policy); if (!policy) { + list_del(&cpufreq_dom->node); + xfree(cpufreq_dom); cpufreq_statistic_exit(cpu); return -ENOMEM; } memset(policy, 0, sizeof(struct cpufreq_policy)); - + policy->cpu = cpu; + cpu_set(cpu, policy->cpus); cpufreq_cpu_policy[cpu] = policy; - cpu_set(cpu, cpufreq_dom_map[dom]); - cpu_set(cpu, policy->cpus); - - policy->cpu = cpu; + ret = cpufreq_driver->init(policy); if (ret) goto err1; @@ -124,7 +161,7 @@ int cpufreq_add_cpu(unsigned int cpu) * After get full cpumap of the coordination domain, * we can safely start gov here. */ - if (cpus_weight(cpufreq_dom_map[dom]) == + if (cpus_weight(cpufreq_dom->map) == perf->domain_info.num_processors) { memcpy(&new_policy, policy, sizeof(struct cpufreq_policy)); policy->governor = NULL; @@ -138,51 +175,68 @@ err2: err2: cpufreq_driver->exit(policy); err1: - for_each_cpu_mask(j, cpufreq_dom_map[dom]) { + for_each_cpu_mask(j, cpufreq_dom->map) { cpufreq_cpu_policy[j] = NULL; cpufreq_statistic_exit(j); } - cpus_clear(cpufreq_dom_map[dom]); + list_del(&cpufreq_dom->node); + xfree(cpufreq_dom); xfree(policy); return ret; } int cpufreq_del_cpu(unsigned int cpu) { - unsigned int dom; + unsigned int dom, domexist = 0; + struct list_head *pos; + struct cpufreq_dom *cpufreq_dom = NULL; struct cpufreq_policy *policy; struct processor_performance *perf = &processor_pminfo[cpu]->perf; /* to protect the case when Px was not controlled by xen */ - if (!processor_pminfo[cpu] || !(perf->init & XEN_PX_INIT)) + if (!processor_pminfo[cpu] || + !(perf->init & XEN_PX_INIT) || + !cpu_online(cpu)) + return -EINVAL; + + if (!cpufreq_cpu_policy[cpu]) return 0; - - if (!cpu_online(cpu) || !cpufreq_cpu_policy[cpu]) - return -EINVAL; dom = perf->domain_info.domain; policy = cpufreq_cpu_policy[cpu]; - printk(KERN_EMERG"deleting CPU %u\n", cpu); + list_for_each(pos, &cpufreq_dom_list_head) { + cpufreq_dom = list_entry(pos, struct cpufreq_dom, node); + if (dom == cpufreq_dom->dom) { + domexist = 1; + break; + } + } + + if (!domexist) + return -EINVAL; /* for the first cpu of the domain, stop gov */ - if (cpus_weight(cpufreq_dom_map[dom]) == + if (cpus_weight(cpufreq_dom->map) == perf->domain_info.num_processors) __cpufreq_governor(policy, CPUFREQ_GOV_STOP); cpufreq_cpu_policy[cpu] = NULL; cpu_clear(cpu, policy->cpus); - cpu_clear(cpu, cpufreq_dom_map[dom]); + cpu_clear(cpu, cpufreq_dom->map); cpufreq_statistic_exit(cpu); /* for the last cpu of the domain, clean room */ /* It's safe here to free freq_table, drv_data and policy */ - if (!cpus_weight(cpufreq_dom_map[dom])) { + if (!cpus_weight(cpufreq_dom->map)) { cpufreq_driver->exit(policy); + list_del(&cpufreq_dom->node); + xfree(cpufreq_dom); xfree(policy); } + printk(KERN_EMERG"deleting CPU %u\n", cpu); return 0; } @@ -258,6 +312,24 @@ int set_px_pminfo(uint32_t acpi_id, stru if ( dom0_px_info->flags & XEN_PX_PCT ) { + /* space_id check */ + if (dom0_px_info->control_register.space_id != + dom0_px_info->status_register.space_id) + { + ret = -EINVAL; + goto out; + } + +#ifdef CONFIG_IA64 + /* for IA64, currently it only supports FFH */ + if (dom0_px_info->control_register.space_id != + ACPI_ADR_SPACE_FIXED_HARDWARE) + { + ret = -EINVAL; + goto out; + } +#endif + memcpy ((void *)&pxpt->control_register, (void *)&dom0_px_info->control_register, sizeof(struct xen_pct_register)); @@ -267,8 +339,16 @@ int set_px_pminfo(uint32_t acpi_id, stru print_PCT(&pxpt->control_register); print_PCT(&pxpt->status_register); } + if ( dom0_px_info->flags & XEN_PX_PSS ) { + /* capability check */ + if (dom0_px_info->state_count <= 1) + { + ret = -EINVAL; + goto out; + } + if ( !(pxpt->states = xmalloc_array(struct xen_processor_px, dom0_px_info->state_count)) ) { @@ -280,14 +360,28 @@ int set_px_pminfo(uint32_t acpi_id, stru pxpt->state_count = dom0_px_info->state_count; print_PSS(pxpt->states,pxpt->state_count); } + if ( dom0_px_info->flags & XEN_PX_PSD ) { +#ifdef CONFIG_X86 + /* for X86, check domain coordination */ + /* for IA64, _PSD is optional for current IA64 cpufreq algorithm */ + if (dom0_px_info->shared_type != CPUFREQ_SHARED_TYPE_ALL && + dom0_px_info->shared_type != CPUFREQ_SHARED_TYPE_ANY && + dom0_px_info->shared_type != CPUFREQ_SHARED_TYPE_HW) + { + ret = -EINVAL; + goto out; + } +#endif + pxpt->shared_type = dom0_px_info->shared_type; memcpy ((void *)&pxpt->domain_info, (void *)&dom0_px_info->domain_info, sizeof(struct xen_psd_package)); print_PSD(&pxpt->domain_info); } + if ( dom0_px_info->flags & XEN_PX_PPC ) { pxpt->platform_limit = dom0_px_info->platform_limit; @@ -295,7 +389,6 @@ int set_px_pminfo(uint32_t acpi_id, stru if ( pxpt->init == XEN_PX_INIT ) { - ret = cpufreq_limit_change(cpuid); goto out; } diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/asm-x86/config.h --- a/xen/include/asm-x86/config.h Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/include/asm-x86/config.h Tue Nov 04 12:43:19 2008 +0900 @@ -40,14 +40,6 @@ #define CONFIG_HOTPLUG 1 #define CONFIG_HOTPLUG_CPU 1 - -/* - * Avoid deep recursion when tearing down pagetables during domain destruction, - * causing dom0 to become unresponsive and Xen to miss time-critical softirq - * deadlines. This will ultimately be replaced by built-in preemptibility of - * get_page_type(). - */ -#define DOMAIN_DESTRUCT_AVOID_RECURSION 1 #define HZ 100 diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/asm-x86/event.h --- a/xen/include/asm-x86/event.h Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/include/asm-x86/event.h Tue Nov 04 12:43:19 2008 +0900 @@ -11,36 +11,8 @@ #include <xen/shared.h> -static inline void vcpu_kick(struct vcpu *v) -{ - /* - * NB1. 'pause_flags' and 'processor' must be checked /after/ update of - * pending flag. These values may fluctuate (after all, we hold no - * locks) but the key insight is that each change will cause - * evtchn_upcall_pending to be polled. - * - * NB2. We save the running flag across the unblock to avoid a needless - * IPI for domains that we IPI'd to unblock. - */ - int running = v->is_running; - vcpu_unblock(v); - if ( running ) - smp_send_event_check_cpu(v->processor); -} - -static inline void vcpu_mark_events_pending(struct vcpu *v) -{ - int already_pending = test_and_set_bit( - 0, (unsigned long *)&vcpu_info(v, evtchn_upcall_pending)); - - if ( already_pending ) - return; - - if ( is_hvm_vcpu(v) ) - hvm_assert_evtchn_irq(v); - else - vcpu_kick(v); -} +void vcpu_kick(struct vcpu *v); +void vcpu_mark_events_pending(struct vcpu *v); int hvm_local_events_need_delivery(struct vcpu *v); static inline int local_events_need_delivery(void) diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/asm-x86/fixmap.h --- a/xen/include/asm-x86/fixmap.h Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/include/asm-x86/fixmap.h Tue Nov 04 12:43:19 2008 +0900 @@ -29,6 +29,7 @@ * from the end of virtual memory backwards. */ enum fixed_addresses { + FIX_RESERVED, /* Index 0 is reserved since fix_to_virt(0) > FIXADDR_TOP. */ #ifdef __i386__ FIX_PAE_HIGHMEM_0, FIX_PAE_HIGHMEM_END = FIX_PAE_HIGHMEM_0 + NR_CPUS-1, diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/asm-x86/hvm/vmx/vpmu.h --- a/xen/include/asm-x86/hvm/vmx/vpmu.h Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/include/asm-x86/hvm/vmx/vpmu.h Tue Nov 04 12:43:19 2008 +0900 @@ -67,7 +67,7 @@ struct vpmu_struct { #define VPMU_CONTEXT_ALLOCATED 0x1 #define VPMU_CONTEXT_LOADED 0x2 #define VPMU_RUNNING 0x4 - +#define PASSIVE_DOMAIN_ALLOCATED 0x8 int vpmu_do_wrmsr(struct cpu_user_regs *regs); int vpmu_do_rdmsr(struct cpu_user_regs *regs); int vpmu_do_interrupt(struct cpu_user_regs *regs); diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/asm-x86/hvm/vmx/vpmu_core2.h --- a/xen/include/asm-x86/hvm/vmx/vpmu_core2.h Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/include/asm-x86/hvm/vmx/vpmu_core2.h Tue Nov 04 12:43:19 2008 +0900 @@ -23,28 +23,6 @@ #ifndef __ASM_X86_HVM_VPMU_CORE_H_ #define __ASM_X86_HVM_VPMU_CORE_H_ -/* Core 2 Non-architectual Performance Counter MSRs. */ -u32 core2_counters_msr[] = { - MSR_CORE_PERF_FIXED_CTR0, - MSR_CORE_PERF_FIXED_CTR1, - MSR_CORE_PERF_FIXED_CTR2}; - -/* Core 2 Non-architectual Performance Control MSRs. */ -u32 core2_ctrls_msr[] = { - MSR_CORE_PERF_FIXED_CTR_CTRL, - MSR_IA32_PEBS_ENABLE, - MSR_IA32_DS_AREA}; - -struct pmumsr core2_counters = { - 3, - core2_counters_msr -}; - -struct pmumsr core2_ctrls = { - 3, - core2_ctrls_msr -}; - struct arch_msr_pair { u64 counter; u64 control; diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/asm-x86/hvm/vpt.h --- a/xen/include/asm-x86/hvm/vpt.h Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/include/asm-x86/hvm/vpt.h Tue Nov 04 12:43:19 2008 +0900 @@ -32,41 +32,6 @@ #include <asm/hvm/irq.h> #include <public/hvm/save.h> -struct HPETState; -struct HPET_timer_fn_info { - struct HPETState *hs; - unsigned int tn; -}; - -struct hpet_registers { - /* Memory-mapped, software visible registers */ - uint64_t capability; /* capabilities */ - uint64_t config; /* configuration */ - uint64_t isr; /* interrupt status reg */ - uint64_t mc64; /* main counter */ - struct { /* timers */ - uint64_t config; /* configuration/cap */ - uint64_t cmp; /* comparator */ - uint64_t fsb; /* FSB route, not supported now */ - } timers[HPET_TIMER_NUM]; - - /* Hidden register state */ - uint64_t period[HPET_TIMER_NUM]; /* Last value written to comparator */ -}; - -typedef struct HPETState { - struct hpet_registers hpet; - struct vcpu *vcpu; - uint64_t stime_freq; - uint64_t hpet_to_ns_scale; /* hpet ticks to ns (multiplied by 2^10) */ - uint64_t hpet_to_ns_limit; /* max hpet ticks convertable to ns */ - uint64_t mc_offset; - struct timer timers[HPET_TIMER_NUM]; - struct HPET_timer_fn_info timer_fn_info[HPET_TIMER_NUM]; - spinlock_t lock; -} HPETState; - - /* * Abstract layer of periodic time, one short time. */ @@ -107,6 +72,34 @@ typedef struct PITState { struct periodic_time pt0; spinlock_t lock; } PITState; + +struct hpet_registers { + /* Memory-mapped, software visible registers */ + uint64_t capability; /* capabilities */ + uint64_t config; /* configuration */ + uint64_t isr; /* interrupt status reg */ + uint64_t mc64; /* main counter */ + struct { /* timers */ + uint64_t config; /* configuration/cap */ + uint64_t cmp; /* comparator */ + uint64_t fsb; /* FSB route, not supported now */ + } timers[HPET_TIMER_NUM]; + + /* Hidden register state */ + uint64_t period[HPET_TIMER_NUM]; /* Last value written to comparator */ + uint64_t comparator64[HPET_TIMER_NUM]; /* 64 bit running comparator */ +}; + +typedef struct HPETState { + struct hpet_registers hpet; + struct vcpu *vcpu; + uint64_t stime_freq; + uint64_t hpet_to_ns_scale; /* hpet ticks to ns (multiplied by 2^10) */ + uint64_t hpet_to_ns_limit; /* max hpet ticks convertable to ns */ + uint64_t mc_offset; + struct periodic_time pt[HPET_TIMER_NUM]; + spinlock_t lock; +} HPETState; typedef struct RTCState { /* Hardware state */ @@ -160,13 +153,13 @@ void pt_migrate(struct vcpu *v); * The given periodic timer structure must be initialised with zero bytes, * except for the 'source' field which must be initialised with the * correct PTSRC_ value. The initialised timer structure can then be passed - * to {create,destroy}_periodic_time() and number of times and in any order. + * to {create,destroy}_periodic_time() any number of times and in any order. * Note that, for a given periodic timer, invocations of these functions MUST * be serialised. */ void create_periodic_time( - struct vcpu *v, struct periodic_time *pt, uint64_t period, - uint8_t irq, char one_shot, time_cb *cb, void *data); + struct vcpu *v, struct periodic_time *pt, uint64_t delta, + uint64_t period, uint8_t irq, time_cb *cb, void *data); void destroy_periodic_time(struct periodic_time *pt); int pv_pit_handler(int port, int data, int write); @@ -185,7 +178,6 @@ void pmtimer_deinit(struct domain *d); void pmtimer_deinit(struct domain *d); void pmtimer_reset(struct domain *d); -void hpet_migrate_timers(struct vcpu *v); void hpet_init(struct vcpu *v); void hpet_deinit(struct domain *d); void hpet_reset(struct domain *d); diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/asm-x86/mm.h --- a/xen/include/asm-x86/mm.h Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/include/asm-x86/mm.h Tue Nov 04 12:43:19 2008 +0900 @@ -61,12 +61,36 @@ struct page_info /* * When PGT_partial is true then this field is valid and indicates * that PTEs in the range [0, @nr_validated_ptes) have been validated. - * If @partial_pte is true then PTE at @nr_validated_ptes+1 has been - * partially validated. + * An extra page reference must be acquired (or not dropped) whenever + * PGT_partial gets set, and it must be dropped when the flag gets + * cleared. This is so that a get() leaving a page in partially + * validated state (where the caller would drop the reference acquired + * due to the getting of the type [apparently] failing [-EAGAIN]) + * would not accidentally result in a page left with zero general + * reference count, but non-zero type reference count (possible when + * the partial get() is followed immediately by domain destruction). + * Likewise, the ownership of the single type reference for partially + * (in-)validated pages is tied to this flag, i.e. the instance + * setting the flag must not drop that reference, whereas the instance + * clearing it will have to. + * + * If @partial_pte is positive then PTE at @nr_validated_ptes+1 has + * been partially validated. This implies that the general reference + * to the page (acquired from get_page_from_lNe()) would be dropped + * (again due to the apparent failure) and hence must be re-acquired + * when resuming the validation, but must not be dropped when picking + * up the page for invalidation. + * + * If @partial_pte is negative then PTE at @nr_validated_ptes+1 has + * been partially invalidated. This is basically the opposite case of + * above, i.e. the general reference to the page was not dropped in + * put_page_from_lNe() (due to the apparent failure), and hence it + * must be dropped when the put operation is resumed (and completes), + * but it must not be acquired if picking up the page for validation. */ struct { u16 nr_validated_ptes; - bool_t partial_pte; + s8 partial_pte; }; /* diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/asm-x86/page.h --- a/xen/include/asm-x86/page.h Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/include/asm-x86/page.h Tue Nov 04 12:43:19 2008 +0900 @@ -314,6 +314,9 @@ unsigned long clone_idle_pagetable(struc #define __PAGE_HYPERVISOR_NOCACHE \ (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_PCD | _PAGE_ACCESSED) +#define GRANT_PTE_FLAGS \ + (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_NX | _PAGE_GNTTAB) + #ifndef __ASSEMBLY__ static inline int get_order_from_bytes(paddr_t size) diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/asm-x86/softirq.h --- a/xen/include/asm-x86/softirq.h Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/include/asm-x86/softirq.h Tue Nov 04 12:43:19 2008 +0900 @@ -3,7 +3,8 @@ #define NMI_MCE_SOFTIRQ (NR_COMMON_SOFTIRQS + 0) #define TIME_CALIBRATE_SOFTIRQ (NR_COMMON_SOFTIRQS + 1) +#define VCPU_KICK_SOFTIRQ (NR_COMMON_SOFTIRQS + 2) -#define NR_ARCH_SOFTIRQS 2 +#define NR_ARCH_SOFTIRQS 3 #endif /* __ASM_SOFTIRQ_H__ */ diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/asm-x86/x86_32/page.h --- a/xen/include/asm-x86/x86_32/page.h Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/include/asm-x86/x86_32/page.h Tue Nov 04 12:43:19 2008 +0900 @@ -105,9 +105,6 @@ extern unsigned int PAGE_HYPERVISOR_NOCA #define get_pte_flags(x) (((int)((x) >> 32) & ~0xFFF) | ((int)(x) & 0xFFF)) #define put_pte_flags(x) (((intpte_t)((x) & ~0xFFF) << 32) | ((x) & 0xFFF)) -#define GRANT_PTE_FLAGS \ - (_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_GNTTAB) - /* * Disallow unused flag bits plus PAT/PSE, PCD, PWT and GLOBAL. * Permit the NX bit if the hardware supports it. diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/asm-x86/x86_64/page.h --- a/xen/include/asm-x86/x86_64/page.h Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/include/asm-x86/x86_64/page.h Tue Nov 04 12:43:19 2008 +0900 @@ -119,13 +119,10 @@ typedef l4_pgentry_t root_pgentry_t; #define L3_DISALLOW_MASK (BASE_DISALLOW_MASK) #define L4_DISALLOW_MASK (BASE_DISALLOW_MASK) -#define COMPAT_L3_DISALLOW_MASK 0xFFFFF1FEU +#define COMPAT_L3_DISALLOW_MASK 0xFFFFF198U #define PAGE_HYPERVISOR (__PAGE_HYPERVISOR | _PAGE_GLOBAL) #define PAGE_HYPERVISOR_NOCACHE (__PAGE_HYPERVISOR_NOCACHE | _PAGE_GLOBAL) - -#define GRANT_PTE_FLAGS \ - (_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_GNTTAB|_PAGE_USER) #define USER_MAPPINGS_ARE_GLOBAL #ifdef USER_MAPPINGS_ARE_GLOBAL diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/asm-x86/xenoprof.h --- a/xen/include/asm-x86/xenoprof.h Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/include/asm-x86/xenoprof.h Tue Nov 04 12:43:19 2008 +0900 @@ -64,6 +64,9 @@ void xenoprof_backtrace( "xenoprof/x86 with autotranslated mode enabled" \ "isn't supported yet\n"); \ } while (0) +int passive_domain_do_rdmsr(struct cpu_user_regs *regs); +int passive_domain_do_wrmsr(struct cpu_user_regs *regs); +void passive_domain_destroy(struct vcpu *v); #endif /* __ASM_X86_XENOPROF_H__ */ diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/public/features.h --- a/xen/include/public/features.h Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/include/public/features.h Tue Nov 04 12:43:19 2008 +0900 @@ -59,6 +59,9 @@ /* x86: Does this Xen host support the MMU_PT_UPDATE_PRESERVE_AD hypercall? */ #define XENFEAT_mmu_pt_update_preserve_ad 5 +/* x86: Does this Xen host support the MMU_{CLEAR,COPY}_PAGE hypercall? */ +#define XENFEAT_highmem_assist 6 + #define XENFEAT_NR_SUBMAPS 1 #endif /* __XEN_PUBLIC_FEATURES_H__ */ diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/public/trace.h --- a/xen/include/public/trace.h Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/include/public/trace.h Tue Nov 04 12:43:19 2008 +0900 @@ -142,7 +142,9 @@ #define TRC_HVM_INVLPG64 (TRC_HVM_HANDLER + TRC_64_FLAG + 0x14) #define TRC_HVM_MCE (TRC_HVM_HANDLER + 0x15) #define TRC_HVM_IO_ASSIST (TRC_HVM_HANDLER + 0x16) +#define TRC_HVM_IO_ASSIST64 (TRC_HVM_HANDLER + TRC_64_FLAG + 0x16) #define TRC_HVM_MMIO_ASSIST (TRC_HVM_HANDLER + 0x17) +#define TRC_HVM_MMIO_ASSIST64 (TRC_HVM_HANDLER + TRC_64_FLAG + 0x17) #define TRC_HVM_CLTS (TRC_HVM_HANDLER + 0x18) #define TRC_HVM_LMSW (TRC_HVM_HANDLER + 0x19) #define TRC_HVM_LMSW64 (TRC_HVM_HANDLER + TRC_64_FLAG + 0x19) diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/public/xen.h --- a/xen/include/public/xen.h Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/include/public/xen.h Tue Nov 04 12:43:19 2008 +0900 @@ -231,6 +231,13 @@ DEFINE_XEN_GUEST_HANDLE(xen_pfn_t); * cmd: MMUEXT_SET_LDT * linear_addr: Linear address of LDT base (NB. must be page-aligned). * nr_ents: Number of entries in LDT. + * + * cmd: MMUEXT_CLEAR_PAGE + * mfn: Machine frame number to be cleared. + * + * cmd: MMUEXT_COPY_PAGE + * mfn: Machine frame number of the destination page. + * src_mfn: Machine frame number of the source page. */ #define MMUEXT_PIN_L1_TABLE 0 #define MMUEXT_PIN_L2_TABLE 1 @@ -247,12 +254,15 @@ DEFINE_XEN_GUEST_HANDLE(xen_pfn_t); #define MMUEXT_FLUSH_CACHE 12 #define MMUEXT_SET_LDT 13 #define MMUEXT_NEW_USER_BASEPTR 15 +#define MMUEXT_CLEAR_PAGE 16 +#define MMUEXT_COPY_PAGE 17 #ifndef __ASSEMBLY__ struct mmuext_op { unsigned int cmd; union { - /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR */ + /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR + * CLEAR_PAGE, COPY_PAGE */ xen_pfn_t mfn; /* INVLPG_LOCAL, INVLPG_ALL, SET_LDT */ unsigned long linear_addr; @@ -266,6 +276,8 @@ struct mmuext_op { #else void *vcpumask; #endif + /* COPY_PAGE */ + xen_pfn_t src_mfn; } arg2; }; typedef struct mmuext_op mmuext_op_t; diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/xen/cpuidle.h --- a/xen/include/xen/cpuidle.h Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/include/xen/cpuidle.h Tue Nov 04 12:43:19 2008 +0900 @@ -30,12 +30,18 @@ #define ACPI_PROCESSOR_MAX_POWER 8 #define CPUIDLE_NAME_LEN 16 +#define ACPI_CSTATE_EM_NONE 0 +#define ACPI_CSTATE_EM_SYSIO 1 +#define ACPI_CSTATE_EM_FFH 2 +#define ACPI_CSTATE_EM_HALT 3 + struct acpi_processor_cx { + u8 idx; u8 valid; u8 type; u32 address; - u8 space_id; + u8 entry_method; /* ACPI_CSTATE_EM_xxx */ u32 latency; u32 latency_ticks; u32 power; diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/xen/domain_page.h --- a/xen/include/xen/domain_page.h Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/include/xen/domain_page.h Tue Nov 04 12:43:19 2008 +0900 @@ -24,7 +24,7 @@ void *map_domain_page(unsigned long mfn) * Pass a VA within a page previously mapped in the context of the * currently-executing VCPU via a call to map_domain_page(). */ -void unmap_domain_page(void *va); +void unmap_domain_page(const void *va); /* * Similar to the above calls, except the mapping is accessible in all @@ -32,7 +32,7 @@ void unmap_domain_page(void *va); * mappings can also be unmapped from any context. */ void *map_domain_page_global(unsigned long mfn); -void unmap_domain_page_global(void *va); +void unmap_domain_page_global(const void *va); #define DMCACHE_ENTRY_VALID 1U #define DMCACHE_ENTRY_HELD 2U @@ -75,7 +75,7 @@ map_domain_page_with_cache(unsigned long } static inline void -unmap_domain_page_with_cache(void *va, struct domain_mmap_cache *cache) +unmap_domain_page_with_cache(const void *va, struct domain_mmap_cache *cache) { ASSERT(cache != NULL); cache->flags &= ~DMCACHE_ENTRY_HELD; diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/xen/spinlock.h --- a/xen/include/xen/spinlock.h Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/include/xen/spinlock.h Tue Nov 04 12:43:19 2008 +0900 @@ -5,21 +5,38 @@ #include <asm/system.h> #include <asm/spinlock.h> +#ifndef NDEBUG +struct lock_debug { + int irq_safe; /* +1: IRQ-safe; 0: not IRQ-safe; -1: don't know yet */ +}; +#define _LOCK_DEBUG { -1 } +void spin_debug_enable(void); +void spin_debug_disable(void); +#else +struct lock_debug { }; +#define _LOCK_DEBUG { } +#define spin_debug_enable() ((void)0) +#define spin_debug_disable() ((void)0) +#endif + typedef struct { raw_spinlock_t raw; u16 recurse_cpu:12; u16 recurse_cnt:4; + struct lock_debug debug; } spinlock_t; -#define SPIN_LOCK_UNLOCKED { _RAW_SPIN_LOCK_UNLOCKED, 0xfffu, 0 } + +#define SPIN_LOCK_UNLOCKED { _RAW_SPIN_LOCK_UNLOCKED, 0xfffu, 0, _LOCK_DEBUG } #define DEFINE_SPINLOCK(l) spinlock_t l = SPIN_LOCK_UNLOCKED #define spin_lock_init(l) (*(l) = (spinlock_t)SPIN_LOCK_UNLOCKED) typedef struct { raw_rwlock_t raw; + struct lock_debug debug; } rwlock_t; -#define RW_LOCK_UNLOCKED { _RAW_RW_LOCK_UNLOCKED } +#define RW_LOCK_UNLOCKED { _RAW_RW_LOCK_UNLOCKED, _LOCK_DEBUG } #define DEFINE_RWLOCK(l) rwlock_t l = RW_LOCK_UNLOCKED #define rwlock_init(l) (*(l) = (rwlock_t)RW_LOCK_UNLOCKED) @@ -34,6 +51,7 @@ int _spin_is_locked(spinlock_t *lock); int _spin_is_locked(spinlock_t *lock); int _spin_trylock(spinlock_t *lock); void _spin_barrier(spinlock_t *lock); +void _spin_barrier_irq(spinlock_t *lock); void _spin_lock_recursive(spinlock_t *lock); void _spin_unlock_recursive(spinlock_t *lock); @@ -67,6 +85,7 @@ void _write_unlock_irqrestore(rwlock_t * /* Ensure a lock is quiescent between two critical operations. */ #define spin_barrier(l) _spin_barrier(l) +#define spin_barrier_irq(l) _spin_barrier_irq(l) /* * spin_[un]lock_recursive(): Use these forms when the lock can (safely!) be diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/xen/time.h --- a/xen/include/xen/time.h Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/include/xen/time.h Tue Nov 04 12:43:19 2008 +0900 @@ -52,6 +52,7 @@ struct tm gmtime(unsigned long t); #define SECONDS(_s) ((s_time_t)((_s) * 1000000000ULL)) #define MILLISECS(_ms) ((s_time_t)((_ms) * 1000000ULL)) #define MICROSECS(_us) ((s_time_t)((_us) * 1000ULL)) +#define STIME_MAX ((s_time_t)((uint64_t)~0ull>>1)) extern void update_vcpu_system_time(struct vcpu *v); extern void update_domain_wallclock_time(struct domain *d); diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/xen/timer.h --- a/xen/include/xen/timer.h Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/include/xen/timer.h Tue Nov 04 12:43:19 2008 +0900 @@ -15,12 +15,13 @@ struct timer { struct timer { /* System time expiry value (nanoseconds since boot). */ s_time_t expires; + s_time_t expires_end; /* Position in active-timer data structure. */ union { /* Timer-heap offset. */ unsigned int heap_offset; - /* Overflow linked list. */ + /* Linked list. */ struct timer *list_next; }; diff -r 10f0e1bb8e5e -r e75cb35c798b xen/include/xlat.lst --- a/xen/include/xlat.lst Tue Nov 04 12:07:22 2008 +0900 +++ b/xen/include/xlat.lst Tue Nov 04 12:43:19 2008 +0900 @@ -56,6 +56,6 @@ ! processor_flags platform.h ! processor_power platform.h ! pct_register platform.h -! processor_px platform.h +? processor_px platform.h ! psd_package platform.h ! processor_performance platform.h _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-changelog
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |