[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-changelog] Merged.
# HG changeset patch # User emellor@xxxxxxxxxxxxxxxxxxxxxx # Node ID 8af1199488d3636135f3adf3f7302d4a04e9004e # Parent 25e3c8668f1f4769db8466b4af965a99503311ae # Parent 299d6ff8fdb2604dde767af2a2bee985602e9a46 Merged. diff -r 25e3c8668f1f -r 8af1199488d3 .hgignore --- a/.hgignore Mon Jan 9 11:19:55 2006 +++ b/.hgignore Mon Jan 9 11:22:17 2006 @@ -181,6 +181,7 @@ ^xen/TAGS$ ^xen/arch/x86/asm-offsets\.s$ ^xen/arch/x86/boot/mkelf32$ +^xen/arch/x86/xen\.lds$ ^xen/ddb/.*$ ^xen/include/asm$ ^xen/include/asm-.*/asm-offsets\.h$ diff -r 25e3c8668f1f -r 8af1199488d3 linux-2.6-xen-sparse/arch/ia64/xen/drivers/xenia64_init.c --- a/linux-2.6-xen-sparse/arch/ia64/xen/drivers/xenia64_init.c Mon Jan 9 11:19:55 2006 +++ b/linux-2.6-xen-sparse/arch/ia64/xen/drivers/xenia64_init.c Mon Jan 9 11:22:17 2006 @@ -25,8 +25,9 @@ xen_start_info = __va(s->arch.start_info_pfn << PAGE_SHIFT); xen_start_info->flags = s->arch.flags; - printk("Running on Xen! start_info_pfn=0x%lx lags=0x%x\n", - s->arch.start_info_pfn, xen_start_info->flags); + printk("Running on Xen! start_info_pfn=0x%lx nr_pages=%d flags=0x%x\n", + s->arch.start_info_pfn, xen_start_info->nr_pages, + xen_start_info->flags); evtchn_init(); initialized = 1; diff -r 25e3c8668f1f -r 8af1199488d3 linux-2.6-xen-sparse/arch/xen/Makefile --- a/linux-2.6-xen-sparse/arch/xen/Makefile Mon Jan 9 11:19:55 2006 +++ b/linux-2.6-xen-sparse/arch/xen/Makefile Mon Jan 9 11:22:17 2006 @@ -77,8 +77,6 @@ install -m0664 .config $(INSTALL_PATH)/boot/config-$(XINSTALL_NAME)$(INSTALL_SUFFIX) install -m0664 System.map $(INSTALL_PATH)/boot/System.map-$(XINSTALL_NAME)$(INSTALL_SUFFIX) ln -f -s vmlinuz-$(XINSTALL_NAME)$(INSTALL_SUFFIX) $(INSTALL_PATH)/boot/vmlinuz-$(VERSION).$(PATCHLEVEL)$(XENGUEST)$(INSTALL_SUFFIX) - mkdir -p $(INSTALL_PATH)/usr/include/xen/linux - install -m0644 $(srctree)/include/asm-xen/linux-public/*.h $(INSTALL_PATH)/usr/include/xen/linux archclean: @if [ -e arch/xen/arch ]; then $(MAKE) $(clean)=arch/xen/arch; fi; diff -r 25e3c8668f1f -r 8af1199488d3 linux-2.6-xen-sparse/arch/xen/i386/kernel/acpi/boot.c --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/acpi/boot.c Mon Jan 9 11:19:55 2006 +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/acpi/boot.c Mon Jan 9 11:22:17 2006 @@ -39,8 +39,6 @@ #ifdef CONFIG_XEN #include <asm/fixmap.h> #endif - -void (*pm_power_off)(void) = NULL; #ifdef CONFIG_X86_64 diff -r 25e3c8668f1f -r 8af1199488d3 linux-2.6-xen-sparse/arch/xen/i386/kernel/cpu/mtrr/main.c --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/cpu/mtrr/main.c Mon Jan 9 11:19:55 2006 +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/cpu/mtrr/main.c Mon Jan 9 11:22:17 2006 @@ -67,8 +67,11 @@ op.u.add_memtype.pfn = base; op.u.add_memtype.nr_pfns = size; op.u.add_memtype.type = type; - if ((error = HYPERVISOR_dom0_op(&op))) + error = HYPERVISOR_dom0_op(&op); + if (error) { + BUG_ON(error > 0); return error; + } if (increment) ++usage_table[op.u.add_memtype.reg]; @@ -121,8 +124,12 @@ if (--usage_table[reg] < 1) { op.cmd = DOM0_DEL_MEMTYPE; op.u.del_memtype.handle = 0; - op.u.add_memtype.reg = reg; - (void)HYPERVISOR_dom0_op(&op); + op.u.del_memtype.reg = reg; + error = HYPERVISOR_dom0_op(&op); + if (error) { + BUG_ON(error > 0); + goto out; + } } error = reg; out: diff -r 25e3c8668f1f -r 8af1199488d3 linux-2.6-xen-sparse/arch/xen/i386/kernel/i386_ksyms.c --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/i386_ksyms.c Mon Jan 9 11:19:55 2006 +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/i386_ksyms.c Mon Jan 9 11:22:17 2006 @@ -76,9 +76,7 @@ EXPORT_SYMBOL(iounmap); EXPORT_SYMBOL(kernel_thread); EXPORT_SYMBOL(pm_idle); -#ifdef CONFIG_ACPI_BOOT EXPORT_SYMBOL(pm_power_off); -#endif EXPORT_SYMBOL(get_cmos_time); EXPORT_SYMBOL(cpu_khz); EXPORT_SYMBOL(apm_info); diff -r 25e3c8668f1f -r 8af1199488d3 linux-2.6-xen-sparse/arch/xen/kernel/gnttab.c --- a/linux-2.6-xen-sparse/arch/xen/kernel/gnttab.c Mon Jan 9 11:19:55 2006 +++ b/linux-2.6-xen-sparse/arch/xen/kernel/gnttab.c Mon Jan 9 11:22:17 2006 @@ -389,6 +389,30 @@ return -ENOSYS; } +static int __init +gnttab_proc_init(void) +{ + /* + * /proc/xen/grant : used by libxc to access grant tables + */ + if ((grant_pde = create_xen_proc_entry("grant", 0600)) == NULL) { + WPRINTK("Unable to create grant xen proc entry\n"); + return -1; + } + + grant_file_ops.read = grant_pde->proc_fops->read; + grant_file_ops.write = grant_pde->proc_fops->write; + + grant_pde->proc_fops = &grant_file_ops; + + grant_pde->read_proc = &grant_read; + grant_pde->write_proc = &grant_write; + + return 0; +} + +device_initcall(gnttab_proc_init); + #endif /* CONFIG_PROC_FS */ int @@ -446,29 +470,11 @@ gnttab_free_count = NR_GRANT_ENTRIES - NR_RESERVED_ENTRIES; gnttab_free_head = NR_RESERVED_ENTRIES; -#ifdef CONFIG_PROC_FS - /* - * /proc/xen/grant : used by libxc to access grant tables - */ - if ((grant_pde = create_xen_proc_entry("grant", 0600)) == NULL) { - WPRINTK("Unable to create grant xen proc entry\n"); - return -1; - } - - grant_file_ops.read = grant_pde->proc_fops->read; - grant_file_ops.write = grant_pde->proc_fops->write; - - grant_pde->proc_fops = &grant_file_ops; - - grant_pde->read_proc = &grant_read; - grant_pde->write_proc = &grant_write; -#endif - printk("Grant table initialized\n"); return 0; } -__initcall(gnttab_init); +core_initcall(gnttab_init); /* * Local variables: diff -r 25e3c8668f1f -r 8af1199488d3 linux-2.6-xen-sparse/arch/xen/kernel/reboot.c --- a/linux-2.6-xen-sparse/arch/xen/kernel/reboot.c Mon Jan 9 11:19:55 2006 +++ b/linux-2.6-xen-sparse/arch/xen/kernel/reboot.c Mon Jan 9 11:22:17 2006 @@ -16,6 +16,13 @@ #include <linux/cpu.h> #include <linux/kthread.h> #include <asm-xen/xencons.h> + +#if defined(__i386__) || defined(__x86_64__) +/* + * Power off function, if any + */ +void (*pm_power_off)(void); +#endif #define SHUTDOWN_INVALID -1 #define SHUTDOWN_POWEROFF 0 diff -r 25e3c8668f1f -r 8af1199488d3 linux-2.6-xen-sparse/arch/xen/x86_64/kernel/x8664_ksyms.c --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/x8664_ksyms.c Mon Jan 9 11:19:55 2006 +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/x8664_ksyms.c Mon Jan 9 11:22:17 2006 @@ -59,9 +59,7 @@ EXPORT_SYMBOL(probe_irq_mask); EXPORT_SYMBOL(kernel_thread); EXPORT_SYMBOL(pm_idle); -#ifdef CONFIG_ACPI_BOOT EXPORT_SYMBOL(pm_power_off); -#endif EXPORT_SYMBOL(get_cmos_time); EXPORT_SYMBOL(__down_failed); diff -r 25e3c8668f1f -r 8af1199488d3 linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c --- a/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c Mon Jan 9 11:19:55 2006 +++ b/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c Mon Jan 9 11:22:17 2006 @@ -540,6 +540,9 @@ pending_vaddrs = kmalloc(sizeof(pending_vaddrs[0]) * mmap_pages, GFP_KERNEL); if (!pending_reqs || !pending_grant_handles || !pending_vaddrs) { + kfree(pending_reqs); + kfree(pending_grant_handles); + kfree(pending_vaddrs); printk("%s: out of memory\n", __FUNCTION__); return -1; } diff -r 25e3c8668f1f -r 8af1199488d3 linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c --- a/linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c Mon Jan 9 11:19:55 2006 +++ b/linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c Mon Jan 9 11:22:17 2006 @@ -331,7 +331,12 @@ return; } - xlvbd_add(sectors, info->vdevice, binfo, sector_size, info); + err = xlvbd_add(sectors, info->vdevice, binfo, sector_size, info); + if (err) { + xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s", + info->xbdev->otherend); + return; + } (void)xenbus_switch_state(info->xbdev, NULL, XenbusStateConnected); diff -r 25e3c8668f1f -r 8af1199488d3 linux-2.6-xen-sparse/drivers/xen/blktap/blktap.c --- a/linux-2.6-xen-sparse/drivers/xen/blktap/blktap.c Mon Jan 9 11:19:55 2006 +++ b/linux-2.6-xen-sparse/drivers/xen/blktap/blktap.c Mon Jan 9 11:22:17 2006 @@ -208,7 +208,7 @@ } struct vm_operations_struct blktap_vm_ops = { - nopage: blktap_nopage, + .nopage = blktap_nopage, }; /****************************************************************** @@ -225,7 +225,7 @@ /* Allocate the fe ring. */ sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL); if (sring == NULL) - goto fail_nomem; + return -ENOMEM; SetPageReserved(virt_to_page(sring)); @@ -233,9 +233,6 @@ FRONT_RING_INIT(&blktap_ufe_ring, sring, PAGE_SIZE); return 0; - - fail_nomem: - return -ENOMEM; } static int blktap_release(struct inode *inode, struct file *filp) @@ -391,12 +388,12 @@ } static struct file_operations blktap_fops = { - owner: THIS_MODULE, - poll: blktap_poll, - ioctl: blktap_ioctl, - open: blktap_open, - release: blktap_release, - mmap: blktap_mmap, + .owner = THIS_MODULE, + .poll = blktap_poll, + .ioctl = blktap_ioctl, + .open = blktap_open, + .release = blktap_release, + .mmap = blktap_mmap, }; diff -r 25e3c8668f1f -r 8af1199488d3 linux-2.6-xen-sparse/drivers/xen/console/console.c --- a/linux-2.6-xen-sparse/drivers/xen/console/console.c Mon Jan 9 11:19:55 2006 +++ b/linux-2.6-xen-sparse/drivers/xen/console/console.c Mon Jan 9 11:22:17 2006 @@ -314,39 +314,31 @@ { int sent, sz, work_done = 0; - if (xen_start_info->flags & SIF_INITDOMAIN) { - if (x_char) { + if (x_char) { + if (xen_start_info->flags & SIF_INITDOMAIN) kcons_write_dom0(NULL, &x_char, 1); - x_char = 0; - work_done = 1; - } - - while (wc != wp) { - sz = wp - wc; - if (sz > (wbuf_size - WBUF_MASK(wc))) - sz = wbuf_size - WBUF_MASK(wc); + else + while (x_char) + if (xencons_ring_send(&x_char, 1) == 1) + break; + x_char = 0; + work_done = 1; + } + + while (wc != wp) { + sz = wp - wc; + if (sz > (wbuf_size - WBUF_MASK(wc))) + sz = wbuf_size - WBUF_MASK(wc); + if (xen_start_info->flags & SIF_INITDOMAIN) { kcons_write_dom0(NULL, &wbuf[WBUF_MASK(wc)], sz); wc += sz; - work_done = 1; - } - } else { - while (x_char) { - if (xencons_ring_send(&x_char, 1) == 1) { - x_char = 0; - work_done = 1; - } - } - - while (wc != wp) { - sz = wp - wc; - if (sz > (wbuf_size - WBUF_MASK(wc))) - sz = wbuf_size - WBUF_MASK(wc); + } else { sent = xencons_ring_send(&wbuf[WBUF_MASK(wc)], sz); if (sent == 0) break; wc += sent; - work_done = 1; } + work_done = 1; } if (work_done && (xencons_tty != NULL)) { diff -r 25e3c8668f1f -r 8af1199488d3 linux-2.6-xen-sparse/drivers/xen/netback/common.h --- a/linux-2.6-xen-sparse/drivers/xen/netback/common.h Mon Jan 9 11:19:55 2006 +++ b/linux-2.6-xen-sparse/drivers/xen/netback/common.h Mon Jan 9 11:22:17 2006 @@ -82,7 +82,7 @@ #define NET_RX_RING_SIZE __RING_SIZE((netif_rx_sring_t *)0, PAGE_SIZE) void netif_creditlimit(netif_t *netif); -int netif_disconnect(netif_t *netif); +void netif_disconnect(netif_t *netif); netif_t *alloc_netif(domid_t domid, unsigned int handle, u8 be_mac[ETH_ALEN]); void free_netif(netif_t *netif); diff -r 25e3c8668f1f -r 8af1199488d3 linux-2.6-xen-sparse/drivers/xen/netback/interface.c --- a/linux-2.6-xen-sparse/drivers/xen/netback/interface.c Mon Jan 9 11:19:55 2006 +++ b/linux-2.6-xen-sparse/drivers/xen/netback/interface.c Mon Jan 9 11:22:17 2006 @@ -183,7 +183,7 @@ int netif_map(netif_t *netif, unsigned long tx_ring_ref, unsigned long rx_ring_ref, unsigned int evtchn) { - int err; + int err = -ENOMEM; netif_tx_sring_t *txs; netif_rx_sring_t *rxs; evtchn_op_t op = { @@ -196,24 +196,19 @@ return 0; netif->tx_comms_area = alloc_vm_area(PAGE_SIZE); + if (netif->tx_comms_area == NULL) + return -ENOMEM; netif->rx_comms_area = alloc_vm_area(PAGE_SIZE); - if (netif->tx_comms_area == NULL || netif->rx_comms_area == NULL) - return -ENOMEM; + if (netif->rx_comms_area == NULL) + goto err_rx; err = map_frontend_pages(netif, tx_ring_ref, rx_ring_ref); - if (err) { - free_vm_area(netif->tx_comms_area); - free_vm_area(netif->rx_comms_area); - return err; - } + if (err) + goto err_map; err = HYPERVISOR_event_channel_op(&op); - if (err) { - unmap_frontend_pages(netif); - free_vm_area(netif->tx_comms_area); - free_vm_area(netif->rx_comms_area); - return err; - } + if (err) + goto err_hypervisor; netif->evtchn = op.u.bind_interdomain.local_port; @@ -241,19 +236,22 @@ rtnl_unlock(); return 0; +err_hypervisor: + unmap_frontend_pages(netif); +err_map: + free_vm_area(netif->rx_comms_area); +err_rx: + free_vm_area(netif->tx_comms_area); + return err; } static void free_netif_callback(void *arg) { netif_t *netif = (netif_t *)arg; - /* Already disconnected? */ - if (!netif->irq) - return; - - unbind_from_irqhandler(netif->irq, netif); - netif->irq = 0; - + if (netif->irq) + unbind_from_irqhandler(netif->irq, netif); + unregister_netdev(netif->dev); if (netif->tx.sring) { @@ -290,10 +288,10 @@ #endif } -int netif_disconnect(netif_t *netif) -{ - - if (netif->status == CONNECTED) { +void netif_disconnect(netif_t *netif) +{ + switch (netif->status) { + case CONNECTED: rtnl_lock(); netif->status = DISCONNECTING; wmb(); @@ -301,10 +299,14 @@ __netif_down(netif); rtnl_unlock(); netif_put(netif); - return 0; /* Caller should not send response message. */ - } - - return 1; + break; + case DISCONNECTED: + BUG_ON(atomic_read(&netif->refcnt) != 0); + free_netif(netif); + break; + default: + BUG(); + } } /* diff -r 25e3c8668f1f -r 8af1199488d3 linux-2.6-xen-sparse/drivers/xen/netback/netback.c --- a/linux-2.6-xen-sparse/drivers/xen/netback/netback.c Mon Jan 9 11:19:55 2006 +++ b/linux-2.6-xen-sparse/drivers/xen/netback/netback.c Mon Jan 9 11:22:17 2006 @@ -14,6 +14,7 @@ #include <asm-xen/balloon.h> #include <asm-xen/xen-public/memory.h> +/*#define NETBE_DEBUG_INTERRUPT*/ static void netif_idx_release(u16 pending_idx); static void netif_page_release(struct page *page); @@ -727,6 +728,7 @@ return notify; } +#ifdef NETBE_DEBUG_INTERRUPT static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs) { struct list_head *ent; @@ -758,6 +760,7 @@ return IRQ_HANDLED; } +#endif static int __init netback_init(void) { @@ -794,6 +797,7 @@ netif_xenbus_init(); +#ifdef NETBE_DEBUG_INTERRUPT (void)bind_virq_to_irqhandler( VIRQ_DEBUG, 0, @@ -801,6 +805,7 @@ SA_SHIRQ, "net-be-dbg", &netif_be_dbg); +#endif return 0; } diff -r 25e3c8668f1f -r 8af1199488d3 linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c --- a/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c Mon Jan 9 11:19:55 2006 +++ b/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c Mon Jan 9 11:22:17 2006 @@ -116,6 +116,8 @@ #define RX_MAX_TARGET NET_RX_RING_SIZE int rx_min_target, rx_max_target, rx_target; struct sk_buff_head rx_batch; + + struct timer_list rx_refill_timer; /* * {tx,rx}_skbs store outstanding skbuffs. The first entry in each @@ -517,6 +519,13 @@ } +static void rx_refill_timeout(unsigned long data) +{ + struct net_device *dev = (struct net_device *)data; + netif_rx_schedule(dev); +} + + static void network_alloc_rx_buffers(struct net_device *dev) { unsigned short id; @@ -534,7 +543,7 @@ * Allocate skbuffs greedily, even though we batch updates to the * receive ring. This creates a less bursty demand on the memory * allocator, so should reduce the chance of failed allocation requests - * both for ourself and for other kernel subsystems. + * both for ourself and for other kernel subsystems. */ batch_target = np->rx_target - (req_prod - np->rx.rsp_cons); for (i = skb_queue_len(&np->rx_batch); i < batch_target; i++) { @@ -545,8 +554,15 @@ skb = alloc_xen_skb( ((PAGE_SIZE - sizeof(struct skb_shared_info)) & (-SKB_DATA_ALIGN(1))) - 16); - if (skb == NULL) - break; + if (skb == NULL) { + /* Any skbuffs queued for refill? Force them out. */ + if (i != 0) + goto refill; + /* Could not allocate any skbuffs. Try again later. */ + mod_timer(&np->rx_refill_timer, + jiffies + (HZ/10)); + return; + } __skb_queue_tail(&np->rx_batch, skb); } @@ -554,6 +570,12 @@ if (i < (np->rx_target/2)) return; + /* Adjust our fill target if we risked running out of buffers. */ + if (((req_prod - np->rx.sring->rsp_prod) < (np->rx_target / 4)) && + ((np->rx_target *= 2) > np->rx_max_target)) + np->rx_target = np->rx_max_target; + + refill: for (i = 0; ; i++) { if ((skb = __skb_dequeue(&np->rx_batch)) == NULL) break; @@ -608,11 +630,6 @@ /* Above is a suitable barrier to ensure backend will see requests. */ np->rx.req_prod_pvt = req_prod + i; RING_PUSH_REQUESTS(&np->rx); - - /* Adjust our fill target if we risked running out of buffers. */ - if (((req_prod - np->rx.sring->rsp_prod) < (np->rx_target / 4)) && - ((np->rx_target *= 2) > np->rx_max_target)) - np->rx_target = np->rx_max_target; } @@ -1077,6 +1094,10 @@ np->rx_min_target = RX_MIN_TARGET; np->rx_max_target = RX_MAX_TARGET; + init_timer(&np->rx_refill_timer); + np->rx_refill_timer.data = (unsigned long)netdev; + np->rx_refill_timer.function = rx_refill_timeout; + /* Initialise {tx,rx}_skbs as a free chain containing every entry. */ for (i = 0; i <= NET_TX_RING_SIZE; i++) { np->tx_skbs[i] = (void *)((unsigned long) i+1); @@ -1188,29 +1209,26 @@ DPRINTK("%s\n", dev->nodename); - netif_free(info); - kfree(info); + netif_disconnect_backend(info); + free_netdev(info->netdev); return 0; } -static void netif_free(struct netfront_info *info) -{ - netif_disconnect_backend(info); - close_netdev(info); -} - - static void close_netdev(struct netfront_info *info) { - if (info->netdev) { + spin_lock_irq(&info->netdev->xmit_lock); + netif_stop_queue(info->netdev); + spin_unlock_irq(&info->netdev->xmit_lock); + #ifdef CONFIG_PROC_FS - xennet_proc_delif(info->netdev); + xennet_proc_delif(info->netdev); #endif - unregister_netdev(info->netdev); - info->netdev = NULL; - } + + del_timer_sync(&info->rx_refill_timer); + + unregister_netdev(info->netdev); } @@ -1219,21 +1237,28 @@ /* Stop old i/f to prevent errors whilst we rebuild the state. */ spin_lock_irq(&info->tx_lock); spin_lock(&info->rx_lock); - netif_stop_queue(info->netdev); - /* info->backend_state = BEST_DISCONNECTED; */ + info->backend_state = BEST_DISCONNECTED; spin_unlock(&info->rx_lock); spin_unlock_irq(&info->tx_lock); - + + if (info->irq) + unbind_from_irqhandler(info->irq, info->netdev); + info->evtchn = info->irq = 0; + end_access(info->tx_ring_ref, info->tx.sring); end_access(info->rx_ring_ref, info->rx.sring); info->tx_ring_ref = GRANT_INVALID_REF; info->rx_ring_ref = GRANT_INVALID_REF; info->tx.sring = NULL; info->rx.sring = NULL; - - if (info->irq) - unbind_from_irqhandler(info->irq, info->netdev); - info->evtchn = info->irq = 0; +} + + +static void netif_free(struct netfront_info *info) +{ + close_netdev(info); + netif_disconnect_backend(info); + free_netdev(info->netdev); } diff -r 25e3c8668f1f -r 8af1199488d3 linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_dev.c --- a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_dev.c Mon Jan 9 11:19:55 2006 +++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_dev.c Mon Jan 9 11:22:17 2006 @@ -111,7 +111,6 @@ struct xenbus_dev_data *u = filp->private_data; struct xenbus_dev_transaction *trans; void *reply; - int err = 0; if ((len + u->len) > sizeof(u->u.buffer)) return -EINVAL; @@ -136,41 +135,36 @@ case XS_RM: case XS_SET_PERMS: reply = xenbus_dev_request_and_reply(&u->u.msg); - if (IS_ERR(reply)) { - err = PTR_ERR(reply); - } else { - if (u->u.msg.type == XS_TRANSACTION_START) { - trans = kmalloc(sizeof(*trans), GFP_KERNEL); - trans->handle = (struct xenbus_transaction *) - simple_strtoul(reply, NULL, 0); - list_add(&trans->list, &u->transactions); - } else if (u->u.msg.type == XS_TRANSACTION_END) { - list_for_each_entry(trans, &u->transactions, - list) - if ((unsigned long)trans->handle == - (unsigned long)u->u.msg.tx_id) - break; - BUG_ON(&trans->list == &u->transactions); - list_del(&trans->list); - kfree(trans); - } - queue_reply(u, (char *)&u->u.msg, sizeof(u->u.msg)); - queue_reply(u, (char *)reply, u->u.msg.len); - kfree(reply); + if (IS_ERR(reply)) + return PTR_ERR(reply); + + if (u->u.msg.type == XS_TRANSACTION_START) { + trans = kmalloc(sizeof(*trans), GFP_KERNEL); + if (!trans) + return -ENOMEM; + trans->handle = (struct xenbus_transaction *) + simple_strtoul(reply, NULL, 0); + list_add(&trans->list, &u->transactions); + } else if (u->u.msg.type == XS_TRANSACTION_END) { + list_for_each_entry(trans, &u->transactions, list) + if ((unsigned long)trans->handle == + (unsigned long)u->u.msg.tx_id) + break; + BUG_ON(&trans->list == &u->transactions); + list_del(&trans->list); + kfree(trans); } + queue_reply(u, (char *)&u->u.msg, sizeof(u->u.msg)); + queue_reply(u, (char *)reply, u->u.msg.len); + kfree(reply); break; default: - err = -EINVAL; - break; + return -EINVAL; } - if (err == 0) { - u->len = 0; - err = len; - } - - return err; + u->len = 0; + return len; } static int xenbus_dev_open(struct inode *inode, struct file *filp) diff -r 25e3c8668f1f -r 8af1199488d3 linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe.c --- a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe.c Mon Jan 9 11:19:55 2006 +++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe.c Mon Jan 9 11:22:17 2006 @@ -542,14 +542,6 @@ const char *type, const char *nodename) { -#define CHECK_FAIL \ - do { \ - if (err) \ - goto fail; \ - } \ - while (0) \ - - int err; struct xenbus_device *xendev; size_t stringlen; @@ -584,19 +576,18 @@ xendev->dev.release = xenbus_dev_release; err = bus->get_bus_id(xendev->dev.bus_id, xendev->nodename); - CHECK_FAIL; + if (err) + goto fail; /* Register with generic device framework. */ err = device_register(&xendev->dev); - CHECK_FAIL; + if (err) + goto fail; device_create_file(&xendev->dev, &dev_attr_nodename); device_create_file(&xendev->dev, &dev_attr_devtype); return 0; - -#undef CHECK_FAIL - fail: xenbus_dev_free(xendev); return err; diff -r 25e3c8668f1f -r 8af1199488d3 linux-2.6-xen-sparse/include/asm-xen/asm-ia64/hypercall.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-ia64/hypercall.h Mon Jan 9 11:19:55 2006 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-ia64/hypercall.h Mon Jan 9 11:22:17 2006 @@ -372,7 +372,7 @@ int ret; __asm__ __volatile__ ( ";; mov r14=%2 ; mov r15=%3 ; mov r2=%1 ; break 0x1000 ;; mov %0=r8 ;;" : "=r" (ret) - : "i" (__HYPERVISOR_console_io), "r"(cmd), "r"(arg) + : "i" (__HYPERVISOR_memory_op), "r"(cmd), "r"(arg) : "r14","r15","r2","r8","memory" ); return ret; } diff -r 25e3c8668f1f -r 8af1199488d3 tools/Makefile --- a/tools/Makefile Mon Jan 9 11:19:55 2006 +++ b/tools/Makefile Mon Jan 9 11:22:17 2006 @@ -12,6 +12,7 @@ SUBDIRS += security SUBDIRS += console SUBDIRS += xenmon +SUBDIRS += guest-headers ifeq ($(VTPM_TOOLS),y) SUBDIRS += vtpm_manager SUBDIRS += vtpm diff -r 25e3c8668f1f -r 8af1199488d3 tools/Rules.mk --- a/tools/Rules.mk Mon Jan 9 11:19:55 2006 +++ b/tools/Rules.mk Mon Jan 9 11:22:17 2006 @@ -35,6 +35,8 @@ mk-symlinks: mkdir -p xen ( cd xen && ln -sf ../$(XEN_ROOT)/xen/include/public/*.h . ) + mkdir -p xen/hvm + ( cd xen/hvm && ln -sf ../../$(XEN_ROOT)/xen/include/public/hvm/*.h . ) mkdir -p xen/io ( cd xen/io && ln -sf ../../$(XEN_ROOT)/xen/include/public/io/*.h . ) mkdir -p xen/linux diff -r 25e3c8668f1f -r 8af1199488d3 tools/debugger/libxendebug/xendebug.c --- a/tools/debugger/libxendebug/xendebug.c Mon Jan 9 11:19:55 2006 +++ b/tools/debugger/libxendebug/xendebug.c Mon Jan 9 11:22:17 2006 @@ -119,8 +119,8 @@ if ( !ctxt->valid[vcpu] ) { - if ( (rc = xc_domain_get_vcpu_context(xc_handle, domid, vcpu, - &ctxt->context[vcpu])) ) + if ( (rc = xc_vcpu_getcontext(xc_handle, domid, vcpu, + &ctxt->context[vcpu])) ) return NULL; ctxt->valid[vcpu] = true; @@ -139,10 +139,10 @@ return -EINVAL; op.interface_version = DOM0_INTERFACE_VERSION; - op.cmd = DOM0_SETDOMAININFO; - op.u.setdomaininfo.domain = ctxt->domid; - op.u.setdomaininfo.vcpu = vcpu; - op.u.setdomaininfo.ctxt = &ctxt->context[vcpu]; + op.cmd = DOM0_SETVCPUCONTEXT; + op.u.setvcpucontext.domain = ctxt->domid; + op.u.setvcpucontext.vcpu = vcpu; + op.u.setvcpucontext.ctxt = &ctxt->context[vcpu]; if ( (rc = mlock(&ctxt->context[vcpu], sizeof(vcpu_guest_context_t))) ) return rc; diff -r 25e3c8668f1f -r 8af1199488d3 tools/examples/xmexample.vmx --- a/tools/examples/xmexample.vmx Mon Jan 9 11:19:55 2006 +++ b/tools/examples/xmexample.vmx Mon Jan 9 11:22:17 2006 @@ -28,7 +28,13 @@ #----------------------------------------------------------------------------- # the number of cpus guest platform has, default=1 -vcpus=1 +#vcpus=1 + +# enable/disalbe vmx guest ACPI, default=0 (disabled) +#acpi=0 + +# enable/disalbe vmx guest APIC, default=0 (disabled) +#apic=0 # List of which CPUS this domain is allowed to use, default Xen picks #cpus = "" # leave to Xen to pick diff -r 25e3c8668f1f -r 8af1199488d3 tools/firmware/vmxassist/Makefile --- a/tools/firmware/vmxassist/Makefile Mon Jan 9 11:19:55 2006 +++ b/tools/firmware/vmxassist/Makefile Mon Jan 9 11:22:17 2006 @@ -24,7 +24,7 @@ # The emulator code lives in ROM space TEXTADDR=0x000D0000 -DEFINES=-DDEBUG -D_ACPI_ -DTEXTADDR=$(TEXTADDR) +DEFINES=-DDEBUG -DTEXTADDR=$(TEXTADDR) XENINC=-I$(XEN_ROOT)/tools/libxc LD = ld diff -r 25e3c8668f1f -r 8af1199488d3 tools/firmware/vmxassist/acpi_madt.c --- a/tools/firmware/vmxassist/acpi_madt.c Mon Jan 9 11:19:55 2006 +++ b/tools/firmware/vmxassist/acpi_madt.c Mon Jan 9 11:22:17 2006 @@ -17,30 +17,73 @@ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. */ + #include "../acpi/acpi2_0.h" #include "../acpi/acpi_madt.h" + +#include <xen/hvm/hvm_info_table.h> #define NULL ((void*)0) extern int puts(const char *s); -#define VCPU_NR_PAGE 0x0009F000 -#define VCPU_NR_OFFSET 0x00000800 -#define VCPU_MAGIC 0x76637075 /* "vcpu" */ +static struct hvm_info_table *table = NULL; -/* xc_vmx_builder wrote vcpu block at 0x9F800. Return it. */ -static int +static int validate_hvm_info(struct hvm_info_table *t) +{ + char signature[] = "HVM INFO"; + uint8_t *ptr = (uint8_t *)t; + uint8_t sum = 0; + int i; + + /* strncmp(t->signature, "HVM INFO", 8) */ + for (i = 0; i < 8; i++) { + if (signature[i] != t->signature[i]) { + puts("Bad hvm info signature\n"); + return 0; + } + } + + for (i = 0; i < t->length; i++) + sum += ptr[i]; + + return (sum == 0); +} + +/* xc_vmx_builder wrote hvm info at 0x9F800. Return it. */ +static struct hvm_info_table * +get_hvm_info_table(void) +{ + struct hvm_info_table *t; + int i; + + if (table != NULL) + return table; + + t = (struct hvm_info_table *)HVM_INFO_PADDR; + + if (!validate_hvm_info(t)) { + puts("Bad hvm info table\n"); + return NULL; + } + + table = t; + + return table; +} + +int get_vcpu_nr(void) { - unsigned int *vcpus; + struct hvm_info_table *t = get_hvm_info_table(); + return (t ? t->nr_vcpus : 1); /* default 1 vcpu */ +} - vcpus = (unsigned int *)(VCPU_NR_PAGE + VCPU_NR_OFFSET); - if (vcpus[0] != VCPU_MAGIC) { - puts("Bad vcpus magic, set vcpu number to 1 by default.\n"); - return 1; - } - - return vcpus[1]; +int +get_acpi_enabled(void) +{ + struct hvm_info_table *t = get_hvm_info_table(); + return (t ? t->acpi_enabled : 0); /* default no acpi */ } static void * @@ -74,10 +117,10 @@ return madt; } -static void +static void set_checksum(void *start, int checksum_offset, int len) { - unsigned char sum = 0; + unsigned char sum = 0; unsigned char *ptr; ptr = start; @@ -89,9 +132,9 @@ ptr[checksum_offset] = -sum; } -static int +static int acpi_madt_set_local_apics( - int nr_vcpu, + int nr_vcpu, ACPI_MULTIPLE_APIC_DESCRIPTION_TABLE *madt) { int i; @@ -104,14 +147,14 @@ madt->LocalApic[i].Length = sizeof (ACPI_LOCAL_APIC_STRUCTURE); madt->LocalApic[i].AcpiProcessorId = i; madt->LocalApic[i].ApicId = i; - madt->LocalApic[i].Flags = 1; + madt->LocalApic[i].Flags = 1; } madt->Header.Header.Length = - sizeof(ACPI_MULTIPLE_APIC_DESCRIPTION_TABLE) - + sizeof(ACPI_MULTIPLE_APIC_DESCRIPTION_TABLE) - (MAX_VIRT_CPUS - nr_vcpu)* sizeof(ACPI_LOCAL_APIC_STRUCTURE); - return 0; + return 0; } #define FIELD_OFFSET(TYPE,Field) ((unsigned int)(&(((TYPE *) 0)->Field))) @@ -133,7 +176,7 @@ madt, FIELD_OFFSET(ACPI_TABLE_HEADER, Checksum), madt->Header.Header.Length); - return 0; + return 0; } /* diff -r 25e3c8668f1f -r 8af1199488d3 tools/firmware/vmxassist/vm86.h --- a/tools/firmware/vmxassist/vm86.h Mon Jan 9 11:19:55 2006 +++ b/tools/firmware/vmxassist/vm86.h Mon Jan 9 11:22:17 2006 @@ -24,7 +24,7 @@ #include <stdint.h> #endif -#include <xen/vmx_assist.h> +#include <xen/hvm/vmx_assist.h> #define NR_EXCEPTION_HANDLER 32 #define NR_INTERRUPT_HANDLERS 16 diff -r 25e3c8668f1f -r 8af1199488d3 tools/firmware/vmxassist/vmxloader.c --- a/tools/firmware/vmxassist/vmxloader.c Mon Jan 9 11:19:55 2006 +++ b/tools/firmware/vmxassist/vmxloader.c Mon Jan 9 11:22:17 2006 @@ -24,12 +24,10 @@ #include "machine.h" #include "roms.h" -#ifdef _ACPI_ #include "acpi.h" #include "../acpi/acpi2_0.h" // for ACPI_PHYSICAL_ADDRESS int acpi_madt_update(unsigned char* acpi_start); -#endif - +int get_acpi_enabled(void); /* * C runtime start off @@ -120,18 +118,17 @@ memcpy((void *)0xC0000, vgabios_stdvga, sizeof(vgabios_stdvga)); } -#ifdef _ACPI_ - puts("Loading ACPI ...\n"); - acpi_madt_update(acpi); - - if (ACPI_PHYSICAL_ADDRESS+sizeof(acpi) <= 0xF0000) { - /* make sure acpi table does not overlap rombios - * currently acpi less than 8K will be OK. - */ - memcpy((void *)ACPI_PHYSICAL_ADDRESS, acpi, sizeof(acpi)); + if (get_acpi_enabled() != 0) { + puts("Loading ACPI ...\n"); + acpi_madt_update((unsigned char*)acpi); + if (ACPI_PHYSICAL_ADDRESS+sizeof(acpi) <= 0xF0000) { + /* make sure acpi table does not overlap rombios + * currently acpi less than 8K will be OK. + */ + memcpy((void *)ACPI_PHYSICAL_ADDRESS, acpi, sizeof(acpi)); + } } -#endif puts("Loading VMXAssist ...\n"); memcpy((void *)TEXTADDR, vmxassist, sizeof(vmxassist)); diff -r 25e3c8668f1f -r 8af1199488d3 tools/ioemu/hw/i8254.c --- a/tools/ioemu/hw/i8254.c Mon Jan 9 11:19:55 2006 +++ b/tools/ioemu/hw/i8254.c Mon Jan 9 11:22:17 2006 @@ -23,7 +23,7 @@ */ #include "vl.h" #include <xenctrl.h> -#include <xen/io/ioreq.h> +#include <xen/hvm/ioreq.h> //#define DEBUG_PIT diff -r 25e3c8668f1f -r 8af1199488d3 tools/ioemu/hw/i8259.c --- a/tools/ioemu/hw/i8259.c Mon Jan 9 11:19:55 2006 +++ b/tools/ioemu/hw/i8259.c Mon Jan 9 11:22:17 2006 @@ -23,7 +23,7 @@ */ #include "vl.h" #include <xenctrl.h> -#include <xen/io/ioreq.h> +#include <xen/hvm/ioreq.h> /* debug PIC */ //#define DEBUG_PIC diff -r 25e3c8668f1f -r 8af1199488d3 tools/ioemu/hw/i8259_stub.c --- a/tools/ioemu/hw/i8259_stub.c Mon Jan 9 11:19:55 2006 +++ b/tools/ioemu/hw/i8259_stub.c Mon Jan 9 11:22:17 2006 @@ -22,7 +22,7 @@ * THE SOFTWARE. */ #include "xenctrl.h" -#include <xen/io/ioreq.h> +#include <xen/hvm/ioreq.h> #include <stdio.h> #include "cpu.h" #include "cpu-all.h" diff -r 25e3c8668f1f -r 8af1199488d3 tools/ioemu/target-i386-dm/helper2.c --- a/tools/ioemu/target-i386-dm/helper2.c Mon Jan 9 11:19:55 2006 +++ b/tools/ioemu/target-i386-dm/helper2.c Mon Jan 9 11:22:17 2006 @@ -48,7 +48,7 @@ #include <sys/ioctl.h> #include <xenctrl.h> -#include <xen/io/ioreq.h> +#include <xen/hvm/ioreq.h> #include <xen/linux/evtchn.h> #include "cpu.h" diff -r 25e3c8668f1f -r 8af1199488d3 tools/ioemu/vl.c --- a/tools/ioemu/vl.c Mon Jan 9 11:19:55 2006 +++ b/tools/ioemu/vl.c Mon Jan 9 11:22:17 2006 @@ -2948,6 +2948,7 @@ case QEMU_OPTION_vcpus: vcpus = atoi(optarg); fprintf(logfile, "qemu: the number of cpus is %d\n", vcpus); + break; case QEMU_OPTION_pci: pci_enabled = 1; break; diff -r 25e3c8668f1f -r 8af1199488d3 tools/libxc/xc_core.c --- a/tools/libxc/xc_core.c Mon Jan 9 11:19:55 2006 +++ b/tools/libxc/xc_core.c Mon Jan 9 11:22:17 2006 @@ -55,7 +55,7 @@ } for (i = 0; i < info.max_vcpu_id; i++) - if (xc_domain_get_vcpu_context(xc_handle, domid, + if (xc_vcpu_getcontext(xc_handle, domid, i, &ctxt[nr_vcpus]) == 0) nr_vcpus++; diff -r 25e3c8668f1f -r 8af1199488d3 tools/libxc/xc_domain.c --- a/tools/libxc/xc_domain.c Mon Jan 9 11:19:55 2006 +++ b/tools/libxc/xc_domain.c Mon Jan 9 11:22:17 2006 @@ -58,16 +58,16 @@ return do_dom0_op(xc_handle, &op); } -int xc_domain_pincpu(int xc_handle, - uint32_t domid, - int vcpu, - cpumap_t cpumap) -{ - DECLARE_DOM0_OP; - op.cmd = DOM0_PINCPUDOMAIN; - op.u.pincpudomain.domain = (domid_t)domid; - op.u.pincpudomain.vcpu = vcpu; - op.u.pincpudomain.cpumap = cpumap; +int xc_vcpu_setaffinity(int xc_handle, + uint32_t domid, + int vcpu, + cpumap_t cpumap) +{ + DECLARE_DOM0_OP; + op.cmd = DOM0_SETVCPUAFFINITY; + op.u.setvcpuaffinity.domain = (domid_t)domid; + op.u.setvcpuaffinity.vcpu = vcpu; + op.u.setvcpuaffinity.cpumap = cpumap; return do_dom0_op(xc_handle, &op); } @@ -155,7 +155,7 @@ return ret; } -int xc_domain_get_vcpu_context(int xc_handle, +int xc_vcpu_getcontext(int xc_handle, uint32_t domid, uint32_t vcpu, vcpu_guest_context_t *ctxt) @@ -345,10 +345,10 @@ return do_dom0_op(xc_handle, &op); } -int xc_domain_get_vcpu_info(int xc_handle, - uint32_t domid, - uint32_t vcpu, - xc_vcpuinfo_t *info) +int xc_vcpu_getinfo(int xc_handle, + uint32_t domid, + uint32_t vcpu, + xc_vcpuinfo_t *info) { int rc; DECLARE_DOM0_OP; @@ -380,18 +380,18 @@ return do_dom0_op(xc_handle, &op); } -int xc_domain_setinfo(int xc_handle, - uint32_t domid, - uint32_t vcpu, - vcpu_guest_context_t *ctxt) +int xc_vcpu_setcontext(int xc_handle, + uint32_t domid, + uint32_t vcpu, + vcpu_guest_context_t *ctxt) { dom0_op_t op; int rc; - op.cmd = DOM0_SETDOMAININFO; - op.u.setdomaininfo.domain = domid; - op.u.setdomaininfo.vcpu = vcpu; - op.u.setdomaininfo.ctxt = ctxt; + op.cmd = DOM0_SETVCPUCONTEXT; + op.u.setvcpucontext.domain = domid; + op.u.setvcpucontext.vcpu = vcpu; + op.u.setvcpucontext.ctxt = ctxt; if ( (rc = mlock(ctxt, sizeof(*ctxt))) != 0 ) return rc; @@ -402,6 +402,38 @@ return rc; +} + +int xc_domain_irq_permission(int xc_handle, + uint32_t domid, + uint8_t pirq, + uint8_t allow_access) +{ + dom0_op_t op; + + op.cmd = DOM0_IRQ_PERMISSION; + op.u.irq_permission.domain = domid; + op.u.irq_permission.pirq = pirq; + op.u.irq_permission.allow_access = allow_access; + + return do_dom0_op(xc_handle, &op); +} + +int xc_domain_iomem_permission(int xc_handle, + uint32_t domid, + unsigned long first_pfn, + unsigned long nr_pfns, + uint8_t allow_access) +{ + dom0_op_t op; + + op.cmd = DOM0_IOMEM_PERMISSION; + op.u.iomem_permission.domain = domid; + op.u.iomem_permission.first_pfn = first_pfn; + op.u.iomem_permission.nr_pfns = nr_pfns; + op.u.iomem_permission.allow_access = allow_access; + + return do_dom0_op(xc_handle, &op); } /* diff -r 25e3c8668f1f -r 8af1199488d3 tools/libxc/xc_ia64_stubs.c --- a/tools/libxc/xc_ia64_stubs.c Mon Jan 9 11:19:55 2006 +++ b/tools/libxc/xc_ia64_stubs.c Mon Jan 9 11:22:17 2006 @@ -5,7 +5,7 @@ #include <stdlib.h> #include <zlib.h> #include "xen/arch-ia64.h" -#include <xen/io/ioreq.h> +#include <xen/hvm/ioreq.h> /* this is a very ugly way of getting FPSR_DEFAULT. struct ia64_fpreg is * mysteriously declared in two places: /usr/include/asm/fpu.h and @@ -627,6 +627,7 @@ unsigned int control_evtchn, unsigned int lapic, unsigned int vcpus, + unsigned int acpi, unsigned int store_evtchn, unsigned long *store_mfn) { @@ -663,7 +664,7 @@ goto error_out; } - if ( xc_domain_get_vcpu_context(xc_handle, domid, 0, ctxt) ){ + if ( xc_vcpu_getcontext(xc_handle, domid, 0, ctxt) ){ PERROR("Could not get vcpu context"); goto error_out; } @@ -687,11 +688,11 @@ memset( &launch_op, 0, sizeof(launch_op) ); - launch_op.u.setdomaininfo.domain = (domid_t)domid; - launch_op.u.setdomaininfo.vcpu = 0; - launch_op.u.setdomaininfo.ctxt = ctxt; - - launch_op.cmd = DOM0_SETDOMAININFO; + launch_op.u.setvcpucontext.domain = (domid_t)domid; + launch_op.u.setvcpucontext.vcpu = 0; + launch_op.u.setvcpucontext.ctxt = ctxt; + + launch_op.cmd = DOM0_SETVCPUCONTEXT; rc = do_dom0_op(xc_handle, &launch_op); return rc; diff -r 25e3c8668f1f -r 8af1199488d3 tools/libxc/xc_linux_build.c --- a/tools/libxc/xc_linux_build.c Mon Jan 9 11:19:55 2006 +++ b/tools/libxc/xc_linux_build.c Mon Jan 9 11:22:17 2006 @@ -393,10 +393,14 @@ start_info->store_evtchn = store_evtchn; start_info->console_mfn = nr_pages - 1; start_info->console_evtchn = console_evtchn; + start_info->nr_pages = nr_pages; // FIXME?: nr_pages - 2 ???? if ( initrd_len != 0 ) { ctxt->initrd.start = vinitrd_start; ctxt->initrd.size = initrd_len; + } else { + ctxt->initrd.start = 0; + ctxt->initrd.size = 0; } strncpy((char *)ctxt->cmdline, cmdline, IA64_COMMAND_LINE_SIZE); ctxt->cmdline[IA64_COMMAND_LINE_SIZE-1] = '\0'; @@ -790,7 +794,7 @@ goto error_out; } - if ( xc_domain_get_vcpu_context(xc_handle, domid, 0, ctxt) ) + if ( xc_vcpu_getcontext(xc_handle, domid, 0, ctxt) ) { PERROR("Could not get vcpu context"); goto error_out; @@ -893,11 +897,11 @@ memset( &launch_op, 0, sizeof(launch_op) ); - launch_op.u.setdomaininfo.domain = (domid_t)domid; - launch_op.u.setdomaininfo.vcpu = 0; - launch_op.u.setdomaininfo.ctxt = ctxt; - - launch_op.cmd = DOM0_SETDOMAININFO; + launch_op.u.setvcpucontext.domain = (domid_t)domid; + launch_op.u.setvcpucontext.vcpu = 0; + launch_op.u.setvcpucontext.ctxt = ctxt; + + launch_op.cmd = DOM0_SETVCPUCONTEXT; rc = xc_dom0_op(xc_handle, &launch_op); return rc; diff -r 25e3c8668f1f -r 8af1199488d3 tools/libxc/xc_linux_restore.c --- a/tools/libxc/xc_linux_restore.c Mon Jan 9 11:19:55 2006 +++ b/tools/libxc/xc_linux_restore.c Mon Jan 9 11:22:17 2006 @@ -171,7 +171,7 @@ /* Only have to worry about vcpu 0 even for SMP */ - if (xc_domain_get_vcpu_context( xc_handle, dom, 0, &ctxt)) { + if (xc_vcpu_getcontext( xc_handle, dom, 0, &ctxt)) { ERR("Could not get vcpu context"); goto out; } @@ -735,10 +735,10 @@ DPRINTF("Domain ready to be built.\n"); - op.cmd = DOM0_SETDOMAININFO; - op.u.setdomaininfo.domain = (domid_t)dom; - op.u.setdomaininfo.vcpu = 0; - op.u.setdomaininfo.ctxt = &ctxt; + op.cmd = DOM0_SETVCPUCONTEXT; + op.u.setvcpucontext.domain = (domid_t)dom; + op.u.setvcpucontext.vcpu = 0; + op.u.setvcpucontext.ctxt = &ctxt; rc = xc_dom0_op(xc_handle, &op); if (rc != 0) { diff -r 25e3c8668f1f -r 8af1199488d3 tools/libxc/xc_linux_save.c --- a/tools/libxc/xc_linux_save.c Mon Jan 9 11:19:55 2006 +++ b/tools/libxc/xc_linux_save.c Mon Jan 9 11:22:17 2006 @@ -382,7 +382,7 @@ return -1; } - if ( xc_domain_get_vcpu_context(xc_handle, dom, 0 /* XXX */, ctxt)) + if ( xc_vcpu_getcontext(xc_handle, dom, 0 /* XXX */, ctxt)) ERR("Could not get vcpu context"); @@ -643,7 +643,7 @@ } /* Only have to worry about vcpu 0 even for SMP */ - if (xc_domain_get_vcpu_context(xc_handle, dom, 0, &ctxt)) { + if (xc_vcpu_getcontext(xc_handle, dom, 0, &ctxt)) { ERR("Could not get vcpu context"); goto out; } diff -r 25e3c8668f1f -r 8af1199488d3 tools/libxc/xc_pagetab.c --- a/tools/libxc/xc_pagetab.c Mon Jan 9 11:19:55 2006 +++ b/tools/libxc/xc_pagetab.c Mon Jan 9 11:22:17 2006 @@ -74,7 +74,7 @@ #define pt_levels 4 #endif - if (xc_domain_get_vcpu_context(xc_handle, dom, vcpu, &ctx) != 0) { + if (xc_vcpu_getcontext(xc_handle, dom, vcpu, &ctx) != 0) { fprintf(stderr, "failed to retreive vcpu context\n"); goto out; } diff -r 25e3c8668f1f -r 8af1199488d3 tools/libxc/xc_ptrace.c --- a/tools/libxc/xc_ptrace.c Mon Jan 9 11:19:55 2006 +++ b/tools/libxc/xc_ptrace.c Mon Jan 9 11:22:17 2006 @@ -33,7 +33,7 @@ if (online) *online = 0; if ( !(regs_valid & (1 << cpu)) ) { - retval = xc_domain_get_vcpu_context(xc_handle, current_domid, + retval = xc_vcpu_getcontext(xc_handle, current_domid, cpu, &ctxt[cpu]); if ( retval ) goto done; @@ -43,8 +43,7 @@ if ( online == NULL ) goto done; - retval = xc_domain_get_vcpu_info(xc_handle, current_domid, - cpu, &info); + retval = xc_vcpu_getinfo(xc_handle, current_domid, cpu, &info); *online = info.online; done: @@ -395,7 +394,7 @@ case PTRACE_SETREGS: SET_XC_REGS(((struct gdb_regs *)data), ctxt[cpu].user_regs); - retval = xc_domain_setinfo(xc_handle, current_domid, cpu, &ctxt[cpu]); + retval = xc_vcpu_setcontext(xc_handle, current_domid, cpu, &ctxt[cpu]); if (retval) goto error_out; break; @@ -405,7 +404,7 @@ * during single-stepping - but that just seems retarded */ ctxt[cpu].user_regs.eflags |= PSL_T; - retval = xc_domain_setinfo(xc_handle, current_domid, cpu, &ctxt[cpu]); + retval = xc_vcpu_setcontext(xc_handle, current_domid, cpu, &ctxt[cpu]); if ( retval ) { perror("dom0 op failed"); @@ -423,8 +422,8 @@ /* Clear trace flag */ if ( ctxt[cpu].user_regs.eflags & PSL_T ) { ctxt[cpu].user_regs.eflags &= ~PSL_T; - retval = xc_domain_setinfo(xc_handle, current_domid, - cpu, &ctxt[cpu]); + retval = xc_vcpu_setcontext(xc_handle, current_domid, + cpu, &ctxt[cpu]); if ( retval ) { perror("dom0 op failed"); goto error_out; diff -r 25e3c8668f1f -r 8af1199488d3 tools/libxc/xc_vmx_build.c --- a/tools/libxc/xc_vmx_build.c Mon Jan 9 11:19:55 2006 +++ b/tools/libxc/xc_vmx_build.c Mon Jan 9 11:22:17 2006 @@ -9,7 +9,8 @@ #include <stdlib.h> #include <unistd.h> #include <zlib.h> -#include <xen/io/ioreq.h> +#include <xen/hvm/hvm_info_table.h> +#include <xen/hvm/ioreq.h> #define VMX_LOADER_ENTR_ADDR 0x00100000 @@ -33,9 +34,6 @@ #define E820_MAP_NR_OFFSET 0x000001E8 #define E820_MAP_OFFSET 0x000002D0 -#define VCPU_NR_PAGE 0x0009F000 -#define VCPU_NR_OFFSET 0x00000800 - struct e820entry { uint64_t addr; uint64_t size; @@ -119,26 +117,50 @@ return (*(((unsigned char *)e820_page) + E820_MAP_NR_OFFSET) = nr_map); } +static void +set_hvm_info_checksum(struct hvm_info_table *t) +{ + uint8_t *ptr = (uint8_t *)t, sum = 0; + unsigned int i; + + t->checksum = 0; + + for (i = 0; i < t->length; i++) + sum += *ptr++; + + t->checksum = -sum; +} + /* - * Use E820 reserved memory 0x9F800 to pass number of vcpus to vmxloader - * vmxloader will use it to config ACPI MADT table + * Use E820 reserved memory 0x9F800 to pass HVM info to vmxloader + * vmxloader will use this info to set BIOS accordingly */ -#define VCPU_MAGIC 0x76637075 /* "vcpu" */ -static int set_vcpu_nr(int xc_handle, uint32_t dom, - unsigned long *pfn_list, unsigned int vcpus) -{ - char *va_map; - unsigned int *va_vcpus; - - va_map = xc_map_foreign_range(xc_handle, dom, - PAGE_SIZE, PROT_READ|PROT_WRITE, - pfn_list[VCPU_NR_PAGE >> PAGE_SHIFT]); +static int set_hvm_info(int xc_handle, uint32_t dom, + unsigned long *pfn_list, unsigned int vcpus, + unsigned int acpi, unsigned int apic) +{ + char *va_map; + struct hvm_info_table *va_hvm; + + va_map = xc_map_foreign_range( + xc_handle, + dom, + PAGE_SIZE, + PROT_READ|PROT_WRITE, + pfn_list[HVM_INFO_PFN]); + if ( va_map == NULL ) return -1; - va_vcpus = (unsigned int *)(va_map + VCPU_NR_OFFSET); - va_vcpus[0] = VCPU_MAGIC; - va_vcpus[1] = vcpus; + va_hvm = (struct hvm_info_table *)(va_map + HVM_INFO_OFFSET); + memset(va_hvm, 0, sizeof(*va_hvm)); + strncpy(va_hvm->signature, "HVM INFO", 8); + va_hvm->length = sizeof(struct hvm_info_table); + va_hvm->acpi_enabled = acpi; + va_hvm->apic_enabled = apic; + va_hvm->nr_vcpus = vcpus; + + set_hvm_info_checksum(va_hvm); munmap(va_map, PAGE_SIZE); @@ -279,8 +301,9 @@ vcpu_guest_context_t *ctxt, unsigned long shared_info_frame, unsigned int control_evtchn, - unsigned int lapic, unsigned int vcpus, + unsigned int acpi, + unsigned int apic, unsigned int store_evtchn, unsigned long *store_mfn) { @@ -490,20 +513,14 @@ goto error_out; } - if (set_vcpu_nr(xc_handle, dom, page_array, vcpus)) { - fprintf(stderr, "Couldn't set vcpu number for VMX guest.\n"); - goto error_out; - } - - *store_mfn = page_array[(v_end-2) >> PAGE_SHIFT]; - if ( xc_clear_domain_page(xc_handle, dom, *store_mfn) ) - goto error_out; - - shared_page_frame = (v_end - PAGE_SIZE) >> PAGE_SHIFT; - - if ((e820_page = xc_map_foreign_range( - xc_handle, dom, PAGE_SIZE, PROT_READ|PROT_WRITE, - page_array[E820_MAP_PAGE >> PAGE_SHIFT])) == 0) + if ( set_hvm_info(xc_handle, dom, page_array, vcpus, acpi, apic) ) { + fprintf(stderr, "Couldn't set hvm info for VMX guest.\n"); + goto error_out; + } + + if ( (e820_page = xc_map_foreign_range( + xc_handle, dom, PAGE_SIZE, PROT_READ|PROT_WRITE, + page_array[E820_MAP_PAGE >> PAGE_SHIFT])) == 0 ) goto error_out; memset(e820_page, 0, PAGE_SIZE); e820_map_nr = build_e820map(e820_page, v_end); @@ -518,25 +535,29 @@ munmap(e820_page, PAGE_SIZE); /* shared_info page starts its life empty. */ - if ((shared_info = xc_map_foreign_range( - xc_handle, dom, PAGE_SIZE, PROT_READ|PROT_WRITE, - shared_info_frame)) == 0) + if ( (shared_info = xc_map_foreign_range( + xc_handle, dom, PAGE_SIZE, PROT_READ|PROT_WRITE, + shared_info_frame)) == 0 ) goto error_out; memset(shared_info, 0, sizeof(shared_info_t)); /* Mask all upcalls... */ for ( i = 0; i < MAX_VIRT_CPUS; i++ ) shared_info->vcpu_info[i].evtchn_upcall_mask = 1; - munmap(shared_info, PAGE_SIZE); /* Populate the event channel port in the shared page */ - if ((sp = (shared_iopage_t *) xc_map_foreign_range( - xc_handle, dom, PAGE_SIZE, PROT_READ|PROT_WRITE, - page_array[shared_page_frame])) == 0) + shared_page_frame = page_array[(v_end >> PAGE_SHIFT) - 1]; + if ( (sp = (shared_iopage_t *) xc_map_foreign_range( + xc_handle, dom, PAGE_SIZE, PROT_READ|PROT_WRITE, + shared_page_frame)) == 0 ) goto error_out; memset(sp, 0, PAGE_SIZE); sp->sp_global.eport = control_evtchn; munmap(sp, PAGE_SIZE); + + *store_mfn = page_array[(v_end >> PAGE_SHIFT) - 2]; + if ( xc_clear_domain_page(xc_handle, dom, *store_mfn) ) + goto error_out; /* Send the page update requests down to the hypervisor. */ if ( xc_finish_mmu_updates(xc_handle, mmu) ) @@ -559,7 +580,7 @@ ctxt->user_regs.eax = 0; ctxt->user_regs.esp = 0; ctxt->user_regs.ebx = 0; /* startup_32 expects this to be 0 to signal boot cpu */ - ctxt->user_regs.ecx = lapic; + ctxt->user_regs.ecx = 0; ctxt->user_regs.esi = 0; ctxt->user_regs.edi = 0; ctxt->user_regs.ebp = 0; @@ -572,29 +593,6 @@ free(mmu); free(page_array); return -1; -} - -#define VMX_FEATURE_FLAG 0x20 - -static int vmx_identify(void) -{ - int eax, ecx; - - __asm__ __volatile__ ( -#if defined(__i386__) - "push %%ebx; cpuid; pop %%ebx" -#elif defined(__x86_64__) - "push %%rbx; cpuid; pop %%rbx" -#endif - : "=a" (eax), "=c" (ecx) - : "0" (1) - : "dx"); - - if (!(ecx & VMX_FEATURE_FLAG)) { - return -1; - } - - return 0; } int xc_vmx_build(int xc_handle, @@ -602,8 +600,9 @@ int memsize, const char *image_name, unsigned int control_evtchn, - unsigned int lapic, unsigned int vcpus, + unsigned int acpi, + unsigned int apic, unsigned int store_evtchn, unsigned long *store_mfn) { @@ -613,10 +612,18 @@ unsigned long nr_pages; char *image = NULL; unsigned long image_size; - - if ( vmx_identify() < 0 ) - { - PERROR("CPU doesn't support VMX Extensions"); + xen_capabilities_info_t xen_caps; + + if ( (rc = xc_version(xc_handle, XENVER_capabilities, &xen_caps)) != 0 ) + { + PERROR("Failed to get xen version info"); + goto error_out; + } + + if ( !strstr(xen_caps, "hvm") ) + { + PERROR("CPU doesn't support VMX Extensions or " + "CPU VMX Extensions are not turned on"); goto error_out; } @@ -644,7 +651,7 @@ goto error_out; } - if ( xc_domain_get_vcpu_context(xc_handle, domid, 0, ctxt) ) + if ( xc_vcpu_getcontext(xc_handle, domid, 0, ctxt) ) { PERROR("Could not get vcpu context"); goto error_out; @@ -659,7 +666,7 @@ if ( setup_guest(xc_handle, domid, memsize, image, image_size, nr_pages, ctxt, op.u.getdomaininfo.shared_info_frame, control_evtchn, - lapic, vcpus, store_evtchn, store_mfn) < 0) + vcpus, acpi, apic, store_evtchn, store_mfn) < 0) { ERROR("Error constructing guest OS"); goto error_out; @@ -701,11 +708,11 @@ memset( &launch_op, 0, sizeof(launch_op) ); - launch_op.u.setdomaininfo.domain = (domid_t)domid; - launch_op.u.setdomaininfo.vcpu = 0; - launch_op.u.setdomaininfo.ctxt = ctxt; - - launch_op.cmd = DOM0_SETDOMAININFO; + launch_op.u.setvcpucontext.domain = (domid_t)domid; + launch_op.u.setvcpucontext.vcpu = 0; + launch_op.u.setvcpucontext.ctxt = ctxt; + + launch_op.cmd = DOM0_SETVCPUCONTEXT; rc = xc_dom0_op(xc_handle, &launch_op); return rc; diff -r 25e3c8668f1f -r 8af1199488d3 tools/libxc/xenctrl.h --- a/tools/libxc/xenctrl.h Mon Jan 9 11:19:55 2006 +++ b/tools/libxc/xenctrl.h Mon Jan 9 11:22:17 2006 @@ -181,10 +181,11 @@ */ int xc_domain_destroy(int xc_handle, uint32_t domid); -int xc_domain_pincpu(int xc_handle, - uint32_t domid, - int vcpu, - cpumap_t cpumap); + +int xc_vcpu_setaffinity(int xc_handle, + uint32_t domid, + int vcpu, + cpumap_t cpumap); /** * This function will return information about one or more domains. It is @@ -208,7 +209,7 @@ /** - * This function will set the vcpu context for the specified domain. + * This function will set the execution context for the specified vcpu. * * @parm xc_handle a handle to an open hypervisor interface * @parm domid the domain to set the vcpu context for @@ -216,10 +217,10 @@ * @parm ctxt pointer to the the cpu context with the values to set * @return the number of domains enumerated or -1 on error */ -int xc_domain_setinfo(int xc_handle, - uint32_t domid, - uint32_t vcpu, - vcpu_guest_context_t *ctxt); +int xc_vcpu_setcontext(int xc_handle, + uint32_t domid, + uint32_t vcpu, + vcpu_guest_context_t *ctxt); /** * This function will return information about one or more domains, using a * single hypercall. The domain information will be stored into the supplied @@ -249,17 +250,16 @@ * domain * @return 0 on success, -1 on failure */ -int xc_domain_get_vcpu_context(int xc_handle, +int xc_vcpu_getcontext(int xc_handle, uint32_t domid, uint32_t vcpu, vcpu_guest_context_t *ctxt); typedef dom0_getvcpuinfo_t xc_vcpuinfo_t; -int xc_domain_get_vcpu_info(int xc_handle, - uint32_t domid, - uint32_t vcpu, - xc_vcpuinfo_t *info); - +int xc_vcpu_getinfo(int xc_handle, + uint32_t domid, + uint32_t vcpu, + xc_vcpuinfo_t *info); int xc_domain_setcpuweight(int xc_handle, uint32_t domid, @@ -379,6 +379,17 @@ uint32_t first_port, uint32_t nr_ports, uint32_t allow_access); + +int xc_domain_irq_permission(int xc_handle, + uint32_t domid, + uint8_t pirq, + uint8_t allow_access); + +int xc_domain_iomem_permission(int xc_handle, + uint32_t domid, + unsigned long first_pfn, + unsigned long nr_pfns, + uint8_t allow_access); unsigned long xc_make_page_below_4G(int xc_handle, uint32_t domid, unsigned long mfn); diff -r 25e3c8668f1f -r 8af1199488d3 tools/libxc/xenguest.h --- a/tools/libxc/xenguest.h Mon Jan 9 11:19:55 2006 +++ b/tools/libxc/xenguest.h Mon Jan 9 11:22:17 2006 @@ -56,8 +56,9 @@ int memsize, const char *image_name, unsigned int control_evtchn, - unsigned int lapic, unsigned int vcpus, + unsigned int acpi, + unsigned int apic, unsigned int store_evtchn, unsigned long *store_mfn); diff -r 25e3c8668f1f -r 8af1199488d3 tools/python/xen/lowlevel/xc/xc.c --- a/tools/python/xen/lowlevel/xc/xc.c Mon Jan 9 11:19:55 2006 +++ b/tools/python/xen/lowlevel/xc/xc.c Mon Jan 9 11:22:17 2006 @@ -135,9 +135,9 @@ } -static PyObject *pyxc_domain_pincpu(XcObject *self, - PyObject *args, - PyObject *kwds) +static PyObject *pyxc_vcpu_setaffinity(XcObject *self, + PyObject *args, + PyObject *kwds) { uint32_t dom; int vcpu = 0, i; @@ -157,7 +157,7 @@ cpumap |= (cpumap_t)1 << PyInt_AsLong(PyList_GetItem(cpulist, i)); } - if ( xc_domain_pincpu(self->xc_handle, dom, vcpu, cpumap) != 0 ) + if ( xc_vcpu_setaffinity(self->xc_handle, dom, vcpu, cpumap) != 0 ) return PyErr_SetFromErrno(xc_error); Py_INCREF(zero); @@ -297,7 +297,7 @@ &dom, &vcpu) ) return NULL; - rc = xc_domain_get_vcpu_info(self->xc_handle, dom, vcpu, &info); + rc = xc_vcpu_getinfo(self->xc_handle, dom, vcpu, &info); if ( rc < 0 ) return PyErr_SetFromErrno(xc_error); @@ -362,21 +362,23 @@ uint32_t dom; char *image; int control_evtchn, store_evtchn; + int memsize; int vcpus = 1; - int lapic = 0; - int memsize; + int acpi = 0; + int apic = 0; unsigned long store_mfn = 0; static char *kwd_list[] = { "dom", "control_evtchn", "store_evtchn", - "memsize", "image", "lapic", "vcpus", NULL }; - - if ( !PyArg_ParseTupleAndKeywords(args, kwds, "iiiisii", kwd_list, + "memsize", "image", "vcpus", "acpi", "apic", + NULL }; + + if ( !PyArg_ParseTupleAndKeywords(args, kwds, "iiiisiii", kwd_list, &dom, &control_evtchn, &store_evtchn, - &memsize, &image, &lapic, &vcpus) ) + &memsize, &image, &vcpus, &acpi, &apic) ) return NULL; if ( xc_vmx_build(self->xc_handle, dom, memsize, image, control_evtchn, - lapic, vcpus, store_evtchn, &store_mfn) != 0 ) + vcpus, acpi, apic, store_evtchn, &store_mfn) != 0 ) return PyErr_SetFromErrno(xc_error); return Py_BuildValue("{s:i}", "store_mfn", store_mfn); @@ -774,6 +776,52 @@ return zero; } +static PyObject *pyxc_domain_irq_permission(PyObject *self, + PyObject *args, + PyObject *kwds) +{ + XcObject *xc = (XcObject *)self; + uint32_t dom; + int pirq, allow_access, ret; + + static char *kwd_list[] = { "dom", "pirq", "allow_access", NULL }; + + if ( !PyArg_ParseTupleAndKeywords(args, kwds, "iii", kwd_list, + &dom, &pirq, &allow_access) ) + return NULL; + + ret = xc_domain_irq_permission( + xc->xc_handle, dom, pirq, allow_access); + if ( ret != 0 ) + return PyErr_SetFromErrno(xc_error); + + Py_INCREF(zero); + return zero; +} + +static PyObject *pyxc_domain_iomem_permission(PyObject *self, + PyObject *args, + PyObject *kwds) +{ + XcObject *xc = (XcObject *)self; + uint32_t dom; + unsigned long first_pfn, nr_pfns, allow_access, ret; + + static char *kwd_list[] = { "dom", "first_pfn", "nr_pfns", "allow_access", NULL }; + + if ( !PyArg_ParseTupleAndKeywords(args, kwds, "illi", kwd_list, + &dom, &first_pfn, &nr_pfns, &allow_access) ) + return NULL; + + ret = xc_domain_iomem_permission( + xc->xc_handle, dom, first_pfn, nr_pfns, allow_access); + if ( ret != 0 ) + return PyErr_SetFromErrno(xc_error); + + Py_INCREF(zero); + return zero; +} + static PyObject *dom_op(XcObject *self, PyObject *args, int (*fn)(int, uint32_t)) @@ -842,8 +890,8 @@ " dom [int]: Identifier of domain to be destroyed.\n\n" "Returns: [int] 0 on success; -1 on error.\n" }, - { "domain_pincpu", - (PyCFunction)pyxc_domain_pincpu, + { "vcpu_setaffinity", + (PyCFunction)pyxc_vcpu_setaffinity, METH_VARARGS | METH_KEYWORDS, "\n" "Pin a VCPU to a specified set CPUs.\n" " dom [int]: Identifier of domain to which VCPU belongs.\n" @@ -1067,6 +1115,25 @@ " dom [int]: Identifier of domain to be allowed access.\n" " first_port [int]: First IO port\n" " nr_ports [int]: Number of IO ports\n" + " allow_access [int]: Non-zero means enable access; else disable access\n\n" + "Returns: [int] 0 on success; -1 on error.\n" }, + + { "domain_irq_permission", + (PyCFunction)pyxc_domain_irq_permission, + METH_VARARGS | METH_KEYWORDS, "\n" + "Allow a domain access to a physical IRQ\n" + " dom [int]: Identifier of domain to be allowed access.\n" + " pirq [int]: The Physical IRQ\n" + " allow_access [int]: Non-zero means enable access; else disable access\n\n" + "Returns: [int] 0 on success; -1 on error.\n" }, + + { "domain_iomem_permission", + (PyCFunction)pyxc_domain_iomem_permission, + METH_VARARGS | METH_KEYWORDS, "\n" + "Allow a domain access to a range of IO memory pages\n" + " dom [int]: Identifier of domain to be allowed access.\n" + " first_pfn [long]: First page of I/O Memory\n" + " nr_pfns [long]: Number of pages of I/O Memory (>0)\n" " allow_access [int]: Non-zero means enable access; else disable access\n\n" "Returns: [int] 0 on success; -1 on error.\n" }, diff -r 25e3c8668f1f -r 8af1199488d3 tools/python/xen/xend/XendDomain.py --- a/tools/python/xen/xend/XendDomain.py Mon Jan 9 11:19:55 2006 +++ b/tools/python/xen/xend/XendDomain.py Mon Jan 9 11:22:17 2006 @@ -443,7 +443,7 @@ cpumap = map(lambda x: int(x), cpumap.replace("[", "").replace("]", "").split(",")) try: - return xc.domain_pincpu(dominfo.getDomid(), vcpu, cpumap) + return xc.vcpu_setaffinity(dominfo.getDomid(), vcpu, cpumap) except Exception, ex: raise XendError(str(ex)) diff -r 25e3c8668f1f -r 8af1199488d3 tools/python/xen/xend/XendDomainInfo.py --- a/tools/python/xen/xend/XendDomainInfo.py Mon Jan 9 11:19:55 2006 +++ b/tools/python/xen/xend/XendDomainInfo.py Mon Jan 9 11:22:17 2006 @@ -1179,7 +1179,7 @@ for v in range(0, self.info['max_vcpu_id']+1): # pincpu takes a list of ints cpu = [ int( cpus[v % len(cpus)] ) ] - xc.domain_pincpu(self.domid, v, cpu) + xc.vcpu_setaffinity(self.domid, v, cpu) m = self.image.getDomainMemory(self.info['memory'] * 1024) balloon.free(m) diff -r 25e3c8668f1f -r 8af1199488d3 tools/python/xen/xend/image.py --- a/tools/python/xen/xend/image.py Mon Jan 9 11:19:55 2006 +++ b/tools/python/xen/xend/image.py Mon Jan 9 11:22:17 2006 @@ -189,11 +189,16 @@ def configure(self, imageConfig, deviceConfig): ImageHandler.configure(self, imageConfig, deviceConfig) + info = xc.xeninfo() + if not 'hvm' in info['xen_caps']: + raise VmError("vmx: not an Intel VT platform, we stop creating!") + self.dmargs = self.parseDeviceModelArgs(imageConfig, deviceConfig) self.device_model = sxp.child_value(imageConfig, 'device_model') if not self.device_model: raise VmError("vmx: missing device model") self.display = sxp.child_value(imageConfig, 'display') + self.xauthority = sxp.child_value(imageConfig, 'xauthority') self.vm.storeVm(("image/dmargs", " ".join(self.dmargs)), ("image/device-model", self.device_model), @@ -204,10 +209,8 @@ self.dmargs += self.configVNC(imageConfig) - self.lapic = 0 - lapic = sxp.child_value(imageConfig, 'lapic') - if not lapic is None: - self.lapic = int(lapic) + self.acpi = int(sxp.child_value(imageConfig, 'acpi', 0)) + self.apic = int(sxp.child_value(imageConfig, 'apic', 0)) def buildDomain(self): # Create an event channel @@ -222,17 +225,18 @@ log.debug("control_evtchn = %d", self.device_channel) log.debug("store_evtchn = %d", store_evtchn) log.debug("memsize = %d", self.vm.getMemoryTarget() / 1024) - log.debug("lapic = %d", self.lapic) log.debug("vcpus = %d", self.vm.getVCpuCount()) + log.debug("acpi = %d", self.acpi) + log.debug("apic = %d", self.apic) return xc.vmx_build(dom = self.vm.getDomid(), image = self.kernel, control_evtchn = self.device_channel, store_evtchn = store_evtchn, memsize = self.vm.getMemoryTarget() / 1024, - lapic = self.lapic, - vcpus = self.vm.getVCpuCount()) - + vcpus = self.vm.getVCpuCount(), + acpi = self.acpi, + apic = self.apic) # Return a list of cmd line args to the device models based on the # xm config file @@ -264,44 +268,44 @@ nics = 0 for (name, info) in deviceConfig: if name == 'vbd': - uname = sxp.child_value(info, 'uname') - typedev = sxp.child_value(info, 'dev') - (_, vbdparam) = string.split(uname, ':', 1) - if re.match('^ioemu:', typedev): - (emtype, vbddev) = string.split(typedev, ':', 1) - else: - emtype = 'vbd' - vbddev = typedev - if emtype != 'ioemu': - continue; - vbddev_list = ['hda', 'hdb', 'hdc', 'hdd'] - if vbddev not in vbddev_list: - raise VmError("vmx: for qemu vbd type=file&dev=hda~hdd") - ret.append("-%s" % vbddev) - ret.append("%s" % vbdparam) + uname = sxp.child_value(info, 'uname') + typedev = sxp.child_value(info, 'dev') + (_, vbdparam) = string.split(uname, ':', 1) + if 'ioemu:' in typedev: + (emtype, vbddev) = string.split(typedev, ':', 1) + else: + emtype = 'vbd' + vbddev = typedev + if emtype == 'vbd': + continue; + vbddev_list = ['hda', 'hdb', 'hdc', 'hdd'] + if vbddev not in vbddev_list: + raise VmError("vmx: for qemu vbd type=file&dev=hda~hdd") + ret.append("-%s" % vbddev) + ret.append("%s" % vbdparam) if name == 'vif': - type = sxp.child_value(info, 'type') - if type != 'ioemu': - continue - nics += 1 - if mac != None: - continue - mac = sxp.child_value(info, 'mac') - bridge = sxp.child_value(info, 'bridge') - if mac == None: - mac = randomMAC() - if bridge == None: - bridge = 'xenbr0' - ret.append("-macaddr") - ret.append("%s" % mac) - ret.append("-bridge") - ret.append("%s" % bridge) + type = sxp.child_value(info, 'type') + if type != 'ioemu': + continue + nics += 1 + if mac != None: + continue + mac = sxp.child_value(info, 'mac') + bridge = sxp.child_value(info, 'bridge') + if mac == None: + mac = randomMAC() + if bridge == None: + bridge = 'xenbr0' + ret.append("-macaddr") + ret.append("%s" % mac) + ret.append("-bridge") + ret.append("%s" % bridge) if name == 'vtpm': - instance = sxp.child_value(info, 'pref_instance') - ret.append("-instance") - ret.append("%s" % instance) + instance = sxp.child_value(info, 'pref_instance') + ret.append("-instance") + ret.append("%s" % instance) ret.append("-nics") - ret.append("%d" % nics) + ret.append("%d" % nics) return ret def configVNC(self, config): @@ -340,6 +344,8 @@ env = dict(os.environ) if self.display: env['DISPLAY'] = self.display + if self.xauthority: + env['XAUTHORITY'] = self.xauthority log.info("spawning device models: %s %s", self.device_model, args) self.pid = os.spawnve(os.P_NOWAIT, self.device_model, args, env) log.info("device model pid: %d", self.pid) diff -r 25e3c8668f1f -r 8af1199488d3 tools/python/xen/xend/server/blkif.py --- a/tools/python/xen/xend/server/blkif.py Mon Jan 9 11:19:55 2006 +++ b/tools/python/xen/xend/server/blkif.py Mon Jan 9 11:22:17 2006 @@ -31,7 +31,7 @@ """Block device interface controller. Handles all block devices for a domain. """ - + def __init__(self, vm): """Create a block device controller. """ @@ -40,9 +40,9 @@ def getDeviceDetails(self, config): """@see DevController.getDeviceDetails""" - + dev = sxp.child_value(config, 'dev') - if re.match('^ioemu:', dev): + if 'ioemu:' in dev: return (None,{},{}) devid = blkif.blkdev_name_to_number(dev) diff -r 25e3c8668f1f -r 8af1199488d3 tools/python/xen/xm/create.py --- a/tools/python/xen/xm/create.py Mon Jan 9 11:19:55 2006 +++ b/tools/python/xen/xm/create.py Mon Jan 9 11:22:17 2006 @@ -160,9 +160,13 @@ fn=set_int, default=None, use="CPUS to run the domain on.") -gopts.var('lapic', val='LAPIC', +gopts.var('acpi', val='ACPI', fn=set_int, default=0, - use="Disable or enable local APIC of VMX domain.") + use="Disable or enable ACPI of VMX domain.") + +gopts.var('apic', val='APIC', + fn=set_int, default=0, + use="Disable or enable APIC of VMX domain.") gopts.var('vcpus', val='VCPUS', fn=set_int, default=1, @@ -387,6 +391,10 @@ gopts.var('display', val='DISPLAY', fn=set_value, default=None, use="X11 display to use") + +gopts.var('xauthority', val='XAUTHORITY', + fn=set_value, default=None, + use="X11 Authority to use") def err(msg): @@ -526,7 +534,8 @@ """ args = [ 'device_model', 'vcpus', 'cdrom', 'boot', 'fda', 'fdb', 'localtime', 'serial', 'stdvga', 'isa', 'nographic', 'audio', - 'vnc', 'vncviewer', 'sdl', 'display', 'ne2000', 'lapic'] + 'vnc', 'vncviewer', 'sdl', 'display', 'ne2000', 'acpi', 'apic', + 'xauthority' ] for a in args: if (vals.__dict__[a]): config_image.append([a, vals.__dict__[a]]) @@ -801,6 +810,9 @@ if not gopts.vals.display: gopts.vals.display = os.getenv("DISPLAY") + if not gopts.vals.xauthority: + gopts.vals.xauthority = os.getenv("XAUTHORITY") + # Process remaining args as config variables. for arg in args: if '=' in arg: diff -r 25e3c8668f1f -r 8af1199488d3 tools/python/xen/xm/main.py --- a/tools/python/xen/xm/main.py Mon Jan 9 11:19:55 2006 +++ b/tools/python/xen/xm/main.py Mon Jan 9 11:22:17 2006 @@ -390,7 +390,6 @@ def xm_vcpu_list(args): - print 'Name ID VCPU CPU State Time(s) CPU Affinity' from xen.xend.XendClient import server if args: @@ -400,6 +399,8 @@ dominfo = map( lambda x: server.xend_domain_vcpuinfo(sxp.child_value(x, 'name')), doms) + + print 'Name ID VCPU CPU State Time(s) CPU Affinity' for dom in dominfo: def get_info(n): @@ -625,6 +626,8 @@ server.xend_domain_cpu_sedf_set(dom, *v) def xm_info(args): + arg_check(args, "info", 0) + from xen.xend.XendClient import server info = server.xend_node() @@ -645,9 +648,12 @@ def xm_top(args): + arg_check(args, "top", 0) + os.execvp('xentop', ['xentop']) def xm_dmesg(args): + arg_check(args, "dmesg", 0) gopts = Opts(use="""[-c|--clear] diff -r 25e3c8668f1f -r 8af1199488d3 tools/tests/test_x86_emulator.c --- a/tools/tests/test_x86_emulator.c Mon Jan 9 11:19:55 2006 +++ b/tools/tests/test_x86_emulator.c Mon Jan 9 11:22:17 2006 @@ -92,7 +92,7 @@ regs.ecx = 0x12345678; cr2 = (unsigned long)&res; res = 0x7FFFFFFF; - rc = x86_emulate_memop(®s, cr2, &emulops, 4); + rc = x86_emulate_memop(®s, cr2, &emulops, X86EMUL_MODE_PROT32); if ( (rc != 0) || (res != 0x92345677) || (regs.eflags != 0xa94) || @@ -110,7 +110,7 @@ regs.ecx = 0x12345678UL; #endif cr2 = (unsigned long)&res; - rc = x86_emulate_memop(®s, cr2, &emulops, 4); + rc = x86_emulate_memop(®s, cr2, &emulops, X86EMUL_MODE_PROT32); if ( (rc != 0) || (res != 0x92345677) || (regs.ecx != 0x8000000FUL) || @@ -125,7 +125,7 @@ regs.eax = 0x92345677UL; regs.ecx = 0xAA; cr2 = (unsigned long)&res; - rc = x86_emulate_memop(®s, cr2, &emulops, 4); + rc = x86_emulate_memop(®s, cr2, &emulops, X86EMUL_MODE_PROT32); if ( (rc != 0) || (res != 0x923456AA) || (regs.eflags != 0x244) || @@ -141,7 +141,7 @@ regs.eax = 0xAABBCC77UL; regs.ecx = 0xFF; cr2 = (unsigned long)&res; - rc = x86_emulate_memop(®s, cr2, &emulops, 4); + rc = x86_emulate_memop(®s, cr2, &emulops, X86EMUL_MODE_PROT32); if ( (rc != 0) || (res != 0x923456AA) || ((regs.eflags&0x240) != 0x200) || @@ -157,7 +157,7 @@ regs.eip = (unsigned long)&instr[0]; regs.ecx = 0x12345678; cr2 = (unsigned long)&res; - rc = x86_emulate_memop(®s, cr2, &emulops, 4); + rc = x86_emulate_memop(®s, cr2, &emulops, X86EMUL_MODE_PROT32); if ( (rc != 0) || (res != 0x12345678) || (regs.eflags != 0x200) || @@ -174,7 +174,7 @@ regs.eax = 0x923456AAUL; regs.ecx = 0xDDEEFF00L; cr2 = (unsigned long)&res; - rc = x86_emulate_memop(®s, cr2, &emulops, 4); + rc = x86_emulate_memop(®s, cr2, &emulops, X86EMUL_MODE_PROT32); if ( (rc != 0) || (res != 0xDDEEFF00) || (regs.eflags != 0x244) || @@ -193,7 +193,7 @@ regs.edi = (unsigned long)&res + 2; regs.error_code = 0; /* read fault */ cr2 = regs.esi; - rc = x86_emulate_memop(®s, cr2, &emulops, 4); + rc = x86_emulate_memop(®s, cr2, &emulops, X86EMUL_MODE_PROT32); if ( (rc != 0) || (res != 0x44554455) || (regs.eflags != 0x200) || @@ -211,7 +211,7 @@ regs.eip = (unsigned long)&instr[0]; regs.edi = (unsigned long)&res; cr2 = regs.edi; - rc = x86_emulate_memop(®s, cr2, &emulops, 4); + rc = x86_emulate_memop(®s, cr2, &emulops, X86EMUL_MODE_PROT32); if ( (rc != 0) || (res != 0x2233445D) || ((regs.eflags&0x201) != 0x201) || @@ -229,7 +229,7 @@ regs.eip = (unsigned long)&instr[0]; regs.edi = (unsigned long)cmpxchg8b_res; cr2 = regs.edi; - rc = x86_emulate_memop(®s, cr2, &emulops, 4); + rc = x86_emulate_memop(®s, cr2, &emulops, X86EMUL_MODE_PROT32); if ( (rc != 0) || (cmpxchg8b_res[0] != 0x9999AAAA) || (cmpxchg8b_res[1] != 0xCCCCFFFF) || @@ -243,7 +243,7 @@ regs.eip = (unsigned long)&instr[0]; regs.edi = (unsigned long)cmpxchg8b_res; cr2 = regs.edi; - rc = x86_emulate_memop(®s, cr2, &emulops, 4); + rc = x86_emulate_memop(®s, cr2, &emulops, X86EMUL_MODE_PROT32); if ( (rc != 0) || (cmpxchg8b_res[0] != 0x9999AAAA) || (cmpxchg8b_res[1] != 0xCCCCFFFF) || @@ -260,7 +260,7 @@ regs.ecx = 0x12345678; cr2 = (unsigned long)&res; res = 0x82; - rc = x86_emulate_memop(®s, cr2, &emulops, 4); + rc = x86_emulate_memop(®s, cr2, &emulops, X86EMUL_MODE_PROT32); if ( (rc != 0) || (res != 0x82) || (regs.ecx != 0xFFFFFF82) || @@ -275,7 +275,7 @@ regs.ecx = 0x12345678; cr2 = (unsigned long)&res; res = 0x1234aa82; - rc = x86_emulate_memop(®s, cr2, &emulops, 4); + rc = x86_emulate_memop(®s, cr2, &emulops, X86EMUL_MODE_PROT32); if ( (rc != 0) || (res != 0x1234aa82) || (regs.ecx != 0xaa82) || diff -r 25e3c8668f1f -r 8af1199488d3 tools/vtpm_manager/README --- a/tools/vtpm_manager/README Mon Jan 9 11:19:55 2006 +++ b/tools/vtpm_manager/README Mon Jan 9 11:22:17 2006 @@ -53,11 +53,6 @@ MANUAL_DM_LAUNCH -> Must manually launch & kill VTPMs -WELL_KNOWN_SRK_AUTH -> Rather than randomly generating the password for the SRK, - use a well known value. This is necessary for sharing use - of the SRK across applications. Such as VTPM and Dom0 - measurement software. - WELL_KNOWN_OWNER_AUTH -> Rather than randomly generating the password for the owner, use a well known value. This is useful for debugging and for poor bios which do not support clearing TPM if OwnerAuth is diff -r 25e3c8668f1f -r 8af1199488d3 tools/vtpm_manager/Rules.mk --- a/tools/vtpm_manager/Rules.mk Mon Jan 9 11:19:55 2006 +++ b/tools/vtpm_manager/Rules.mk Mon Jan 9 11:22:17 2006 @@ -56,8 +56,7 @@ # Do not have manager launch DMs. #CFLAGS += -DMANUAL_DM_LAUNCH -# Fixed SRK -CFLAGS += -DWELL_KNOWN_SRK_AUTH +# Fixed OwnerAuth #CFLAGS += -DWELL_KNOWN_OWNER_AUTH # TPM Hardware Device or TPM Simulator diff -r 25e3c8668f1f -r 8af1199488d3 tools/vtpm_manager/manager/securestorage.c --- a/tools/vtpm_manager/manager/securestorage.c Mon Jan 9 11:19:55 2006 +++ b/tools/vtpm_manager/manager/securestorage.c Mon Jan 9 11:22:17 2006 @@ -65,7 +65,7 @@ UINT32 i; struct pack_constbuf_t symkey_cipher32, data_cipher32; - vtpmloginfo(VTPM_LOG_VTPM_DEEP, "Enveloping[%d]: 0x", buffer_len(inbuf)); + vtpmloginfo(VTPM_LOG_VTPM_DEEP, "Enveloping Input[%d]: 0x", buffer_len(inbuf)); for (i=0; i< buffer_len(inbuf); i++) vtpmloginfomore(VTPM_LOG_VTPM_DEEP, "%x ", inbuf->bytes[i]); vtpmloginfomore(VTPM_LOG_VTPM_DEEP, "\n"); @@ -94,6 +94,12 @@ BSG_TPM_SIZE32_DATA, &data_cipher32); vtpmloginfo(VTPM_LOG_VTPM, "Saved %d bytes of E(symkey) + %d bytes of E(data)\n", buffer_len(&symkey_cipher), buffer_len(&data_cipher)); + + vtpmloginfo(VTPM_LOG_VTPM_DEEP, "Enveloping Output[%d]: 0x", buffer_len(sealed_data)); + for (i=0; i< buffer_len(sealed_data); i++) + vtpmloginfomore(VTPM_LOG_VTPM_DEEP, "%x ", sealed_data->bytes[i]); + vtpmloginfomore(VTPM_LOG_VTPM_DEEP, "\n"); + goto egress; abort_egress: @@ -125,7 +131,7 @@ memset(&symkey, 0, sizeof(symkey_t)); - vtpmloginfo(VTPM_LOG_VTPM_DEEP, "envelope decrypting[%ld]: 0x", cipher_size); + vtpmloginfo(VTPM_LOG_VTPM_DEEP, "Envelope Decrypt Input[%ld]: 0x", cipher_size); for (i=0; i< cipher_size; i++) vtpmloginfomore(VTPM_LOG_VTPM_DEEP, "%x ", cipher[i]); vtpmloginfomore(VTPM_LOG_VTPM_DEEP, "\n"); @@ -155,6 +161,11 @@ // Decrypt State TPMTRY(TPM_DECRYPT_ERROR, Crypto_symcrypto_decrypt (&symkey, &data_cipher, unsealed_data) ); + + vtpmloginfo(VTPM_LOG_VTPM_DEEP, "Envelope Decrypte Output[%d]: 0x", buffer_len(unsealed_data)); + for (i=0; i< buffer_len(unsealed_data); i++) + vtpmloginfomore(VTPM_LOG_VTPM_DEEP, "%x ", unsealed_data->bytes[i]); + vtpmloginfomore(VTPM_LOG_VTPM_DEEP, "\n"); goto egress; @@ -291,124 +302,175 @@ return status; } + TPM_RESULT VTPM_SaveService(void) { TPM_RESULT status=TPM_SUCCESS; int fh, dmis=-1; - - BYTE *flat_global; - int flat_global_size, bytes_written; + + BYTE *flat_boot_key, *flat_dmis, *flat_enc; + buffer_t clear_flat_global, enc_flat_global; UINT32 storageKeySize = buffer_len(&vtpm_globals->storageKeyWrap); + UINT32 bootKeySize = buffer_len(&vtpm_globals->bootKeyWrap); struct pack_buf_t storage_key_pack = {storageKeySize, vtpm_globals->storageKeyWrap.bytes}; - + struct pack_buf_t boot_key_pack = {bootKeySize, vtpm_globals->bootKeyWrap.bytes}; + struct hashtable_itr *dmi_itr; VTPM_DMI_RESOURCE *dmi_res; - - UINT32 flat_global_full_size; - - // Global Values needing to be saved - flat_global_full_size = 3*sizeof(TPM_DIGEST) + // Auths - sizeof(UINT32) + // storagekeysize - storageKeySize + // storage key - hashtable_count(vtpm_globals->dmi_map) * // num DMIS - (sizeof(UINT32) + 2*sizeof(TPM_DIGEST)); // Per DMI info - - - flat_global = (BYTE *) malloc( flat_global_full_size); - - flat_global_size = BSG_PackList(flat_global, 4, - BSG_TPM_AUTHDATA, &vtpm_globals->owner_usage_auth, - BSG_TPM_AUTHDATA, &vtpm_globals->srk_usage_auth, - BSG_TPM_SECRET, &vtpm_globals->storage_key_usage_auth, - BSG_TPM_SIZE32_DATA, &storage_key_pack); - + + UINT32 boot_key_size, flat_dmis_size; + + // Initially fill these with buffer sizes for each data type. Later fill + // in actual size, once flattened. + boot_key_size = sizeof(UINT32) + // bootkeysize + bootKeySize; // boot key + + TPMTRYRETURN(buffer_init(&clear_flat_global, 3*sizeof(TPM_DIGEST) + // Auths + sizeof(UINT32) +// storagekeysize + storageKeySize, NULL) ); // storage key + + flat_dmis_size = (hashtable_count(vtpm_globals->dmi_map) - 1) * // num DMIS (-1 for Dom0) + (sizeof(UINT32) + 2*sizeof(TPM_DIGEST)); // Per DMI info + + flat_boot_key = (BYTE *) malloc( boot_key_size ); + flat_enc = (BYTE *) malloc( sizeof(UINT32) ); + flat_dmis = (BYTE *) malloc( flat_dmis_size ); + + boot_key_size = BSG_PackList(flat_boot_key, 1, + BSG_TPM_SIZE32_DATA, &boot_key_pack); + + BSG_PackList(clear_flat_global.bytes, 3, + BSG_TPM_AUTHDATA, &vtpm_globals->owner_usage_auth, + BSG_TPM_SECRET, &vtpm_globals->storage_key_usage_auth, + BSG_TPM_SIZE32_DATA, &storage_key_pack); + + TPMTRYRETURN(envelope_encrypt(&clear_flat_global, + &vtpm_globals->bootKey, + &enc_flat_global) ); + + BSG_PackConst(buffer_len(&enc_flat_global), 4, flat_enc); + // Per DMI values to be saved if (hashtable_count(vtpm_globals->dmi_map) > 0) { - + dmi_itr = hashtable_iterator(vtpm_globals->dmi_map); do { dmi_res = (VTPM_DMI_RESOURCE *) hashtable_iterator_value(dmi_itr); dmis++; // No need to save dmi0. - if (dmi_res->dmi_id == 0) - continue; - - - flat_global_size += BSG_PackList( flat_global + flat_global_size, 3, - BSG_TYPE_UINT32, &dmi_res->dmi_id, - BSG_TPM_DIGEST, &dmi_res->NVM_measurement, - BSG_TPM_DIGEST, &dmi_res->DMI_measurement); - + if (dmi_res->dmi_id == 0) + continue; + + + flat_dmis_size += BSG_PackList( flat_dmis + flat_dmis_size, 3, + BSG_TYPE_UINT32, &dmi_res->dmi_id, + BSG_TPM_DIGEST, &dmi_res->NVM_measurement, + BSG_TPM_DIGEST, &dmi_res->DMI_measurement); + } while (hashtable_iterator_advance(dmi_itr)); } - - //FIXME: Once we have a way to protect a TPM key, we should use it to - // encrypt this blob. BUT, unless there is a way to ensure the key is - // not used by other apps, this encryption is useless. + fh = open(STATE_FILE, O_WRONLY | O_CREAT, S_IREAD | S_IWRITE); if (fh == -1) { vtpmlogerror(VTPM_LOG_VTPM, "Unable to open %s file for write.\n", STATE_FILE); status = TPM_IOERROR; goto abort_egress; } - - if ( (bytes_written = write(fh, flat_global, flat_global_size)) != flat_global_size ) { - vtpmlogerror(VTPM_LOG_VTPM, "Failed to save service data. %d/%d bytes written.\n", bytes_written, flat_global_size); - status = TPM_IOERROR; - goto abort_egress; - } - vtpm_globals->DMI_table_dirty = FALSE; - + + if ( ( write(fh, flat_boot_key, boot_key_size) != boot_key_size ) || + ( write(fh, flat_enc, sizeof(UINT32)) != sizeof(UINT32) ) || + ( write(fh, enc_flat_global.bytes, buffer_len(&enc_flat_global)) != buffer_len(&enc_flat_global) ) || + ( write(fh, flat_dmis, flat_dmis_size) != flat_dmis_size ) ) { + vtpmlogerror(VTPM_LOG_VTPM, "Failed to completely write service data.\n"); + status = TPM_IOERROR; + goto abort_egress; + } + + vtpm_globals->DMI_table_dirty = FALSE; + goto egress; - + abort_egress: egress: - - free(flat_global); + + free(flat_boot_key); + free(flat_enc); + buffer_free(&enc_flat_global); + free(flat_dmis); close(fh); - + vtpmloginfo(VTPM_LOG_VTPM, "Saved VTPM Service state (status = %d, dmis = %d)\n", (int) status, dmis); return status; } TPM_RESULT VTPM_LoadService(void) { - + TPM_RESULT status=TPM_SUCCESS; int fh, stat_ret, dmis=0; long fh_size = 0, step_size; - BYTE *flat_global=NULL; - struct pack_buf_t storage_key_pack; - UINT32 *dmi_id_key; - + BYTE *flat_table=NULL; + buffer_t unsealed_data; + struct pack_buf_t storage_key_pack, boot_key_pack; + UINT32 *dmi_id_key, enc_size; + VTPM_DMI_RESOURCE *dmi_res; struct stat file_stat; - + + TPM_HANDLE boot_key_handle; + TPM_AUTHDATA boot_usage_auth; + memset(&boot_usage_auth, 0, sizeof(TPM_AUTHDATA)); + fh = open(STATE_FILE, O_RDONLY ); stat_ret = fstat(fh, &file_stat); - if (stat_ret == 0) + if (stat_ret == 0) fh_size = file_stat.st_size; else { status = TPM_IOERROR; goto abort_egress; } - - flat_global = (BYTE *) malloc(fh_size); - - if ((long) read(fh, flat_global, fh_size) != fh_size ) { - status = TPM_IOERROR; - goto abort_egress; - } - + + flat_table = (BYTE *) malloc(fh_size); + + if ((long) read(fh, flat_table, fh_size) != fh_size ) { + status = TPM_IOERROR; + goto abort_egress; + } + + // Read Boot Key + step_size = BSG_UnpackList( flat_table, 2, + BSG_TPM_SIZE32_DATA, &boot_key_pack, + BSG_TYPE_UINT32, &enc_size); + + TPMTRYRETURN(buffer_init(&vtpm_globals->bootKeyWrap, 0, 0) ); + TPMTRYRETURN(buffer_append_raw(&vtpm_globals->bootKeyWrap, boot_key_pack.size, boot_key_pack.data) ); + + //Load Boot Key + TPMTRYRETURN( VTSP_LoadKey( vtpm_globals->manager_tcs_handle, + TPM_SRK_KEYHANDLE, + &vtpm_globals->bootKeyWrap, + &SRK_AUTH, + &boot_key_handle, + &vtpm_globals->keyAuth, + &vtpm_globals->bootKey, + FALSE) ); + + TPMTRYRETURN( envelope_decrypt(enc_size, + flat_table + step_size, + vtpm_globals->manager_tcs_handle, + boot_key_handle, + (const TPM_AUTHDATA*) &boot_usage_auth, + &unsealed_data) ); + step_size += enc_size; + // Global Values needing to be saved - step_size = BSG_UnpackList( flat_global, 4, - BSG_TPM_AUTHDATA, &vtpm_globals->owner_usage_auth, - BSG_TPM_AUTHDATA, &vtpm_globals->srk_usage_auth, - BSG_TPM_SECRET, &vtpm_globals->storage_key_usage_auth, - BSG_TPM_SIZE32_DATA, &storage_key_pack); - + BSG_UnpackList( unsealed_data.bytes, 3, + BSG_TPM_AUTHDATA, &vtpm_globals->owner_usage_auth, + BSG_TPM_SECRET, &vtpm_globals->storage_key_usage_auth, + BSG_TPM_SIZE32_DATA, &storage_key_pack); + TPMTRYRETURN(buffer_init(&vtpm_globals->storageKeyWrap, 0, 0) ); TPMTRYRETURN(buffer_append_raw(&vtpm_globals->storageKeyWrap, storage_key_pack.size, storage_key_pack.data) ); - + // Per DMI values to be saved while ( step_size < fh_size ){ if (fh_size - step_size < (long) (sizeof(UINT32) + 2*sizeof(TPM_DIGEST))) { @@ -417,35 +479,38 @@ } else { dmi_res = (VTPM_DMI_RESOURCE *) malloc(sizeof(VTPM_DMI_RESOURCE)); dmis++; - + dmi_res->connected = FALSE; - - step_size += BSG_UnpackList(flat_global + step_size, 3, - BSG_TYPE_UINT32, &dmi_res->dmi_id, - BSG_TPM_DIGEST, &dmi_res->NVM_measurement, - BSG_TPM_DIGEST, &dmi_res->DMI_measurement); - + + step_size += BSG_UnpackList(flat_table + step_size, 3, + BSG_TYPE_UINT32, &dmi_res->dmi_id, + BSG_TPM_DIGEST, &dmi_res->NVM_measurement, + BSG_TPM_DIGEST, &dmi_res->DMI_measurement); + // install into map dmi_id_key = (UINT32 *) malloc (sizeof(UINT32)); *dmi_id_key = dmi_res->dmi_id; if (!hashtable_insert(vtpm_globals->dmi_map, dmi_id_key, dmi_res)) { - status = TPM_FAIL; - goto abort_egress; + status = TPM_FAIL; + goto abort_egress; } - + } - - } - + + } + vtpmloginfo(VTPM_LOG_VTPM, "Loaded saved state (dmis = %d).\n", dmis); goto egress; - + abort_egress: vtpmlogerror(VTPM_LOG_VTPM, "Failed to load service data with error = %s\n", tpm_get_error_name(status)); egress: - - free(flat_global); + + free(flat_table); close(fh); - + + // TODO: Could be nice and evict BootKey. (Need to add EvictKey to VTSP. + return status; } + diff -r 25e3c8668f1f -r 8af1199488d3 tools/vtpm_manager/manager/vtpm_manager.c --- a/tools/vtpm_manager/manager/vtpm_manager.c Mon Jan 9 11:19:55 2006 +++ b/tools/vtpm_manager/manager/vtpm_manager.c Mon Jan 9 11:22:17 2006 @@ -74,16 +74,15 @@ #endif // --------------------------- Well Known Auths -------------------------- -#ifdef WELL_KNOWN_SRK_AUTH -static BYTE FIXED_SRK_AUTH[20] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, +const TPM_AUTHDATA SRK_AUTH = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; -#endif #ifdef WELL_KNOWN_OWNER_AUTH static BYTE FIXED_OWNER_AUTH[20] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; #endif - + + // -------------------------- Hash table functions -------------------- static unsigned int hashfunc32(void *ky) { @@ -100,13 +99,7 @@ TPM_RESULT status = TPM_SUCCESS; - // Generate Auth's for SRK & Owner -#ifdef WELL_KNOWN_SRK_AUTH - memcpy(vtpm_globals->srk_usage_auth, FIXED_SRK_AUTH, sizeof(TPM_AUTHDATA)); -#else - Crypto_GetRandom(vtpm_globals->srk_usage_auth, sizeof(TPM_AUTHDATA) ); -#endif - + // Generate Auth for Owner #ifdef WELL_KNOWN_OWNER_AUTH memcpy(vtpm_globals->owner_usage_auth, FIXED_OWNER_AUTH, sizeof(TPM_AUTHDATA)); #else @@ -116,14 +109,14 @@ // Take Owership of TPM CRYPTO_INFO ek_cryptoInfo; - vtpmloginfo(VTPM_LOG_VTPM, "Attempting Pubek Read. NOTE: Failure is ok.\n"); status = VTSP_ReadPubek(vtpm_globals->manager_tcs_handle, &ek_cryptoInfo); // If we can read PubEK then there is no owner and we should take it. if (status == TPM_SUCCESS) { + vtpmloginfo(VTPM_LOG_VTPM, "Failed to readEK meaning TPM has an owner. Creating Keys off existing SRK.\n"); TPMTRYRETURN(VTSP_TakeOwnership(vtpm_globals->manager_tcs_handle, (const TPM_AUTHDATA*)&vtpm_globals->owner_usage_auth, - (const TPM_AUTHDATA*)&vtpm_globals->srk_usage_auth, + &SRK_AUTH, &ek_cryptoInfo, &vtpm_globals->keyAuth)); @@ -142,7 +135,7 @@ TPMTRYRETURN( VTSP_OSAP(vtpm_globals->manager_tcs_handle, TPM_ET_KEYHANDLE, TPM_SRK_KEYHANDLE, - (const TPM_AUTHDATA*)&vtpm_globals->srk_usage_auth, + &SRK_AUTH, &sharedsecret, &osap) ); @@ -157,8 +150,43 @@ &vtpm_globals->storageKeyWrap, &osap) ); - vtpm_globals->keyAuth.fContinueAuthSession = TRUE; - + // Generate boot key's auth + Crypto_GetRandom( &vtpm_globals->storage_key_usage_auth, + sizeof(TPM_AUTHDATA) ); + + TPM_AUTHDATA bootKeyWrapAuth; + memset(&bootKeyWrapAuth, 0, sizeof(bootKeyWrapAuth)); + + TPMTRYRETURN( VTSP_OSAP(vtpm_globals->manager_tcs_handle, + TPM_ET_KEYHANDLE, + TPM_SRK_KEYHANDLE, + &SRK_AUTH, + &sharedsecret, + &osap) ); + + osap.fContinueAuthSession = FALSE; + + // FIXME: This key protects the global secrets on disk. It should use TPM + // PCR bindings to limit its use to legit configurations. + // Current binds are open, implying a Trusted VM contains this code. + // If this VM is not Trusted, use measurement and PCR bindings. + TPMTRYRETURN( VTSP_CreateWrapKey( vtpm_globals->manager_tcs_handle, + TPM_KEY_BIND, + (const TPM_AUTHDATA*)&bootKeyWrapAuth, + TPM_SRK_KEYHANDLE, + (const TPM_AUTHDATA*)&sharedsecret, + &vtpm_globals->bootKeyWrap, + &osap) ); + + // Populate CRYPTO_INFO vtpm_globals->bootKey. This does not load it into the TPM + TPMTRYRETURN( VTSP_LoadKey( vtpm_globals->manager_tcs_handle, + TPM_SRK_KEYHANDLE, + &vtpm_globals->bootKeyWrap, + NULL, + NULL, + NULL, + &vtpm_globals->bootKey, + TRUE ) ); goto egress; abort_egress: @@ -278,24 +306,26 @@ #endif // Check status of rx_fh. If necessary attempt to re-open it. + char* s = NULL; if (*rx_fh < 0) { #ifdef VTPM_MULTI_VM - *rx_fh = open(VTPM_BE_DEV, O_RDWR); + s = VTPM_BE_DEV; #else if (threadType == BE_LISTENER_THREAD) #ifdef DUMMY_BACKEND - *rx_fh = open("/tmp/in.fifo", O_RDWR); + s = "/tmp/in.fifo"; #else - *rx_fh = open(VTPM_BE_DEV, O_RDWR); + s = VTPM_BE_DEV; #endif else // DMI Listener - *rx_fh = open(VTPM_RX_FIFO, O_RDWR); + s = VTPM_RX_FIFO; + *rx_fh = open(s, O_RDWR); #endif } // Respond to failures to open rx_fh if (*rx_fh < 0) { - vtpmhandlerlogerror(VTPM_LOG_VTPM, "Can't open inbound fh.\n"); + vtpmhandlerlogerror(VTPM_LOG_VTPM, "Can't open inbound fh for %s.\n", s); #ifdef VTPM_MULTI_VM return TPM_IOERROR; #else @@ -713,7 +743,7 @@ /////////////////////////////////////////////////////////////////////////////// TPM_RESULT VTPM_Init_Service() { - TPM_RESULT status = TPM_FAIL; + TPM_RESULT status = TPM_FAIL, serviceStatus; BYTE *randomsead; UINT32 randomsize; @@ -737,7 +767,7 @@ // Create new TCS Object vtpm_globals->manager_tcs_handle = 0; - + TPMTRYRETURN(TCS_create()); // Create TCS Context for service @@ -756,17 +786,24 @@ vtpm_globals->keyAuth.fContinueAuthSession = TRUE; // If failed, create new Service. - if (VTPM_LoadService() != TPM_SUCCESS) + serviceStatus = VTPM_LoadService(); + if (serviceStatus == TPM_IOERROR) { + vtpmloginfo(VTPM_LOG_VTPM, "Failed to read service file. Assuming first time initialization.\n"); TPMTRYRETURN( VTPM_Create_Service() ); + } else if (serviceStatus != TPM_SUCCESS) { + vtpmlogerror(VTPM_LOG_VTPM, "Failed to read existing service file"); + exit(1); + } //Load Storage Key TPMTRYRETURN( VTSP_LoadKey( vtpm_globals->manager_tcs_handle, TPM_SRK_KEYHANDLE, &vtpm_globals->storageKeyWrap, - (const TPM_AUTHDATA*)&vtpm_globals->srk_usage_auth, + &SRK_AUTH, &vtpm_globals->storageKeyHandle, &vtpm_globals->keyAuth, - &vtpm_globals->storageKey) ); + &vtpm_globals->storageKey, + FALSE ) ); // Create entry for Dom0 for control messages TPMTRYRETURN( VTPM_Handle_New_DMI(NULL) ); @@ -797,12 +834,11 @@ free (dmi_itr); } - + if ( (vtpm_globals->DMI_table_dirty) && (VTPM_SaveService() != TPM_SUCCESS) ) + vtpmlogerror(VTPM_LOG_VTPM, "Unable to save manager data.\n"); + TCS_CloseContext(vtpm_globals->manager_tcs_handle); - - if ( (vtpm_globals->DMI_table_dirty) && - (VTPM_SaveService() != TPM_SUCCESS) ) - vtpmlogerror(VTPM_LOG_VTPM, "Unable to save manager data.\n"); + TCS_destroy(); hashtable_destroy(vtpm_globals->dmi_map, 1); free(vtpm_globals); diff -r 25e3c8668f1f -r 8af1199488d3 tools/vtpm_manager/manager/vtpmpriv.h --- a/tools/vtpm_manager/manager/vtpmpriv.h Mon Jan 9 11:19:55 2006 +++ b/tools/vtpm_manager/manager/vtpmpriv.h Mon Jan 9 11:22:17 2006 @@ -108,6 +108,7 @@ TCS_CONTEXT_HANDLE manager_tcs_handle; // TCS Handle used by manager TPM_HANDLE storageKeyHandle; // Key used by persistent store CRYPTO_INFO storageKey; // For software encryption + CRYPTO_INFO bootKey; // For saving table TCS_AUTH keyAuth; // OIAP session for storageKey BOOL DMI_table_dirty; // Indicates that a command // has updated the DMI table @@ -115,15 +116,17 @@ // Persistent Data TPM_AUTHDATA owner_usage_auth; // OwnerAuth of real TPM - TPM_AUTHDATA srk_usage_auth; // SRK Auth of real TPM buffer_t storageKeyWrap; // Wrapped copy of storageKey + TPM_AUTHDATA srk_usage_auth; + TPM_AUTHDATA storage_key_usage_auth; - TPM_AUTHDATA storage_key_usage_auth; - + buffer_t bootKeyWrap; // Wrapped copy of boot key + }VTPM_GLOBALS; -//Global dmi map -extern VTPM_GLOBALS *vtpm_globals; +// --------------------------- Global Values -------------------------- +extern VTPM_GLOBALS *vtpm_globals; // Key info and DMI states +extern const TPM_AUTHDATA SRK_AUTH; // SRK Well Known Auth Value // ********************** Command Handler Prototypes *********************** TPM_RESULT VTPM_Handle_Load_NVM( VTPM_DMI_RESOURCE *myDMI, diff -r 25e3c8668f1f -r 8af1199488d3 tools/vtpm_manager/manager/vtsp.c --- a/tools/vtpm_manager/manager/vtsp.c Mon Jan 9 11:19:55 2006 +++ b/tools/vtpm_manager/manager/vtsp.c Mon Jan 9 11:22:17 2006 @@ -144,7 +144,10 @@ if (memcmp (&hm, &(auth->HMAC), sizeof(TPM_DIGEST)) == 0) // 0 indicates equality return (TPM_SUCCESS); else { - VTSP_OIAP( hContext, auth); + // If specified, reconnect the OIAP session. + // NOTE: This only works for TCS's that never have a 0 context. + if (hContext) + VTSP_OIAP( hContext, auth); return (TPM_AUTHFAIL); } } @@ -157,6 +160,10 @@ TPMTRYRETURN( TCSP_OIAP(hContext, &auth->AuthHandle, &auth->NonceEven) ); + + memset(&auth->HMAC, 0, sizeof(TPM_DIGEST)); + auth->fContinueAuthSession = FALSE; + goto egress; abort_egress: @@ -195,6 +202,9 @@ BSG_TPM_NONCE, &nonceOddOSAP); Crypto_HMAC(sharedSecretText, sizeof(sharedSecretText), (BYTE *) usageAuth, TPM_DIGEST_SIZE, (BYTE *) sharedSecret); + + memset(&auth->HMAC, 0, sizeof(TPM_DIGEST)); + auth->fContinueAuthSession = FALSE; goto egress; @@ -287,9 +297,6 @@ srkKeyInfo.parms = (BYTE *) &srkRSAkeyInfo; struct pack_buf_t srkText; - - // GenerateAuth new nonceOdd - Crypto_GetRandom(&auth->NonceOdd, sizeof(TPM_NONCE) ); //These values are accurate for an enc(AuthData). struct pack_buf_t encOwnerAuth, encSrkAuth; @@ -383,9 +390,6 @@ BYTE *paramText; // Digest to make Auth. UINT32 paramTextSize; - // Generate HMAC - Crypto_GetRandom(&auth->NonceOdd, sizeof(TPM_NONCE) ); - paramText = (BYTE *) malloc(sizeof(BYTE) * TCPA_MAX_BUFFER_LENGTH); paramTextSize = BSG_PackList(paramText, 1, @@ -504,9 +508,6 @@ newKeyText.data = flatKey; newKeyText.size = flatKeySize; - // GenerateAuth new nonceOdd - Crypto_GetRandom(&auth->NonceOdd, sizeof(TPM_NONCE) ); - // Generate HMAC paramText = (BYTE *) malloc(sizeof(BYTE) * TCPA_MAX_BUFFER_LENGTH); @@ -563,63 +564,66 @@ const TPM_AUTHDATA *parentAuth, TPM_HANDLE *newKeyHandle, TCS_AUTH *auth, - CRYPTO_INFO *cryptoinfo /*= NULL*/) { - - - vtpmloginfo(VTPM_LOG_VTSP, "Loading Key.\n%s",""); + CRYPTO_INFO *cryptoinfo, + const BOOL skipTPMLoad) { + + + vtpmloginfo(VTPM_LOG_VTSP, "Loading Key %s.\n", (!skipTPMLoad ? "into TPM" : "only into memory")); TPM_RESULT status = TPM_SUCCESS; TPM_COMMAND_CODE command = TPM_ORD_LoadKey; - - BYTE *paramText; // Digest to make Auth. + + BYTE *paramText=NULL; // Digest to make Auth. UINT32 paramTextSize; - - if ((rgbWrappedKeyBlob == NULL) || (parentAuth == NULL) || - (newKeyHandle==NULL) || (auth==NULL)) { - status = TPM_BAD_PARAMETER; - goto abort_egress; - } - - // Generate Extra TCS Parameters - TPM_HANDLE phKeyHMAC; - - // Generate HMAC - Crypto_GetRandom(&auth->NonceOdd, sizeof(TPM_NONCE) ); - - paramText = (BYTE *) malloc(sizeof(BYTE) * TCPA_MAX_BUFFER_LENGTH); - - paramTextSize = BSG_PackList(paramText, 1, - BSG_TPM_COMMAND_CODE, &command); - - memcpy(paramText + paramTextSize, rgbWrappedKeyBlob->bytes, buffer_len(rgbWrappedKeyBlob)); - paramTextSize += buffer_len(rgbWrappedKeyBlob); - - TPMTRYRETURN( GenerateAuth( paramText, paramTextSize, + + // SkipTPMLoad stops key from being loaded into TPM, but still generates CRYPTO_INFO for it + if (! skipTPMLoad) { + + if ((rgbWrappedKeyBlob == NULL) || (parentAuth == NULL) || + (newKeyHandle==NULL) || (auth==NULL)) { + status = TPM_BAD_PARAMETER; + goto abort_egress; + } + + // Generate Extra TCS Parameters + TPM_HANDLE phKeyHMAC; + + paramText = (BYTE *) malloc(sizeof(BYTE) * TCPA_MAX_BUFFER_LENGTH); + + paramTextSize = BSG_PackList(paramText, 1, + BSG_TPM_COMMAND_CODE, &command); + + memcpy(paramText + paramTextSize, rgbWrappedKeyBlob->bytes, buffer_len(rgbWrappedKeyBlob)); + paramTextSize += buffer_len(rgbWrappedKeyBlob); + + TPMTRYRETURN( GenerateAuth( paramText, paramTextSize, parentAuth, auth) ); - // Call TCS - TPMTRYRETURN( TCSP_LoadKeyByBlob( hContext, - hUnwrappingKey, - buffer_len(rgbWrappedKeyBlob), - rgbWrappedKeyBlob->bytes, - auth, - newKeyHandle, - &phKeyHMAC) ); - - // Verify Auth - paramTextSize = BSG_PackList(paramText, 3, - BSG_TPM_RESULT, &status, - BSG_TPM_COMMAND_CODE, &command, - BSG_TPM_HANDLE, newKeyHandle); - - TPMTRYRETURN( VerifyAuth( paramText, paramTextSize, - parentAuth, auth, - hContext) ); - - // Unpack/return key structure + // Call TCS + TPMTRYRETURN( TCSP_LoadKeyByBlob( hContext, + hUnwrappingKey, + buffer_len(rgbWrappedKeyBlob), + rgbWrappedKeyBlob->bytes, + auth, + newKeyHandle, + &phKeyHMAC) ); + + // Verify Auth + paramTextSize = BSG_PackList(paramText, 3, + BSG_TPM_RESULT, &status, + BSG_TPM_COMMAND_CODE, &command, + BSG_TPM_HANDLE, newKeyHandle); + + TPMTRYRETURN( VerifyAuth( paramText, paramTextSize, + parentAuth, auth, + hContext) ); + } + + // Build cryptoinfo structure for software crypto function. if (cryptoinfo != NULL) { TPM_KEY newKey; + // Unpack/return key structure BSG_Unpack(BSG_TPM_KEY, rgbWrappedKeyBlob->bytes , &newKey); TPM_RSA_KEY_PARMS rsaKeyParms; @@ -669,9 +673,6 @@ struct pack_buf_t clear_data32; BYTE *clear_data_text; UINT32 clear_data_size; - - // Generate HMAC - Crypto_GetRandom(&auth->NonceOdd, sizeof(TPM_NONCE) ); struct pack_buf_t bound_data32 = {bound_data->size, bound_data->bytes}; @@ -781,6 +782,196 @@ return TPM_SUCCESS; } +TPM_RESULT VTSP_Seal(const TCS_CONTEXT_HANDLE hContext, + const TPM_KEY_HANDLE keyHandle, + const TPM_AUTHDATA *sealDataAuth, + const TPM_PCR_COMPOSITE *pcrComp, + const buffer_t *inData, + TPM_STORED_DATA *sealedData, + const TPM_SECRET *osapSharedSecret, + TCS_AUTH *auth) { + + TPM_RESULT status = TPM_SUCCESS; + TPM_COMMAND_CODE command = TPM_ORD_Seal; + + BYTE *paramText; // Digest to make Auth. + UINT32 paramTextSize; + + // Generate PCR_Info Struct from Comp + TPM_PCR_INFO pcrInfo; + UINT32 pcrInfoSize, flatpcrSize; + BYTE flatpcr[3 + // PCR_Select = 3 1 byte banks + sizeof(UINT16) + // 2 byte UINT16 + sizeof(UINT32) + // PCR_Comp = 4 byte UINT32 + 24 * sizeof(TPM_PCRVALUE) ]; // up to 24 PCRs + + if (pcrComp != NULL) { + //printf("\n\tBinding to PCRs: "); + //for(int i = 0 ; i < pcrComp->select.sizeOfSelect ; i++) + //printf("%2.2x", pcrComp->select.pcrSelect[i]); + + memcpy(&pcrInfo.pcrSelection, &pcrComp->select, sizeof(TPM_PCR_SELECTION)); + + flatpcrSize = BSG_Pack(BSG_TPM_PCR_COMPOSITE, (BYTE *) pcrComp, flatpcr); + Crypto_SHA1Full((BYTE *) flatpcr, flatpcrSize, (BYTE *) &(pcrInfo.digestAtRelease)); + memset(&(pcrInfo.digestAtCreation), 0, sizeof(TPM_DIGEST)); + pcrInfoSize = BSG_Pack(BSG_TPM_PCR_INFO, (BYTE *) &pcrInfo, flatpcr); + } else { + //printf("\n\tBinding to no PCRS."); + pcrInfoSize = 0; + } + + // Calculate encUsageAuth + BYTE XORbuffer[sizeof(TPM_SECRET) + sizeof(TPM_NONCE)]; + UINT32 XORbufferSize = sizeof(XORbuffer); + TPM_DIGEST XORKey; + TPM_ENCAUTH encAuth; + + BSG_PackList( XORbuffer, 2, + BSG_TPM_SECRET, osapSharedSecret, + BSG_TPM_NONCE, &auth->NonceEven ); + + Crypto_SHA1Full(XORbuffer, XORbufferSize, (BYTE *) &XORKey); + + int i; + for (i=0; i < TPM_DIGEST_SIZE; i++) + ((BYTE *) &encAuth)[i] = ((BYTE *) &XORKey)[i] ^ ((BYTE *) sealDataAuth)[i]; + + // Generate Extra TCS Parameters + UINT32 inDataSize = buffer_len(inData); + struct pack_buf_t inData_pack = {inDataSize, inData->bytes}; + struct pack_buf_t pcrInfo_pack = {pcrInfoSize, flatpcr}; + + UINT32 sealedDataSize; + BYTE *flatSealedData=NULL; + + paramText = (BYTE *) malloc(sizeof(BYTE) * TCPA_MAX_BUFFER_LENGTH); + + paramTextSize = BSG_PackList(paramText, 4, + BSG_TPM_COMMAND_CODE, &command, + BSG_TPM_ENCAUTH, &encAuth, + BSG_TPM_SIZE32_DATA, &pcrInfo_pack, + BSG_TPM_SIZE32_DATA, &inData_pack); + + TPMTRYRETURN( GenerateAuth( paramText, paramTextSize, + osapSharedSecret, auth) ); + + // Call TCS + TPMTRYRETURN( TCSP_Seal( hContext, + keyHandle, + encAuth, + pcrInfoSize, + flatpcr, + inDataSize, + inData->bytes, + auth, + &sealedDataSize, + &flatSealedData) ); + + // Unpack/return key structure + BSG_Unpack( BSG_TPM_STORED_DATA, flatSealedData, sealedData ); + + paramTextSize = BSG_PackList(paramText, 3, + BSG_TPM_RESULT, &status, + BSG_TPM_COMMAND_CODE, &command, + BSG_TPM_STORED_DATA, sealedData); + + TPMTRYRETURN( VerifyAuth( paramText, paramTextSize, + osapSharedSecret, auth, + 0) ); + + + goto egress; + + abort_egress: + egress: + + if (flatSealedData) + TCS_FreeMemory( hContext, flatSealedData); + + free(paramText); + return status; +} + + +TPM_RESULT VTSP_Unseal(const TCS_CONTEXT_HANDLE hContext, + const TPM_KEY_HANDLE keyHandle, + const TPM_STORED_DATA *sealedData, + const TPM_AUTHDATA *key_usage_auth, + const TPM_AUTHDATA *data_usage_auth, + buffer_t *outData, + TCS_AUTH *auth, + TCS_AUTH *dataAuth) { + + TPM_RESULT status = TPM_SUCCESS; + TPM_COMMAND_CODE command = TPM_ORD_Unseal; + + BYTE *paramText; // Digest to make Auth. + UINT32 paramTextSize; + + // Generate Extra TCS Parameters + UINT32 sealDataSize, clearDataSize; + BYTE *flatSealedData= (BYTE *) malloc(sizeof(TPM_VERSION) + + 2 * sizeof(UINT32) + + sealedData->sealInfoSize + + sealedData->encDataSize), + *clearData=NULL; + + sealDataSize = BSG_Pack(BSG_TPM_STORED_DATA, sealedData, flatSealedData ); + + paramText = (BYTE *) malloc(sizeof(BYTE) * TCPA_MAX_BUFFER_LENGTH); + + paramTextSize = BSG_PackList(paramText, 2, + BSG_TPM_COMMAND_CODE, &command, + BSG_TPM_STORED_DATA, sealedData); + + TPMTRYRETURN( GenerateAuth( paramText, paramTextSize, + key_usage_auth, auth) ); + + TPMTRYRETURN( GenerateAuth( paramText, paramTextSize, + data_usage_auth, dataAuth) ); + // Call TCS + TPMTRYRETURN( TCSP_Unseal( hContext, + keyHandle, + sealDataSize, + flatSealedData, + auth, + dataAuth, + &clearDataSize, + &clearData) ); + + // Verify Auth + struct pack_buf_t clearData_pack = {clearDataSize, clearData}; + + paramTextSize = BSG_PackList(paramText, 3, + BSG_TPM_RESULT, &status, + BSG_TPM_COMMAND_CODE, &command, + BSG_TPM_SIZE32_DATA, &clearData_pack); + + TPMTRYRETURN( VerifyAuth( paramText, paramTextSize, + key_usage_auth, auth, + hContext) ); + + TPMTRYRETURN( VerifyAuth( paramText, paramTextSize, + data_usage_auth, dataAuth, + hContext) ); + + // Unpack/return key structure + TPMTRYRETURN( buffer_init(outData, clearDataSize, clearData) ); + + goto egress; + + abort_egress: + egress: + + if (flatSealedData) + TCS_FreeMemory( hContext, clearData); + + free(paramText); + return status; +} + + // Function Reaches into unsupported TCS command, beware. TPM_RESULT VTSP_RawTransmit(const TCS_CONTEXT_HANDLE hContext, const buffer_t *inbuf, diff -r 25e3c8668f1f -r 8af1199488d3 tools/vtpm_manager/manager/vtsp.h --- a/tools/vtpm_manager/manager/vtsp.h Mon Jan 9 11:19:55 2006 +++ b/tools/vtpm_manager/manager/vtsp.h Mon Jan 9 11:22:17 2006 @@ -86,7 +86,8 @@ const TPM_AUTHDATA *parentAuth, TPM_HANDLE *newKeyHandle, TCS_AUTH *pAuth, - CRYPTO_INFO *cryptoinfo); + CRYPTO_INFO *cryptoinfo, + const BOOL skipTPMLoad); TPM_RESULT VTSP_Unbind( const TCS_CONTEXT_HANDLE hContext, const TPM_KEY_HANDLE key_handle, @@ -99,4 +100,22 @@ const buffer_t *inData, buffer_t *outData); +TPM_RESULT VTSP_Seal(const TCS_CONTEXT_HANDLE hContext, + const TPM_KEY_HANDLE keyHandle, + const TPM_AUTHDATA *sealDataAuth, + const TPM_PCR_COMPOSITE *pcrComp, + const buffer_t *inData, + TPM_STORED_DATA *sealedData, + const TPM_SECRET *osapSharedSecret, + TCS_AUTH *auth); + +TPM_RESULT VTSP_Unseal(const TCS_CONTEXT_HANDLE hContext, + const TPM_KEY_HANDLE keyHandle, + const TPM_STORED_DATA *sealedData, + const TPM_AUTHDATA *key_usage_auth, + const TPM_AUTHDATA *data_usage_auth, + buffer_t *outData, + TCS_AUTH *auth, + TCS_AUTH *dataAuth); + #endif //_VTSP_H_ diff -r 25e3c8668f1f -r 8af1199488d3 tools/vtpm_manager/tcs/tcs.c --- a/tools/vtpm_manager/tcs/tcs.c Mon Jan 9 11:19:55 2006 +++ b/tools/vtpm_manager/tcs/tcs.c Mon Jan 9 11:22:17 2006 @@ -636,7 +636,7 @@ TDDL_UINT32 OutLength = TCPA_MAX_BUFFER_LENGTH; // check input params - if (inData == NULL || pubAuth == NULL || SealedDataSize == NULL || *SealedData == NULL) + if (inData == NULL || pubAuth == NULL || SealedDataSize == NULL || SealedData == NULL) return TPM_BAD_PARAMETER; // Convert Byte Input parameter in the input byte stream InBuf diff -r 25e3c8668f1f -r 8af1199488d3 tools/xentrace/Makefile --- a/tools/xentrace/Makefile Mon Jan 9 11:19:55 2006 +++ b/tools/xentrace/Makefile Mon Jan 9 11:22:17 2006 @@ -15,24 +15,32 @@ OBJS = $(patsubst %.c,%.o,$(wildcard *.c)) BIN = xentrace tbctl setsize -LIBBIN = xenctx +LIBBIN = SCRIPTS = xentrace_format MAN1 = $(wildcard *.1) MAN8 = $(wildcard *.8) + +ifeq ($(XEN_TARGET_ARCH),x86_32) +LIBBIN += xenctx +endif + +ifeq ($(XEN_TARGET_ARCH),x86_64) +LIBBIN += xenctx +endif all: build build: $(BIN) $(LIBBIN) install: build [ -d $(DESTDIR)/usr/bin ] || $(INSTALL_DIR) $(DESTDIR)/usr/bin - [ -d $(DESTDIR)/usr/$(LIBDIR)/xen/bin ] || \ + [ -z "$(LIBBIN)"] || [ -d $(DESTDIR)/usr/$(LIBDIR)/xen/bin ] || \ $(INSTALL_DIR) $(DESTDIR)/usr/$(LIBDIR)/xen/bin [ -d $(DESTDIR)/usr/share/man/man1 ] || \ $(INSTALL_DIR) $(DESTDIR)/usr/share/man/man1 [ -d $(DESTDIR)/usr/share/man/man8 ] || \ $(INSTALL_DIR) $(DESTDIR)/usr/share/man/man8 $(INSTALL_PROG) $(BIN) $(SCRIPTS) $(DESTDIR)/usr/bin - $(INSTALL_PROG) $(LIBBIN) $(DESTDIR)/usr/$(LIBDIR)/xen/bin + [ -z "$(LIBBIN)"] || $(INSTALL_PROG) $(LIBBIN) $(DESTDIR)/usr/$(LIBDIR)/xen/bin $(INSTALL_DATA) $(MAN1) $(DESTDIR)/usr/share/man/man1 $(INSTALL_DATA) $(MAN8) $(DESTDIR)/usr/share/man/man8 diff -r 25e3c8668f1f -r 8af1199488d3 tools/xentrace/xenctx.c --- a/tools/xentrace/xenctx.c Mon Jan 9 11:19:55 2006 +++ b/tools/xentrace/xenctx.c Mon Jan 9 11:22:17 2006 @@ -380,10 +380,10 @@ exit(-1); } - ret = xc_domain_get_vcpu_context(xc_handle, domid, vcpu, &ctx); + ret = xc_vcpu_getcontext(xc_handle, domid, vcpu, &ctx); if (ret < 0) { xc_domain_unpause(xc_handle, domid); - perror("xc_domain_get_vcpu_context"); + perror("xc_vcpu_getcontext"); exit(-1); } diff -r 25e3c8668f1f -r 8af1199488d3 tools/xm-test/tests/network-attach/Makefile.am --- a/tools/xm-test/tests/network-attach/Makefile.am Mon Jan 9 11:19:55 2006 +++ b/tools/xm-test/tests/network-attach/Makefile.am Mon Jan 9 11:22:17 2006 @@ -6,7 +6,7 @@ 03_network_attach_detach_multiple_pos.test \ 04_network_attach_baddomain_neg.test -XFAIL_TESTS = 03_network_attach_detach_multiple_pos.test +XFAIL_TESTS = EXTRA_DIST = $(TESTS) $(XFAIL_TESTS) network_utils.py diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/ia64/Makefile --- a/xen/arch/ia64/Makefile Mon Jan 9 11:19:55 2006 +++ b/xen/arch/ia64/Makefile Mon Jan 9 11:22:17 2006 @@ -23,6 +23,13 @@ __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o \ __divdi3.o __udivdi3.o __moddi3.o __umoddi3.o +# xen stack unwinder +# unwind_decoder.c is included in unwind.c +OBJS += unwind.o +#unwind.o: CFLAGS += -DUNW_DEBUG=4 + +OBJS += process-linux-xen.o + # perfmon.o # unwind.o needed for kernel unwinding (rare) @@ -31,11 +38,26 @@ # remove following line if not privifying in memory # OBJS += privify.o -default: $(OBJS) head.o xen.lds.s - $(LD) -r -o arch.o $(OBJS) +default: $(TARGET) + +$(CURDIR)/arch.o: $(OBJS) + $(LD) -r -o $@ $(OBJS) + +$(TARGET)-syms: $(ALL_OBJS) head.o xen.lds.s $(LD) $(LDFLAGS) -T $(BASEDIR)/arch/$(TARGET_ARCH)/xen.lds.s -N \ - -Map map.out head.o $(ALL_OBJS) -o $(TARGET)-syms - $(OBJCOPY) -R .note -R .comment -S $(TARGET)-syms $(TARGET) + -Map map.out head.o $(ALL_OBJS) -o $@ + $(NM) -n $@ | $(BASEDIR)/tools/symbols > $(BASEDIR)/xen-syms.S + $(MAKE) $(BASEDIR)/xen-syms.o + $(LD) $(LDFLAGS) -T $(BASEDIR)/arch/$(TARGET_ARCH)/xen.lds.s -N \ + -Map map.out head.o $(ALL_OBJS) $(BASEDIR)/xen-syms.o -o $@ + $(NM) -n $@ | $(BASEDIR)/tools/symbols >$(BASEDIR)/xen-syms.S + $(MAKE) $(BASEDIR)/xen-syms.o + $(LD) $(LDFLAGS) -T $(BASEDIR)/arch/$(TARGET_ARCH)/xen.lds.s -N \ + -Map map.out head.o $(ALL_OBJS) $(BASEDIR)/xen-syms.o -o $@ + rm -f $(BASEDIR)/xen-syms.S $(BASEDIR)/xen-syms.o + +$(TARGET): $(TARGET)-syms + $(OBJCOPY) -R .note -R .comment -S $(TARGET)-syms $@ $(NM) -n $(TARGET)-syms | grep -v '\( [aUw] \)\|\(__crc_\)\|\( \$[adt]\)'\ > $(BASEDIR)/System.map diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/ia64/linux-xen/entry.S --- a/xen/arch/ia64/linux-xen/entry.S Mon Jan 9 11:19:55 2006 +++ b/xen/arch/ia64/linux-xen/entry.S Mon Jan 9 11:22:17 2006 @@ -1417,7 +1417,6 @@ br.cond.sptk.many rp // goes to ia64_leave_kernel END(ia64_prepare_handle_unaligned) -#ifndef XEN // // unw_init_running(void (*callback)(info, arg), void *arg) // @@ -1463,6 +1462,7 @@ br.ret.sptk.many rp END(unw_init_running) +#ifndef XEN .rodata .align 8 .globl sys_call_table diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/ia64/vmx/mmio.c --- a/xen/arch/ia64/vmx/mmio.c Mon Jan 9 11:19:55 2006 +++ b/xen/arch/ia64/vmx/mmio.c Mon Jan 9 11:22:17 2006 @@ -29,7 +29,7 @@ #include <asm/vmx_vcpu.h> #include <asm/privop.h> #include <asm/types.h> -#include <public/io/ioreq.h> +#include <public/hvm/ioreq.h> #include <asm/mm.h> #include <asm/vmx.h> diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/ia64/vmx/vlsapic.c --- a/xen/arch/ia64/vmx/vlsapic.c Mon Jan 9 11:19:55 2006 +++ b/xen/arch/ia64/vmx/vlsapic.c Mon Jan 9 11:22:17 2006 @@ -218,7 +218,7 @@ */ void vtm_domain_out(VCPU *vcpu) { - if(!is_idle_task(vcpu->domain)) + if(!is_idle_domain(vcpu->domain)) rem_ac_timer(&vcpu->arch.arch_vmx.vtm.vtm_timer); } @@ -230,7 +230,7 @@ { vtime_t *vtm; - if(!is_idle_task(vcpu->domain)) { + if(!is_idle_domain(vcpu->domain)) { vtm=&(vcpu->arch.arch_vmx.vtm); vtm_interruption_update(vcpu, vtm); } diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/ia64/vmx/vmx_init.c --- a/xen/arch/ia64/vmx/vmx_init.c Mon Jan 9 11:19:55 2006 +++ b/xen/arch/ia64/vmx/vmx_init.c Mon Jan 9 11:22:17 2006 @@ -42,7 +42,7 @@ #include <xen/lib.h> #include <asm/vmmu.h> #include <public/arch-ia64.h> -#include <public/io/ioreq.h> +#include <public/hvm/ioreq.h> #include <asm/vmx_phy_mode.h> #include <asm/processor.h> #include <asm/vmx.h> diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/ia64/vmx/vmx_process.c --- a/xen/arch/ia64/vmx/vmx_process.c Mon Jan 9 11:19:55 2006 +++ b/xen/arch/ia64/vmx/vmx_process.c Mon Jan 9 11:22:17 2006 @@ -53,6 +53,7 @@ #define INITIAL_PSR_VALUE_AT_INTERRUPTION 0x0000001808028034 +extern void die_if_kernel(char *str, struct pt_regs *regs, long err); extern void rnat_consumption (VCPU *vcpu); #define DOMN_PAL_REQUEST 0x110000 @@ -185,8 +186,11 @@ }else if(iim == DOMN_PAL_REQUEST){ pal_emul(current); vmx_vcpu_increment_iip(current); - } else + } else { + if (iim == 0) + die_if_kernel("bug check", regs, iim); vmx_reflect_interruption(ifa,isr,iim,11,regs); + } } @@ -227,7 +231,7 @@ struct domain *d = current->domain; struct vcpu *v = current; // FIXME: Will this work properly if doing an RFI??? - if (!is_idle_task(d) ) { // always comes from guest + if (!is_idle_domain(d) ) { // always comes from guest extern void vmx_dorfirfi(void); struct pt_regs *user_regs = vcpu_regs(current); if (local_softirq_pending()) diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/ia64/vmx/vmx_support.c --- a/xen/arch/ia64/vmx/vmx_support.c Mon Jan 9 11:19:55 2006 +++ b/xen/arch/ia64/vmx/vmx_support.c Mon Jan 9 11:22:17 2006 @@ -21,7 +21,7 @@ */ #include <xen/config.h> #include <xen/sched.h> -#include <public/io/ioreq.h> +#include <public/hvm/ioreq.h> #include <asm/vmx.h> #include <asm/vmx_vcpu.h> diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/ia64/xen/dom_fw.c --- a/xen/arch/ia64/xen/dom_fw.c Mon Jan 9 11:19:55 2006 +++ b/xen/arch/ia64/xen/dom_fw.c Mon Jan 9 11:22:17 2006 @@ -861,12 +861,16 @@ bp->console_info.orig_x = 0; bp->console_info.orig_y = 24; bp->fpswa = 0; - bp->initrd_start = (dom0_start+dom0_size) - - (PAGE_ALIGN(ia64_boot_param->initrd_size) + 4*1024*1024); - bp->initrd_size = ia64_boot_param->initrd_size; - printf(" initrd start %0xlx", bp->initrd_start); - printf(" initrd size %0xlx", bp->initrd_size); - - + if (d == dom0) { + bp->initrd_start = (dom0_start+dom0_size) - + (PAGE_ALIGN(ia64_boot_param->initrd_size) + 4*1024*1024); + bp->initrd_size = ia64_boot_param->initrd_size; + } + else { + bp->initrd_start = d->arch.initrd_start; + bp->initrd_size = d->arch.initrd_len; + } + printf(" initrd start %0xlx", bp->initrd_start); + printf(" initrd size %0xlx", bp->initrd_size); return bp; } diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/ia64/xen/domain.c --- a/xen/arch/ia64/xen/domain.c Mon Jan 9 11:19:55 2006 +++ b/xen/arch/ia64/xen/domain.c Mon Jan 9 11:22:17 2006 @@ -19,6 +19,7 @@ #include <xen/delay.h> #include <xen/softirq.h> #include <xen/mm.h> +#include <xen/iocap.h> #include <asm/ptrace.h> #include <asm/system.h> #include <asm/io.h> @@ -45,7 +46,7 @@ #include <asm/vmx_vcpu.h> #include <asm/vmx_vpd.h> #include <asm/pal.h> -#include <public/io/ioreq.h> +#include <public/hvm/ioreq.h> #define CONFIG_DOMAIN0_CONTIGUOUS unsigned long dom0_start = -1L; @@ -181,7 +182,7 @@ memset(v->arch._thread.fph,0,sizeof(struct ia64_fpreg)*96); } -void arch_do_createdomain(struct vcpu *v) +int arch_do_createdomain(struct vcpu *v) { struct domain *d = v->domain; struct thread_info *ti = alloc_thread_info(v); @@ -248,7 +249,9 @@ } } else d->arch.mm = NULL; - printf ("arch_do_create_domain: domain=%p\n", d); + printf ("arch_do_create_domain: domain=%p\n", d); + + return 0; } void arch_getdomaininfo_ctxt(struct vcpu *v, struct vcpu_guest_context *c) @@ -291,16 +294,7 @@ d->arch.cmdline = c->cmdline; new_thread(v, regs->cr_iip, 0, 0); -#ifdef CONFIG_IA64_SPLIT_CACHE - /* Sync d/i cache conservatively */ - if (!running_on_sim) { - ret = ia64_pal_cache_flush(4, 0, &progress, NULL); - if ((ret!=PAL_STATUS_SUCCESS)&& (ret!=PAL_STATUS_UNIMPLEMENTED)) - printk("PAL CACHE FLUSH failed for dom0.\n"); - else - printk("Sync i/d cache for guest SUCC\n"); - } -#endif + sync_split_caches(); v->vcpu_info->arch.evtchn_vector = c->vcpu.evtchn_vector; if ( c->vcpu.privregs && copy_from_user(v->arch.privregs, c->vcpu.privregs, sizeof(mapped_regs_t))) { @@ -428,7 +422,7 @@ { p = alloc_domheap_page(d); // zero out pages for security reasons - memset(__va(page_to_phys(p)),0,PAGE_SIZE); + if (p) memset(__va(page_to_phys(p)),0,PAGE_SIZE); } if (unlikely(!p)) { printf("map_new_domain_page: Can't alloc!!!! Aaaargh!\n"); @@ -763,7 +757,10 @@ */ void physdev_init_dom0(struct domain *d) { - set_bit(_DOMF_physdev_access, &d->domain_flags); + if (iomem_permit_access(d, 0UL, ~0UL)) + BUG(); + if (irqs_permit_access(d, 0, NR_PIRQS-1)) + BUG(); } unsigned int vmx_dom0 = 0; @@ -912,9 +909,9 @@ memset(si, 0, PAGE_SIZE); d->shared_info->arch.start_info_pfn = __pa(si) >> PAGE_SHIFT; sprintf(si->magic, "xen-%i.%i-ia64", XEN_VERSION, XEN_SUBVERSION); + si->nr_pages = d->tot_pages; #if 0 - si->nr_pages = d->tot_pages; si->shared_info = virt_to_phys(d->shared_info); si->flags = SIF_PRIVILEGED | SIF_INITDOMAIN; //si->pt_base = vpt_start; @@ -959,16 +956,7 @@ new_thread(v, pkern_entry, 0, 0); physdev_init_dom0(d); -#ifdef CONFIG_IA64_SPLIT_CACHE - /* Sync d/i cache conservatively */ - if (!running_on_sim) { - ret = ia64_pal_cache_flush(4, 0, &progress, NULL); - if ((ret!=PAL_STATUS_SUCCESS)&& (ret!=PAL_STATUS_UNIMPLEMENTED)) - printk("PAL CACHE FLUSH failed for dom0.\n"); - else - printk("Sync i/d cache for guest SUCC\n"); - } -#endif + sync_split_caches(); // FIXME: Hack for keyboard input #ifdef CLONE_DOMAIN0 @@ -1027,16 +1015,7 @@ #endif new_thread(v, pkern_entry, 0, 0); printk("new_thread returns\n"); -#ifdef CONFIG_IA64_SPLIT_CACHE - /* Sync d/i cache conservatively */ - if (!running_on_sim) { - ret = ia64_pal_cache_flush(4, 0, &progress, NULL); - if ((ret!=PAL_STATUS_SUCCESS)&& (ret!=PAL_STATUS_UNIMPLEMENTED)) - printk("PAL CACHE FLUSH failed for dom0.\n"); - else - printk("Sync i/d cache for guest SUCC\n"); - } -#endif + sync_split_caches(); __set_bit(0x30, VCPU(v, delivery_mask)); return 0; @@ -1050,16 +1029,7 @@ v->domain->domain_id); loaddomainelfimage(v->domain,v->domain->arch.image_start); new_thread(v, v->domain->arch.entry, 0, 0); -#ifdef CONFIG_IA64_SPLIT_CACHE - /* Sync d/i cache conservatively */ - if (!running_on_sim) { - ret = ia64_pal_cache_flush(4, 0, &progress, NULL); - if ((ret!=PAL_STATUS_SUCCESS)&& (ret!=PAL_STATUS_UNIMPLEMENTED)) - printk("PAL CACHE FLUSH failed for dom0.\n"); - else - printk("Sync i/d cache for guest SUCC\n"); - } -#endif + sync_split_caches(); } #endif @@ -1098,15 +1068,6 @@ void domain_pend_keyboard_interrupt(int irq) { vcpu_pend_interrupt(dom0->vcpu[0],irq); -} - -void vcpu_migrate_cpu(struct vcpu *v, int newcpu) -{ - if ( v->processor == newcpu ) - return; - - set_bit(_VCPUF_cpu_migrated, &v->vcpu_flags); - v->processor = newcpu; } void sync_vcpu_execstate(struct vcpu *v) diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/ia64/xen/hyperprivop.S --- a/xen/arch/ia64/xen/hyperprivop.S Mon Jan 9 11:19:55 2006 +++ b/xen/arch/ia64/xen/hyperprivop.S Mon Jan 9 11:22:17 2006 @@ -543,6 +543,13 @@ extr.u r21=r30,IA64_PSR_PP_BIT,1 ;; cmp.ne p7,p0=r21,r0 ;; (p7) br.spnt.few dispatch_break_fault ;; + movl r20=IA64_PSR_CPL ;; + and r22=r20,r30 ;; + cmp.ne p7,p0=r22,r0 +(p7) br.spnt.many 1f ;; + cmp.eq p7,p0=r17,r0 +(p7) br.spnt.few dispatch_break_fault ;; +1: #if 1 /* special handling in case running on simulator */ movl r20=first_break;; ld4 r23=[r20];; diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/ia64/xen/irq.c --- a/xen/arch/ia64/xen/irq.c Mon Jan 9 11:19:55 2006 +++ b/xen/arch/ia64/xen/irq.c Mon Jan 9 11:22:17 2006 @@ -1377,9 +1377,6 @@ irq_guest_action_t *action; unsigned long flags; int rc = 0; - - if ( !IS_CAPABLE_PHYSDEV(d->domain) ) - return -EPERM; spin_lock_irqsave(&desc->lock, flags); diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/ia64/xen/ivt.S --- a/xen/arch/ia64/xen/ivt.S Mon Jan 9 11:19:55 2006 +++ b/xen/arch/ia64/xen/ivt.S Mon Jan 9 11:22:17 2006 @@ -839,6 +839,8 @@ mov r17=cr.iim mov r31=pr ;; + cmp.eq p7,p0=r17,r0 +(p7) br.spnt.few dispatch_break_fault ;; movl r18=XSI_PSR_IC ;; ld8 r19=[r18] diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/ia64/xen/process.c --- a/xen/arch/ia64/xen/process.c Mon Jan 9 11:19:55 2006 +++ b/xen/arch/ia64/xen/process.c Mon Jan 9 11:22:17 2006 @@ -33,6 +33,7 @@ #include <xen/multicall.h> extern unsigned long vcpu_get_itir_on_fault(struct vcpu *, UINT64); +extern void die_if_kernel(char *str, struct pt_regs *regs, long err); extern unsigned long dom0_start, dom0_size; @@ -64,26 +65,16 @@ extern struct schedule_data schedule_data[NR_CPUS]; -void schedule_tail(struct vcpu *next) -{ - unsigned long rr7; - //printk("current=%lx,shared_info=%lx\n",current,current->vcpu_info); - //printk("next=%lx,shared_info=%lx\n",next,next->vcpu_info); - - // TG: Real HACK FIXME. - // This is currently necessary because when a new domain is started, - // the context_switch function of xen/common/schedule.c(__enter_scheduler) - // never returns. Therefore, the lock must be released. - // schedule_tail is only called when a domain is started. - spin_unlock_irq(&schedule_data[current->processor].schedule_lock); - - /* rr7 will be postponed to last point when resuming back to guest */ - if(VMX_DOMAIN(current)){ - vmx_load_all_rr(current); - }else{ - load_region_regs(current); - vcpu_load_kernel_regs(current); - } +void schedule_tail(struct vcpu *prev) +{ + context_saved(prev); + + if (VMX_DOMAIN(current)) { + vmx_load_all_rr(current); + } else { + load_region_regs(current); + vcpu_load_kernel_regs(current); + } } void tdpfoo(void) { } @@ -251,7 +242,7 @@ struct domain *d = current->domain; struct vcpu *v = current; // FIXME: Will this work properly if doing an RFI??? - if (!is_idle_task(d) && user_mode(regs)) { + if (!is_idle_domain(d) && user_mode(regs)) { //vcpu_poke_timer(v); if (vcpu_deliverable_interrupts(v)) reflect_extint(regs); @@ -686,6 +677,8 @@ vcpu_increment_iip(current); } else { + if (iim == 0) + die_if_kernel("bug check", regs, iim); PSCB(v,iim) = iim; reflect_interruption(isr,regs,IA64_BREAK_VECTOR); } diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/ia64/xen/vcpu.c --- a/xen/arch/ia64/xen/vcpu.c Mon Jan 9 11:19:55 2006 +++ b/xen/arch/ia64/xen/vcpu.c Mon Jan 9 11:22:17 2006 @@ -1085,7 +1085,7 @@ /* gloss over the wraparound problem for now... we know it exists * but it doesn't matter right now */ - if (is_idle_task(vcpu->domain)) { + if (is_idle_domain(vcpu->domain)) { // printf("****** vcpu_set_next_timer called during idle!!\n"); vcpu_safe_set_itm(s); return; diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/ia64/xen/xenmisc.c --- a/xen/arch/ia64/xen/xenmisc.c Mon Jan 9 11:19:55 2006 +++ b/xen/arch/ia64/xen/xenmisc.c Mon Jan 9 11:22:17 2006 @@ -25,7 +25,6 @@ int phys_proc_id[NR_CPUS]; unsigned long loops_per_jiffy = (1<<12); // from linux/init/main.c -void unw_init(void) { printf("unw_init() skipped (NEED FOR KERNEL UNWIND)\n"); } void ia64_mca_init(void) { printf("ia64_mca_init() skipped (Machine check abort handling)\n"); } void ia64_mca_cpu_init(void *x) { } void ia64_patch_mckinley_e9(unsigned long a, unsigned long b) { } @@ -180,11 +179,6 @@ // from arch/ia64/traps.c /////////////////////////////// -void show_registers(struct pt_regs *regs) -{ - printf("*** ADD REGISTER DUMP HERE FOR DEBUGGING\n"); -} - int is_kernel_text(unsigned long addr) { extern char _stext[], _etext[]; @@ -236,7 +230,13 @@ void die_if_kernel(char *str, struct pt_regs *regs, long err) /* __attribute__ ((noreturn)) */ { - printk("die_if_kernel: called, not implemented\n"); + if (user_mode(regs)) + return; + + printk("%s: %s %ld\n", __func__, str, err); + debugtrace_dump(); + show_registers(regs); + domain_crash_synchronous(); } long @@ -320,18 +320,15 @@ ia64_set_iva(&ia64_ivt); ia64_set_pta(VHPT_ADDR | (1 << 8) | (VHPT_SIZE_LOG2 << 2) | VHPT_ENABLED); - if (!is_idle_task(current->domain)) { + if (!is_idle_domain(current->domain)) { load_region_regs(current); vcpu_load_kernel_regs(current); if (vcpu_timer_expired(current)) vcpu_pend_timer(current); } if (vcpu_timer_expired(current)) vcpu_pend_timer(current); } -} - -void context_switch_finalise(struct vcpu *next) -{ - /* nothing to do */ + + context_saved(prev); } void continue_running(struct vcpu *same) @@ -368,3 +365,23 @@ goto loop; } } + +/* FIXME: for the forseeable future, all cpu's that enable VTi have split + * caches and all cpu's that have split caches enable VTi. This may + * eventually be untrue though. */ +#define cpu_has_split_cache vmx_enabled +extern unsigned int vmx_enabled; + +void sync_split_caches(void) +{ + unsigned long ret, progress = 0; + + if (cpu_has_split_cache) { + /* Sync d/i cache conservatively */ + ret = ia64_pal_cache_flush(4, 0, &progress, NULL); + if ((ret!=PAL_STATUS_SUCCESS)&& (ret!=PAL_STATUS_UNIMPLEMENTED)) + printk("PAL CACHE FLUSH failed\n"); + else printk("Sync i/d cache for guest SUCC\n"); + } + else printk("sync_split_caches ignored for CPU with no split cache\n"); +} diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/ia64/xen/xensetup.c --- a/xen/arch/ia64/xen/xensetup.c Mon Jan 9 11:19:55 2006 +++ b/xen/arch/ia64/xen/xensetup.c Mon Jan 9 11:22:17 2006 @@ -26,7 +26,7 @@ char saved_command_line[COMMAND_LINE_SIZE]; -struct vcpu *idle_task[NR_CPUS] = { &idle0_vcpu }; +struct vcpu *idle_domain[NR_CPUS] = { &idle0_vcpu }; cpumask_t cpu_present_map; @@ -382,8 +382,7 @@ panic("Could not set up DOM0 guest OS\n"); /* PIN domain0 on CPU 0. */ - dom0->vcpu[0]->cpumap=1; - set_bit(_VCPUF_cpu_pinned, &dom0->vcpu[0]->vcpu_flags); + dom0->vcpu[0]->cpu_affinity = cpumask_of_cpu(0); #ifdef CLONE_DOMAIN0 { diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/ia64/xen/xentime.c --- a/xen/arch/ia64/xen/xentime.c Mon Jan 9 11:19:55 2006 +++ b/xen/arch/ia64/xen/xentime.c Mon Jan 9 11:22:17 2006 @@ -127,7 +127,7 @@ vcpu_wake(dom0->vcpu[0]); } } - if (!is_idle_task(current->domain)) { + if (!is_idle_domain(current->domain)) { if (vcpu_timer_expired(current)) { vcpu_pend_timer(current); // ensure another timer interrupt happens even if domain doesn't diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/x86/Makefile --- a/xen/arch/x86/Makefile Mon Jan 9 11:19:55 2006 +++ b/xen/arch/x86/Makefile Mon Jan 9 11:22:17 2006 @@ -29,6 +29,7 @@ endif OBJS := $(subst $(TARGET_SUBARCH)/asm-offsets.o,,$(OBJS)) +OBJS := $(subst $(TARGET_SUBARCH)/xen.lds.o,,$(OBJS)) ifneq ($(crash_debug),y) OBJS := $(patsubst cdb%.o,,$(OBJS)) @@ -43,21 +44,24 @@ $(CURDIR)/arch.o: $(OBJS) $(LD) $(LDFLAGS) -r -o $@ $(OBJS) -$(TARGET)-syms: boot/$(TARGET_SUBARCH).o $(ALL_OBJS) $(TARGET_SUBARCH)/xen.lds - $(LD) $(LDFLAGS) -T $(TARGET_SUBARCH)/xen.lds -N \ +$(TARGET)-syms: boot/$(TARGET_SUBARCH).o $(ALL_OBJS) xen.lds + $(LD) $(LDFLAGS) -T xen.lds -N \ boot/$(TARGET_SUBARCH).o $(ALL_OBJS) -o $@ $(NM) -n $@ | $(BASEDIR)/tools/symbols >$(BASEDIR)/xen-syms.S $(MAKE) $(BASEDIR)/xen-syms.o - $(LD) $(LDFLAGS) -T $(TARGET_SUBARCH)/xen.lds -N \ + $(LD) $(LDFLAGS) -T xen.lds -N \ boot/$(TARGET_SUBARCH).o $(ALL_OBJS) $(BASEDIR)/xen-syms.o -o $@ $(NM) -n $@ | $(BASEDIR)/tools/symbols >$(BASEDIR)/xen-syms.S $(MAKE) $(BASEDIR)/xen-syms.o - $(LD) $(LDFLAGS) -T $(TARGET_SUBARCH)/xen.lds -N \ + $(LD) $(LDFLAGS) -T xen.lds -N \ boot/$(TARGET_SUBARCH).o $(ALL_OBJS) $(BASEDIR)/xen-syms.o -o $@ rm -f $(BASEDIR)/xen-syms.S $(BASEDIR)/xen-syms.o asm-offsets.s: $(TARGET_SUBARCH)/asm-offsets.c $(HDRS) $(CC) $(CFLAGS) -S -o $@ $< + +xen.lds: $(TARGET_SUBARCH)/xen.lds.S $(HDRS) + $(CC) $(CFLAGS) -P -E -Ui386 -D__ASSEMBLY__ -o $@ $< boot/mkelf32: boot/mkelf32.c $(HOSTCC) $(HOSTCFLAGS) -o $@ $< @@ -73,5 +77,6 @@ rm -f dm/*.o dm/*~ dm/core rm -f genapic/*.o genapic/*~ genapic/core rm -f cpu/*.o cpu/*~ cpu/core + rm -f xen.lds .PHONY: default clean diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/x86/boot/x86_32.S --- a/xen/arch/x86/boot/x86_32.S Mon Jan 9 11:19:55 2006 +++ b/xen/arch/x86/boot/x86_32.S Mon Jan 9 11:22:17 2006 @@ -1,5 +1,6 @@ #include <xen/config.h> #include <public/xen.h> +#include <asm/asm_defns.h> #include <asm/desc.h> #include <asm/page.h> #include <asm/msr.h> @@ -53,6 +54,7 @@ mov %ecx,%gs ljmp $(__HYPERVISOR_CS),$(1f)-__PAGE_OFFSET 1: lss stack_start-__PAGE_OFFSET,%esp + add $(STACK_SIZE-CPUINFO_sizeof-__PAGE_OFFSET),%esp /* Reset EFLAGS (subsumes CLI and CLD). */ pushl $0 @@ -98,7 +100,7 @@ 1: stosl /* low mappings cover as much physmem as possible */ add $4,%edi add $(1<<L2_PAGETABLE_SHIFT),%eax - cmp $__HYPERVISOR_VIRT_START+0xe3,%eax + cmp $HYPERVISOR_VIRT_START+0xe3,%eax jne 1b #else /* Initialize low and high mappings of all memory with 4MB pages */ @@ -111,7 +113,7 @@ jne 1b 1: stosl /* low mappings cover as much physmem as possible */ add $(1<<L2_PAGETABLE_SHIFT),%eax - cmp $__HYPERVISOR_VIRT_START+0xe3,%eax + cmp $HYPERVISOR_VIRT_START+0xe3,%eax jne 1b #endif @@ -189,7 +191,7 @@ /*** STACK LOCATION ***/ ENTRY(stack_start) - .long cpu0_stack + STACK_SIZE - 200 - __PAGE_OFFSET + .long cpu0_stack .long __HYPERVISOR_DS /*** DESCRIPTOR TABLES ***/ @@ -256,10 +258,6 @@ .fill 1*PAGE_SIZE,1,0 #endif -#if (STACK_ORDER == 0) -.section ".bss.page_aligned","w" -#else -.section ".bss.twopage_aligned","w" -#endif +.section ".bss.stack_aligned","w" ENTRY(cpu0_stack) .fill STACK_SIZE,1,0 diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/x86/boot/x86_64.S --- a/xen/arch/x86/boot/x86_64.S Mon Jan 9 11:19:55 2006 +++ b/xen/arch/x86/boot/x86_64.S Mon Jan 9 11:22:17 2006 @@ -1,5 +1,6 @@ #include <xen/config.h> #include <public/xen.h> +#include <asm/asm_defns.h> #include <asm/desc.h> #include <asm/page.h> #include <asm/msr.h> @@ -121,7 +122,8 @@ mov %rcx,%cr4 mov stack_start(%rip),%rsp - + or $(STACK_SIZE-CPUINFO_sizeof),%rsp + /* Reset EFLAGS (subsumes CLI and CLD). */ pushq $0 popf @@ -140,7 +142,7 @@ mov %ecx,%ss lidt idt_descr(%rip) - + cmp $(SECONDARY_CPU_FLAG),%ebx je start_secondary @@ -219,7 +221,7 @@ .quad idt_table ENTRY(stack_start) - .quad cpu0_stack + STACK_SIZE - 200 + .quad cpu0_stack high_start: .quad __high_start @@ -265,10 +267,6 @@ .org 0x4000 + PAGE_SIZE .code64 -#if (STACK_ORDER == 0) -.section ".bss.page_aligned","w" -#else -.section ".bss.twopage_aligned","w" -#endif +.section ".bss.stack_aligned","w" ENTRY(cpu0_stack) .fill STACK_SIZE,1,0 diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/x86/dm/i8259.c --- a/xen/arch/x86/dm/i8259.c Mon Jan 9 11:19:55 2006 +++ b/xen/arch/x86/dm/i8259.c Mon Jan 9 11:22:17 2006 @@ -29,7 +29,7 @@ #include <xen/lib.h> #include <xen/errno.h> #include <xen/sched.h> -#include <public/io/ioreq.h> +#include <public/hvm/ioreq.h> #include <asm/vmx.h> #include <asm/vmx_vpic.h> #include <asm/current.h> diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/x86/dm/vmx_vioapic.c --- a/xen/arch/x86/dm/vmx_vioapic.c Mon Jan 9 11:19:55 2006 +++ b/xen/arch/x86/dm/vmx_vioapic.c Mon Jan 9 11:22:17 2006 @@ -37,7 +37,7 @@ #include <xen/lib.h> #include <xen/errno.h> #include <xen/sched.h> -#include <public/io/ioreq.h> +#include <public/hvm/ioreq.h> #include <asm/vmx.h> #include <asm/vmx_vpic.h> #include <asm/current.h> diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/x86/dom0_ops.c --- a/xen/arch/x86/dom0_ops.c Mon Jan 9 11:19:55 2006 +++ b/xen/arch/x86/dom0_ops.c Mon Jan 9 11:22:17 2006 @@ -17,6 +17,7 @@ #include <asm/msr.h> #include <xen/trace.h> #include <xen/console.h> +#include <xen/iocap.h> #include <asm/shadow.h> #include <asm/irq.h> #include <asm/processor.h> @@ -35,13 +36,13 @@ static void write_msr_for(void *unused) { - if ( ((1 << current->processor) & msr_cpu_mask) ) + if ( ((1 << smp_processor_id()) & msr_cpu_mask) ) (void)wrmsr_user(msr_addr, msr_lo, msr_hi); } static void read_msr_for(void *unused) { - if ( ((1 << current->processor) & msr_cpu_mask) ) + if ( ((1 << smp_processor_id()) & msr_cpu_mask) ) (void)rdmsr_user(msr_addr, msr_lo, msr_hi); } @@ -102,12 +103,27 @@ op->u.add_memtype.nr_pfns, op->u.add_memtype.type, 1); + if (ret > 0) + { + (void)__put_user(0, &u_dom0_op->u.add_memtype.handle); + (void)__put_user(ret, &u_dom0_op->u.add_memtype.reg); + ret = 0; + } } break; case DOM0_DEL_MEMTYPE: { - ret = mtrr_del_page(op->u.del_memtype.reg, 0, 0); + if (op->u.del_memtype.handle == 0 + /* mtrr/main.c otherwise does a lookup */ + && (int)op->u.del_memtype.reg >= 0) + { + ret = mtrr_del_page(op->u.del_memtype.reg, 0, 0); + if (ret > 0) + ret = 0; + } + else + ret = -EINVAL; } break; @@ -141,7 +157,6 @@ struct domain *d; unsigned int fp = op->u.ioport_permission.first_port; unsigned int np = op->u.ioport_permission.nr_ports; - unsigned int p; ret = -EINVAL; if ( (fp + np) > 65536 ) @@ -152,26 +167,12 @@ op->u.ioport_permission.domain)) == NULL) ) break; - ret = -ENOMEM; - if ( d->arch.iobmp_mask != NULL ) - { - if ( (d->arch.iobmp_mask = xmalloc_array( - u8, IOBMP_BYTES)) == NULL ) - { - put_domain(d); - break; - } - memset(d->arch.iobmp_mask, 0xFF, IOBMP_BYTES); - } - - ret = 0; - for ( p = fp; p < (fp + np); p++ ) - { - if ( op->u.ioport_permission.allow_access ) - clear_bit(p, d->arch.iobmp_mask); - else - set_bit(p, d->arch.iobmp_mask); - } + if ( np == 0 ) + ret = 0; + else if ( op->u.ioport_permission.allow_access ) + ret = ioports_permit_access(d, fp, fp + np - 1); + else + ret = ioports_deny_access(d, fp, fp + np - 1); put_domain(d); } @@ -193,7 +194,7 @@ memcpy(pi->hw_cap, boot_cpu_data.x86_capability, NCAPINTS*4); ret = 0; if ( copy_to_user(u_dom0_op, op, sizeof(*op)) ) - ret = -EFAULT; + ret = -EFAULT; } break; diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/x86/domain.c --- a/xen/arch/x86/domain.c Mon Jan 9 11:19:55 2006 +++ b/xen/arch/x86/domain.c Mon Jan 9 11:22:17 2006 @@ -20,6 +20,7 @@ #include <xen/delay.h> #include <xen/softirq.h> #include <xen/grant_table.h> +#include <xen/iocap.h> #include <asm/regs.h> #include <asm/mc146818rtc.h> #include <asm/system.h> @@ -35,9 +36,7 @@ #include <xen/console.h> #include <xen/elf.h> #include <asm/vmx.h> -#include <asm/vmx_vmcs.h> #include <asm/msr.h> -#include <asm/physdev.h> #include <xen/kernel.h> #include <xen/multicall.h> @@ -47,17 +46,16 @@ struct percpu_ctxt { struct vcpu *curr_vcpu; - unsigned int context_not_finalised; unsigned int dirty_segment_mask; } __cacheline_aligned; static struct percpu_ctxt percpu_ctxt[NR_CPUS]; -static void continue_idle_task(struct vcpu *v) +static void continue_idle_domain(struct vcpu *v) { reset_stack_and_jump(idle_loop); } -static void continue_nonidle_task(struct vcpu *v) +static void continue_nonidle_domain(struct vcpu *v) { reset_stack_and_jump(ret_from_intr); } @@ -93,12 +91,13 @@ { struct vcpu *v = current; - ASSERT(is_idle_task(v->domain)); + ASSERT(is_idle_domain(v->domain)); percpu_ctxt[smp_processor_id()].curr_vcpu = v; - cpu_set(smp_processor_id(), v->domain->cpumask); - v->arch.schedule_tail = continue_idle_task; - - idle_loop(); + cpu_set(smp_processor_id(), v->domain->domain_dirty_cpumask); + cpu_set(smp_processor_id(), v->vcpu_dirty_cpumask); + v->arch.schedule_tail = continue_idle_domain; + + reset_stack_and_jump(idle_loop); } static long no_idt[2]; @@ -185,11 +184,17 @@ { struct pfn_info *page; - if ( d->tot_pages < 10 ) + printk("Memory pages belonging to domain %u:\n", d->domain_id); + + if ( d->tot_pages >= 10 ) + { + printk(" DomPage list too long to display\n"); + } + else { list_for_each_entry ( page, &d->page_list, list ) { - printk("Page %p: mfn=%p, caf=%08x, taf=%" PRtype_info "\n", + printk(" DomPage %p: mfn=%p, caf=%08x, taf=%" PRtype_info "\n", _p(page_to_phys(page)), _p(page_to_pfn(page)), page->count_info, page->u.inuse.type_info); } @@ -197,15 +202,10 @@ list_for_each_entry ( page, &d->xenpage_list, list ) { - printk("XenPage %p: mfn=%p, caf=%08x, taf=%" PRtype_info "\n", + printk(" XenPage %p: mfn=%p, caf=%08x, taf=%" PRtype_info "\n", _p(page_to_phys(page)), _p(page_to_pfn(page)), page->count_info, page->u.inuse.type_info); } - - page = virt_to_page(d->shared_info); - printk("Shared_info@%p: mfn=%p, caf=%08x, taf=%" PRtype_info "\n", - _p(page_to_phys(page)), _p(page_to_pfn(page)), page->count_info, - page->u.inuse.type_info); } struct vcpu *alloc_vcpu_struct(struct domain *d, unsigned int vcpu_id) @@ -250,24 +250,36 @@ #endif } -void arch_do_createdomain(struct vcpu *v) +int arch_do_createdomain(struct vcpu *v) { struct domain *d = v->domain; l1_pgentry_t gdt_l1e; - int vcpuid, pdpt_order; + int vcpuid, pdpt_order, rc; #ifdef __x86_64__ int i; #endif - if ( is_idle_task(d) ) - return; - - v->arch.schedule_tail = continue_nonidle_task; - - d->shared_info = alloc_xenheap_page(); + if ( is_idle_domain(d) ) + return 0; + + d->arch.ioport_caps = + rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex); + if ( d->arch.ioport_caps == NULL ) + return -ENOMEM; + + if ( (d->shared_info = alloc_xenheap_page()) == NULL ) + return -ENOMEM; + + if ( (rc = ptwr_init(d)) != 0 ) + { + free_xenheap_page(d->shared_info); + return rc; + } + + v->arch.schedule_tail = continue_nonidle_domain; + memset(d->shared_info, 0, PAGE_SIZE); v->vcpu_info = &d->shared_info->vcpu_info[v->vcpu_id]; - v->cpumap = CPUMAP_RUNANYWHERE; SHARE_PFN_WITH_DOMAIN(virt_to_page(d->shared_info), d); pdpt_order = get_order_from_bytes(PDPT_L1_ENTRIES * sizeof(l1_pgentry_t)); @@ -308,25 +320,10 @@ __PAGE_HYPERVISOR); #endif - (void)ptwr_init(d); - shadow_lock_init(d); INIT_LIST_HEAD(&d->arch.free_shadow_frames); -} - -void vcpu_migrate_cpu(struct vcpu *v, int newcpu) -{ - if ( v->processor == newcpu ) - return; - - set_bit(_VCPUF_cpu_migrated, &v->vcpu_flags); - v->processor = newcpu; - - if ( VMX_DOMAIN(v) ) - { - __vmpclear(virt_to_phys(v->arch.arch_vmx.vmcs)); - v->arch.schedule_tail = arch_vmx_do_relaunch; - } + + return 0; } /* This is called by arch_final_setup_guest and do_boot_vcpu */ @@ -348,6 +345,8 @@ ((c->user_regs.ss & 3) == 0) ) return -EINVAL; } + else if ( !hvm_enabled ) + return -EINVAL; clear_bit(_VCPUF_fpu_initialised, &v->vcpu_flags); if ( c->flags & VGCF_I387_VALID ) @@ -690,7 +689,7 @@ struct vcpu *p = percpu_ctxt[cpu].curr_vcpu; struct vcpu *n = current; - if ( !is_idle_task(p->domain) ) + if ( !is_idle_domain(p->domain) ) { memcpy(&p->arch.guest_context.user_regs, stack_regs, @@ -699,7 +698,7 @@ save_segments(p); } - if ( !is_idle_task(n->domain) ) + if ( !is_idle_domain(n->domain) ) { memcpy(stack_regs, &n->arch.guest_context.user_regs, @@ -725,7 +724,8 @@ } if ( p->domain != n->domain ) - cpu_set(cpu, n->domain->cpumask); + cpu_set(cpu, n->domain->domain_dirty_cpumask); + cpu_set(cpu, n->vcpu_dirty_cpumask); write_ptbase(n); @@ -738,7 +738,8 @@ } if ( p->domain != n->domain ) - cpu_clear(cpu, p->domain->cpumask); + cpu_clear(cpu, p->domain->domain_dirty_cpumask); + cpu_clear(cpu, p->vcpu_dirty_cpumask); percpu_ctxt[cpu].curr_vcpu = n; } @@ -748,28 +749,24 @@ { unsigned int cpu = smp_processor_id(); - ASSERT(!local_irq_is_enabled()); + ASSERT(local_irq_is_enabled()); set_current(next); - if ( (percpu_ctxt[cpu].curr_vcpu != next) && !is_idle_task(next->domain) ) - { + if ( (percpu_ctxt[cpu].curr_vcpu != next) && + !is_idle_domain(next->domain) ) + { + /* This may happen if next has been migrated by the scheduler. */ + if ( unlikely(!cpus_empty(next->vcpu_dirty_cpumask)) ) + { + ASSERT(!cpu_isset(cpu, next->vcpu_dirty_cpumask)); + sync_vcpu_execstate(next); + ASSERT(cpus_empty(next->vcpu_dirty_cpumask)); + } + + local_irq_disable(); __context_switch(); - percpu_ctxt[cpu].context_not_finalised = 1; - } -} - -void context_switch_finalise(struct vcpu *next) -{ - unsigned int cpu = smp_processor_id(); - - ASSERT(local_irq_is_enabled()); - - if ( percpu_ctxt[cpu].context_not_finalised ) - { - percpu_ctxt[cpu].context_not_finalised = 0; - - BUG_ON(percpu_ctxt[cpu].curr_vcpu != next); + local_irq_enable(); if ( VMX_DOMAIN(next) ) { @@ -783,6 +780,8 @@ } } + context_saved(prev); + schedule_tail(next); BUG(); } @@ -812,20 +811,11 @@ void sync_vcpu_execstate(struct vcpu *v) { - unsigned int cpu = v->processor; - - if ( !cpu_isset(cpu, v->domain->cpumask) ) - return; - - if ( cpu == smp_processor_id() ) - { + if ( cpu_isset(smp_processor_id(), v->vcpu_dirty_cpumask) ) (void)__sync_lazy_execstate(); - } - else - { - /* Other cpus call __sync_lazy_execstate from flush ipi handler. */ - flush_tlb_mask(cpumask_of_cpu(cpu)); - } + + /* Other cpus call __sync_lazy_execstate from flush ipi handler. */ + flush_tlb_mask(v->vcpu_dirty_cpumask); } unsigned long __hypercall_create_continuation( @@ -951,9 +941,7 @@ struct vcpu *v; unsigned long pfn; - BUG_ON(!cpus_empty(d->cpumask)); - - physdev_destroy_state(d); + BUG_ON(!cpus_empty(d->domain_dirty_cpumask)); ptwr_destroy(d); diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/x86/domain_build.c --- a/xen/arch/x86/domain_build.c Mon Jan 9 11:19:55 2006 +++ b/xen/arch/x86/domain_build.c Mon Jan 9 11:22:17 2006 @@ -16,13 +16,13 @@ #include <xen/kernel.h> #include <xen/domain.h> #include <xen/compile.h> +#include <xen/iocap.h> #include <asm/regs.h> #include <asm/system.h> #include <asm/io.h> #include <asm/processor.h> #include <asm/desc.h> #include <asm/i387.h> -#include <asm/physdev.h> #include <asm/shadow.h> static long dom0_nrpages; @@ -94,9 +94,9 @@ return page; } -static void process_dom0_ioports_disable() +static void process_dom0_ioports_disable(void) { - unsigned long io_from, io_to, io_nr; + unsigned long io_from, io_to; char *t, *u, *s = opt_dom0_ioports_disable; if ( *s == '\0' ) @@ -126,8 +126,8 @@ printk("Disabling dom0 access to ioport range %04lx-%04lx\n", io_from, io_to); - io_nr = io_to - io_from + 1; - physdev_modify_ioport_access_range(dom0, 0, io_from, io_nr); + if ( ioports_deny_access(dom0, io_from, io_to) != 0 ) + BUG(); } } @@ -183,7 +183,6 @@ /* Machine address of next candidate page-table page. */ unsigned long mpt_alloc; - extern void physdev_init_dom0(struct domain *); extern void translate_l2pgtable( struct domain *d, l1_pgentry_t *p2m, unsigned long l2mfn); @@ -692,9 +691,6 @@ zap_low_mappings(l2start); zap_low_mappings(idle_pg_table_l2); #endif - - /* DOM0 gets access to everything. */ - physdev_init_dom0(d); init_domain_time(d); @@ -746,19 +742,28 @@ printk("dom0: shadow setup done\n"); } + i = 0; + + /* DOM0 is permitted full I/O capabilities. */ + i |= ioports_permit_access(dom0, 0, 0xFFFF); + i |= iomem_permit_access(dom0, 0UL, ~0UL); + i |= irqs_permit_access(dom0, 0, NR_PIRQS-1); + /* * Modify I/O port access permissions. */ /* Master Interrupt Controller (PIC). */ - physdev_modify_ioport_access_range(dom0, 0, 0x20, 2); + i |= ioports_deny_access(dom0, 0x20, 0x21); /* Slave Interrupt Controller (PIC). */ - physdev_modify_ioport_access_range(dom0, 0, 0xA0, 2); + i |= ioports_deny_access(dom0, 0xA0, 0xA1); /* Interval Timer (PIT). */ - physdev_modify_ioport_access_range(dom0, 0, 0x40, 4); + i |= ioports_deny_access(dom0, 0x40, 0x43); /* PIT Channel 2 / PC Speaker Control. */ - physdev_modify_ioport_access_range(dom0, 0, 0x61, 1); - /* Command-line passed i/o ranges */ + i |= ioports_deny_access(dom0, 0x61, 0x61); + /* Command-line I/O ranges. */ process_dom0_ioports_disable(); + + BUG_ON(i != 0); return 0; } diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/x86/idle0_task.c --- a/xen/arch/x86/idle0_task.c Mon Jan 9 11:19:55 2006 +++ b/xen/arch/x86/idle0_task.c Mon Jan 9 11:22:17 2006 @@ -11,6 +11,7 @@ struct vcpu idle0_vcpu = { processor: 0, + cpu_affinity:CPU_MASK_CPU0, domain: &idle0_domain }; diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/x86/io_apic.c --- a/xen/arch/x86/io_apic.c Mon Jan 9 11:19:55 2006 +++ b/xen/arch/x86/io_apic.c Mon Jan 9 11:22:17 2006 @@ -1807,3 +1807,47 @@ return 0; } + +void dump_ioapic_irq_info(void) +{ + struct irq_pin_list *entry; + struct IO_APIC_route_entry rte; + unsigned int irq, pin, printed = 0; + unsigned long flags; + + for ( irq = 0; irq < NR_IRQS; irq++ ) + { + entry = &irq_2_pin[irq]; + if ( entry->pin == -1 ) + continue; + + if ( !printed++ ) + printk("IO-APIC interrupt information:\n"); + + printk(" IRQ%3d Vec%3d:\n", irq, irq_to_vector(irq)); + + for ( ; ; ) + { + pin = entry->pin; + + printk(" Apic 0x%02x, Pin %2d: ", entry->apic, pin); + + spin_lock_irqsave(&ioapic_lock, flags); + *(((int *)&rte) + 0) = io_apic_read(entry->apic, 0x10 + 2 * pin); + *(((int *)&rte) + 1) = io_apic_read(entry->apic, 0x11 + 2 * pin); + spin_unlock_irqrestore(&ioapic_lock, flags); + + printk("vector=%u, delivery_mode=%u, dest_mode=%s, " + "delivery_status=%d, polarity=%d, irr=%d, " + "trigger=%s, mask=%d\n", + rte.vector, rte.delivery_mode, + rte.dest_mode ? "logical" : "physical", + rte.delivery_status, rte.polarity, rte.irr, + rte.trigger ? "level" : "edge", rte.mask); + + if ( entry->next == 0 ) + break; + entry = &irq_2_pin[entry->next]; + } + } +} diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/x86/irq.c --- a/xen/arch/x86/irq.c Mon Jan 9 11:19:55 2006 +++ b/xen/arch/x86/irq.c Mon Jan 9 11:22:17 2006 @@ -12,6 +12,7 @@ #include <xen/irq.h> #include <xen/perfc.h> #include <xen/sched.h> +#include <xen/keyhandler.h> #include <asm/current.h> #include <asm/smpboot.h> @@ -198,19 +199,21 @@ int pirq_guest_bind(struct vcpu *v, int irq, int will_share) { - unsigned int vector = irq_to_vector(irq); - struct domain *d = v->domain; - irq_desc_t *desc = &irq_desc[vector]; + unsigned int vector; + irq_desc_t *desc; irq_guest_action_t *action; unsigned long flags; int rc = 0; cpumask_t cpumask = CPU_MASK_NONE; - if ( !IS_CAPABLE_PHYSDEV(d) ) - return -EPERM; - + if ( (irq < 0) || (irq >= NR_IRQS) ) + return -EINVAL; + + vector = irq_to_vector(irq); if ( vector == 0 ) - return -EBUSY; + return -EINVAL; + + desc = &irq_desc[vector]; spin_lock_irqsave(&desc->lock, flags); @@ -309,3 +312,71 @@ spin_unlock_irqrestore(&desc->lock, flags); return 0; } + +extern void dump_ioapic_irq_info(void); + +static void dump_irqs(unsigned char key) +{ + int i, irq, vector; + irq_desc_t *desc; + irq_guest_action_t *action; + struct domain *d; + unsigned long flags; + + printk("Guest interrupt information:\n"); + + for ( irq = 0; irq < NR_IRQS; irq++ ) + { + vector = irq_to_vector(irq); + if ( vector == 0 ) + continue; + + desc = &irq_desc[vector]; + + spin_lock_irqsave(&desc->lock, flags); + + if ( desc->status & IRQ_GUEST ) + { + action = (irq_guest_action_t *)desc->action; + + printk(" IRQ%3d Vec%3d: type=%-15s status=%08x " + "in-flight=%d domain-list=", + irq, vector, desc->handler->typename, + desc->status, action->in_flight); + + for ( i = 0; i < action->nr_guests; i++ ) + { + d = action->guest[i]; + printk("%u(%c%c%c%c)", + d->domain_id, + (test_bit(d->pirq_to_evtchn[irq], + &d->shared_info->evtchn_pending[0]) ? + 'P' : '-'), + (test_bit(d->pirq_to_evtchn[irq]/BITS_PER_LONG, + &d->shared_info->vcpu_info[0]. + evtchn_pending_sel) ? + 'S' : '-'), + (test_bit(d->pirq_to_evtchn[irq], + &d->shared_info->evtchn_mask[0]) ? + 'M' : '-'), + (test_bit(irq, &d->pirq_mask) ? + 'M' : '-')); + if ( i != action->nr_guests ) + printk(","); + } + + printk("\n"); + } + + spin_unlock_irqrestore(&desc->lock, flags); + } + + dump_ioapic_irq_info(); +} + +static int __init setup_dump_irqs(void) +{ + register_keyhandler('i', dump_irqs, "dump interrupt bindings"); + return 0; +} +__initcall(setup_dump_irqs); diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/x86/mm.c --- a/xen/arch/x86/mm.c Mon Jan 9 11:19:55 2006 +++ b/xen/arch/x86/mm.c Mon Jan 9 11:22:17 2006 @@ -96,6 +96,7 @@ #include <xen/softirq.h> #include <xen/domain_page.h> #include <xen/event.h> +#include <xen/iocap.h> #include <asm/shadow.h> #include <asm/page.h> #include <asm/flushtlb.h> @@ -437,7 +438,6 @@ unsigned long mfn = l1e_get_pfn(l1e); struct pfn_info *page = pfn_to_page(mfn); int okay; - extern int domain_iomem_in_pfn(struct domain *d, unsigned long pfn); if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) ) return 1; @@ -455,8 +455,7 @@ if ( d == dom_io ) d = current->domain; - if ( (!IS_PRIV(d)) && - (!IS_CAPABLE_PHYSDEV(d) || !domain_iomem_in_pfn(d, mfn)) ) + if ( !iomem_access_permitted(d, mfn, mfn) ) { MEM_LOG("Non-privileged attempt to map I/O space %08lx", mfn); return 0; @@ -1458,7 +1457,8 @@ * was GDT/LDT) but those circumstances should be * very rare. */ - cpumask_t mask = page_get_owner(page)->cpumask; + cpumask_t mask = + page_get_owner(page)->domain_dirty_cpumask; tlbflush_filter(mask, page->tlbflush_timestamp); if ( unlikely(!cpus_empty(mask)) ) @@ -1620,7 +1620,7 @@ if ( shadow_mode_enabled(d) ) shadow_sync_all(d); if ( deferred_ops & DOP_FLUSH_ALL_TLBS ) - flush_tlb_mask(d->cpumask); + flush_tlb_mask(d->domain_dirty_cpumask); else local_flush_tlb(); } @@ -1692,7 +1692,7 @@ struct domain *d, unsigned long vmask) { unsigned int vcpu_id; - cpumask_t pmask; + cpumask_t pmask = CPU_MASK_NONE; struct vcpu *v; while ( vmask != 0 ) @@ -1701,7 +1701,7 @@ vmask &= ~(1UL << vcpu_id); if ( (vcpu_id < MAX_VIRT_CPUS) && ((v = d->vcpu[vcpu_id]) != NULL) ) - cpu_set(v->processor, pmask); + cpus_or(pmask, pmask, v->vcpu_dirty_cpumask); } return pmask; @@ -1870,7 +1870,6 @@ break; } pmask = vcpumask_to_pcpumask(d, vmask); - cpus_and(pmask, pmask, d->cpumask); if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI ) flush_tlb_mask(pmask); else @@ -1879,15 +1878,15 @@ } case MMUEXT_TLB_FLUSH_ALL: - flush_tlb_mask(d->cpumask); + flush_tlb_mask(d->domain_dirty_cpumask); break; case MMUEXT_INVLPG_ALL: - flush_tlb_one_mask(d->cpumask, op.arg1.linear_addr); + flush_tlb_one_mask(d->domain_dirty_cpumask, op.arg1.linear_addr); break; case MMUEXT_FLUSH_CACHE: - if ( unlikely(!IS_CAPABLE_PHYSDEV(d)) ) + if ( unlikely(!cache_flush_permitted(d)) ) { MEM_LOG("Non-physdev domain tried to FLUSH_CACHE."); okay = 0; @@ -2498,7 +2497,7 @@ l1_pgentry_t val = l1e_from_intpte(val64); struct vcpu *v = current; struct domain *d = v->domain; - unsigned int cpu = v->processor; + unsigned int cpu = smp_processor_id(); unsigned long vmask, bmap_ptr; cpumask_t pmask; int rc = 0; @@ -2549,13 +2548,12 @@ local_flush_tlb(); break; case UVMF_ALL: - flush_tlb_mask(d->cpumask); + flush_tlb_mask(d->domain_dirty_cpumask); break; default: if ( unlikely(get_user(vmask, (unsigned long *)bmap_ptr)) ) rc = -EFAULT; pmask = vcpumask_to_pcpumask(d, vmask); - cpus_and(pmask, pmask, d->cpumask); flush_tlb_mask(pmask); break; } @@ -2570,13 +2568,12 @@ local_flush_tlb_one(va); break; case UVMF_ALL: - flush_tlb_one_mask(d->cpumask, va); + flush_tlb_one_mask(d->domain_dirty_cpumask, va); break; default: if ( unlikely(get_user(vmask, (unsigned long *)bmap_ptr)) ) rc = -EFAULT; pmask = vcpumask_to_pcpumask(d, vmask); - cpus_and(pmask, pmask, d->cpumask); flush_tlb_one_mask(pmask, va); break; } @@ -3019,7 +3016,7 @@ /* Ensure that there are no stale writable mappings in any TLB. */ /* NB. INVLPG is a serialising instruction: flushes pending updates. */ - flush_tlb_one_mask(d->cpumask, l1va); + flush_tlb_one_mask(d->domain_dirty_cpumask, l1va); PTWR_PRINTK("[%c] disconnected_l1va at %p now %"PRIpte"\n", PTWR_PRINT_WHICH, ptep, pte.l1); @@ -3343,7 +3340,7 @@ if ( which == PTWR_PT_ACTIVE ) { l2e_remove_flags(*pl2e, _PAGE_PRESENT); - flush_tlb_mask(d->cpumask); + flush_tlb_mask(d->domain_dirty_cpumask); } /* Temporarily map the L1 page, and make a copy of it. */ @@ -3370,7 +3367,7 @@ emulate: if ( x86_emulate_memop(guest_cpu_user_regs(), addr, - &ptwr_mem_emulator, BITS_PER_LONG/8) ) + &ptwr_mem_emulator, X86EMUL_MODE_HOST) ) return 0; perfc_incrc(ptwr_emulations); return EXCRET_fault_fixed; diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/x86/physdev.c --- a/xen/arch/x86/physdev.c Mon Jan 9 11:19:55 2006 +++ b/xen/arch/x86/physdev.c Mon Jan 9 11:22:17 2006 @@ -13,27 +13,6 @@ extern int ioapic_guest_read(int apicid, int address, u32 *pval); extern int ioapic_guest_write(int apicid, int address, u32 pval); - -void physdev_modify_ioport_access_range( - struct domain *d, int enable, int port, int num) -{ - int i; - for ( i = port; i < (port + num); i++ ) - (enable ? clear_bit : set_bit)(i, d->arch.iobmp_mask); -} - -void physdev_destroy_state(struct domain *d) -{ - xfree(d->arch.iobmp_mask); - d->arch.iobmp_mask = NULL; -} - -/* Check if a domain controls a device with IO memory within frame @pfn. - * Returns: 1 if the domain should be allowed to map @pfn, 0 otherwise. */ -int domain_iomem_in_pfn(struct domain *p, unsigned long pfn) -{ - return 0; -} /* * Demuxing hypercall. @@ -120,18 +99,6 @@ return ret; } -/* Domain 0 has read access to all devices. */ -void physdev_init_dom0(struct domain *d) -{ - /* Access to all I/O ports. */ - d->arch.iobmp_mask = xmalloc_array(u8, IOBMP_BYTES); - BUG_ON(d->arch.iobmp_mask == NULL); - memset(d->arch.iobmp_mask, 0, IOBMP_BYTES); - - set_bit(_DOMF_physdev_access, &d->domain_flags); -} - - /* * Local variables: * mode: C diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/x86/setup.c --- a/xen/arch/x86/setup.c Mon Jan 9 11:19:55 2006 +++ b/xen/arch/x86/setup.c Mon Jan 9 11:22:17 2006 @@ -92,7 +92,7 @@ #endif EXPORT_SYMBOL(mmu_cr4_features); -struct vcpu *idle_task[NR_CPUS] = { &idle0_vcpu }; +struct vcpu *idle_domain[NR_CPUS] = { &idle0_vcpu }; int acpi_disabled; @@ -138,131 +138,19 @@ (*call)(); } -static void __init start_of_day(void) -{ - int i; - unsigned long vgdt, gdt_pfn; - - early_cpu_init(); - - paging_init(); - - /* Unmap the first page of CPU0's stack. */ - memguard_guard_stack(cpu0_stack); - - open_softirq(NEW_TLBFLUSH_CLOCK_PERIOD_SOFTIRQ, new_tlbflush_clock_period); - - if ( opt_watchdog ) - nmi_watchdog = NMI_LOCAL_APIC; - - sort_exception_tables(); - - arch_do_createdomain(current); - - /* - * Map default GDT into its final positions in the idle page table. As - * noted in arch_do_createdomain(), we must map for every possible VCPU#. - */ - vgdt = GDT_VIRT_START(current) + FIRST_RESERVED_GDT_BYTE; - gdt_pfn = virt_to_phys(gdt_table) >> PAGE_SHIFT; - for ( i = 0; i < MAX_VIRT_CPUS; i++ ) - { - map_pages_to_xen(vgdt, gdt_pfn, 1, PAGE_HYPERVISOR); - vgdt += 1 << PDPT_VCPU_VA_SHIFT; - } - - find_smp_config(); - - smp_alloc_memory(); - - dmi_scan_machine(); - - generic_apic_probe(); - - acpi_boot_table_init(); - acpi_boot_init(); - - if ( smp_found_config ) - get_smp_config(); - - init_apic_mappings(); - - init_IRQ(); - - trap_init(); - - ac_timer_init(); - - early_time_init(); - - arch_init_memory(); - - scheduler_init(); - - identify_cpu(&boot_cpu_data); - if ( cpu_has_fxsr ) - set_in_cr4(X86_CR4_OSFXSR); - if ( cpu_has_xmm ) - set_in_cr4(X86_CR4_OSXMMEXCPT); - - if ( opt_nosmp ) - { - max_cpus = 0; - smp_num_siblings = 1; - boot_cpu_data.x86_num_cores = 1; - } - - smp_prepare_cpus(max_cpus); - - /* We aren't hotplug-capable yet. */ - BUG_ON(!cpus_empty(cpu_present_map)); - for_each_cpu ( i ) - cpu_set(i, cpu_present_map); - - /* - * Initialise higher-level timer functions. We do this fairly late - * (post-SMP) because the time bases and scale factors need to be updated - * regularly, and SMP initialisation can cause a long delay with - * interrupts not yet enabled. - */ - init_xen_time(); - - initialize_keytable(); - - serial_init_postirq(); - - BUG_ON(!local_irq_is_enabled()); - - for_each_present_cpu ( i ) - { - if ( num_online_cpus() >= max_cpus ) - break; - if ( !cpu_online(i) ) - __cpu_up(i); - } - - printk("Brought up %ld CPUs\n", (long)num_online_cpus()); - smp_cpus_done(max_cpus); - - do_initcalls(); - - schedulers_start(); - - watchdog_enable(); -} - #define EARLY_FAIL() for ( ; ; ) __asm__ __volatile__ ( "hlt" ) static struct e820entry e820_raw[E820MAX]; void __init __start_xen(multiboot_info_t *mbi) { + unsigned long vgdt, gdt_pfn; char *cmdline; + unsigned long _initrd_start = 0, _initrd_len = 0; + unsigned int initrdidx = 1; module_t *mod = (module_t *)__va(mbi->mods_addr); unsigned long nr_pages, modules_length; unsigned long initial_images_start, initial_images_end; - unsigned long _initrd_start = 0, _initrd_len = 0; - unsigned int initrdidx = 1; physaddr_t s, e; int i, e820_warn = 0, e820_raw_nr = 0, bytes = 0; struct ns16550_defaults ns16550 = { @@ -455,6 +343,12 @@ BUG_ON(sizeof(shared_info_t) > PAGE_SIZE); BUG_ON(sizeof(vcpu_info_t) != 64); + /* __foo are defined in public headers. Check they match internal defs. */ + BUG_ON(__HYPERVISOR_VIRT_START != HYPERVISOR_VIRT_START); +#ifdef HYPERVISOR_VIRT_END + BUG_ON(__HYPERVISOR_VIRT_END != HYPERVISOR_VIRT_END); +#endif + init_frametable(); end_boot_allocator(); @@ -486,7 +380,113 @@ early_boot = 0; - start_of_day(); + early_cpu_init(); + + paging_init(); + + /* Unmap the first page of CPU0's stack. */ + memguard_guard_stack(cpu0_stack); + + open_softirq(NEW_TLBFLUSH_CLOCK_PERIOD_SOFTIRQ, new_tlbflush_clock_period); + + if ( opt_watchdog ) + nmi_watchdog = NMI_LOCAL_APIC; + + sort_exception_tables(); + + if ( arch_do_createdomain(current) != 0 ) + BUG(); + + /* + * Map default GDT into its final positions in the idle page table. As + * noted in arch_do_createdomain(), we must map for every possible VCPU#. + */ + vgdt = GDT_VIRT_START(current) + FIRST_RESERVED_GDT_BYTE; + gdt_pfn = virt_to_phys(gdt_table) >> PAGE_SHIFT; + for ( i = 0; i < MAX_VIRT_CPUS; i++ ) + { + map_pages_to_xen(vgdt, gdt_pfn, 1, PAGE_HYPERVISOR); + vgdt += 1 << PDPT_VCPU_VA_SHIFT; + } + + find_smp_config(); + + smp_alloc_memory(); + + dmi_scan_machine(); + + generic_apic_probe(); + + acpi_boot_table_init(); + acpi_boot_init(); + + if ( smp_found_config ) + get_smp_config(); + + init_apic_mappings(); + + init_IRQ(); + + trap_init(); + + ac_timer_init(); + + early_time_init(); + + arch_init_memory(); + + scheduler_init(); + + identify_cpu(&boot_cpu_data); + if ( cpu_has_fxsr ) + set_in_cr4(X86_CR4_OSFXSR); + if ( cpu_has_xmm ) + set_in_cr4(X86_CR4_OSXMMEXCPT); + + if ( opt_nosmp ) + { + max_cpus = 0; + smp_num_siblings = 1; + boot_cpu_data.x86_num_cores = 1; + } + + smp_prepare_cpus(max_cpus); + + /* We aren't hotplug-capable yet. */ + BUG_ON(!cpus_empty(cpu_present_map)); + for_each_cpu ( i ) + cpu_set(i, cpu_present_map); + + /* + * Initialise higher-level timer functions. We do this fairly late + * (post-SMP) because the time bases and scale factors need to be updated + * regularly, and SMP initialisation can cause a long delay with + * interrupts not yet enabled. + */ + init_xen_time(); + + initialize_keytable(); + + serial_init_postirq(); + + BUG_ON(!local_irq_is_enabled()); + + for_each_present_cpu ( i ) + { + if ( num_online_cpus() >= max_cpus ) + break; + if ( !cpu_online(i) ) + __cpu_up(i); + } + + printk("Brought up %ld CPUs\n", (long)num_online_cpus()); + smp_cpus_done(max_cpus); + + do_initcalls(); + + schedulers_start(); + + watchdog_enable(); shadow_mode_init(); diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/x86/shadow.c --- a/xen/arch/x86/shadow.c Mon Jan 9 11:19:55 2006 +++ b/xen/arch/x86/shadow.c Mon Jan 9 11:22:17 2006 @@ -1800,7 +1800,7 @@ } /* Other VCPUs mustn't use the revoked writable mappings. */ - other_vcpus_mask = d->cpumask; + other_vcpus_mask = d->domain_dirty_cpumask; cpu_clear(smp_processor_id(), other_vcpus_mask); flush_tlb_mask(other_vcpus_mask); diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/x86/shadow32.c --- a/xen/arch/x86/shadow32.c Mon Jan 9 11:19:55 2006 +++ b/xen/arch/x86/shadow32.c Mon Jan 9 11:22:17 2006 @@ -2586,7 +2586,7 @@ } /* Other VCPUs mustn't use the revoked writable mappings. */ - other_vcpus_mask = d->cpumask; + other_vcpus_mask = d->domain_dirty_cpumask; cpu_clear(smp_processor_id(), other_vcpus_mask); flush_tlb_mask(other_vcpus_mask); diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/x86/smpboot.c --- a/xen/arch/x86/smpboot.c Mon Jan 9 11:19:55 2006 +++ b/xen/arch/x86/smpboot.c Mon Jan 9 11:22:17 2006 @@ -435,7 +435,7 @@ extern void percpu_traps_init(void); - set_current(idle_task[cpu]); + set_current(idle_domain[cpu]); set_processor_id(cpu); percpu_traps_init(); @@ -763,7 +763,6 @@ { struct domain *idle; struct vcpu *v; - void *stack; unsigned long boot_error; int timeout, cpu; unsigned long start_eip; @@ -774,7 +773,7 @@ if ( (idle = do_createdomain(IDLE_DOMAIN_ID, cpu)) == NULL ) panic("failed 'createdomain' for CPU %d", cpu); - v = idle_task[cpu] = idle->vcpu[0]; + v = idle_domain[cpu] = idle->vcpu[0]; set_bit(_DOMF_idle_domain, &idle->domain_flags); @@ -786,16 +785,10 @@ /* So we see what's up */ printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip); - stack = alloc_xenheap_pages(STACK_ORDER); -#if defined(__i386__) - stack_start.esp = (void *)__pa(stack); -#elif defined(__x86_64__) - stack_start.esp = stack; -#endif - stack_start.esp += STACK_SIZE - sizeof(struct cpu_info); + stack_start.esp = alloc_xenheap_pages(STACK_ORDER); /* Debug build: detect stack overflow by setting up a guard page. */ - memguard_guard_stack(stack); + memguard_guard_stack(stack_start.esp); /* * This grunge runs the startup process for diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/x86/traps.c --- a/xen/arch/x86/traps.c Mon Jan 9 11:19:55 2006 +++ b/xen/arch/x86/traps.c Mon Jan 9 11:22:17 2006 @@ -41,6 +41,7 @@ #include <xen/softirq.h> #include <xen/domain_page.h> #include <xen/symbols.h> +#include <xen/iocap.h> #include <asm/shadow.h> #include <asm/system.h> #include <asm/io.h> @@ -192,7 +193,8 @@ /* Bounds for range of valid frame pointer. */ low = (unsigned long)(ESP_BEFORE_EXCEPTION(regs) - 2); - high = (low & ~(STACK_SIZE - 1)) + (STACK_SIZE - sizeof(struct cpu_info)); + high = (low & ~(STACK_SIZE - 1)) + + (STACK_SIZE - sizeof(struct cpu_info) - 2*sizeof(unsigned long)); /* The initial frame pointer. */ next = regs->ebp; @@ -200,14 +202,14 @@ for ( ; ; ) { /* Valid frame pointer? */ - if ( (next < low) || (next > high) ) + if ( (next < low) || (next >= high) ) { /* * Exception stack frames have a different layout, denoted by an * inverted frame pointer. */ next = ~next; - if ( (next < low) || (next > high) ) + if ( (next < low) || (next >= high) ) break; frame = (unsigned long *)next; next = frame[0]; @@ -621,17 +623,7 @@ unsigned int port, unsigned int bytes, struct vcpu *v, struct cpu_user_regs *regs) { - struct domain *d = v->domain; - u16 x; - - if ( d->arch.iobmp_mask != NULL ) - { - x = *(u16 *)(d->arch.iobmp_mask + (port >> 3)); - if ( (x & (((1<<bytes)-1) << (port&7))) == 0 ) - return 1; - } - - return 0; + return ioports_access_permitted(v->domain, port, port + bytes - 1); } /* Check admin limits. Silently fail the access if it is disallowed. */ @@ -871,7 +863,7 @@ case 0x09: /* WBINVD */ /* Ignore the instruction if unprivileged. */ - if ( !IS_CAPABLE_PHYSDEV(v->domain) ) + if ( !cache_flush_permitted(v->domain) ) DPRINTK("Non-physdev domain attempted WBINVD.\n"); else wbinvd(); @@ -885,7 +877,8 @@ switch ( modrm_reg ) { case 0: /* Read CR0 */ - *reg = v->arch.guest_context.ctrlreg[0]; + *reg = (read_cr0() & ~X86_CR0_TS) | + v->arch.guest_context.ctrlreg[0]; break; case 2: /* Read CR2 */ @@ -927,6 +920,11 @@ switch ( modrm_reg ) { case 0: /* Write CR0 */ + if ( (*reg ^ read_cr0()) & ~X86_CR0_TS ) + { + DPRINTK("Attempt to change unmodifiable CR0 flags.\n"); + goto fail; + } (void)do_fpu_taskswitch(!!(*reg & X86_CR0_TS)); break; @@ -939,6 +937,14 @@ LOCK_BIGLOCK(v->domain); (void)new_guest_cr3(*reg); UNLOCK_BIGLOCK(v->domain); + break; + + case 4: + if ( *reg != (read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE)) ) + { + DPRINTK("Attempt to change CR4 flags.\n"); + goto fail; + } break; default: diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/x86/vmx.c --- a/xen/arch/x86/vmx.c Mon Jan 9 11:19:55 2006 +++ b/xen/arch/x86/vmx.c Mon Jan 9 11:22:17 2006 @@ -42,7 +42,7 @@ #include <asm/shadow_64.h> #endif #include <public/sched.h> -#include <public/io/ioreq.h> +#include <public/hvm/ioreq.h> #include <asm/vmx_vpic.h> #include <asm/vmx_vlapic.h> @@ -53,7 +53,7 @@ integer_param("vmx_debug", opt_vmx_debug_level); static unsigned long trace_values[NR_CPUS][4]; -#define TRACE_VMEXIT(index,value) trace_values[current->processor][index]=value +#define TRACE_VMEXIT(index,value) trace_values[smp_processor_id()][index]=value static int vmx_switch_on; @@ -65,11 +65,6 @@ { struct domain *d = v->domain; struct vcpu *vc; - - d->arch.vmx_platform.lapic_enable = v->arch.guest_context.user_regs.ecx; - v->arch.guest_context.user_regs.ecx = 0; - VMX_DBG_LOG(DBG_LEVEL_VLAPIC, "lapic enable is %d.\n", - d->arch.vmx_platform.lapic_enable); /* Initialize monitor page table */ for_each_vcpu(d, vc) @@ -95,7 +90,7 @@ void vmx_relinquish_resources(struct vcpu *v) { struct vmx_virpit *vpit; - + if ( !VMX_DOMAIN(v) ) return; @@ -1955,9 +1950,12 @@ asmlinkage void trace_vmentry (void) { - TRACE_5D(TRC_VMENTRY,trace_values[current->processor][0], - trace_values[current->processor][1],trace_values[current->processor][2], - trace_values[current->processor][3],trace_values[current->processor][4]); + TRACE_5D(TRC_VMENTRY, + trace_values[smp_processor_id()][0], + trace_values[smp_processor_id()][1], + trace_values[smp_processor_id()][2], + trace_values[smp_processor_id()][3], + trace_values[smp_processor_id()][4]); TRACE_VMEXIT(0,9); TRACE_VMEXIT(1,9); TRACE_VMEXIT(2,9); diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/x86/vmx_intercept.c --- a/xen/arch/x86/vmx_intercept.c Mon Jan 9 11:19:55 2006 +++ b/xen/arch/x86/vmx_intercept.c Mon Jan 9 11:22:17 2006 @@ -24,7 +24,7 @@ #include <asm/vmx_vpit.h> #include <asm/vmx_intercept.h> #include <asm/vmx_vlapic.h> -#include <public/io/ioreq.h> +#include <public/hvm/ioreq.h> #include <xen/lib.h> #include <xen/sched.h> #include <asm/current.h> diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/x86/vmx_io.c --- a/xen/arch/x86/vmx_io.c Mon Jan 9 11:19:55 2006 +++ b/xen/arch/x86/vmx_io.c Mon Jan 9 11:22:17 2006 @@ -37,7 +37,7 @@ #include <asm/shadow.h> #include <asm/vmx_vpic.h> #include <asm/vmx_vlapic.h> -#include <public/io/ioreq.h> +#include <public/hvm/ioreq.h> #ifdef CONFIG_VMX #if defined (__i386__) diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/x86/vmx_platform.c --- a/xen/arch/x86/vmx_platform.c Mon Jan 9 11:19:55 2006 +++ b/xen/arch/x86/vmx_platform.c Mon Jan 9 11:22:17 2006 @@ -27,7 +27,7 @@ #include <xen/trace.h> #include <asm/vmx.h> #include <asm/vmx_platform.h> -#include <public/io/ioreq.h> +#include <public/hvm/ioreq.h> #include <xen/lib.h> #include <xen/sched.h> diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/x86/vmx_vlapic.c --- a/xen/arch/x86/vmx_vlapic.c Mon Jan 9 11:19:55 2006 +++ b/xen/arch/x86/vmx_vlapic.c Mon Jan 9 11:22:17 2006 @@ -32,7 +32,7 @@ #include <xen/lib.h> #include <xen/sched.h> #include <asm/current.h> -#include <public/io/ioreq.h> +#include <public/hvm/ioreq.h> #ifdef CONFIG_VMX @@ -62,7 +62,7 @@ int vmx_apic_support(struct domain *d) { - return d->arch.vmx_platform.lapic_enable; + return d->arch.vmx_platform.apic_enabled; } s_time_t get_apictime_scheduled(struct vcpu *v) diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/x86/vmx_vmcs.c --- a/xen/arch/x86/vmx_vmcs.c Mon Jan 9 11:19:55 2006 +++ b/xen/arch/x86/vmx_vmcs.c Mon Jan 9 11:22:17 2006 @@ -32,7 +32,7 @@ #include <asm/flushtlb.h> #include <xen/event.h> #include <xen/kernel.h> -#include <public/io/ioreq.h> +#include <public/hvm/hvm_info_table.h> #if CONFIG_PAGING_LEVELS >= 4 #include <asm/shadow_64.h> #endif @@ -206,35 +206,55 @@ &d->shared_info->evtchn_mask[0]); } -#define VCPU_NR_PAGE 0x0009F000 -#define VCPU_NR_OFFSET 0x00000800 -#define VCPU_MAGIC 0x76637075 /* "vcpu" */ - -static void vmx_set_vcpu_nr(struct domain *d) +static int validate_hvm_info(struct hvm_info_table *t) +{ + char signature[] = "HVM INFO"; + uint8_t *ptr = (uint8_t *)t; + uint8_t sum = 0; + int i; + + /* strncmp(t->signature, "HVM INFO", 8) */ + for ( i = 0; i < 8; i++ ) { + if ( signature[i] != t->signature[i] ) { + printk("Bad hvm info signature\n"); + return 0; + } + } + + for ( i = 0; i < t->length; i++ ) + sum += ptr[i]; + + return (sum == 0); +} + +static void vmx_get_hvm_info(struct domain *d) { unsigned char *p; unsigned long mpfn; - unsigned int *vcpus; - - mpfn = get_mfn_from_pfn(VCPU_NR_PAGE >> PAGE_SHIFT); - if (mpfn == INVALID_MFN) { - printk("Can not get vcpu number page mfn for VMX domain.\n"); + struct hvm_info_table *t; + + mpfn = get_mfn_from_pfn(HVM_INFO_PFN); + if ( mpfn == INVALID_MFN ) { + printk("Can not get hvm info page mfn for VMX domain.\n"); domain_crash_synchronous(); } p = map_domain_page(mpfn); - if (p == NULL) { - printk("Can not map vcpu number page for VMX domain.\n"); - domain_crash_synchronous(); - } - - vcpus = (unsigned int *)(p + VCPU_NR_OFFSET); - if (vcpus[0] != VCPU_MAGIC) { - printk("Bad vcpus magic, set vcpu number to 1 by default.\n"); - d->arch.vmx_platform.nr_vcpu = 1; - } - - d->arch.vmx_platform.nr_vcpu = vcpus[1]; + if ( p == NULL ) { + printk("Can not map hvm info page for VMX domain.\n"); + domain_crash_synchronous(); + } + + t = (struct hvm_info_table *)(p + HVM_INFO_OFFSET); + + if ( validate_hvm_info(t) ) { + d->arch.vmx_platform.nr_vcpus = t->nr_vcpus; + d->arch.vmx_platform.apic_enabled = t->apic_enabled; + } else { + printk("Bad hvm info table\n"); + d->arch.vmx_platform.nr_vcpus = 1; + d->arch.vmx_platform.apic_enabled = 0; + } unmap_domain_page(p); } @@ -244,10 +264,10 @@ struct vmx_platform *platform; vmx_map_io_shared_page(d); - vmx_set_vcpu_nr(d); + vmx_get_hvm_info(d); platform = &d->arch.vmx_platform; - pic_init(&platform->vmx_pic, pic_irq_request, + pic_init(&platform->vmx_pic, pic_irq_request, &platform->interrupt_request); register_pic_io_hook(); @@ -335,6 +355,8 @@ __vmwrite(HOST_RSP, (unsigned long)get_stack_bottom()); v->arch.schedule_tail = arch_vmx_do_resume; + v->arch.arch_vmx.launch_cpu = smp_processor_id(); + /* init guest tsc to start from 0 */ rdtscll(host_tsc); v->arch.arch_vmx.tsc_offset = 0 - host_tsc; @@ -617,11 +639,21 @@ void arch_vmx_do_resume(struct vcpu *v) { - u64 vmcs_phys_ptr = (u64) virt_to_phys(v->arch.arch_vmx.vmcs); - - load_vmcs(&v->arch.arch_vmx, vmcs_phys_ptr); - vmx_do_resume(v); - reset_stack_and_jump(vmx_asm_do_resume); + if ( v->arch.arch_vmx.launch_cpu == smp_processor_id() ) + { + load_vmcs(&v->arch.arch_vmx, virt_to_phys(v->arch.arch_vmx.vmcs)); + vmx_do_resume(v); + reset_stack_and_jump(vmx_asm_do_resume); + } + else + { + __vmpclear(virt_to_phys(v->arch.arch_vmx.vmcs)); + load_vmcs(&v->arch.arch_vmx, virt_to_phys(v->arch.arch_vmx.vmcs)); + vmx_do_resume(v); + vmx_set_host_env(v); + v->arch.arch_vmx.launch_cpu = smp_processor_id(); + reset_stack_and_jump(vmx_asm_do_relaunch); + } } void arch_vmx_do_launch(struct vcpu *v) @@ -641,18 +673,6 @@ } vmx_do_launch(v); reset_stack_and_jump(vmx_asm_do_launch); -} - -void arch_vmx_do_relaunch(struct vcpu *v) -{ - u64 vmcs_phys_ptr = (u64) virt_to_phys(v->arch.arch_vmx.vmcs); - - load_vmcs(&v->arch.arch_vmx, vmcs_phys_ptr); - vmx_do_resume(v); - vmx_set_host_env(v); - v->arch.schedule_tail = arch_vmx_do_resume; - - reset_stack_and_jump(vmx_asm_do_relaunch); } #endif /* CONFIG_VMX */ diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/x86/x86_emulate.c --- a/xen/arch/x86/x86_emulate.c Mon Jan 9 11:19:55 2006 +++ b/xen/arch/x86/x86_emulate.c Mon Jan 9 11:22:17 2006 @@ -371,6 +371,21 @@ (_type)_x; \ }) +/* Access/update address held in a register, based on addressing mode. */ +#define register_address(sel, reg) \ + ((ad_bytes == sizeof(unsigned long)) ? (reg) : \ + ((mode == X86EMUL_MODE_REAL) ? /* implies ad_bytes == 2 */ \ + (((unsigned long)(sel) << 4) + ((reg) & 0xffff)) : \ + ((reg) & ((1UL << (ad_bytes << 3)) - 1)))) +#define register_address_increment(reg, inc) \ +do { \ + if ( ad_bytes == sizeof(unsigned long) ) \ + (reg) += (inc); \ + else \ + (reg) = ((reg) & ~((1UL << (ad_bytes << 3)) - 1)) | \ + (((reg) + (inc)) & ((1UL << (ad_bytes << 3)) - 1)); \ +} while (0) + void * decode_register( uint8_t modrm_reg, struct cpu_user_regs *regs, int highbyte_regs) @@ -420,32 +435,64 @@ { uint8_t b, d, sib, twobyte = 0, rex_prefix = 0; uint8_t modrm, modrm_mod = 0, modrm_reg = 0, modrm_rm = 0; - unsigned int op_bytes = (mode == 8) ? 4 : mode, ad_bytes = mode; - unsigned int lock_prefix = 0, rep_prefix = 0, i; + uint16_t *seg = NULL; /* override segment */ + unsigned int op_bytes, ad_bytes, lock_prefix = 0, rep_prefix = 0, i; int rc = 0; struct operand src, dst; /* Shadow copy of register state. Committed on successful emulation. */ struct cpu_user_regs _regs = *regs; + switch ( mode ) + { + case X86EMUL_MODE_REAL: + case X86EMUL_MODE_PROT16: + op_bytes = ad_bytes = 2; + break; + case X86EMUL_MODE_PROT32: + op_bytes = ad_bytes = 4; + break; +#ifdef __x86_64__ + case X86EMUL_MODE_PROT64: + op_bytes = 4; + ad_bytes = 8; + break; +#endif + default: + return -1; + } + /* Legacy prefixes. */ for ( i = 0; i < 8; i++ ) { switch ( b = insn_fetch(uint8_t, 1, _regs.eip) ) { case 0x66: /* operand-size override */ - op_bytes ^= 6; /* switch between 2/4 bytes */ + op_bytes ^= 6; /* switch between 2/4 bytes */ break; case 0x67: /* address-size override */ - ad_bytes ^= (mode == 8) ? 12 : 6; /* switch between 2/4/8 bytes */ + if ( mode == X86EMUL_MODE_PROT64 ) + ad_bytes ^= 12; /* switch between 4/8 bytes */ + else + ad_bytes ^= 6; /* switch between 2/4 bytes */ break; case 0x2e: /* CS override */ + seg = &_regs.cs; + break; case 0x3e: /* DS override */ + seg = &_regs.ds; + break; case 0x26: /* ES override */ + seg = &_regs.es; + break; case 0x64: /* FS override */ + seg = &_regs.fs; + break; case 0x65: /* GS override */ + seg = &_regs.gs; + break; case 0x36: /* SS override */ - DPRINTF("Warning: ignoring a segment override.\n"); + seg = &_regs.ss; break; case 0xf0: /* LOCK */ lock_prefix = 1; @@ -461,8 +508,12 @@ } done_prefixes: + /* Note quite the same as 80386 real mode, but hopefully good enough. */ + if ( (mode == X86EMUL_MODE_REAL) && (ad_bytes != 2) ) + goto cannot_emulate; + /* REX prefix. */ - if ( (mode == 8) && ((b & 0xf0) == 0x40) ) + if ( (mode == X86EMUL_MODE_PROT64) && ((b & 0xf0) == 0x40) ) { rex_prefix = b; if ( b & 8 ) @@ -674,7 +725,7 @@ emulate_2op_SrcV("cmp", src, dst, _regs.eflags); break; case 0x63: /* movsxd */ - if ( mode != 8 ) /* x86/64 long mode only */ + if ( mode != X86EMUL_MODE_PROT64 ) goto cannot_emulate; dst.val = (int32_t)src.val; break; @@ -721,12 +772,13 @@ dst.val = src.val; break; case 0x8f: /* pop (sole member of Grp1a) */ - /* 64-bit mode: POP defaults to 64-bit operands. */ - if ( (mode == 8) && (dst.bytes == 4) ) + /* 64-bit mode: POP always pops a 64-bit operand. */ + if ( mode == X86EMUL_MODE_PROT64 ) dst.bytes = 8; - if ( (rc = ops->read_std(_regs.esp, &dst.val, dst.bytes)) != 0 ) + if ( (rc = ops->read_std(register_address(_regs.ss, _regs.esp), + &dst.val, dst.bytes)) != 0 ) goto done; - _regs.esp += dst.bytes; + register_address_increment(_regs.esp, dst.bytes); break; case 0xc0 ... 0xc1: grp2: /* Grp2 */ switch ( modrm_reg ) @@ -797,16 +849,17 @@ emulate_1op("dec", dst, _regs.eflags); break; case 6: /* push */ - /* 64-bit mode: PUSH defaults to 64-bit operands. */ - if ( (mode == 8) && (dst.bytes == 4) ) + /* 64-bit mode: PUSH always pushes a 64-bit operand. */ + if ( mode == X86EMUL_MODE_PROT64 ) { dst.bytes = 8; if ( (rc = ops->read_std((unsigned long)dst.ptr, &dst.val, 8)) != 0 ) goto done; } - _regs.esp -= dst.bytes; - if ( (rc = ops->write_std(_regs.esp, dst.val, dst.bytes)) != 0 ) + register_address_increment(_regs.esp, -dst.bytes); + if ( (rc = ops->write_std(register_address(_regs.ss, _regs.esp), + dst.val, dst.bytes)) != 0 ) goto done; dst.val = dst.orig_val; /* skanky: disable writeback */ break; @@ -873,19 +926,22 @@ { /* Write fault: destination is special memory. */ dst.ptr = (unsigned long *)cr2; - if ( (rc = ops->read_std(_regs.esi - _regs.edi + cr2, + if ( (rc = ops->read_std(register_address(seg ? *seg : _regs.ds, + _regs.esi), &dst.val, dst.bytes)) != 0 ) goto done; } else { /* Read fault: source is special memory. */ - dst.ptr = (unsigned long *)(_regs.edi - _regs.esi + cr2); + dst.ptr = (unsigned long *)register_address(_regs.es, _regs.edi); if ( (rc = ops->read_emulated(cr2, &dst.val, dst.bytes)) != 0 ) goto done; } - _regs.esi += (_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes; - _regs.edi += (_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes; + register_address_increment( + _regs.esi, (_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes); + register_address_increment( + _regs.edi, (_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes); break; case 0xa6 ... 0xa7: /* cmps */ DPRINTF("Urk! I don't handle CMPS.\n"); @@ -895,7 +951,8 @@ dst.bytes = (d & ByteOp) ? 1 : op_bytes; dst.ptr = (unsigned long *)cr2; dst.val = _regs.eax; - _regs.edi += (_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes; + register_address_increment( + _regs.edi, (_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes); break; case 0xac ... 0xad: /* lods */ dst.type = OP_REG; @@ -903,7 +960,8 @@ dst.ptr = (unsigned long *)&_regs.eax; if ( (rc = ops->read_emulated(cr2, &dst.val, dst.bytes)) != 0 ) goto done; - _regs.esi += (_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes; + register_address_increment( + _regs.esi, (_regs.eflags & EFLG_DF) ? -dst.bytes : dst.bytes); break; case 0xae ... 0xaf: /* scas */ DPRINTF("Urk! I don't handle SCAS.\n"); diff -r 25e3c8668f1f -r 8af1199488d3 xen/common/bitmap.c --- a/xen/common/bitmap.c Mon Jan 9 11:19:55 2006 +++ b/xen/common/bitmap.c Mon Jan 9 11:22:17 2006 @@ -282,6 +282,111 @@ #endif EXPORT_SYMBOL(__bitmap_weight); +/* + * Bitmap printing & parsing functions: first version by Bill Irwin, + * second version by Paul Jackson, third by Joe Korty. + */ + +#define CHUNKSZ 32 +#define nbits_to_hold_value(val) fls(val) +#define roundup_power2(val,modulus) (((val) + (modulus) - 1) & ~((modulus) - 1)) +#define unhex(c) (isdigit(c) ? (c - '0') : (toupper(c) - 'A' + 10)) +#define BASEDEC 10 /* fancier cpuset lists input in decimal */ + +/** + * bitmap_scnprintf - convert bitmap to an ASCII hex string. + * @buf: byte buffer into which string is placed + * @buflen: reserved size of @buf, in bytes + * @maskp: pointer to bitmap to convert + * @nmaskbits: size of bitmap, in bits + * + * Exactly @nmaskbits bits are displayed. Hex digits are grouped into + * comma-separated sets of eight digits per set. + */ +int bitmap_scnprintf(char *buf, unsigned int buflen, + const unsigned long *maskp, int nmaskbits) +{ + int i, word, bit, len = 0; + unsigned long val; + const char *sep = ""; + int chunksz; + u32 chunkmask; + + chunksz = nmaskbits & (CHUNKSZ - 1); + if (chunksz == 0) + chunksz = CHUNKSZ; + + i = roundup_power2(nmaskbits, CHUNKSZ) - CHUNKSZ; + for (; i >= 0; i -= CHUNKSZ) { + chunkmask = ((1ULL << chunksz) - 1); + word = i / BITS_PER_LONG; + bit = i % BITS_PER_LONG; + val = (maskp[word] >> bit) & chunkmask; + len += scnprintf(buf+len, buflen-len, "%s%0*lx", sep, + (chunksz+3)/4, val); + chunksz = CHUNKSZ; + sep = ","; + } + return len; +} +EXPORT_SYMBOL(bitmap_scnprintf); + +/* + * bscnl_emit(buf, buflen, rbot, rtop, bp) + * + * Helper routine for bitmap_scnlistprintf(). Write decimal number + * or range to buf, suppressing output past buf+buflen, with optional + * comma-prefix. Return len of what would be written to buf, if it + * all fit. + */ +static inline int bscnl_emit(char *buf, int buflen, int rbot, int rtop, int len) +{ + if (len > 0) + len += scnprintf(buf + len, buflen - len, ","); + if (rbot == rtop) + len += scnprintf(buf + len, buflen - len, "%d", rbot); + else + len += scnprintf(buf + len, buflen - len, "%d-%d", rbot, rtop); + return len; +} + +/** + * bitmap_scnlistprintf - convert bitmap to list format ASCII string + * @buf: byte buffer into which string is placed + * @buflen: reserved size of @buf, in bytes + * @maskp: pointer to bitmap to convert + * @nmaskbits: size of bitmap, in bits + * + * Output format is a comma-separated list of decimal numbers and + * ranges. Consecutively set bits are shown as two hyphen-separated + * decimal numbers, the smallest and largest bit numbers set in + * the range. Output format is compatible with the format + * accepted as input by bitmap_parselist(). + * + * The return value is the number of characters which would be + * generated for the given input, excluding the trailing '\0', as + * per ISO C99. + */ +int bitmap_scnlistprintf(char *buf, unsigned int buflen, + const unsigned long *maskp, int nmaskbits) +{ + int len = 0; + /* current bit is 'cur', most recently seen range is [rbot, rtop] */ + int cur, rbot, rtop; + + rbot = cur = find_first_bit(maskp, nmaskbits); + while (cur < nmaskbits) { + rtop = cur; + cur = find_next_bit(maskp, nmaskbits, cur+1); + if (cur >= nmaskbits || cur > rtop + 1) { + len = bscnl_emit(buf, buflen, rbot, rtop, len); + rbot = cur; + } + } + return len; +} +EXPORT_SYMBOL(bitmap_scnlistprintf); + /** * bitmap_find_free_region - find a contiguous aligned mem region * @bitmap: an array of unsigned longs corresponding to the bitmap diff -r 25e3c8668f1f -r 8af1199488d3 xen/common/dom0_ops.c --- a/xen/common/dom0_ops.c Mon Jan 9 11:19:55 2006 +++ b/xen/common/dom0_ops.c Mon Jan 9 11:22:17 2006 @@ -16,6 +16,7 @@ #include <xen/domain_page.h> #include <xen/trace.h> #include <xen/console.h> +#include <xen/iocap.h> #include <asm/current.h> #include <public/dom0_ops.h> #include <public/sched_ctl.h> @@ -109,13 +110,13 @@ switch ( op->cmd ) { - case DOM0_SETDOMAININFO: - { - struct domain *d = find_domain_by_id(op->u.setdomaininfo.domain); + case DOM0_SETVCPUCONTEXT: + { + struct domain *d = find_domain_by_id(op->u.setvcpucontext.domain); ret = -ESRCH; if ( d != NULL ) { - ret = set_info_guest(d, &op->u.setdomaininfo); + ret = set_info_guest(d, &op->u.setvcpucontext); put_domain(d); } } @@ -283,11 +284,12 @@ } break; - case DOM0_PINCPUDOMAIN: - { - domid_t dom = op->u.pincpudomain.domain; + case DOM0_SETVCPUAFFINITY: + { + domid_t dom = op->u.setvcpuaffinity.domain; struct domain *d = find_domain_by_id(dom); struct vcpu *v; + cpumask_t new_affinity; if ( d == NULL ) { @@ -295,15 +297,15 @@ break; } - if ( (op->u.pincpudomain.vcpu >= MAX_VIRT_CPUS) || - !d->vcpu[op->u.pincpudomain.vcpu] ) + if ( (op->u.setvcpuaffinity.vcpu >= MAX_VIRT_CPUS) || + !d->vcpu[op->u.setvcpuaffinity.vcpu] ) { ret = -EINVAL; put_domain(d); break; } - v = d->vcpu[op->u.pincpudomain.vcpu]; + v = d->vcpu[op->u.setvcpuaffinity.vcpu]; if ( v == NULL ) { ret = -ESRCH; @@ -318,22 +320,13 @@ break; } - v->cpumap = op->u.pincpudomain.cpumap; - - if ( v->cpumap == CPUMAP_RUNANYWHERE ) - { - clear_bit(_VCPUF_cpu_pinned, &v->vcpu_flags); - } - else - { - /* pick a new cpu from the usable map */ - int new_cpu; - new_cpu = (int)find_first_set_bit(v->cpumap) % num_online_cpus(); - vcpu_pause(v); - vcpu_migrate_cpu(v, new_cpu); - set_bit(_VCPUF_cpu_pinned, &v->vcpu_flags); - vcpu_unpause(v); - } + new_affinity = v->cpu_affinity; + memcpy(cpus_addr(new_affinity), + &op->u.setvcpuaffinity.cpumap, + min((int)BITS_TO_LONGS(NR_CPUS), + (int)sizeof(op->u.setvcpuaffinity.cpumap))); + + ret = vcpu_set_affinity(v, &new_affinity); put_domain(d); } @@ -505,7 +498,11 @@ op->u.getvcpuinfo.running = test_bit(_VCPUF_running, &v->vcpu_flags); op->u.getvcpuinfo.cpu_time = v->cpu_time; op->u.getvcpuinfo.cpu = v->processor; - op->u.getvcpuinfo.cpumap = v->cpumap; + op->u.getvcpuinfo.cpumap = 0; + memcpy(&op->u.getvcpuinfo.cpumap, + cpus_addr(v->cpu_affinity), + min((int)BITS_TO_LONGS(NR_CPUS), + (int)sizeof(op->u.getvcpuinfo.cpumap))); ret = 0; if ( copy_to_user(u_dom0_op, op, sizeof(*op)) ) @@ -582,6 +579,7 @@ } } break; + case DOM0_SETDEBUGGING: { struct domain *d; @@ -596,6 +594,53 @@ put_domain(d); ret = 0; } + } + break; + + case DOM0_IRQ_PERMISSION: + { + struct domain *d; + unsigned int pirq = op->u.irq_permission.pirq; + + ret = -EINVAL; + if ( pirq >= NR_PIRQS ) + break; + + ret = -ESRCH; + d = find_domain_by_id(op->u.irq_permission.domain); + if ( d == NULL ) + break; + + if ( op->u.irq_permission.allow_access ) + ret = irq_permit_access(d, pirq); + else + ret = irq_deny_access(d, pirq); + + put_domain(d); + } + break; + + case DOM0_IOMEM_PERMISSION: + { + struct domain *d; + unsigned long pfn = op->u.iomem_permission.first_pfn; + unsigned long nr_pfns = op->u.iomem_permission.nr_pfns; + + ret = -EINVAL; + if ( (pfn + nr_pfns - 1) < pfn ) /* wrap? */ + break; + + ret = -ESRCH; + d = find_domain_by_id(op->u.iomem_permission.domain); + if ( d == NULL ) + break; + + if ( op->u.iomem_permission.allow_access ) + ret = iomem_permit_access(d, pfn, pfn + nr_pfns - 1); + else + ret = iomem_deny_access(d, pfn, pfn + nr_pfns - 1); + + put_domain(d); } break; diff -r 25e3c8668f1f -r 8af1199488d3 xen/common/domain.c --- a/xen/common/domain.c Mon Jan 9 11:19:55 2006 +++ b/xen/common/domain.c Mon Jan 9 11:22:17 2006 @@ -16,6 +16,7 @@ #include <xen/console.h> #include <xen/softirq.h> #include <xen/domain_page.h> +#include <xen/rangeset.h> #include <asm/debugger.h> #include <public/dom0_ops.h> #include <public/sched.h> @@ -50,25 +51,24 @@ else set_bit(_DOMF_ctrl_pause, &d->domain_flags); - if ( !is_idle_task(d) && + if ( !is_idle_domain(d) && ((evtchn_init(d) != 0) || (grant_table_create(d) != 0)) ) - { - evtchn_destroy(d); - free_domain(d); - return NULL; - } + goto fail1; if ( (v = alloc_vcpu(d, 0, cpu)) == NULL ) - { - grant_table_destroy(d); - evtchn_destroy(d); - free_domain(d); - return NULL; - } - - arch_do_createdomain(v); - - if ( !is_idle_task(d) ) + goto fail2; + + rangeset_domain_initialise(d); + + d->iomem_caps = rangeset_new(d, "I/O Memory", RANGESETF_prettyprint_hex); + d->irq_caps = rangeset_new(d, "Interrupts", 0); + + if ( (d->iomem_caps == NULL) || + (d->irq_caps == NULL) || + (arch_do_createdomain(v) != 0) ) + goto fail3; + + if ( !is_idle_domain(d) ) { write_lock(&domlist_lock); pd = &domain_list; /* NB. domain_list maintained in order of dom_id. */ @@ -83,6 +83,15 @@ } return d; + + fail3: + rangeset_domain_destroy(d); + fail2: + grant_table_destroy(d); + fail1: + evtchn_destroy(d); + free_domain(d); + return NULL; } @@ -164,20 +173,23 @@ BUG_ON(d == NULL); BUG_ON(d == current->domain); - BUG_ON(!test_bit(_DOMF_shuttingdown, &d->domain_flags)); - BUG_ON(test_bit(_DOMF_shutdown, &d->domain_flags)); + + LOCK_BIGLOCK(d); /* Make sure that every vcpu is descheduled before we finalise. */ for_each_vcpu ( d, v ) vcpu_sleep_sync(v); - BUG_ON(!cpus_empty(d->cpumask)); + BUG_ON(!cpus_empty(d->domain_dirty_cpumask)); sync_pagetable_state(d); - set_bit(_DOMF_shutdown, &d->domain_flags); - clear_bit(_DOMF_shuttingdown, &d->domain_flags); - - send_guest_virq(dom0->vcpu[0], VIRQ_DOM_EXC); + /* Don't set DOMF_shutdown until execution contexts are sync'ed. */ + if ( !test_and_set_bit(_DOMF_shutdown, &d->domain_flags) ) + send_guest_virq(dom0->vcpu[0], VIRQ_DOM_EXC); + + UNLOCK_BIGLOCK(d); + + put_domain(d); } static __init int domain_shutdown_finaliser_init(void) @@ -213,16 +225,17 @@ /* Mark the domain as shutting down. */ d->shutdown_code = reason; - if ( !test_and_set_bit(_DOMF_shuttingdown, &d->domain_flags) ) - { - /* This vcpu won the race to finalise the shutdown. */ - domain_shuttingdown[smp_processor_id()] = d; - raise_softirq(DOMAIN_SHUTDOWN_FINALISE_SOFTIRQ); - } /* Put every vcpu to sleep, but don't wait (avoids inter-vcpu deadlock). */ for_each_vcpu ( d, v ) + { + atomic_inc(&v->pausecnt); vcpu_sleep_nosync(v); + } + + get_knownalive_domain(d); + domain_shuttingdown[smp_processor_id()] = d; + raise_softirq(DOMAIN_SHUTDOWN_FINALISE_SOFTIRQ); } @@ -271,6 +284,8 @@ *pd = d->next_in_hashbucket; write_unlock(&domlist_lock); + rangeset_domain_destroy(d); + evtchn_destroy(d); grant_table_destroy(d); @@ -346,11 +361,11 @@ * of domains other than domain 0. ie. the domains that are being built by * the userspace dom0 domain builder. */ -int set_info_guest(struct domain *d, dom0_setdomaininfo_t *setdomaininfo) +int set_info_guest(struct domain *d, dom0_setvcpucontext_t *setvcpucontext) { int rc = 0; struct vcpu_guest_context *c = NULL; - unsigned long vcpu = setdomaininfo->vcpu; + unsigned long vcpu = setvcpucontext->vcpu; struct vcpu *v; if ( (vcpu >= MAX_VIRT_CPUS) || ((v = d->vcpu[vcpu]) == NULL) ) @@ -363,7 +378,7 @@ return -ENOMEM; rc = -EFAULT; - if ( copy_from_user(c, setdomaininfo->ctxt, sizeof(*c)) == 0 ) + if ( copy_from_user(c, setvcpucontext->ctxt, sizeof(*c)) == 0 ) rc = arch_set_info_guest(v, c); xfree(c); diff -r 25e3c8668f1f -r 8af1199488d3 xen/common/event_channel.c --- a/xen/common/event_channel.c Mon Jan 9 11:19:55 2006 +++ b/xen/common/event_channel.c Mon Jan 9 11:22:17 2006 @@ -22,6 +22,7 @@ #include <xen/sched.h> #include <xen/event.h> #include <xen/irq.h> +#include <xen/iocap.h> #include <asm/current.h> #include <public/xen.h> @@ -241,6 +242,9 @@ if ( pirq >= ARRAY_SIZE(d->pirq_to_evtchn) ) return -EINVAL; + + if ( !irq_access_permitted(d, pirq) ) + return -EPERM; spin_lock(&d->evtchn_lock); diff -r 25e3c8668f1f -r 8af1199488d3 xen/common/grant_table.c --- a/xen/common/grant_table.c Mon Jan 9 11:19:55 2006 +++ b/xen/common/grant_table.c Mon Jan 9 11:22:17 2006 @@ -469,7 +469,7 @@ for ( i = 0; i < count; i++ ) (void)__gnttab_unmap_grant_ref(&uop[i]); - flush_tlb_mask(current->domain->cpumask); + flush_tlb_mask(current->domain->domain_dirty_cpumask); return 0; } diff -r 25e3c8668f1f -r 8af1199488d3 xen/common/keyhandler.c --- a/xen/common/keyhandler.c Mon Jan 9 11:19:55 2006 +++ b/xen/common/keyhandler.c Mon Jan 9 11:22:17 2006 @@ -11,6 +11,7 @@ #include <xen/sched.h> #include <xen/softirq.h> #include <xen/domain.h> +#include <xen/rangeset.h> #include <asm/debugger.h> #define KEY_MAX 256 @@ -96,44 +97,60 @@ machine_restart(NULL); } -static void do_task_queues(unsigned char key) +static void cpuset_print(char *set, int size, cpumask_t mask) +{ + *set++ = '{'; + set += cpulist_scnprintf(set, size-2, mask); + *set++ = '}'; + *set++ = '\0'; +} + +static void dump_domains(unsigned char key) { struct domain *d; struct vcpu *v; s_time_t now = NOW(); - - printk("'%c' pressed -> dumping task queues (now=0x%X:%08X)\n", key, + char cpuset[100]; + + printk("'%c' pressed -> dumping domain info (now=0x%X:%08X)\n", key, (u32)(now>>32), (u32)now); read_lock(&domlist_lock); for_each_domain ( d ) { - printk("Xen: DOM %u, flags=%lx refcnt=%d nr_pages=%d " - "xenheap_pages=%d\n", d->domain_id, d->domain_flags, - atomic_read(&d->refcnt), d->tot_pages, d->xenheap_pages); - /* The handle is printed according to the OSF DCE UUID spec., even - though it is not necessarily such a thing, for ease of use when it - _is_ one of those. */ - printk(" handle=%02x%02x%02x%02x-%02x%02x-%02x%02x-" + printk("General information for domain %u:\n", d->domain_id); + cpuset_print(cpuset, sizeof(cpuset), d->domain_dirty_cpumask); + printk(" flags=%lx refcnt=%d nr_pages=%d xenheap_pages=%d " + "dirty_cpus=%s\n", + d->domain_flags, atomic_read(&d->refcnt), + d->tot_pages, d->xenheap_pages, cpuset); + printk(" handle=%02x%02x%02x%02x-%02x%02x-%02x%02x-" "%02x%02x-%02x%02x%02x%02x%02x%02x\n", d->handle[ 0], d->handle[ 1], d->handle[ 2], d->handle[ 3], d->handle[ 4], d->handle[ 5], d->handle[ 6], d->handle[ 7], d->handle[ 8], d->handle[ 9], d->handle[10], d->handle[11], d->handle[12], d->handle[13], d->handle[14], d->handle[15]); + rangeset_domain_printk(d); + dump_pageframe_info(d); + printk("VCPU information and callbacks for domain %u:\n", + d->domain_id); for_each_vcpu ( d, v ) { - printk("Guest: %p CPU %d [has=%c] flags=%lx " - "upcall_pend = %02x, upcall_mask = %02x\n", v, - v->processor, + printk(" VCPU%d: CPU%d [has=%c] flags=%lx " + "upcall_pend = %02x, upcall_mask = %02x ", + v->vcpu_id, v->processor, test_bit(_VCPUF_running, &v->vcpu_flags) ? 'T':'F', v->vcpu_flags, v->vcpu_info->evtchn_upcall_pending, v->vcpu_info->evtchn_upcall_mask); - printk("Notifying guest... %d/%d\n", d->domain_id, v->vcpu_id); - printk("port %d/%d stat %d %d %d\n", + cpuset_print(cpuset, sizeof(cpuset), v->vcpu_dirty_cpumask); + printk("dirty_cpus=%s ", cpuset); + cpuset_print(cpuset, sizeof(cpuset), v->cpu_affinity); + printk("cpu_affinity=%s\n", cpuset); + printk(" Notifying guest (virq %d, port %d, stat %d/%d/%d)\n", VIRQ_DEBUG, v->virq_to_evtchn[VIRQ_DEBUG], test_bit(v->virq_to_evtchn[VIRQ_DEBUG], &d->shared_info->evtchn_pending[0]), @@ -191,7 +208,7 @@ register_keyhandler( 'L', reset_sched_histo, "reset sched latency histogram"); register_keyhandler( - 'q', do_task_queues, "dump task queues + guest state"); + 'q', dump_domains, "dump domain (and guest debug) info"); register_keyhandler( 'r', dump_runq, "dump run queues"); register_irq_keyhandler( diff -r 25e3c8668f1f -r 8af1199488d3 xen/common/memory.c --- a/xen/common/memory.c Mon Jan 9 11:19:55 2006 +++ b/xen/common/memory.c Mon Jan 9 11:22:17 2006 @@ -15,6 +15,7 @@ #include <xen/sched.h> #include <xen/event.h> #include <xen/shadow.h> +#include <xen/iocap.h> #include <asm/current.h> #include <asm/hardirq.h> #include <public/memory.h> @@ -35,7 +36,8 @@ !array_access_ok(extent_list, nr_extents, sizeof(*extent_list)) ) return 0; - if ( (extent_order != 0) && !IS_CAPABLE_PHYSDEV(current->domain) ) + if ( (extent_order != 0) && + !multipage_allocation_permitted(current->domain) ) { DPRINTK("Only I/O-capable domains may allocate multi-page extents.\n"); return 0; diff -r 25e3c8668f1f -r 8af1199488d3 xen/common/page_alloc.c --- a/xen/common/page_alloc.c Mon Jan 9 11:19:55 2006 +++ b/xen/common/page_alloc.c Mon Jan 9 11:22:17 2006 @@ -615,7 +615,7 @@ shadow_drop_references(d, &pg[i]); ASSERT((pg[i].u.inuse.type_info & PGT_count_mask) == 0); pg[i].tlbflush_timestamp = tlbflush_current_time(); - pg[i].u.free.cpumask = d->cpumask; + pg[i].u.free.cpumask = d->domain_dirty_cpumask; list_del(&pg[i].list); } diff -r 25e3c8668f1f -r 8af1199488d3 xen/common/sched_bvt.c --- a/xen/common/sched_bvt.c Mon Jan 9 11:19:55 2006 +++ b/xen/common/sched_bvt.c Mon Jan 9 11:22:17 2006 @@ -31,7 +31,8 @@ struct list_head run_list; /* runqueue list pointers */ u32 avt; /* actual virtual time */ u32 evt; /* effective virtual time */ - struct vcpu *vcpu; + int migrated; /* migrated to a new CPU */ + struct vcpu *vcpu; struct bvt_dom_info *inf; }; @@ -219,7 +220,7 @@ einf->vcpu = v; - if ( is_idle_task(v->domain) ) + if ( is_idle_domain(v->domain) ) { einf->avt = einf->evt = ~0U; BUG_ON(__task_on_runqueue(v)); @@ -250,9 +251,11 @@ /* Set the BVT parameters. AVT should always be updated if CPU migration ocurred.*/ - if ( einf->avt < CPU_SVT(cpu) || - unlikely(test_bit(_VCPUF_cpu_migrated, &v->vcpu_flags)) ) + if ( (einf->avt < CPU_SVT(cpu)) || einf->migrated ) + { einf->avt = CPU_SVT(cpu); + einf->migrated = 0; + } /* Deal with warping here. */ einf->evt = calc_evt(v, einf->avt); @@ -265,7 +268,7 @@ ((einf->evt - curr_evt) / BVT_INFO(curr->domain)->mcu_advance) + ctx_allow; - if ( is_idle_task(curr->domain) || (einf->evt <= curr_evt) ) + if ( is_idle_domain(curr->domain) || (einf->evt <= curr_evt) ) cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ); else if ( schedule_data[cpu].s_timer.expires > r_time ) set_ac_timer(&schedule_data[cpu].s_timer, r_time); @@ -274,11 +277,27 @@ static void bvt_sleep(struct vcpu *v) { - if ( test_bit(_VCPUF_running, &v->vcpu_flags) ) + if ( schedule_data[v->processor].curr == v ) cpu_raise_softirq(v->processor, SCHEDULE_SOFTIRQ); else if ( __task_on_runqueue(v) ) __del_from_runqueue(v); } + + +static int bvt_set_affinity(struct vcpu *v, cpumask_t *affinity) +{ + if ( v == current ) + return cpu_isset(v->processor, *affinity) ? 0 : -EBUSY; + + vcpu_pause(v); + v->cpu_affinity = *affinity; + v->processor = first_cpu(v->cpu_affinity); + EBVT_INFO(v)->migrated = 1; + vcpu_unpause(v); + + return 0; +} + /** * bvt_free_task - free BVT private structures for a task @@ -380,7 +399,7 @@ ASSERT(prev_einf != NULL); ASSERT(__task_on_runqueue(prev)); - if ( likely(!is_idle_task(prev->domain)) ) + if ( likely(!is_idle_domain(prev->domain)) ) { prev_einf->avt = calc_avt(prev, now); prev_einf->evt = calc_evt(prev, prev_einf->avt); @@ -390,7 +409,7 @@ __del_from_runqueue(prev); - if ( domain_runnable(prev) ) + if ( vcpu_runnable(prev) ) __add_to_runqueue_tail(prev); } @@ -471,13 +490,13 @@ } /* work out time for next run through scheduler */ - if ( is_idle_task(next->domain) ) + if ( is_idle_domain(next->domain) ) { r_time = ctx_allow; goto sched_done; } - if ( (next_prime == NULL) || is_idle_task(next_prime->domain) ) + if ( (next_prime == NULL) || is_idle_domain(next_prime->domain) ) { /* We have only one runnable task besides the idle task. */ r_time = 10 * ctx_allow; /* RN: random constant */ @@ -557,6 +576,7 @@ .dump_cpu_state = bvt_dump_cpu_state, .sleep = bvt_sleep, .wake = bvt_wake, + .set_affinity = bvt_set_affinity }; /* diff -r 25e3c8668f1f -r 8af1199488d3 xen/common/sched_sedf.c --- a/xen/common/sched_sedf.c Mon Jan 9 11:19:55 2006 +++ b/xen/common/sched_sedf.c Mon Jan 9 11:22:17 2006 @@ -325,21 +325,29 @@ list_insert_sort(RUNQ(d->processor), LIST(d), runq_comp); } + /* Allocates memory for per domain private scheduling data*/ -static int sedf_alloc_task(struct vcpu *d) { - PRINT(2,"sedf_alloc_task was called, domain-id %i.%i\n",d->domain->domain_id, - d->vcpu_id); - if (d->domain->sched_priv == NULL) { - if ((d->domain->sched_priv = - xmalloc(struct sedf_dom_info)) == NULL ) +static int sedf_alloc_task(struct vcpu *d) +{ + PRINT(2, "sedf_alloc_task was called, domain-id %i.%i\n", + d->domain->domain_id, d->vcpu_id); + + if ( d->domain->sched_priv == NULL ) + { + d->domain->sched_priv = xmalloc(struct sedf_dom_info); + if ( d->domain->sched_priv == NULL ) return -1; memset(d->domain->sched_priv, 0, sizeof(struct sedf_dom_info)); } - if ((d->sched_priv = xmalloc(struct sedf_vcpu_info)) == NULL ) + + if ( (d->sched_priv = xmalloc(struct sedf_vcpu_info)) == NULL ) return -1; + memset(d->sched_priv, 0, sizeof(struct sedf_vcpu_info)); + return 0; } + /* Setup the sedf_dom_info */ static void sedf_add_task(struct vcpu *d) @@ -363,14 +371,17 @@ INIT_LIST_HEAD(EXTRAQ(d->processor,EXTRA_UTIL_Q)); } - if (d->domain->domain_id==0) { + if ( d->domain->domain_id == 0 ) + { /*set dom0 to something useful to boot the machine*/ inf->period = MILLISECS(20); inf->slice = MILLISECS(15); inf->latency = 0; inf->deadl_abs = 0; inf->status = EXTRA_AWARE | SEDF_ASLEEP; - } else { + } + else + { /*other domains run in best effort mode*/ inf->period = WEIGHT_PERIOD; inf->slice = 0; @@ -379,14 +390,18 @@ inf->status = EXTRA_AWARE | SEDF_ASLEEP; inf->extraweight = 1; } + inf->period_orig = inf->period; inf->slice_orig = inf->slice; INIT_LIST_HEAD(&(inf->list)); INIT_LIST_HEAD(&(inf->extralist[EXTRA_PEN_Q])); INIT_LIST_HEAD(&(inf->extralist[EXTRA_UTIL_Q])); - if (!is_idle_task(d->domain)) { + if ( !is_idle_domain(d->domain) ) + { extraq_check(d); - } else { + } + else + { EDOM_INFO(d)->deadl_abs = 0; EDOM_INFO(d)->status &= ~SEDF_ASLEEP; } @@ -396,19 +411,28 @@ static void sedf_free_task(struct domain *d) { int i; + PRINT(2,"sedf_free_task was called, domain-id %i\n",d->domain_id); + ASSERT(d->sched_priv != NULL); xfree(d->sched_priv); - for (i = 0; i < MAX_VIRT_CPUS; i++) - if ( d->vcpu[i] ) { + for ( i = 0; i < MAX_VIRT_CPUS; i++ ) + { + if ( d->vcpu[i] ) + { ASSERT(d->vcpu[i]->sched_priv != NULL); xfree(d->vcpu[i]->sched_priv); } -} - -/* handles the rescheduling, bookkeeping of domains running in their realtime-time :)*/ -static inline void desched_edf_dom (s_time_t now, struct vcpu* d) { + } +} + +/* + * Handles the rescheduling & bookkeeping of domains running in their + * guaranteed timeslice. + */ +static void desched_edf_dom(s_time_t now, struct vcpu* d) +{ struct sedf_vcpu_info* inf = EDOM_INFO(d); /*current domain is running in real time mode*/ @@ -418,27 +442,30 @@ /*scheduling decisions, which don't remove the running domain from the runq*/ - if ((inf->cputime < inf->slice) && sedf_runnable(d)) + if ( (inf->cputime < inf->slice) && sedf_runnable(d) ) return; __del_from_queue(d); /*manage bookkeeping (i.e. calculate next deadline, memorize overun-time of slice) of finished domains*/ - if (inf->cputime >= inf->slice) { + if ( inf->cputime >= inf->slice ) + { inf->cputime -= inf->slice; - if (inf->period < inf->period_orig) { + if ( inf->period < inf->period_orig ) + { /*this domain runs in latency scaling or burst mode*/ #if (UNBLOCK == UNBLOCK_BURST) /*if we are runnig in burst scaling wait for two periods before scaling periods up again*/ - if (now - inf->unblock_abs >= 2 * inf->period) + if ( (now - inf->unblock_abs) >= (2 * inf->period) ) #endif { inf->period *= 2; inf->slice *= 2; - if ((inf->period > inf->period_orig) || - (inf->slice > inf->slice_orig)) { + if ( (inf->period > inf->period_orig) || + (inf->slice > inf->slice_orig) ) + { /*reset slice & period*/ inf->period = inf->period_orig; inf->slice = inf->slice_orig; @@ -450,36 +477,46 @@ } /*add a runnable domain to the waitqueue*/ - if (sedf_runnable(d)) + if ( sedf_runnable(d) ) + { __add_to_waitqueue_sort(d); - else { + } + else + { /*we have a blocked realtime task -> remove it from exqs too*/ #if (EXTRA > EXTRA_OFF) #if (EXTRA == EXTRA_BLOCK_WEIGHT) - if (extraq_on(d, EXTRA_PEN_Q)) extraq_del(d, EXTRA_PEN_Q); -#endif - if (extraq_on(d, EXTRA_UTIL_Q)) extraq_del(d, EXTRA_UTIL_Q); -#endif - } + if ( extraq_on(d, EXTRA_PEN_Q) ) + extraq_del(d, EXTRA_PEN_Q); +#endif + if ( extraq_on(d, EXTRA_UTIL_Q) ) + extraq_del(d, EXTRA_UTIL_Q); +#endif + } + ASSERT(EQ(sedf_runnable(d), __task_on_queue(d))); ASSERT(IMPLY(extraq_on(d, EXTRA_UTIL_Q) || extraq_on(d, EXTRA_PEN_Q), sedf_runnable(d))); } + /* Update all elements on the queues */ -static inline void update_queues(s_time_t now, struct list_head* runq, - struct list_head* waitq) { - struct list_head *cur,*tmp; +static void update_queues( + s_time_t now, struct list_head *runq, struct list_head *waitq) +{ + struct list_head *cur, *tmp; struct sedf_vcpu_info *curinf; PRINT(3,"Updating waitq..\n"); + /*check for the first elements of the waitqueue, whether their next period has already started*/ list_for_each_safe(cur, tmp, waitq) { curinf = list_entry(cur, struct sedf_vcpu_info, list); PRINT(4,"\tLooking @ dom %i.%i\n", curinf->vcpu->domain->domain_id, curinf->vcpu->vcpu_id); - if (PERIOD_BEGIN(curinf) <= now) { + if ( PERIOD_BEGIN(curinf) <= now ) + { __del_from_queue(curinf->vcpu); __add_to_runqueue_sort(curinf->vcpu); } @@ -488,13 +525,16 @@ } PRINT(3,"Updating runq..\n"); + /*process the runq, find domains that are on the runqueue which shouldn't be there*/ list_for_each_safe(cur, tmp, runq) { curinf = list_entry(cur,struct sedf_vcpu_info,list); PRINT(4,"\tLooking @ dom %i.%i\n", curinf->vcpu->domain->domain_id, curinf->vcpu->vcpu_id); - if (unlikely(curinf->slice == 0)) { + + if ( unlikely(curinf->slice == 0) ) + { /*ignore domains with empty slice*/ PRINT(4,"\tUpdating zero-slice domain %i.%i\n", curinf->vcpu->domain->domain_id, @@ -504,7 +544,8 @@ /*move them to their next period*/ curinf->deadl_abs += curinf->period; /*ensure that the start of the next period is in the future*/ - if (unlikely(PERIOD_BEGIN(curinf) < now)) { + if ( unlikely(PERIOD_BEGIN(curinf) < now) ) + { curinf->deadl_abs += (DIV_UP(now - PERIOD_BEGIN(curinf), curinf->period)) * curinf->period; @@ -513,8 +554,10 @@ __add_to_waitqueue_sort(curinf->vcpu); continue; } - if (unlikely((curinf->deadl_abs < now) || - (curinf->cputime > curinf->slice))) { + + if ( unlikely((curinf->deadl_abs < now) || + (curinf->cputime > curinf->slice)) ) + { /*we missed the deadline or the slice was already finished... might hapen because of dom_adj.*/ @@ -550,6 +593,7 @@ PRINT(3,"done updating the queues\n"); } + #if (EXTRA > EXTRA_OFF) /* removes a domain from the head of the according extraQ and requeues it at a specified position: @@ -557,9 +601,10 @@ weighted ext.: insert in sorted list by score if the domain is blocked / has regained its short-block-loss time it is not put on any queue */ -static inline void desched_extra_dom(s_time_t now, struct vcpu* d) { +static void desched_extra_dom(s_time_t now, struct vcpu* d) +{ struct sedf_vcpu_info *inf = EDOM_INFO(d); - int i = extra_get_cur_q(inf); + int i = extra_get_cur_q(inf); #if (EXTRA == EXTRA_SLICE_WEIGHT || EXTRA == EXTRA_BLOCK_WEIGHT) unsigned long oldscore; @@ -575,14 +620,15 @@ extraq_del(d, i); #if (EXTRA == EXTRA_ROUNDR) - if (sedf_runnable(d) && (inf->status & EXTRA_AWARE)) + if ( sedf_runnable(d) && (inf->status & EXTRA_AWARE) ) /*add to the tail if it is runnable => round-robin*/ extraq_add_tail(d, EXTRA_UTIL_Q); #elif (EXTRA == EXTRA_SLICE_WEIGHT || EXTRA == EXTRA_BLOCK_WEIGHT) /*update the score*/ - oldscore = inf->score[i]; + oldscore = inf->score[i]; #if (EXTRA == EXTRA_BLOCK_WEIGHT) - if (i == EXTRA_PEN_Q) { + if ( i == EXTRA_PEN_Q ) + { /*domain was running in L0 extraq*/ /*reduce block lost, probably more sophistication here!*/ /*inf->short_block_lost_tot -= EXTRA_QUANTUM;*/ @@ -605,12 +651,13 @@ inf->score[EXTRA_PEN_Q] = (inf->period << 10) / inf->short_block_lost_tot; oldscore = 0; - } else + } + else #endif { /*domain was running in L1 extraq => score is inverse of utilization and is used somewhat incremental!*/ - if (!inf->extraweight) + if ( !inf->extraweight ) /*NB: use fixed point arithmetic with 10 bits*/ inf->score[EXTRA_UTIL_Q] = (inf->period << 10) / inf->slice; @@ -619,24 +666,32 @@ full (ie 100%) utilization is equivalent to 128 extraweight*/ inf->score[EXTRA_UTIL_Q] = (1<<17) / inf->extraweight; } + check_extra_queues: /* Adding a runnable domain to the right queue and removing blocked ones*/ - if (sedf_runnable(d)) { + if ( sedf_runnable(d) ) + { /*add according to score: weighted round robin*/ if (((inf->status & EXTRA_AWARE) && (i == EXTRA_UTIL_Q)) || ((inf->status & EXTRA_WANT_PEN_Q) && (i == EXTRA_PEN_Q))) extraq_add_sort_update(d, i, oldscore); } - else { + else + { /*remove this blocked domain from the waitq!*/ __del_from_queue(d); #if (EXTRA == EXTRA_BLOCK_WEIGHT) /*make sure that we remove a blocked domain from the other extraq too*/ - if (i == EXTRA_PEN_Q) { - if (extraq_on(d, EXTRA_UTIL_Q)) extraq_del(d, EXTRA_UTIL_Q); - } else { - if (extraq_on(d, EXTRA_PEN_Q)) extraq_del(d, EXTRA_PEN_Q); + if ( i == EXTRA_PEN_Q ) + { + if ( extraq_on(d, EXTRA_UTIL_Q) ) + extraq_del(d, EXTRA_UTIL_Q); + } + else + { + if ( extraq_on(d, EXTRA_PEN_Q) ) + extraq_del(d, EXTRA_PEN_Q); } #endif } @@ -647,16 +702,21 @@ } #endif -static inline struct task_slice sedf_do_extra_schedule (s_time_t now, - s_time_t end_xt, struct list_head *extraq[], int cpu) { + +static struct task_slice sedf_do_extra_schedule( + s_time_t now, s_time_t end_xt, struct list_head *extraq[], int cpu) +{ struct task_slice ret; struct sedf_vcpu_info *runinf; ASSERT(end_xt > now); + /* Enough time left to use for extratime? */ - if (end_xt - now < EXTRA_QUANTUM) + if ( end_xt - now < EXTRA_QUANTUM ) goto return_idle; + #if (EXTRA == EXTRA_BLOCK_WEIGHT) - if (!list_empty(extraq[EXTRA_PEN_Q])) { + if ( !list_empty(extraq[EXTRA_PEN_Q]) ) + { /*we still have elements on the level 0 extraq => let those run first!*/ runinf = list_entry(extraq[EXTRA_PEN_Q]->next, @@ -667,9 +727,12 @@ #ifdef SEDF_STATS runinf->pen_extra_slices++; #endif - } else -#endif - if (!list_empty(extraq[EXTRA_UTIL_Q])) { + } + else +#endif + { + if ( !list_empty(extraq[EXTRA_UTIL_Q]) ) + { /*use elements from the normal extraqueue*/ runinf = list_entry(extraq[EXTRA_UTIL_Q]->next, struct sedf_vcpu_info, @@ -680,6 +743,7 @@ } else goto return_idle; + } ASSERT(ret.time > 0); ASSERT(sedf_runnable(ret.task)); @@ -692,6 +756,8 @@ ASSERT(sedf_runnable(ret.task)); return ret; } + + /* Main scheduling function Reasons for calling this function are: -timeslice for the current period used up @@ -699,7 +765,7 @@ -and various others ;) in general: determine which domain to run next*/ static struct task_slice sedf_do_schedule(s_time_t now) { - int cpu = current->processor; + int cpu = smp_processor_id(); struct list_head *runq = RUNQ(cpu); struct list_head *waitq = WAITQ(cpu); #if (EXTRA > EXTRA_OFF) @@ -711,20 +777,21 @@ struct task_slice ret; /*idle tasks don't need any of the following stuf*/ - if (is_idle_task(current->domain)) + if (is_idle_domain(current->domain)) goto check_waitq; /* create local state of the status of the domain, in order to avoid inconsistent state during scheduling decisions, because data for - domain_runnable is not protected by the scheduling lock!*/ - if(!domain_runnable(current)) + vcpu_runnable is not protected by the scheduling lock!*/ + if ( !vcpu_runnable(current) ) inf->status |= SEDF_ASLEEP; - if (inf->status & SEDF_ASLEEP) + if ( inf->status & SEDF_ASLEEP ) inf->block_abs = now; #if (EXTRA > EXTRA_OFF) - if (unlikely(extra_runs(inf))) { + if ( unlikely(extra_runs(inf)) ) + { /*special treatment of domains running in extra time*/ desched_extra_dom(now, current); } @@ -739,10 +806,12 @@ /*now simply pick the first domain from the runqueue, which has the earliest deadline, because the list is sorted*/ - if (!list_empty(runq)) { + if ( !list_empty(runq) ) + { runinf = list_entry(runq->next,struct sedf_vcpu_info,list); ret.task = runinf->vcpu; - if (!list_empty(waitq)) { + if ( !list_empty(waitq) ) + { waitinf = list_entry(waitq->next, struct sedf_vcpu_info,list); /*rerun scheduler, when scheduled domain reaches it's @@ -751,14 +820,16 @@ ret.time = MIN(now + runinf->slice - runinf->cputime, PERIOD_BEGIN(waitinf)) - now; } - else { + else + { ret.time = runinf->slice - runinf->cputime; } CHECK(ret.time > 0); goto sched_done; } - if (!list_empty(waitq)) { + if ( !list_empty(waitq) ) + { waitinf = list_entry(waitq->next,struct sedf_vcpu_info, list); /*we could not find any suitable domain => look for domains that are aware of extratime*/ @@ -771,7 +842,8 @@ #endif CHECK(ret.time > 0); } - else { + else + { /*this could probably never happen, but one never knows...*/ /*it can... imagine a second CPU, which is pure scifi ATM, but one never knows ;)*/ @@ -782,11 +854,13 @@ sched_done: /*TODO: Do something USEFUL when this happens and find out, why it still can happen!!!*/ - if (ret.time<0) { + if ( ret.time < 0) + { printk("Ouch! We are seriously BEHIND schedule! %"PRIi64"\n", ret.time); ret.time = EXTRA_QUANTUM; } + EDOM_INFO(ret.task)->sched_start_abs = now; CHECK(ret.time > 0); ASSERT(sedf_runnable(ret.task)); @@ -794,30 +868,36 @@ return ret; } -static void sedf_sleep(struct vcpu *d) { - PRINT(2,"sedf_sleep was called, domain-id %i.%i\n",d->domain->domain_id, d->vcpu_id); - - if (is_idle_task(d->domain)) + +static void sedf_sleep(struct vcpu *d) +{ + PRINT(2,"sedf_sleep was called, domain-id %i.%i\n", + d->domain->domain_id, d->vcpu_id); + + if ( is_idle_domain(d->domain) ) return; EDOM_INFO(d)->status |= SEDF_ASLEEP; - if ( test_bit(_VCPUF_running, &d->vcpu_flags) ) { + if ( schedule_data[d->processor].curr == d ) + { cpu_raise_softirq(d->processor, SCHEDULE_SOFTIRQ); } - else { + else + { if ( __task_on_queue(d) ) __del_from_queue(d); #if (EXTRA > EXTRA_OFF) - if (extraq_on(d, EXTRA_UTIL_Q)) + if ( extraq_on(d, EXTRA_UTIL_Q) ) extraq_del(d, EXTRA_UTIL_Q); #endif #if (EXTRA == EXTRA_BLOCK_WEIGHT) - if (extraq_on(d, EXTRA_PEN_Q)) + if ( extraq_on(d, EXTRA_PEN_Q) ) extraq_del(d, EXTRA_PEN_Q); #endif } } + /* This function wakes up a domain, i.e. moves them into the waitqueue * things to mention are: admission control is taking place nowhere at @@ -890,17 +970,21 @@ * -either behaviour can lead to missed deadlines in other domains as * opposed to approaches 1,2a,2b */ -static inline void unblock_short_vcons -(struct sedf_vcpu_info* inf, s_time_t now) { +#if (UNBLOCK <= UNBLOCK_SHORT_RESUME) +static void unblock_short_vcons(struct sedf_vcpu_info* inf, s_time_t now) +{ inf->deadl_abs += inf->period; inf->cputime = 0; } - -static inline void unblock_short_cons(struct sedf_vcpu_info* inf, s_time_t now) +#endif + +#if (UNBLOCK == UNBLOCK_SHORT_RESUME) +static void unblock_short_cons(struct sedf_vcpu_info* inf, s_time_t now) { /*treat blocked time as consumed by the domain*/ inf->cputime += now - inf->block_abs; - if (inf->cputime + EXTRA_QUANTUM > inf->slice) { + if ( (inf->cputime + EXTRA_QUANTUM) > inf->slice ) + { /*we don't have a reasonable amount of time in our slice left :( => start in next period!*/ unblock_short_vcons(inf, now); @@ -910,8 +994,11 @@ inf->short_cont++; #endif } -static inline void unblock_short_extra_support (struct sedf_vcpu_info* inf, - s_time_t now) { +#endif + +static void unblock_short_extra_support( + struct sedf_vcpu_info* inf, s_time_t now) +{ /*this unblocking scheme tries to support the domain, by assigning it a priority in extratime distribution according to the loss of time in this slice due to blocking*/ @@ -919,26 +1006,29 @@ /*no more realtime execution in this period!*/ inf->deadl_abs += inf->period; - if (likely(inf->block_abs)) { + if ( likely(inf->block_abs) ) + { //treat blocked time as consumed by the domain*/ /*inf->cputime += now - inf->block_abs;*/ /*penalty is time the domain would have had if it continued to run */ pen = (inf->slice - inf->cputime); - if (pen < 0) pen = 0; + if ( pen < 0 ) + pen = 0; /*accumulate all penalties over the periods*/ /*inf->short_block_lost_tot += pen;*/ /*set penalty to the current value*/ inf->short_block_lost_tot = pen; /*not sure which one is better.. but seems to work well...*/ - if (inf->short_block_lost_tot) { + if ( inf->short_block_lost_tot ) + { inf->score[0] = (inf->period << 10) / inf->short_block_lost_tot; #ifdef SEDF_STATS inf->pen_extra_blocks++; #endif - if (extraq_on(inf->vcpu, EXTRA_PEN_Q)) + if ( extraq_on(inf->vcpu, EXTRA_PEN_Q) ) /*remove domain for possible resorting!*/ extraq_del(inf->vcpu, EXTRA_PEN_Q); else @@ -951,36 +1041,53 @@ extraq_add_sort_update(inf->vcpu, EXTRA_PEN_Q, 0); } } + /*give it a fresh slice in the next period!*/ inf->cputime = 0; } -static inline void unblock_long_vcons(struct sedf_vcpu_info* inf, s_time_t now) + + +#if (UNBLOCK == UNBLOCK_ISOCHRONOUS_EDF) +static void unblock_long_vcons(struct sedf_vcpu_info* inf, s_time_t now) { /* align to next future period */ inf->deadl_abs += (DIV_UP(now - inf->deadl_abs, inf->period) +1) * inf->period; inf->cputime = 0; } - -static inline void unblock_long_cons_a (struct sedf_vcpu_info* inf, - s_time_t now) { +#endif + + +#if 0 +static void unblock_long_cons_a (struct sedf_vcpu_info* inf, s_time_t now) +{ /*treat the time the domain was blocked in the - CURRENT period as consumed by the domain*/ + CURRENT period as consumed by the domain*/ inf->cputime = (now - inf->deadl_abs) % inf->period; - if (inf->cputime + EXTRA_QUANTUM > inf->slice) { + if ( (inf->cputime + EXTRA_QUANTUM) > inf->slice ) + { /*we don't have a reasonable amount of time in our slice left :( => start in next period!*/ unblock_long_vcons(inf, now); } } -static inline void unblock_long_cons_b(struct sedf_vcpu_info* inf,s_time_t now) { +#endif + + +static void unblock_long_cons_b(struct sedf_vcpu_info* inf,s_time_t now) +{ /*Conservative 2b*/ /*Treat the unblocking time as a start of a new period */ inf->deadl_abs = now + inf->period; inf->cputime = 0; } -static inline void unblock_long_cons_c(struct sedf_vcpu_info* inf,s_time_t now) { - if (likely(inf->latency)) { + + +#if (UNBLOCK == UNBLOCK_ATROPOS) +static void unblock_long_cons_c(struct sedf_vcpu_info* inf,s_time_t now) +{ + if ( likely(inf->latency) ) + { /*scale the slice and period accordingly to the latency hint*/ /*reduce period temporarily to the latency hint*/ inf->period = inf->latency; @@ -993,18 +1100,24 @@ inf->deadl_abs = now + inf->period; inf->cputime = 0; } - else { + else + { /*we don't have a latency hint.. use some other technique*/ unblock_long_cons_b(inf, now); } } +#endif + + +#if (UNBLOCK == UNBLOCK_BURST) /*a new idea of dealing with short blocks: burst period scaling*/ -static inline void unblock_short_burst(struct sedf_vcpu_info* inf, s_time_t now) +static void unblock_short_burst(struct sedf_vcpu_info* inf, s_time_t now) { /*treat blocked time as consumed by the domain*/ inf->cputime += now - inf->block_abs; - if (inf->cputime + EXTRA_QUANTUM <= inf->slice) { + if ( (inf->cputime + EXTRA_QUANTUM) <= inf->slice ) + { /*if we can still use some time in the current slice then use it!*/ #ifdef SEDF_STATS @@ -1012,10 +1125,12 @@ inf->short_cont++; #endif } - else { + else + { /*we don't have a reasonable amount of time in our slice left => switch to burst mode*/ - if (likely(inf->unblock_abs)) { + if ( likely(inf->unblock_abs) ) + { /*set the period-length to the current blocking interval, possible enhancements: average over last blocking intervals, user-specified minimum,...*/ @@ -1030,17 +1145,23 @@ /*set new (shorter) deadline*/ inf->deadl_abs += inf->period; } - else { + else + { /*in case we haven't unblocked before start in next period!*/ inf->cputime=0; inf->deadl_abs += inf->period; } } + inf->unblock_abs = now; } -static inline void unblock_long_burst(struct sedf_vcpu_info* inf, s_time_t now) { - if (unlikely(inf->latency && (inf->period > inf->latency))) { + + +static void unblock_long_burst(struct sedf_vcpu_info* inf, s_time_t now) +{ + if ( unlikely(inf->latency && (inf->period > inf->latency)) ) + { /*scale the slice and period accordingly to the latency hint*/ inf->period = inf->latency; /*check for overflows on multiplication*/ @@ -1052,23 +1173,28 @@ inf->deadl_abs = now + inf->period; inf->cputime = 0; } - else { + else + { /*we don't have a latency hint.. or we are currently in "burst mode": use some other technique NB: this should be in fact the normal way of operation, when we are in sync with the device!*/ unblock_long_cons_b(inf, now); } + inf->unblock_abs = now; } +#endif /* UNBLOCK == UNBLOCK_BURST */ + #define DOMAIN_EDF 1 #define DOMAIN_EXTRA_PEN 2 #define DOMAIN_EXTRA_UTIL 3 #define DOMAIN_IDLE 4 -static inline int get_run_type(struct vcpu* d) { +static inline int get_run_type(struct vcpu* d) +{ struct sedf_vcpu_info* inf = EDOM_INFO(d); - if (is_idle_task(d->domain)) + if (is_idle_domain(d->domain)) return DOMAIN_IDLE; if (inf->status & EXTRA_RUN_PEN) return DOMAIN_EXTRA_PEN; @@ -1076,6 +1202,8 @@ return DOMAIN_EXTRA_UTIL; return DOMAIN_EDF; } + + /*Compares two domains in the relation of whether the one is allowed to interrupt the others execution. It returns true (!=0) if a switch to the other domain is good. @@ -1085,8 +1213,10 @@ In the same class priorities are assigned as following: EDF: early deadline > late deadline L0 extra-time: lower score > higher score*/ -static inline int should_switch(struct vcpu* cur, - struct vcpu* other, s_time_t now) { +static inline int should_switch(struct vcpu *cur, + struct vcpu *other, + s_time_t now) +{ struct sedf_vcpu_info *cur_inf, *other_inf; cur_inf = EDOM_INFO(cur); other_inf = EDOM_INFO(other); @@ -1119,41 +1249,51 @@ } return 1; } -void sedf_wake(struct vcpu *d) { + +void sedf_wake(struct vcpu *d) +{ s_time_t now = NOW(); struct sedf_vcpu_info* inf = EDOM_INFO(d); PRINT(3, "sedf_wake was called, domain-id %i.%i\n",d->domain->domain_id, d->vcpu_id); - if (unlikely(is_idle_task(d->domain))) + if ( unlikely(is_idle_domain(d->domain)) ) return; - if ( unlikely(__task_on_queue(d)) ) { + if ( unlikely(__task_on_queue(d)) ) + { PRINT(3,"\tdomain %i.%i is already in some queue\n", d->domain->domain_id, d->vcpu_id); return; } + ASSERT(!sedf_runnable(d)); inf->status &= ~SEDF_ASLEEP; ASSERT(!extraq_on(d, EXTRA_UTIL_Q)); ASSERT(!extraq_on(d, EXTRA_PEN_Q)); - if (unlikely(inf->deadl_abs == 0)) + if ( unlikely(inf->deadl_abs == 0) ) + { /*initial setup of the deadline*/ inf->deadl_abs = now + inf->slice; + } - PRINT(3,"waking up domain %i.%i (deadl= %"PRIu64" period= %"PRIu64" "\ - "now= %"PRIu64")\n", d->domain->domain_id, d->vcpu_id, inf->deadl_abs, - inf->period, now); + PRINT(3, "waking up domain %i.%i (deadl= %"PRIu64" period= %"PRIu64 + "now= %"PRIu64")\n", + d->domain->domain_id, d->vcpu_id, inf->deadl_abs, inf->period, now); + #ifdef SEDF_STATS inf->block_tot++; #endif - if (unlikely(now < PERIOD_BEGIN(inf))) { + + if ( unlikely(now < PERIOD_BEGIN(inf)) ) + { PRINT(4,"extratime unblock\n"); /* unblocking in extra-time! */ #if (EXTRA == EXTRA_BLOCK_WEIGHT) - if (inf->status & EXTRA_WANT_PEN_Q) { + if ( inf->status & EXTRA_WANT_PEN_Q ) + { /*we have a domain that wants compensation for block penalty and did just block in its compensation time. Give it another @@ -1163,8 +1303,10 @@ #endif extraq_check_add_unblocked(d, 0); } - else { - if (now < inf->deadl_abs) { + else + { + if ( now < inf->deadl_abs ) + { PRINT(4,"short unblocking\n"); /*short blocking*/ #ifdef SEDF_STATS @@ -1182,7 +1324,8 @@ extraq_check_add_unblocked(d, 1); } - else { + else + { PRINT(4,"long unblocking\n"); /*long unblocking*/ #ifdef SEDF_STATS @@ -1197,7 +1340,6 @@ unblock_long_cons_c(inf, now); #elif (UNBLOCK == UNBLOCK_SHORT_RESUME) unblock_long_cons_b(inf, now); - /*unblock_short_cons_c(inf, now);*/ #elif (UNBLOCK == UNBLOCK_BURST) unblock_long_burst(inf, now); #endif @@ -1205,26 +1347,33 @@ extraq_check_add_unblocked(d, 1); } } - PRINT(3,"woke up domain %i.%i (deadl= %"PRIu64" period= %"PRIu64" "\ - "now= %"PRIu64")\n", d->domain->domain_id, d->vcpu_id, inf->deadl_abs, + + PRINT(3, "woke up domain %i.%i (deadl= %"PRIu64" period= %"PRIu64 + "now= %"PRIu64")\n", + d->domain->domain_id, d->vcpu_id, inf->deadl_abs, inf->period, now); - if (PERIOD_BEGIN(inf) > now) { + + if ( PERIOD_BEGIN(inf) > now ) + { __add_to_waitqueue_sort(d); PRINT(3,"added to waitq\n"); } - else { + else + { __add_to_runqueue_sort(d); PRINT(3,"added to runq\n"); } #ifdef SEDF_STATS /*do some statistics here...*/ - if (inf->block_abs != 0) { + if ( inf->block_abs != 0 ) + { inf->block_time_tot += now - inf->block_abs; inf->penalty_time_tot += PERIOD_BEGIN(inf) + inf->cputime - inf->block_abs; } #endif + /*sanity check: make sure each extra-aware domain IS on the util-q!*/ ASSERT(IMPLY(inf->status & EXTRA_AWARE, extraq_on(d, EXTRA_UTIL_Q))); ASSERT(__task_on_queue(d)); @@ -1234,27 +1383,48 @@ ASSERT(d->processor >= 0); ASSERT(d->processor < NR_CPUS); ASSERT(schedule_data[d->processor].curr); - if (should_switch(schedule_data[d->processor].curr, d, now)) + + if ( should_switch(schedule_data[d->processor].curr, d, now) ) cpu_raise_softirq(d->processor, SCHEDULE_SOFTIRQ); } -/*Print a lot of use-{full, less} information about a domains in the system*/ -static void sedf_dump_domain(struct vcpu *d) { + +static int sedf_set_affinity(struct vcpu *v, cpumask_t *affinity) +{ + if ( v == current ) + return cpu_isset(v->processor, *affinity) ? 0 : -EBUSY; + + vcpu_pause(v); + v->cpu_affinity = *affinity; + v->processor = first_cpu(v->cpu_affinity); + vcpu_unpause(v); + + return 0; +} + + +/* Print a lot of useful information about a domains in the system */ +static void sedf_dump_domain(struct vcpu *d) +{ printk("%i.%i has=%c ", d->domain->domain_id, d->vcpu_id, test_bit(_VCPUF_running, &d->vcpu_flags) ? 'T':'F'); - printk("p=%"PRIu64" sl=%"PRIu64" ddl=%"PRIu64" w=%hu c=%"PRIu64" sc=%i xtr(%s)=%"PRIu64" ew=%hu", + printk("p=%"PRIu64" sl=%"PRIu64" ddl=%"PRIu64" w=%hu c=%"PRIu64 + " sc=%i xtr(%s)=%"PRIu64" ew=%hu", EDOM_INFO(d)->period, EDOM_INFO(d)->slice, EDOM_INFO(d)->deadl_abs, - EDOM_INFO(d)->weight, d->cpu_time, EDOM_INFO(d)->score[EXTRA_UTIL_Q], + EDOM_INFO(d)->weight, d->cpu_time, + EDOM_INFO(d)->score[EXTRA_UTIL_Q], (EDOM_INFO(d)->status & EXTRA_AWARE) ? "yes" : "no", EDOM_INFO(d)->extra_time_tot, EDOM_INFO(d)->extraweight); - if (d->cpu_time !=0) + + if ( d->cpu_time != 0 ) printf(" (%"PRIu64"%%)", (EDOM_INFO(d)->extra_time_tot * 100) / d->cpu_time); + #ifdef SEDF_STATS - if (EDOM_INFO(d)->block_time_tot!=0) + if ( EDOM_INFO(d)->block_time_tot != 0 ) printf(" pen=%"PRIu64"%%", (EDOM_INFO(d)->penalty_time_tot * 100) / EDOM_INFO(d)->block_time_tot); - if (EDOM_INFO(d)->block_tot!=0) + if ( EDOM_INFO(d)->block_tot != 0 ) printf("\n blks=%u sh=%u (%u%%) (shc=%u (%u%%) shex=%i "\ "shexsl=%i) l=%u (%u%%) avg: b=%"PRIu64" p=%"PRIu64"", EDOM_INFO(d)->block_tot, EDOM_INFO(d)->short_block_tot, @@ -1271,7 +1441,8 @@ printf("\n"); } -/*dumps all domains on hte specified cpu*/ + +/* dumps all domains on hte specified cpu */ static void sedf_dump_cpu_state(int i) { struct list_head *list, *queue, *tmp; @@ -1284,7 +1455,8 @@ queue = RUNQ(i); printk("RUNQ rq %lx n: %lx, p: %lx\n", (unsigned long)queue, (unsigned long) queue->next, (unsigned long) queue->prev); - list_for_each_safe ( list, tmp, queue ) { + list_for_each_safe ( list, tmp, queue ) + { printk("%3d: ",loop++); d_inf = list_entry(list, struct sedf_vcpu_info, list); sedf_dump_domain(d_inf->vcpu); @@ -1293,7 +1465,8 @@ queue = WAITQ(i); loop = 0; printk("\nWAITQ rq %lx n: %lx, p: %lx\n", (unsigned long)queue, (unsigned long) queue->next, (unsigned long) queue->prev); - list_for_each_safe ( list, tmp, queue ) { + list_for_each_safe ( list, tmp, queue ) + { printk("%3d: ",loop++); d_inf = list_entry(list, struct sedf_vcpu_info, list); sedf_dump_domain(d_inf->vcpu); @@ -1303,7 +1476,8 @@ printk("\nEXTRAQ (penalty) rq %lx n: %lx, p: %lx\n", (unsigned long)queue, (unsigned long) queue->next, (unsigned long) queue->prev); - list_for_each_safe ( list, tmp, queue ) { + list_for_each_safe ( list, tmp, queue ) + { d_inf = list_entry(list, struct sedf_vcpu_info, extralist[EXTRA_PEN_Q]); printk("%3d: ",loop++); @@ -1314,7 +1488,8 @@ printk("\nEXTRAQ (utilization) rq %lx n: %lx, p: %lx\n", (unsigned long)queue, (unsigned long) queue->next, (unsigned long) queue->prev); - list_for_each_safe ( list, tmp, queue ) { + list_for_each_safe ( list, tmp, queue ) + { d_inf = list_entry(list, struct sedf_vcpu_info, extralist[EXTRA_UTIL_Q]); printk("%3d: ",loop++); @@ -1323,69 +1498,93 @@ loop = 0; printk("\nnot on Q\n"); - for_each_domain(d) + + for_each_domain ( d ) + { for_each_vcpu(d, ed) - { - if (!__task_on_queue(ed) && (ed->processor == i)) { - printk("%3d: ",loop++); - sedf_dump_domain(ed); - } - } -} -/*Adjusts periods and slices of the domains accordingly to their weights*/ -static inline int sedf_adjust_weights(struct sched_adjdom_cmd *cmd) { + { + if ( !__task_on_queue(ed) && (ed->processor == i) ) + { + printk("%3d: ",loop++); + sedf_dump_domain(ed); + } + } + } +} + + +/* Adjusts periods and slices of the domains accordingly to their weights. */ +static int sedf_adjust_weights(struct sched_adjdom_cmd *cmd) +{ struct vcpu *p; struct domain *d; int sumw[NR_CPUS]; s_time_t sumt[NR_CPUS]; int cpu; - for (cpu=0; cpu < NR_CPUS; cpu++) { + for ( cpu = 0; cpu < NR_CPUS; cpu++ ) + { sumw[cpu] = 0; sumt[cpu] = 0; } - /*sum up all weights*/ - for_each_domain(d) - for_each_vcpu(d, p) { - if (EDOM_INFO(p)->weight) - sumw[p->processor] += EDOM_INFO(p)->weight; - else { - /*don't modify domains who don't have a weight, but sum - up the time they need, projected to a WEIGHT_PERIOD, - so that this time is not given to the weight-driven - domains*/ - /*check for overflows*/ - ASSERT((WEIGHT_PERIOD < ULONG_MAX) - && (EDOM_INFO(p)->slice_orig < ULONG_MAX)); - sumt[p->processor] += - (WEIGHT_PERIOD * EDOM_INFO(p)->slice_orig) / - EDOM_INFO(p)->period_orig; - } - } - /*adjust all slices (and periods) to the new weight*/ - for_each_domain(d) - for_each_vcpu(d, p) { - if (EDOM_INFO(p)->weight) { - EDOM_INFO(p)->period_orig = - EDOM_INFO(p)->period = WEIGHT_PERIOD; - EDOM_INFO(p)->slice_orig = - EDOM_INFO(p)->slice = - (EDOM_INFO(p)->weight * - (WEIGHT_PERIOD - WEIGHT_SAFETY - sumt[p->processor])) / - sumw[p->processor]; - } - } + + /* sum up all weights */ + for_each_domain( d ) + { + for_each_vcpu( d, p ) + { + if ( EDOM_INFO(p)->weight ) + { + sumw[p->processor] += EDOM_INFO(p)->weight; + } + else + { + /*don't modify domains who don't have a weight, but sum + up the time they need, projected to a WEIGHT_PERIOD, + so that this time is not given to the weight-driven + domains*/ + /*check for overflows*/ + ASSERT((WEIGHT_PERIOD < ULONG_MAX) + && (EDOM_INFO(p)->slice_orig < ULONG_MAX)); + sumt[p->processor] += + (WEIGHT_PERIOD * EDOM_INFO(p)->slice_orig) / + EDOM_INFO(p)->period_orig; + } + } + } + + /* adjust all slices (and periods) to the new weight */ + for_each_domain( d ) + { + for_each_vcpu ( d, p ) + { + if ( EDOM_INFO(p)->weight ) + { + EDOM_INFO(p)->period_orig = + EDOM_INFO(p)->period = WEIGHT_PERIOD; + EDOM_INFO(p)->slice_orig = + EDOM_INFO(p)->slice = + (EDOM_INFO(p)->weight * + (WEIGHT_PERIOD - WEIGHT_SAFETY - sumt[p->processor])) / + sumw[p->processor]; + } + } + } + return 0; } + /* set or fetch domain scheduling parameters */ -static int sedf_adjdom(struct domain *p, struct sched_adjdom_cmd *cmd) { +static int sedf_adjdom(struct domain *p, struct sched_adjdom_cmd *cmd) +{ struct vcpu *v; PRINT(2,"sedf_adjdom was called, domain-id %i new period %"PRIu64" "\ "new slice %"PRIu64"\nlatency %"PRIu64" extra:%s\n", p->domain_id, cmd->u.sedf.period, cmd->u.sedf.slice, cmd->u.sedf.latency, (cmd->u.sedf.extratime)?"yes":"no"); + if ( cmd->direction == SCHED_INFO_PUT ) { /*check for sane parameters*/ @@ -1458,6 +1657,7 @@ .sleep = sedf_sleep, .wake = sedf_wake, .adjdom = sedf_adjdom, + .set_affinity = sedf_set_affinity }; /* diff -r 25e3c8668f1f -r 8af1199488d3 xen/common/schedule.c --- a/xen/common/schedule.c Mon Jan 9 11:19:55 2006 +++ b/xen/common/schedule.c Mon Jan 9 11:22:17 2006 @@ -100,7 +100,9 @@ v->vcpu_id = vcpu_id; v->processor = cpu_id; atomic_set(&v->pausecnt, 0); - v->cpumap = CPUMAP_RUNANYWHERE; + + v->cpu_affinity = is_idle_domain(d) ? + cpumask_of_cpu(cpu_id) : CPU_MASK_ALL; d->vcpu[vcpu_id] = v; @@ -143,7 +145,7 @@ /* Initialise the per-domain timer. */ init_ac_timer(&v->timer, dom_timer_fn, v, v->processor); - if ( is_idle_task(d) ) + if ( is_idle_domain(d) ) { schedule_data[v->processor].curr = v; schedule_data[v->processor].idle = v; @@ -166,7 +168,7 @@ unsigned long flags; spin_lock_irqsave(&schedule_data[v->processor].schedule_lock, flags); - if ( likely(!domain_runnable(v)) ) + if ( likely(!vcpu_runnable(v)) ) SCHED_OP(sleep, v); spin_unlock_irqrestore(&schedule_data[v->processor].schedule_lock, flags); @@ -182,7 +184,7 @@ * flag is cleared and the scheduler lock is released. We also check that * the domain continues to be unrunnable, in case someone else wakes it. */ - while ( !domain_runnable(v) && + while ( !vcpu_runnable(v) && (test_bit(_VCPUF_running, &v->vcpu_flags) || spin_is_locked(&schedule_data[v->processor].schedule_lock)) ) cpu_relax(); @@ -195,15 +197,22 @@ unsigned long flags; spin_lock_irqsave(&schedule_data[v->processor].schedule_lock, flags); - if ( likely(domain_runnable(v)) ) + if ( likely(vcpu_runnable(v)) ) { SCHED_OP(wake, v); v->wokenup = NOW(); } - clear_bit(_VCPUF_cpu_migrated, &v->vcpu_flags); spin_unlock_irqrestore(&schedule_data[v->processor].schedule_lock, flags); TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id); +} + +int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity) +{ + if ( cpus_empty(*affinity) ) + return -EINVAL; + + return SCHED_OP(set_affinity, v, affinity); } /* Block the currently-executing domain until a pertinent event occurs. */ @@ -330,18 +339,23 @@ do { succ = 0; __clear_cpu_bits(have_lock); - for_each_vcpu(d, v) { + for_each_vcpu ( d, v ) + { cpu = v->processor; - if (!__get_cpu_bit(cpu, have_lock)) { + if ( !__get_cpu_bit(cpu, have_lock) ) + { /* if we don't have a lock on this CPU: acquire it*/ - if (spin_trylock(&schedule_data[cpu].schedule_lock)) { + if ( spin_trylock(&schedule_data[cpu].schedule_lock) ) + { /*we have this lock!*/ __set_cpu_bit(cpu, have_lock); succ = 1; - } else { + } + else + { /*we didn,t get this lock -> free all other locks too!*/ - for (cpu = 0; cpu < NR_CPUS; cpu++) - if (__get_cpu_bit(cpu, have_lock)) + for ( cpu = 0; cpu < NR_CPUS; cpu++ ) + if ( __get_cpu_bit(cpu, have_lock) ) spin_unlock(&schedule_data[cpu].schedule_lock); /* and start from the beginning! */ succ = 0; @@ -354,8 +368,8 @@ SCHED_OP(adjdom, d, cmd); - for (cpu = 0; cpu < NR_CPUS; cpu++) - if (__get_cpu_bit(cpu, have_lock)) + for ( cpu = 0; cpu < NR_CPUS; cpu++ ) + if ( __get_cpu_bit(cpu, have_lock) ) spin_unlock(&schedule_data[cpu].schedule_lock); __clear_cpu_bits(have_lock); @@ -371,22 +385,20 @@ */ static void __enter_scheduler(void) { - struct vcpu *prev = current, *next = NULL; - int cpu = prev->processor; - s_time_t now; + struct vcpu *prev = current, *next = NULL; + int cpu = smp_processor_id(); + s_time_t now = NOW(); struct task_slice next_slice; s32 r_time; /* time for new dom to run */ + ASSERT(!in_irq()); + perfc_incrc(sched_run); - + spin_lock_irq(&schedule_data[cpu].schedule_lock); - - now = NOW(); rem_ac_timer(&schedule_data[cpu].s_timer); - ASSERT(!in_irq()); - prev->cpu_time += now - prev->lastschd; /* get policy-specific decision on scheduling... */ @@ -394,7 +406,7 @@ r_time = next_slice.time; next = next_slice.task; - + schedule_data[cpu].curr = next; next->lastschd = now; @@ -411,11 +423,6 @@ prev->domain->domain_id, now - prev->lastschd); TRACE_3D(TRC_SCHED_SWITCH_INFNEXT, next->domain->domain_id, now - next->wokenup, r_time); - - clear_bit(_VCPUF_running, &prev->vcpu_flags); - set_bit(_VCPUF_running, &next->vcpu_flags); - - perfc_incrc(sched_ctx); /* * Logic of wokenup field in domain struct: @@ -425,10 +432,10 @@ * also set here then a preempted runnable domain will get a screwed up * "waiting time" value next time it is scheduled. */ - prev->wokenup = NOW(); + prev->wokenup = now; #if defined(WAKE_HISTO) - if ( !is_idle_task(next->domain) && next->wokenup ) + if ( !is_idle_domain(next->domain) && next->wokenup ) { ulong diff = (ulong)(now - next->wokenup); diff /= (ulong)MILLISECS(1); @@ -438,7 +445,7 @@ next->wokenup = (s_time_t)0; #elif defined(BLOCKTIME_HISTO) prev->lastdeschd = now; - if ( !is_idle_task(next->domain) ) + if ( !is_idle_domain(next->domain) ) { ulong diff = (ulong)((now - next->lastdeschd) / MILLISECS(10)); if (diff <= BUCKETS-2) schedule_data[cpu].hist[diff]++; @@ -446,10 +453,16 @@ } #endif + set_bit(_VCPUF_running, &next->vcpu_flags); + + spin_unlock_irq(&schedule_data[cpu].schedule_lock); + + perfc_incrc(sched_ctx); + prev->sleep_tick = schedule_data[cpu].tick; /* Ensure that the domain has an up-to-date time base. */ - if ( !is_idle_task(next->domain) ) + if ( !is_idle_domain(next->domain) ) { update_dom_time(next); if ( next->sleep_tick != schedule_data[cpu].tick ) @@ -461,17 +474,6 @@ next->domain->domain_id, next->vcpu_id); context_switch(prev, next); - - spin_unlock_irq(&schedule_data[cpu].schedule_lock); - - context_switch_finalise(next); -} - -/* No locking needed -- pointer comparison is safe :-) */ -int idle_cpu(int cpu) -{ - struct vcpu *p = schedule_data[cpu].curr; - return p == idle_task[cpu]; } @@ -493,11 +495,11 @@ static void t_timer_fn(void *unused) { struct vcpu *v = current; - unsigned int cpu = v->processor; + unsigned int cpu = smp_processor_id(); schedule_data[cpu].tick++; - if ( !is_idle_task(v->domain) ) + if ( !is_idle_domain(v->domain) ) { update_dom_time(v); send_guest_virq(v, VIRQ_TIMER); @@ -531,8 +533,8 @@ init_ac_timer(&t_timer[i], t_timer_fn, NULL, i); } - schedule_data[0].curr = idle_task[0]; - schedule_data[0].idle = idle_task[0]; + schedule_data[0].curr = idle_domain[0]; + schedule_data[0].idle = idle_domain[0]; for ( i = 0; schedulers[i] != NULL; i++ ) { @@ -546,10 +548,10 @@ printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name); - rc = SCHED_OP(alloc_task, idle_task[0]); + rc = SCHED_OP(alloc_task, idle_domain[0]); BUG_ON(rc < 0); - sched_add_domain(idle_task[0]); + sched_add_domain(idle_domain[0]); } /* diff -r 25e3c8668f1f -r 8af1199488d3 xen/common/vsprintf.c --- a/xen/common/vsprintf.c Mon Jan 9 11:19:55 2006 +++ b/xen/common/vsprintf.c Mon Jan 9 11:22:17 2006 @@ -12,11 +12,15 @@ /* * Fri Jul 13 2001 Crutcher Dunnavant <crutcher+kernel@xxxxxxxxxxxxxx> * - changed to provide snprintf and vsnprintf functions + * So Feb 1 16:51:32 CET 2004 Juergen Quade <quade@xxxxxxx> + * - scnprintf and vscnprintf */ #include <stdarg.h> #include <xen/ctype.h> #include <xen/lib.h> +#include <asm/div64.h> +#include <asm/page.h> /** * simple_strtoul - convert a string to an unsigned long @@ -33,11 +37,14 @@ if (*cp == '0') { base = 8; cp++; - if ((*cp == 'x') && isxdigit(cp[1])) { + if ((toupper(*cp) == 'X') && isxdigit(cp[1])) { cp++; base = 16; } } + } else if (base == 16) { + if (cp[0] == '0' && toupper(cp[1]) == 'X') + cp += 2; } while (isxdigit(*cp) && (value = isdigit(*cp) ? *cp-'0' : toupper(*cp)-'A'+10) < base) { @@ -49,6 +56,8 @@ return result; } +EXPORT_SYMBOL(simple_strtoul); + /** * simple_strtol - convert a string to a signed long * @cp: The start of the string @@ -61,6 +70,8 @@ return -simple_strtoul(cp+1,endp,base); return simple_strtoul(cp,endp,base); } + +EXPORT_SYMBOL(simple_strtol); /** * simple_strtoull - convert a string to an unsigned long long @@ -77,11 +88,14 @@ if (*cp == '0') { base = 8; cp++; - if ((*cp == 'x') && isxdigit(cp[1])) { + if ((toupper(*cp) == 'X') && isxdigit(cp[1])) { cp++; base = 16; } } + } else if (base == 16) { + if (cp[0] == '0' && toupper(cp[1]) == 'X') + cp += 2; } while (isxdigit(*cp) && (value = isdigit(*cp) ? *cp-'0' : (islower(*cp) ? toupper(*cp) : *cp)-'A'+10) < base) { @@ -92,6 +106,8 @@ *endp = (char *)cp; return result; } + +EXPORT_SYMBOL(simple_strtoull); /** * simple_strtoll - convert a string to a signed long long @@ -123,25 +139,25 @@ #define SPECIAL 32 /* 0x */ #define LARGE 64 /* use 'ABCDEF' instead of 'abcdef' */ -static char * number(char * buf, char * end, long long num, int base, int size, int precision, int type) +static char * number(char * buf, char * end, unsigned long long num, int base, int size, int precision, int type) { char c,sign,tmp[66]; const char *digits; - const char small_digits[] = "0123456789abcdefghijklmnopqrstuvwxyz"; - const char large_digits[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"; + static const char small_digits[] = "0123456789abcdefghijklmnopqrstuvwxyz"; + static const char large_digits[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"; int i; digits = (type & LARGE) ? large_digits : small_digits; if (type & LEFT) type &= ~ZEROPAD; if (base < 2 || base > 36) - return buf; + return NULL; c = (type & ZEROPAD) ? '0' : ' '; sign = 0; if (type & SIGN) { - if (num < 0) { + if ((signed long long) num < 0) { sign = '-'; - num = -num; + num = - (signed long long) num; size--; } else if (type & PLUS) { sign = '+'; @@ -160,6 +176,9 @@ i = 0; if (num == 0) tmp[i++]='0'; + else while (num != 0) + tmp[i++] = digits[do_div(num,base)]; +#if 0 else { /* XXX KAF: force unsigned mod and div. */ @@ -167,6 +186,7 @@ unsigned int base2=(unsigned int)base; while (num2 != 0) { tmp[i++] = digits[num2%base2]; num2 /= base2; } } +#endif if (i > precision) precision = i; size -= precision; @@ -222,14 +242,22 @@ } /** -* vsnprintf - Format a string and place it in a buffer -* @buf: The buffer to place the result into -* @size: The size of the buffer, including the trailing null space -* @fmt: The format string to use -* @args: Arguments for the format string -* -* Call this function if you are already dealing with a va_list. -* You probably want snprintf instead. + * vsnprintf - Format a string and place it in a buffer + * @buf: The buffer to place the result into + * @size: The size of the buffer, including the trailing null space + * @fmt: The format string to use + * @args: Arguments for the format string + * + * The return value is the number of characters which would + * be generated for the given input, excluding the trailing + * '\0', as per ISO C99. If you want to have the exact + * number of characters written into @buf as return value + * (not including the trailing '\0'), use vscnprintf. If the + * return is greater than or equal to @size, the resulting + * string is truncated. + * + * Call this function if you are already dealing with a va_list. + * You probably want snprintf instead. */ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args) { @@ -247,6 +275,9 @@ int qualifier; /* 'h', 'l', or 'L' for integer fields */ /* 'z' support added 23/7/1999 S.H. */ /* 'z' changed to 'Z' --davidm 1/25/99 */ + + /* Reject out-of-range values early */ + BUG_ON((int)size < 0); str = buf; end = buf + size - 1; @@ -307,17 +338,14 @@ /* get the conversion qualifier */ qualifier = -1; - if (*fmt == 'h' || *fmt == 'l' || *fmt == 'L' || *fmt =='Z') { + if (*fmt == 'h' || *fmt == 'l' || *fmt == 'L' || + *fmt =='Z' || *fmt == 'z') { qualifier = *fmt; ++fmt; if (qualifier == 'l' && *fmt == 'l') { qualifier = 'L'; ++fmt; } - } - if (*fmt == 'q') { - qualifier = 'L'; - ++fmt; } /* default base */ @@ -345,7 +373,7 @@ case 's': s = va_arg(args, char *); - if (!s) + if ((unsigned long)s < PAGE_SIZE) s = "<NULL>"; len = strnlen(s, precision); @@ -386,7 +414,7 @@ if (qualifier == 'l') { long * ip = va_arg(args, long *); *ip = (str - buf); - } else if (qualifier == 'Z') { + } else if (qualifier == 'Z' || qualifier == 'z') { size_t * ip = va_arg(args, size_t *); *ip = (str - buf); } else { @@ -437,7 +465,7 @@ num = va_arg(args, unsigned long); if (flags & SIGN) num = (signed long) num; - } else if (qualifier == 'Z') { + } else if (qualifier == 'Z' || qualifier == 'z') { num = va_arg(args, size_t); } else if (qualifier == 'h') { num = (unsigned short) va_arg(args, int); @@ -463,12 +491,43 @@ return str-buf; } +EXPORT_SYMBOL(vsnprintf); + +/** + * vscnprintf - Format a string and place it in a buffer + * @buf: The buffer to place the result into + * @size: The size of the buffer, including the trailing null space + * @fmt: The format string to use + * @args: Arguments for the format string + * + * The return value is the number of characters which have been written into + * the @buf not including the trailing '\0'. If @size is <= 0 the function + * returns 0. + * + * Call this function if you are already dealing with a va_list. + * You probably want scnprintf instead. + */ +int vscnprintf(char *buf, size_t size, const char *fmt, va_list args) +{ + int i; + + i=vsnprintf(buf,size,fmt,args); + return (i >= size) ? (size - 1) : i; +} + +EXPORT_SYMBOL(vscnprintf); + /** * snprintf - Format a string and place it in a buffer * @buf: The buffer to place the result into * @size: The size of the buffer, including the trailing null space * @fmt: The format string to use * @...: Arguments for the format string + * + * The return value is the number of characters which would be + * generated for the given input, excluding the trailing null, + * as per ISO C99. If the return is greater than or equal to + * @size, the resulting string is truncated. */ int snprintf(char * buf, size_t size, const char *fmt, ...) { @@ -481,26 +540,61 @@ return i; } +EXPORT_SYMBOL(snprintf); + +/** + * scnprintf - Format a string and place it in a buffer + * @buf: The buffer to place the result into + * @size: The size of the buffer, including the trailing null space + * @fmt: The format string to use + * @...: Arguments for the format string + * + * The return value is the number of characters written into @buf not including + * the trailing '\0'. If @size is <= 0 the function returns 0. If the return is + * greater than or equal to @size, the resulting string is truncated. + */ + +int scnprintf(char * buf, size_t size, const char *fmt, ...) +{ + va_list args; + int i; + + va_start(args, fmt); + i = vsnprintf(buf, size, fmt, args); + va_end(args); + return (i >= size) ? (size - 1) : i; +} +EXPORT_SYMBOL(scnprintf); + /** * vsprintf - Format a string and place it in a buffer * @buf: The buffer to place the result into * @fmt: The format string to use * @args: Arguments for the format string * + * The function returns the number of characters written + * into @buf. Use vsnprintf or vscnprintf in order to avoid + * buffer overflows. + * * Call this function if you are already dealing with a va_list. * You probably want sprintf instead. */ int vsprintf(char *buf, const char *fmt, va_list args) { - return vsnprintf(buf, 0xFFFFFFFFUL, fmt, args); -} - + return vsnprintf(buf, INT_MAX, fmt, args); +} + +EXPORT_SYMBOL(vsprintf); /** * sprintf - Format a string and place it in a buffer * @buf: The buffer to place the result into * @fmt: The format string to use * @...: Arguments for the format string + * + * The function returns the number of characters written + * into @buf. Use snprintf or scnprintf in order to avoid + * buffer overflows. */ int sprintf(char * buf, const char *fmt, ...) { @@ -508,11 +602,12 @@ int i; va_start(args, fmt); - i=vsprintf(buf,fmt,args); + i=vsnprintf(buf, INT_MAX, fmt, args); va_end(args); return i; } +EXPORT_SYMBOL(sprintf); /* * Local variables: diff -r 25e3c8668f1f -r 8af1199488d3 xen/drivers/char/ns16550.c --- a/xen/drivers/char/ns16550.c Mon Jan 9 11:19:55 2006 +++ b/xen/drivers/char/ns16550.c Mon Jan 9 11:22:17 2006 @@ -13,6 +13,7 @@ #include <xen/irq.h> #include <xen/sched.h> #include <xen/serial.h> +#include <xen/iocap.h> #include <asm/io.h> /* @@ -233,11 +234,11 @@ } #ifdef CONFIG_X86 -#include <asm/physdev.h> static void ns16550_endboot(struct serial_port *port) { struct ns16550 *uart = port->uart; - physdev_modify_ioport_access_range(dom0, 0, uart->io_base, 8); + if ( ioports_deny_access(dom0, uart->io_base, uart->io_base + 7) != 0 ) + BUG(); } #else #define ns16550_endboot NULL diff -r 25e3c8668f1f -r 8af1199488d3 xen/include/asm-ia64/domain.h --- a/xen/include/asm-ia64/domain.h Mon Jan 9 11:19:55 2006 +++ b/xen/include/asm-ia64/domain.h Mon Jan 9 11:22:17 2006 @@ -10,7 +10,7 @@ #include <asm/vmx_platform.h> #include <xen/list.h> -extern void arch_do_createdomain(struct vcpu *); +extern int arch_do_createdomain(struct vcpu *); extern void domain_relinquish_resources(struct domain *); diff -r 25e3c8668f1f -r 8af1199488d3 xen/include/asm-ia64/linux-xen/asm/pal.h --- a/xen/include/asm-ia64/linux-xen/asm/pal.h Mon Jan 9 11:19:55 2006 +++ b/xen/include/asm-ia64/linux-xen/asm/pal.h Mon Jan 9 11:22:17 2006 @@ -925,7 +925,11 @@ ia64_pal_cache_flush (u64 cache_type, u64 invalidate, u64 *progress, u64 *vector) { struct ia64_pal_retval iprv; +#ifdef XEN /* fix a bug in Linux... PAL has changed */ + PAL_CALL(iprv, PAL_CACHE_FLUSH, cache_type, invalidate, *progress); +#else PAL_CALL_IC_OFF(iprv, PAL_CACHE_FLUSH, cache_type, invalidate, *progress); +#endif if (vector) *vector = iprv.v0; *progress = iprv.v1; diff -r 25e3c8668f1f -r 8af1199488d3 xen/include/asm-ia64/vmx.h --- a/xen/include/asm-ia64/vmx.h Mon Jan 9 11:19:55 2006 +++ b/xen/include/asm-ia64/vmx.h Mon Jan 9 11:22:17 2006 @@ -23,7 +23,7 @@ #define _ASM_IA64_VT_H #define RR7_SWITCH_SHIFT 12 /* 4k enough */ -#include <public/io/ioreq.h> +#include <public/hvm/ioreq.h> extern void identify_vmx_feature(void); extern unsigned int vmx_enabled; diff -r 25e3c8668f1f -r 8af1199488d3 xen/include/asm-x86/config.h --- a/xen/include/asm-x86/config.h Mon Jan 9 11:19:55 2006 +++ b/xen/include/asm-x86/config.h Mon Jan 9 11:22:17 2006 @@ -248,12 +248,10 @@ #ifdef CONFIG_X86_PAE /* Hypervisor owns top 168MB of virtual address space. */ -# define __HYPERVISOR_VIRT_START 0xF5800000 -# define HYPERVISOR_VIRT_START (0xF5800000UL) +#define HYPERVISOR_VIRT_START mk_unsigned_long(0xF5800000) #else /* Hypervisor owns top 64MB of virtual address space. */ -# define __HYPERVISOR_VIRT_START 0xFC000000 -# define HYPERVISOR_VIRT_START (0xFC000000UL) +#define HYPERVISOR_VIRT_START mk_unsigned_long(0xFC000000) #endif #define L2_PAGETABLE_FIRST_XEN_SLOT \ diff -r 25e3c8668f1f -r 8af1199488d3 xen/include/asm-x86/current.h --- a/xen/include/asm-x86/current.h Mon Jan 9 11:19:55 2006 +++ b/xen/include/asm-x86/current.h Mon Jan 9 11:22:17 2006 @@ -49,7 +49,7 @@ #define reset_stack_and_jump(__fn) \ __asm__ __volatile__ ( \ "mov %0,%%"__OP"sp; jmp "STR(__fn) \ - : : "r" (guest_cpu_user_regs()) ) + : : "r" (guest_cpu_user_regs()) : "memory" ) #define schedule_tail(_ed) (((_ed)->arch.schedule_tail)(_ed)) diff -r 25e3c8668f1f -r 8af1199488d3 xen/include/asm-x86/domain.h --- a/xen/include/asm-x86/domain.h Mon Jan 9 11:19:55 2006 +++ b/xen/include/asm-x86/domain.h Mon Jan 9 11:22:17 2006 @@ -24,8 +24,8 @@ /* Writable pagetables. */ struct ptwr_info ptwr[2]; - /* I/O-port access bitmap mask. */ - u8 *iobmp_mask; /* Address of IO bitmap mask, or NULL. */ + /* I/O-port admin-specified access capabilities. */ + struct rangeset *ioport_caps; /* Shadow mode status and controls. */ struct shadow_ops *ops; diff -r 25e3c8668f1f -r 8af1199488d3 xen/include/asm-x86/mm.h --- a/xen/include/asm-x86/mm.h Mon Jan 9 11:19:55 2006 +++ b/xen/include/asm-x86/mm.h Mon Jan 9 11:22:17 2006 @@ -336,11 +336,13 @@ int revalidate_l1(struct domain *, l1_pgentry_t *, l1_pgentry_t *); void cleanup_writable_pagetable(struct domain *d); -#define sync_pagetable_state(d) \ - do { \ - LOCK_BIGLOCK(d); \ - cleanup_writable_pagetable(d); \ - UNLOCK_BIGLOCK(d); \ +#define sync_pagetable_state(d) \ + do { \ + LOCK_BIGLOCK(d); \ + /* Avoid racing with ptwr_destroy(). */ \ + if ( !test_bit(_DOMF_dying, &(d)->domain_flags) ) \ + cleanup_writable_pagetable(d); \ + UNLOCK_BIGLOCK(d); \ } while ( 0 ) int audit_adjust_pgtables(struct domain *d, int dir, int noisy); diff -r 25e3c8668f1f -r 8af1199488d3 xen/include/asm-x86/msr.h --- a/xen/include/asm-x86/msr.h Mon Jan 9 11:19:55 2006 +++ b/xen/include/asm-x86/msr.h Mon Jan 9 11:22:17 2006 @@ -12,7 +12,7 @@ __asm__ __volatile__("rdmsr" \ : "=a" (a__), "=d" (b__) \ : "c" (msr)); \ - val = a__ | (b__<<32); \ + val = a__ | ((u64)b__<<32); \ } while(0); #define wrmsr(msr,val1,val2) \ diff -r 25e3c8668f1f -r 8af1199488d3 xen/include/asm-x86/processor.h --- a/xen/include/asm-x86/processor.h Mon Jan 9 11:19:55 2006 +++ b/xen/include/asm-x86/processor.h Mon Jan 9 11:22:17 2006 @@ -190,7 +190,7 @@ #ifdef CONFIG_X86_HT extern void detect_ht(struct cpuinfo_x86 *c); #else -static inline void detect_ht(struct cpuinfo_x86 *c) {} +static always_inline void detect_ht(struct cpuinfo_x86 *c) {} #endif /* @@ -209,7 +209,7 @@ /* * CPUID functions returning a single datum */ -static inline unsigned int cpuid_eax(unsigned int op) +static always_inline unsigned int cpuid_eax(unsigned int op) { unsigned int eax; @@ -219,7 +219,7 @@ : "bx", "cx", "dx"); return eax; } -static inline unsigned int cpuid_ebx(unsigned int op) +static always_inline unsigned int cpuid_ebx(unsigned int op) { unsigned int eax, ebx; @@ -229,7 +229,7 @@ : "cx", "dx" ); return ebx; } -static inline unsigned int cpuid_ecx(unsigned int op) +static always_inline unsigned int cpuid_ecx(unsigned int op) { unsigned int eax, ecx; @@ -239,7 +239,7 @@ : "bx", "dx" ); return ecx; } -static inline unsigned int cpuid_edx(unsigned int op) +static always_inline unsigned int cpuid_edx(unsigned int op) { unsigned int eax, edx; @@ -281,7 +281,7 @@ */ extern unsigned long mmu_cr4_features; -static inline void set_in_cr4 (unsigned long mask) +static always_inline void set_in_cr4 (unsigned long mask) { unsigned long dummy; mmu_cr4_features |= mask; @@ -292,7 +292,7 @@ : "=&r" (dummy) : "irg" (mask) ); } -static inline void clear_in_cr4 (unsigned long mask) +static always_inline void clear_in_cr4 (unsigned long mask) { unsigned long dummy; mmu_cr4_features &= ~mask; @@ -334,7 +334,7 @@ outb((data), 0x23); \ } while (0) -static inline void __monitor(const void *eax, unsigned long ecx, +static always_inline void __monitor(const void *eax, unsigned long ecx, unsigned long edx) { /* "monitor %eax,%ecx,%edx;" */ @@ -343,7 +343,7 @@ : :"a" (eax), "c" (ecx), "d"(edx)); } -static inline void __mwait(unsigned long eax, unsigned long ecx) +static always_inline void __mwait(unsigned long eax, unsigned long ecx) { /* "mwait %eax,%ecx;" */ asm volatile( @@ -460,7 +460,7 @@ }; /* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ -static inline void rep_nop(void) +static always_inline void rep_nop(void) { __asm__ __volatile__ ( "rep;nop" : : : "memory" ); } @@ -471,7 +471,7 @@ #ifdef CONFIG_MPENTIUMIII #define ARCH_HAS_PREFETCH -extern inline void prefetch(const void *x) +extern always_inline void prefetch(const void *x) { __asm__ __volatile__ ("prefetchnta (%0)" : : "r"(x)); } @@ -482,12 +482,12 @@ #define ARCH_HAS_PREFETCHW #define ARCH_HAS_SPINLOCK_PREFETCH -extern inline void prefetch(const void *x) +extern always_inline void prefetch(const void *x) { __asm__ __volatile__ ("prefetch (%0)" : : "r"(x)); } -extern inline void prefetchw(const void *x) +extern always_inline void prefetchw(const void *x) { __asm__ __volatile__ ("prefetchw (%0)" : : "r"(x)); } diff -r 25e3c8668f1f -r 8af1199488d3 xen/include/asm-x86/shadow.h --- a/xen/include/asm-x86/shadow.h Mon Jan 9 11:19:55 2006 +++ b/xen/include/asm-x86/shadow.h Mon Jan 9 11:22:17 2006 @@ -341,10 +341,10 @@ #if SHADOW_VERBOSE_DEBUG #define SH_LOG(_f, _a...) \ printk("DOM%uP%u: SH_LOG(%d): " _f "\n", \ - current->domain->domain_id , current->processor, __LINE__ , ## _a ) + current->domain->domain_id , smp_processor_id(), __LINE__ , ## _a ) #define SH_VLOG(_f, _a...) \ printk("DOM%uP%u: SH_VLOG(%d): " _f "\n", \ - current->domain->domain_id, current->processor, __LINE__ , ## _a ) + current->domain->domain_id, smp_processor_id(), __LINE__ , ## _a ) #else #define SH_LOG(_f, _a...) ((void)0) #define SH_VLOG(_f, _a...) ((void)0) @@ -353,7 +353,7 @@ #if SHADOW_VVERBOSE_DEBUG #define SH_VVLOG(_f, _a...) \ printk("DOM%uP%u: SH_VVLOG(%d): " _f "\n", \ - current->domain->domain_id, current->processor, __LINE__ , ## _a ) + current->domain->domain_id, smp_processor_id(), __LINE__ , ## _a ) #else #define SH_VVLOG(_f, _a...) ((void)0) #endif @@ -361,7 +361,7 @@ #if SHADOW_VVVERBOSE_DEBUG #define SH_VVVLOG(_f, _a...) \ printk("DOM%uP%u: SH_VVVLOG(%d): " _f "\n", \ - current->domain->domain_id, current->processor, __LINE__ , ## _a ) + current->domain->domain_id, smp_processor_id(), __LINE__ , ## _a ) #else #define SH_VVVLOG(_f, _a...) ((void)0) #endif @@ -369,7 +369,7 @@ #if FULLSHADOW_DEBUG #define FSH_LOG(_f, _a...) \ printk("DOM%uP%u: FSH_LOG(%d): " _f "\n", \ - current->domain->domain_id, current->processor, __LINE__ , ## _a ) + current->domain->domain_id, smp_processor_id(), __LINE__ , ## _a ) #else #define FSH_LOG(_f, _a...) ((void)0) #endif @@ -591,7 +591,7 @@ if ( need_flush ) { perfc_incrc(update_hl2e_invlpg); - flush_tlb_one_mask(v->domain->cpumask, + flush_tlb_one_mask(v->domain->domain_dirty_cpumask, &linear_pg_table[l1_linear_offset(va)]); } } diff -r 25e3c8668f1f -r 8af1199488d3 xen/include/asm-x86/vmx.h --- a/xen/include/asm-x86/vmx.h Mon Jan 9 11:19:55 2006 +++ b/xen/include/asm-x86/vmx.h Mon Jan 9 11:22:17 2006 @@ -26,7 +26,7 @@ #include <asm/vmx_vmcs.h> #include <asm/i387.h> -#include <public/io/ioreq.h> +#include <public/hvm/ioreq.h> extern int hvm_enabled; @@ -38,7 +38,6 @@ extern void arch_vmx_do_launch(struct vcpu *); extern void arch_vmx_do_resume(struct vcpu *); -extern void arch_vmx_do_relaunch(struct vcpu *); extern unsigned int cpu_rev; @@ -506,7 +505,7 @@ static inline unsigned int vmx_get_vcpu_nr(struct domain *d) { - return d->arch.vmx_platform.nr_vcpu; + return d->arch.vmx_platform.nr_vcpus; } static inline shared_iopage_t *get_sp(struct domain *d) diff -r 25e3c8668f1f -r 8af1199488d3 xen/include/asm-x86/vmx_intercept.h --- a/xen/include/asm-x86/vmx_intercept.h Mon Jan 9 11:19:55 2006 +++ b/xen/include/asm-x86/vmx_intercept.h Mon Jan 9 11:22:17 2006 @@ -6,7 +6,7 @@ #include <xen/lib.h> #include <xen/time.h> #include <xen/errno.h> -#include <public/io/ioreq.h> +#include <public/hvm/ioreq.h> #define MAX_IO_HANDLER 8 diff -r 25e3c8668f1f -r 8af1199488d3 xen/include/asm-x86/vmx_platform.h --- a/xen/include/asm-x86/vmx_platform.h Mon Jan 9 11:19:55 2006 +++ b/xen/include/asm-x86/vmx_platform.h Mon Jan 9 11:22:17 2006 @@ -33,10 +33,10 @@ (((size_reg) << 24) | ((index) << 16) | ((seg) << 8) | (flag)) #define operand_size(operand) \ - ((operand >> 24) & 0xFF) + ((operand >> 24) & 0xFF) #define operand_index(operand) \ - ((operand >> 16) & 0xFF) + ((operand >> 16) & 0xFF) /* for instruction.operand[].size */ #define BYTE 1 @@ -81,13 +81,13 @@ struct vmx_platform { unsigned long shared_page_va; - unsigned int nr_vcpu; - unsigned int lapic_enable; + unsigned int nr_vcpus; + unsigned int apic_enabled; struct vmx_virpit vmx_pit; struct vmx_io_handler vmx_io_handler; struct vmx_virpic vmx_pic; - struct vmx_vioapic vmx_vioapic; + struct vmx_vioapic vmx_vioapic; unsigned char round_info[256]; spinlock_t round_robin_lock; int interrupt_request; diff -r 25e3c8668f1f -r 8af1199488d3 xen/include/asm-x86/vmx_vlapic.h --- a/xen/include/asm-x86/vmx_vlapic.h Mon Jan 9 11:19:55 2006 +++ b/xen/include/asm-x86/vmx_vlapic.h Mon Jan 9 11:22:17 2006 @@ -21,7 +21,7 @@ #define VMX_VLAPIC_H #include <asm/msr.h> -#include <public/io/ioreq.h> +#include <public/hvm/ioreq.h> #if defined(__i386__) || defined(__x86_64__) static inline int __fls(uint32_t word) diff -r 25e3c8668f1f -r 8af1199488d3 xen/include/asm-x86/vmx_vmcs.h --- a/xen/include/asm-x86/vmx_vmcs.h Mon Jan 9 11:19:55 2006 +++ b/xen/include/asm-x86/vmx_vmcs.h Mon Jan 9 11:22:17 2006 @@ -23,7 +23,7 @@ #include <asm/vmx_cpu.h> #include <asm/vmx_platform.h> #include <asm/vmx_vlapic.h> -#include <public/vmx_assist.h> +#include <public/hvm/vmx_assist.h> extern int start_vmx(void); extern void stop_vmx(void); @@ -86,7 +86,8 @@ #define PC_DEBUG_PORT 0x80 struct arch_vmx_struct { - struct vmcs_struct *vmcs; /* VMCS pointer in virtual */ + struct vmcs_struct *vmcs; /* VMCS pointer in virtual. */ + unsigned int launch_cpu; /* VMCS is valid on this CPU. */ unsigned long flags; /* VMCS flags */ unsigned long cpu_cr0; /* copy of guest CR0 */ unsigned long cpu_shadow_cr0; /* copy of guest read shadow CR0 */ diff -r 25e3c8668f1f -r 8af1199488d3 xen/include/asm-x86/x86_emulate.h --- a/xen/include/asm-x86/x86_emulate.h Mon Jan 9 11:19:55 2006 +++ b/xen/include/asm-x86/x86_emulate.h Mon Jan 9 11:22:17 2006 @@ -18,10 +18,11 @@ * special treatment or emulation (*_emulated). * * The emulator assumes that an instruction accesses only one 'emulated memory' - * location, and that this is one of its data operands. Instruction fetches and + * location, that this location is the given linear faulting address (cr2), and + * that this is one of the instruction's data operands. Instruction fetches and * stack operations are assumed never to access emulated memory. The emulator * automatically deduces which operand of a string-move operation is accessing - * emulated memory, and requires that the other operand accesses normal memory. + * emulated memory, and assumes that the other operand accesses normal memory. * * NOTES: * 1. The emulator isn't very smart about emulated vs. standard memory. @@ -36,6 +37,7 @@ * then immediately bail. * 3. Valid access sizes are 1, 2, 4 and 8 bytes. On x86/32 systems only * cmpxchg8b_emulated need support 8-byte accesses. + * 4. The emulator cannot handle 64-bit mode emulation on an x86/32 system. */ /* Access completed successfully: continue emulation as normal. */ #define X86EMUL_CONTINUE 0 @@ -141,14 +143,27 @@ struct cpu_user_regs; +/* Execution mode, passed to the emulator. */ +#define X86EMUL_MODE_REAL 0 /* Real mode. */ +#define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */ +#define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */ +#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */ + +/* Host execution mode. */ +#if defined(__i386__) +#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32 +#elif defined(__x86_64__) +#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 +#endif + /* * x86_emulate_memop: Emulate an instruction that faulted attempting to * read/write a 'special' memory area. * @regs: Register state at time of fault. - * @cr2: Linear faulting address. + * @cr2: Linear faulting address within an emulated/special memory area. * @ops: Interface to access special memory. - * @mode: Current execution mode, represented by the default size of memory - * addresses, in bytes. Valid values are 2, 4 and 8 (x86/64 only). + * @mode: Emulated execution mode, represented by an X86EMUL_MODE value. + * Returns -1 on failure, 0 on success. */ extern int x86_emulate_memop( diff -r 25e3c8668f1f -r 8af1199488d3 xen/include/public/arch-x86_32.h --- a/xen/include/public/arch-x86_32.h Mon Jan 9 11:19:55 2006 +++ b/xen/include/public/arch-x86_32.h Mon Jan 9 11:22:17 2006 @@ -49,10 +49,15 @@ * machine->physical mapping table starts at this address, read-only. */ #ifdef CONFIG_X86_PAE -# define HYPERVISOR_VIRT_START (0xF5800000UL) +#define __HYPERVISOR_VIRT_START 0xF5800000 #else -# define HYPERVISOR_VIRT_START (0xFC000000UL) +#define __HYPERVISOR_VIRT_START 0xFC000000 #endif + +#ifndef HYPERVISOR_VIRT_START +#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START) +#endif + #ifndef machine_to_phys_mapping #define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START) #endif @@ -137,7 +142,7 @@ unsigned long pad[5]; /* sizeof(vcpu_info_t) == 64 */ } arch_vcpu_info_t; -#endif +#endif /* !__ASSEMBLY__ */ #endif diff -r 25e3c8668f1f -r 8af1199488d3 xen/include/public/arch-x86_64.h --- a/xen/include/public/arch-x86_64.h Mon Jan 9 11:19:55 2006 +++ b/xen/include/public/arch-x86_64.h Mon Jan 9 11:22:17 2006 @@ -59,9 +59,12 @@ /* And the trap vector is... */ #define TRAP_INSTR "syscall" +#define __HYPERVISOR_VIRT_START 0xFFFF800000000000 +#define __HYPERVISOR_VIRT_END 0xFFFF880000000000 + #ifndef HYPERVISOR_VIRT_START -#define HYPERVISOR_VIRT_START (0xFFFF800000000000UL) -#define HYPERVISOR_VIRT_END (0xFFFF880000000000UL) +#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START) +#define HYPERVISOR_VIRT_END mk_unsigned_long(__HYPERVISOR_VIRT_END) #endif /* Maximum number of virtual CPUs in multi-processor guests. */ diff -r 25e3c8668f1f -r 8af1199488d3 xen/include/public/dom0_ops.h --- a/xen/include/public/dom0_ops.h Mon Jan 9 11:19:55 2006 +++ b/xen/include/public/dom0_ops.h Mon Jan 9 11:22:17 2006 @@ -94,14 +94,14 @@ xen_domain_handle_t handle; } dom0_getdomaininfo_t; -#define DOM0_SETDOMAININFO 13 +#define DOM0_SETVCPUCONTEXT 13 typedef struct { /* IN variables. */ domid_t domain; uint32_t vcpu; /* IN/OUT parameters */ vcpu_guest_context_t *ctxt; -} dom0_setdomaininfo_t; +} dom0_setvcpucontext_t; #define DOM0_MSR 15 typedef struct { @@ -163,13 +163,13 @@ /* * Set which physical cpus a vcpu can execute on. */ -#define DOM0_PINCPUDOMAIN 20 +#define DOM0_SETVCPUAFFINITY 20 typedef struct { /* IN variables. */ domid_t domain; uint32_t vcpu; cpumap_t cpumap; -} dom0_pincpudomain_t; +} dom0_setvcpuaffinity_t; /* Get trace buffers machine base address */ #define DOM0_TBUFCONTROL 21 @@ -410,6 +410,21 @@ uint8_t enable; } dom0_setdebugging_t; +#define DOM0_IRQ_PERMISSION 46 +typedef struct { + domid_t domain; /* domain to be affected */ + uint8_t pirq; + uint8_t allow_access; /* flag to specify enable/disable of IRQ access */ +} dom0_irq_permission_t; + +#define DOM0_IOMEM_PERMISSION 47 +typedef struct { + domid_t domain; /* domain to be affected */ + unsigned long first_pfn; /* first page (physical page number) in range */ + unsigned long nr_pfns; /* number of pages in range (>0) */ + uint8_t allow_access; /* allow (!0) or deny (0) access to range? */ +} dom0_iomem_permission_t; + typedef struct { uint32_t cmd; uint32_t interface_version; /* DOM0_INTERFACE_VERSION */ @@ -421,13 +436,13 @@ dom0_getmemlist_t getmemlist; dom0_schedctl_t schedctl; dom0_adjustdom_t adjustdom; - dom0_setdomaininfo_t setdomaininfo; + dom0_setvcpucontext_t setvcpucontext; dom0_getdomaininfo_t getdomaininfo; dom0_getpageframeinfo_t getpageframeinfo; dom0_msr_t msr; dom0_settime_t settime; dom0_readconsole_t readconsole; - dom0_pincpudomain_t pincpudomain; + dom0_setvcpuaffinity_t setvcpuaffinity; dom0_tbufcontrol_t tbufcontrol; dom0_physinfo_t physinfo; dom0_sched_id_t sched_id; @@ -448,6 +463,8 @@ dom0_max_vcpus_t max_vcpus; dom0_setdomainhandle_t setdomainhandle; dom0_setdebugging_t setdebugging; + dom0_irq_permission_t irq_permission; + dom0_iomem_permission_t iomem_permission; uint8_t pad[128]; } u; } dom0_op_t; diff -r 25e3c8668f1f -r 8af1199488d3 xen/include/public/xen.h --- a/xen/include/public/xen.h Mon Jan 9 11:19:55 2006 +++ b/xen/include/public/xen.h Mon Jan 9 11:22:17 2006 @@ -426,6 +426,15 @@ typedef uint8_t xen_domain_handle_t[16]; +/* Turn a plain number into a C unsigned long constant. */ +#define __mk_unsigned_long(x) x ## UL +#define mk_unsigned_long(x) __mk_unsigned_long(x) + +#else /* __ASSEMBLY__ */ + +/* In assembly code we cannot use C numeric constant suffixes. */ +#define mk_unsigned_long(x) x + #endif /* !__ASSEMBLY__ */ #endif /* __XEN_PUBLIC_XEN_H__ */ diff -r 25e3c8668f1f -r 8af1199488d3 xen/include/xen/bitmap.h --- a/xen/include/xen/bitmap.h Mon Jan 9 11:19:55 2006 +++ b/xen/include/xen/bitmap.h Mon Jan 9 11:22:17 2006 @@ -41,6 +41,8 @@ * bitmap_weight(src, nbits) Hamming Weight: number set bits * bitmap_shift_right(dst, src, n, nbits) *dst = *src >> n * bitmap_shift_left(dst, src, n, nbits) *dst = *src << n + * bitmap_scnprintf(buf, len, src, nbits) Print bitmap src to buf + * bitmap_scnlistprintf(buf, len, src, nbits) Print bitmap src as list to buf */ /* @@ -93,6 +95,10 @@ const unsigned long *bitmap2, int bits); extern int __bitmap_weight(const unsigned long *bitmap, int bits); +extern int bitmap_scnprintf(char *buf, unsigned int len, + const unsigned long *src, int nbits); +extern int bitmap_scnlistprintf(char *buf, unsigned int len, + const unsigned long *src, int nbits); extern int bitmap_find_free_region(unsigned long *bitmap, int bits, int order); extern void bitmap_release_region(unsigned long *bitmap, int pos, int order); extern int bitmap_allocate_region(unsigned long *bitmap, int pos, int order); diff -r 25e3c8668f1f -r 8af1199488d3 xen/include/xen/compiler.h --- a/xen/include/xen/compiler.h Mon Jan 9 11:19:55 2006 +++ b/xen/include/xen/compiler.h Mon Jan 9 11:22:17 2006 @@ -19,4 +19,10 @@ #define __attribute_used__ __attribute__((__unused__)) #endif +#if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4) +#define __must_check __attribute__((warn_unused_result)) +#else +#define __must_check +#endif + #endif /* __LINUX_COMPILER_H */ diff -r 25e3c8668f1f -r 8af1199488d3 xen/include/xen/config.h --- a/xen/include/xen/config.h Mon Jan 9 11:19:55 2006 +++ b/xen/include/xen/config.h Mon Jan 9 11:22:17 2006 @@ -43,4 +43,13 @@ #define __STR(...) #__VA_ARGS__ #define STR(...) __STR(__VA_ARGS__) +#ifndef __ASSEMBLY__ +/* Turn a plain number into a C unsigned long constant. */ +#define __mk_unsigned_long(x) x ## UL +#define mk_unsigned_long(x) __mk_unsigned_long(x) +#else /* __ASSEMBLY__ */ +/* In assembly code we cannot use C numeric constant suffixes. */ +#define mk_unsigned_long(x) x +#endif /* !__ASSEMBLY__ */ + #endif /* __XEN_CONFIG_H__ */ diff -r 25e3c8668f1f -r 8af1199488d3 xen/include/xen/cpumask.h --- a/xen/include/xen/cpumask.h Mon Jan 9 11:19:55 2006 +++ b/xen/include/xen/cpumask.h Mon Jan 9 11:22:17 2006 @@ -8,8 +8,8 @@ * See detailed comments in the file xen/bitmap.h describing the * data type on which these cpumasks are based. * - * For details of cpumask_scnprintf() and cpumask_parse(), - * see bitmap_scnprintf() and bitmap_parse() in lib/bitmap.c. + * For details of cpumask_scnprintf() and cpulist_scnprintf(), + * see bitmap_scnprintf() and bitmap_scnlistprintf() in lib/bitmap.c. * * The available cpumask operations are: * @@ -36,8 +36,8 @@ * void cpus_shift_right(dst, src, n) Shift right * void cpus_shift_left(dst, src, n) Shift left * - * int first_cpu(mask) Number lowest set bit, or >= NR_CPUS - * int next_cpu(cpu, mask) Next cpu past 'cpu', or >= NR_CPUS + * int first_cpu(mask) Number lowest set bit, or NR_CPUS + * int next_cpu(cpu, mask) Next cpu past 'cpu', or NR_CPUS * * cpumask_t cpumask_of_cpu(cpu) Return cpumask with bit 'cpu' set * CPU_MASK_ALL Initializer - all bits set @@ -45,7 +45,7 @@ * unsigned long *cpus_addr(mask) Array of unsigned long's in mask * * int cpumask_scnprintf(buf, len, mask) Format cpumask for printing - * int cpumask_parse(ubuf, ulen, mask) Parse ascii string as cpumask + * int cpulist_scnprintf(buf, len, mask) Format cpumask as list for printing * * for_each_cpu_mask(cpu, mask) for-loop cpu over mask * @@ -207,13 +207,13 @@ #define first_cpu(src) __first_cpu(&(src), NR_CPUS) static inline int __first_cpu(const cpumask_t *srcp, int nbits) { - return find_first_bit(srcp->bits, nbits); + return min_t(int, nbits, find_first_bit(srcp->bits, nbits)); } #define next_cpu(n, src) __next_cpu((n), &(src), NR_CPUS) static inline int __next_cpu(int n, const cpumask_t *srcp, int nbits) { - return find_next_bit(srcp->bits, nbits, n+1); + return min_t(int, nbits, find_next_bit(srcp->bits, nbits, n+1)); } #define cpumask_of_cpu(cpu) \ @@ -259,7 +259,6 @@ #define cpus_addr(src) ((src).bits) -/* #define cpumask_scnprintf(buf, len, src) \ __cpumask_scnprintf((buf), (len), &(src), NR_CPUS) static inline int __cpumask_scnprintf(char *buf, int len, @@ -268,14 +267,13 @@ return bitmap_scnprintf(buf, len, srcp->bits, nbits); } -#define cpumask_parse(ubuf, ulen, src) \ - __cpumask_parse((ubuf), (ulen), &(src), NR_CPUS) -static inline int __cpumask_parse(const char __user *buf, int len, - cpumask_t *dstp, int nbits) -{ - return bitmap_parse(buf, len, dstp->bits, nbits); -} -*/ +#define cpulist_scnprintf(buf, len, src) \ + __cpulist_scnprintf((buf), (len), &(src), NR_CPUS) +static inline int __cpulist_scnprintf(char *buf, int len, + const cpumask_t *srcp, int nbits) +{ + return bitmap_scnlistprintf(buf, len, srcp->bits, nbits); +} #if NR_CPUS > 1 #define for_each_cpu_mask(cpu, mask) \ @@ -368,7 +366,7 @@ for_each_cpu_mask(cpu, (mask)) \ if (cpu_online(cpu)) \ break; \ - min_t(int, NR_CPUS, cpu); \ + cpu; \ }) #define for_each_cpu(cpu) for_each_cpu_mask((cpu), cpu_possible_map) diff -r 25e3c8668f1f -r 8af1199488d3 xen/include/xen/domain.h --- a/xen/include/xen/domain.h Mon Jan 9 11:19:55 2006 +++ b/xen/include/xen/domain.h Mon Jan 9 11:22:17 2006 @@ -13,12 +13,10 @@ extern void free_vcpu_struct(struct vcpu *v); -extern void arch_do_createdomain(struct vcpu *v); +extern int arch_do_createdomain(struct vcpu *v); -extern int arch_set_info_guest( +extern int arch_set_info_guest( struct vcpu *v, struct vcpu_guest_context *c); - -extern void vcpu_migrate_cpu(struct vcpu *v, int newcpu); extern void free_perdomain_pt(struct domain *d); diff -r 25e3c8668f1f -r 8af1199488d3 xen/include/xen/lib.h --- a/xen/include/xen/lib.h Mon Jan 9 11:19:55 2006 +++ b/xen/include/xen/lib.h Mon Jan 9 11:22:17 2006 @@ -53,10 +53,16 @@ /* vsprintf.c */ extern int sprintf(char * buf, const char * fmt, ...) __attribute__ ((format (printf, 2, 3))); -extern int vsprintf(char *buf, const char *, va_list); +extern int vsprintf(char *buf, const char *, va_list) + __attribute__ ((format (printf, 2, 0))); extern int snprintf(char * buf, size_t size, const char * fmt, ...) __attribute__ ((format (printf, 3, 4))); -extern int vsnprintf(char *buf, size_t size, const char *fmt, va_list args); +extern int vsnprintf(char *buf, size_t size, const char *fmt, va_list args) + __attribute__ ((format (printf, 3, 0))); +extern int scnprintf(char * buf, size_t size, const char * fmt, ...) + __attribute__ ((format (printf, 3, 4))); +extern int vscnprintf(char *buf, size_t size, const char *fmt, va_list args) + __attribute__ ((format (printf, 3, 0))); long simple_strtol( const char *cp,char **endp, unsigned int base); diff -r 25e3c8668f1f -r 8af1199488d3 xen/include/xen/sched-if.h --- a/xen/include/xen/sched-if.h Mon Jan 9 11:19:55 2006 +++ b/xen/include/xen/sched-if.h Mon Jan 9 11:22:17 2006 @@ -13,8 +13,8 @@ struct schedule_data { spinlock_t schedule_lock; /* spinlock protecting curr */ - struct vcpu *curr; /* current task */ - struct vcpu *idle; /* idle task for this cpu */ + struct vcpu *curr; /* current task */ + struct vcpu *idle; /* idle task for this cpu */ void *sched_priv; struct ac_timer s_timer; /* scheduling timer */ unsigned long tick; /* current periodic 'tick' */ @@ -39,6 +39,7 @@ void (*rem_task) (struct vcpu *); void (*sleep) (struct vcpu *); void (*wake) (struct vcpu *); + int (*set_affinity) (struct vcpu *, cpumask_t *); struct task_slice (*do_schedule) (s_time_t); int (*control) (struct sched_ctl_cmd *); int (*adjdom) (struct domain *, diff -r 25e3c8668f1f -r 8af1199488d3 xen/include/xen/sched.h --- a/xen/include/xen/sched.h Mon Jan 9 11:19:55 2006 +++ b/xen/include/xen/sched.h Mon Jan 9 11:22:17 2006 @@ -11,6 +11,7 @@ #include <xen/time.h> #include <xen/ac_timer.h> #include <xen/grant_table.h> +#include <xen/rangeset.h> #include <asm/domain.h> extern unsigned long volatile jiffies; @@ -50,8 +51,6 @@ int evtchn_init(struct domain *d); void evtchn_destroy(struct domain *d); -#define CPUMAP_RUNANYWHERE 0xFFFFFFFF - struct vcpu { int vcpu_id; @@ -79,7 +78,11 @@ atomic_t pausecnt; - cpumap_t cpumap; /* which cpus this domain can run on */ + /* Bitmask of CPUs on which this VCPU may run. */ + cpumask_t cpu_affinity; + + /* Bitmask of CPUs which are holding onto this VCPU's state. */ + cpumask_t vcpu_dirty_cpumask; struct arch_vcpu arch; }; @@ -109,6 +112,9 @@ struct domain *next_in_list; struct domain *next_in_hashbucket; + + struct list_head rangesets; + spinlock_t rangesets_lock; /* Event channel information. */ struct evtchn *evtchn[NR_EVTCHN_BUCKETS]; @@ -125,6 +131,10 @@ u16 pirq_to_evtchn[NR_PIRQS]; u32 pirq_mask[NR_PIRQS/32]; + /* I/O capabilities (access to IRQs and memory-mapped I/O). */ + struct rangeset *iomem_caps; + struct rangeset *irq_caps; + unsigned long domain_flags; unsigned long vm_assist; @@ -133,7 +143,7 @@ struct vcpu *vcpu[MAX_VIRT_CPUS]; /* Bitmask of CPUs which are holding onto this domain's state. */ - cpumask_t cpumask; + cpumask_t domain_dirty_cpumask; struct arch_domain arch; @@ -165,9 +175,9 @@ extern struct domain idle0_domain; extern struct vcpu idle0_vcpu; -extern struct vcpu *idle_task[NR_CPUS]; +extern struct vcpu *idle_domain[NR_CPUS]; #define IDLE_DOMAIN_ID (0x7FFFU) -#define is_idle_task(_d) (test_bit(_DOMF_idle_domain, &(_d)->domain_flags)) +#define is_idle_domain(_d) (test_bit(_DOMF_idle_domain, &(_d)->domain_flags)) struct vcpu *alloc_vcpu( struct domain *d, unsigned int vcpu_id, unsigned int cpu_id); @@ -215,7 +225,7 @@ unsigned long image_start, unsigned long image_len, unsigned long initrd_start, unsigned long initrd_len, char *cmdline); -extern int set_info_guest(struct domain *d, dom0_setdomaininfo_t *); +extern int set_info_guest(struct domain *d, dom0_setvcpucontext_t *); struct domain *find_domain_by_id(domid_t dom); extern void domain_destruct(struct domain *d); @@ -261,36 +271,27 @@ extern void sync_vcpu_execstate(struct vcpu *v); /* - * Called by the scheduler to switch to another VCPU. On entry, although - * VCPUF_running is no longer asserted for @prev, its context is still running - * on the local CPU and is not committed to memory. The local scheduler lock - * is therefore still held, and interrupts are disabled, because the local CPU - * is in an inconsistent state. - * - * The callee must ensure that the local CPU is no longer running in @prev's - * context, and that the context is saved to memory, before returning. - * Alternatively, if implementing lazy context switching, it suffices to ensure - * that invoking sync_vcpu_execstate() will switch and commit @prev's state. + * Called by the scheduler to switch to another VCPU. This function must + * call context_saved(@prev) when the local CPU is no longer running in + * @prev's context, and that context is saved to memory. Alternatively, if + * implementing lazy context switching, it suffices to ensure that invoking + * sync_vcpu_execstate() will switch and commit @prev's state. */ extern void context_switch( struct vcpu *prev, struct vcpu *next); /* - * On some architectures (notably x86) it is not possible to entirely load - * @next's context with interrupts disabled. These may implement a function to - * finalise loading the new context after interrupts are re-enabled. This - * function is not given @prev and is not permitted to access it. - */ -extern void context_switch_finalise( - struct vcpu *next); + * As described above, context_switch() must call this function when the + * local CPU is no longer running in @prev's context, and @prev's context is + * saved to memory. Alternatively, if implementing lazy context switching, + * ensure that invoking sync_vcpu_execstate() will switch and commit @prev. + */ +#define context_saved(prev) (clear_bit(_VCPUF_running, &(prev)->vcpu_flags)) /* Called by the scheduler to continue running the current VCPU. */ extern void continue_running( struct vcpu *same); - -/* Is CPU 'cpu' idle right now? */ -int idle_cpu(int cpu); void startup_cpu_idle_loop(void); @@ -356,17 +357,11 @@ /* Currently running on a CPU? */ #define _VCPUF_running 3 #define VCPUF_running (1UL<<_VCPUF_running) - /* Disables auto-migration between CPUs. */ -#define _VCPUF_cpu_pinned 4 -#define VCPUF_cpu_pinned (1UL<<_VCPUF_cpu_pinned) - /* Domain migrated between CPUs. */ -#define _VCPUF_cpu_migrated 5 -#define VCPUF_cpu_migrated (1UL<<_VCPUF_cpu_migrated) /* Initialization completed. */ -#define _VCPUF_initialised 6 +#define _VCPUF_initialised 4 #define VCPUF_initialised (1UL<<_VCPUF_initialised) /* VCPU is not-runnable */ -#define _VCPUF_down 7 +#define _VCPUF_down 5 #define VCPUF_down (1UL<<_VCPUF_down) /* @@ -378,32 +373,25 @@ /* Is this domain privileged? */ #define _DOMF_privileged 1 #define DOMF_privileged (1UL<<_DOMF_privileged) - /* May this domain do IO to physical devices? */ -#define _DOMF_physdev_access 2 -#define DOMF_physdev_access (1UL<<_DOMF_physdev_access) /* Guest shut itself down for some reason. */ -#define _DOMF_shutdown 3 +#define _DOMF_shutdown 2 #define DOMF_shutdown (1UL<<_DOMF_shutdown) - /* Guest is in process of shutting itself down (becomes DOMF_shutdown). */ -#define _DOMF_shuttingdown 4 -#define DOMF_shuttingdown (1UL<<_DOMF_shuttingdown) /* Death rattle. */ -#define _DOMF_dying 5 +#define _DOMF_dying 3 #define DOMF_dying (1UL<<_DOMF_dying) /* Domain is paused by controller software. */ -#define _DOMF_ctrl_pause 6 +#define _DOMF_ctrl_pause 4 #define DOMF_ctrl_pause (1UL<<_DOMF_ctrl_pause) /* Domain is being debugged by controller software. */ -#define _DOMF_debugging 7 +#define _DOMF_debugging 5 #define DOMF_debugging (1UL<<_DOMF_debugging) -static inline int domain_runnable(struct vcpu *v) +static inline int vcpu_runnable(struct vcpu *v) { return ( (atomic_read(&v->pausecnt) == 0) && !(v->vcpu_flags & (VCPUF_blocked|VCPUF_down)) && - !(v->domain->domain_flags & - (DOMF_shutdown|DOMF_shuttingdown|DOMF_ctrl_pause)) ); + !(v->domain->domain_flags & (DOMF_shutdown|DOMF_ctrl_pause)) ); } void vcpu_pause(struct vcpu *v); @@ -414,6 +402,8 @@ void domain_unpause_by_systemcontroller(struct domain *d); void cpu_init(void); +int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity); + static inline void vcpu_unblock(struct vcpu *v) { if ( test_and_clear_bit(_VCPUF_blocked, &v->vcpu_flags) ) @@ -422,8 +412,6 @@ #define IS_PRIV(_d) \ (test_bit(_DOMF_privileged, &(_d)->domain_flags)) -#define IS_CAPABLE_PHYSDEV(_d) \ - (test_bit(_DOMF_physdev_access, &(_d)->domain_flags)) #define VM_ASSIST(_d,_t) (test_bit((_t), &(_d)->vm_assist)) diff -r 25e3c8668f1f -r 8af1199488d3 tools/guest-headers/Makefile --- /dev/null Mon Jan 9 11:19:55 2006 +++ b/tools/guest-headers/Makefile Mon Jan 9 11:22:17 2006 @@ -0,0 +1,11 @@ + +XEN_ROOT=../.. +linuxsparsetree = $(XEN_ROOT)/linux-2.6-xen-sparse + +check: + +install: + mkdir -p $(DESTDIR)/usr/include/xen/linux + install -m0644 $(linuxsparsetree)/include/asm-xen/linux-public/*.h $(DESTDIR)/usr/include/xen/linux + +clean: diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/ia64/linux-xen/process-linux-xen.c --- /dev/null Mon Jan 9 11:19:55 2006 +++ b/xen/arch/ia64/linux-xen/process-linux-xen.c Mon Jan 9 11:22:17 2006 @@ -0,0 +1,848 @@ +/* + * Architecture-specific setup. + * + * Copyright (C) 1998-2003 Hewlett-Packard Co + * David Mosberger-Tang <davidm@xxxxxxxxxx> + * 04/11/17 Ashok Raj <ashok.raj@xxxxxxxxx> Added CPU Hotplug Support + */ +#ifdef XEN +#include <xen/types.h> +#include <xen/lib.h> +#include <xen/symbols.h> +#include <xen/smp.h> +#include <asm/uaccess.h> +#include <asm/processor.h> +#include <asm/ptrace.h> +#include <asm/unwind.h> +#else +#define __KERNEL_SYSCALLS__ /* see <asm/unistd.h> */ +#include <linux/config.h> + +#include <linux/cpu.h> +#include <linux/pm.h> +#include <linux/elf.h> +#include <linux/errno.h> +#include <linux/kallsyms.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/notifier.h> +#include <linux/personality.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/smp_lock.h> +#include <linux/stddef.h> +#include <linux/thread_info.h> +#include <linux/unistd.h> +#include <linux/efi.h> +#include <linux/interrupt.h> +#include <linux/delay.h> +#include <linux/kprobes.h> + +#include <asm/cpu.h> +#include <asm/delay.h> +#include <asm/elf.h> +#include <asm/ia32.h> +#include <asm/irq.h> +#include <asm/pgalloc.h> +#include <asm/processor.h> +#include <asm/sal.h> +#include <asm/tlbflush.h> +#include <asm/uaccess.h> +#include <asm/unwind.h> +#include <asm/user.h> + +#include "entry.h" + +#ifdef CONFIG_PERFMON +# include <asm/perfmon.h> +#endif + +#include "sigframe.h" + +void (*ia64_mark_idle)(int); +static DEFINE_PER_CPU(unsigned int, cpu_idle_state); + +unsigned long boot_option_idle_override = 0; +EXPORT_SYMBOL(boot_option_idle_override); +#endif + +void +ia64_do_show_stack (struct unw_frame_info *info, void *arg) +{ + unsigned long ip, sp, bsp; + char buf[128]; /* don't make it so big that it overflows the stack! */ + + printk("\nCall Trace:\n"); + do { + unw_get_ip(info, &ip); + if (ip == 0) + break; + + unw_get_sp(info, &sp); + unw_get_bsp(info, &bsp); + snprintf(buf, sizeof(buf), + " [<%016lx>] %%s\n" + " sp=%016lx bsp=%016lx\n", + ip, sp, bsp); + print_symbol(buf, ip); + } while (unw_unwind(info) >= 0); +} + +void +show_stack (struct task_struct *task, unsigned long *sp) +{ + if (!task) + unw_init_running(ia64_do_show_stack, NULL); + else { + struct unw_frame_info info; + + unw_init_from_blocked_task(&info, task); + ia64_do_show_stack(&info, NULL); + } +} + +#ifndef XEN +void +dump_stack (void) +{ + show_stack(NULL, NULL); +} + +EXPORT_SYMBOL(dump_stack); +#endif + +#ifdef XEN +void +show_registers(struct pt_regs *regs) +#else +void +show_regs (struct pt_regs *regs) +#endif +{ + unsigned long ip = regs->cr_iip + ia64_psr(regs)->ri; + +#ifndef XEN + print_modules(); + printk("\nPid: %d, CPU %d, comm: %20s\n", current->pid, smp_processor_id(), current->comm); + printk("psr : %016lx ifs : %016lx ip : [<%016lx>] %s\n", + regs->cr_ipsr, regs->cr_ifs, ip, print_tainted()); +#else + printk("\nCPU %d\n", smp_processor_id()); + printk("psr : %016lx ifs : %016lx ip : [<%016lx>]\n", + regs->cr_ipsr, regs->cr_ifs, ip); +#endif + print_symbol("ip is at %s\n", ip); + printk("unat: %016lx pfs : %016lx rsc : %016lx\n", + regs->ar_unat, regs->ar_pfs, regs->ar_rsc); + printk("rnat: %016lx bsps: %016lx pr : %016lx\n", + regs->ar_rnat, regs->ar_bspstore, regs->pr); + printk("ldrs: %016lx ccv : %016lx fpsr: %016lx\n", + regs->loadrs, regs->ar_ccv, regs->ar_fpsr); + printk("csd : %016lx ssd : %016lx\n", regs->ar_csd, regs->ar_ssd); + printk("b0 : %016lx b6 : %016lx b7 : %016lx\n", regs->b0, regs->b6, regs->b7); + printk("f6 : %05lx%016lx f7 : %05lx%016lx\n", + regs->f6.u.bits[1], regs->f6.u.bits[0], + regs->f7.u.bits[1], regs->f7.u.bits[0]); + printk("f8 : %05lx%016lx f9 : %05lx%016lx\n", + regs->f8.u.bits[1], regs->f8.u.bits[0], + regs->f9.u.bits[1], regs->f9.u.bits[0]); + printk("f10 : %05lx%016lx f11 : %05lx%016lx\n", + regs->f10.u.bits[1], regs->f10.u.bits[0], + regs->f11.u.bits[1], regs->f11.u.bits[0]); + + printk("r1 : %016lx r2 : %016lx r3 : %016lx\n", regs->r1, regs->r2, regs->r3); + printk("r8 : %016lx r9 : %016lx r10 : %016lx\n", regs->r8, regs->r9, regs->r10); + printk("r11 : %016lx r12 : %016lx r13 : %016lx\n", regs->r11, regs->r12, regs->r13); + printk("r14 : %016lx r15 : %016lx r16 : %016lx\n", regs->r14, regs->r15, regs->r16); + printk("r17 : %016lx r18 : %016lx r19 : %016lx\n", regs->r17, regs->r18, regs->r19); + printk("r20 : %016lx r21 : %016lx r22 : %016lx\n", regs->r20, regs->r21, regs->r22); + printk("r23 : %016lx r24 : %016lx r25 : %016lx\n", regs->r23, regs->r24, regs->r25); + printk("r26 : %016lx r27 : %016lx r28 : %016lx\n", regs->r26, regs->r27, regs->r28); + printk("r29 : %016lx r30 : %016lx r31 : %016lx\n", regs->r29, regs->r30, regs->r31); + + if (user_mode(regs)) { + /* print the stacked registers */ + unsigned long val, *bsp, ndirty; + int i, sof, is_nat = 0; + + sof = regs->cr_ifs & 0x7f; /* size of frame */ + ndirty = (regs->loadrs >> 19); + bsp = ia64_rse_skip_regs((unsigned long *) regs->ar_bspstore, ndirty); + for (i = 0; i < sof; ++i) { + get_user(val, (unsigned long __user *) ia64_rse_skip_regs(bsp, i)); + printk("r%-3u:%c%016lx%s", 32 + i, is_nat ? '*' : ' ', val, + ((i == sof - 1) || (i % 3) == 2) ? "\n" : " "); + } + } else + show_stack(NULL, NULL); +} + +#ifndef XEN +void +do_notify_resume_user (sigset_t *oldset, struct sigscratch *scr, long in_syscall) +{ + if (fsys_mode(current, &scr->pt)) { + /* defer signal-handling etc. until we return to privilege-level 0. */ + if (!ia64_psr(&scr->pt)->lp) + ia64_psr(&scr->pt)->lp = 1; + return; + } + +#ifdef CONFIG_PERFMON + if (current->thread.pfm_needs_checking) + pfm_handle_work(); +#endif + + /* deal with pending signal delivery */ + if (test_thread_flag(TIF_SIGPENDING)) + ia64_do_signal(oldset, scr, in_syscall); +} + +static int pal_halt = 1; +static int can_do_pal_halt = 1; + +static int __init nohalt_setup(char * str) +{ + pal_halt = can_do_pal_halt = 0; + return 1; +} +__setup("nohalt", nohalt_setup); + +void +update_pal_halt_status(int status) +{ + can_do_pal_halt = pal_halt && status; +} + +/* + * We use this if we don't have any better idle routine.. + */ +void +default_idle (void) +{ + local_irq_enable(); + while (!need_resched()) + if (can_do_pal_halt) + safe_halt(); + else + cpu_relax(); +} + +#ifdef CONFIG_HOTPLUG_CPU +/* We don't actually take CPU down, just spin without interrupts. */ +static inline void play_dead(void) +{ + extern void ia64_cpu_local_tick (void); + unsigned int this_cpu = smp_processor_id(); + + /* Ack it */ + __get_cpu_var(cpu_state) = CPU_DEAD; + + max_xtp(); + local_irq_disable(); + idle_domain_exit(); + ia64_jump_to_sal(&sal_boot_rendez_state[this_cpu]); + /* + * The above is a point of no-return, the processor is + * expected to be in SAL loop now. + */ + BUG(); +} +#else +static inline void play_dead(void) +{ + BUG(); +} +#endif /* CONFIG_HOTPLUG_CPU */ + +void cpu_idle_wait(void) +{ + unsigned int cpu, this_cpu = get_cpu(); + cpumask_t map; + + set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); + put_cpu(); + + cpus_clear(map); + for_each_online_cpu(cpu) { + per_cpu(cpu_idle_state, cpu) = 1; + cpu_set(cpu, map); + } + + __get_cpu_var(cpu_idle_state) = 0; + + wmb(); + do { + ssleep(1); + for_each_online_cpu(cpu) { + if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu)) + cpu_clear(cpu, map); + } + cpus_and(map, map, cpu_online_map); + } while (!cpus_empty(map)); +} +EXPORT_SYMBOL_GPL(cpu_idle_wait); + +void __attribute__((noreturn)) +cpu_idle (void) +{ + void (*mark_idle)(int) = ia64_mark_idle; + + /* endless idle loop with no priority at all */ + while (1) { +#ifdef CONFIG_SMP + if (!need_resched()) + min_xtp(); +#endif + while (!need_resched()) { + void (*idle)(void); + + if (__get_cpu_var(cpu_idle_state)) + __get_cpu_var(cpu_idle_state) = 0; + + rmb(); + if (mark_idle) + (*mark_idle)(1); + + idle = pm_idle; + if (!idle) + idle = default_idle; + (*idle)(); + } + + if (mark_idle) + (*mark_idle)(0); + +#ifdef CONFIG_SMP + normal_xtp(); +#endif + schedule(); + check_pgt_cache(); + if (cpu_is_offline(smp_processor_id())) + play_dead(); + } +} + +void +ia64_save_extra (struct task_struct *task) +{ +#ifdef CONFIG_PERFMON + unsigned long info; +#endif + + if ((task->thread.flags & IA64_THREAD_DBG_VALID) != 0) + ia64_save_debug_regs(&task->thread.dbr[0]); + +#ifdef CONFIG_PERFMON + if ((task->thread.flags & IA64_THREAD_PM_VALID) != 0) + pfm_save_regs(task); + + info = __get_cpu_var(pfm_syst_info); + if (info & PFM_CPUINFO_SYST_WIDE) + pfm_syst_wide_update_task(task, info, 0); +#endif + +#ifdef CONFIG_IA32_SUPPORT + if (IS_IA32_PROCESS(ia64_task_regs(task))) + ia32_save_state(task); +#endif +} + +void +ia64_load_extra (struct task_struct *task) +{ +#ifdef CONFIG_PERFMON + unsigned long info; +#endif + + if ((task->thread.flags & IA64_THREAD_DBG_VALID) != 0) + ia64_load_debug_regs(&task->thread.dbr[0]); + +#ifdef CONFIG_PERFMON + if ((task->thread.flags & IA64_THREAD_PM_VALID) != 0) + pfm_load_regs(task); + + info = __get_cpu_var(pfm_syst_info); + if (info & PFM_CPUINFO_SYST_WIDE) + pfm_syst_wide_update_task(task, info, 1); +#endif + +#ifdef CONFIG_IA32_SUPPORT + if (IS_IA32_PROCESS(ia64_task_regs(task))) + ia32_load_state(task); +#endif +} + +/* + * Copy the state of an ia-64 thread. + * + * We get here through the following call chain: + * + * from user-level: from kernel: + * + * <clone syscall> <some kernel call frames> + * sys_clone : + * do_fork do_fork + * copy_thread copy_thread + * + * This means that the stack layout is as follows: + * + * +---------------------+ (highest addr) + * | struct pt_regs | + * +---------------------+ + * | struct switch_stack | + * +---------------------+ + * | | + * | memory stack | + * | | <-- sp (lowest addr) + * +---------------------+ + * + * Observe that we copy the unat values that are in pt_regs and switch_stack. Spilling an + * integer to address X causes bit N in ar.unat to be set to the NaT bit of the register, + * with N=(X & 0x1ff)/8. Thus, copying the unat value preserves the NaT bits ONLY if the + * pt_regs structure in the parent is congruent to that of the child, modulo 512. Since + * the stack is page aligned and the page size is at least 4KB, this is always the case, + * so there is nothing to worry about. + */ +int +copy_thread (int nr, unsigned long clone_flags, + unsigned long user_stack_base, unsigned long user_stack_size, + struct task_struct *p, struct pt_regs *regs) +{ + extern char ia64_ret_from_clone, ia32_ret_from_clone; + struct switch_stack *child_stack, *stack; + unsigned long rbs, child_rbs, rbs_size; + struct pt_regs *child_ptregs; + int retval = 0; + +#ifdef CONFIG_SMP + /* + * For SMP idle threads, fork_by_hand() calls do_fork with + * NULL regs. + */ + if (!regs) + return 0; +#endif + + stack = ((struct switch_stack *) regs) - 1; + + child_ptregs = (struct pt_regs *) ((unsigned long) p + IA64_STK_OFFSET) - 1; + child_stack = (struct switch_stack *) child_ptregs - 1; + + /* copy parent's switch_stack & pt_regs to child: */ + memcpy(child_stack, stack, sizeof(*child_ptregs) + sizeof(*child_stack)); + + rbs = (unsigned long) current + IA64_RBS_OFFSET; + child_rbs = (unsigned long) p + IA64_RBS_OFFSET; + rbs_size = stack->ar_bspstore - rbs; + + /* copy the parent's register backing store to the child: */ + memcpy((void *) child_rbs, (void *) rbs, rbs_size); + + if (likely(user_mode(child_ptregs))) { + if ((clone_flags & CLONE_SETTLS) && !IS_IA32_PROCESS(regs)) + child_ptregs->r13 = regs->r16; /* see sys_clone2() in entry.S */ + if (user_stack_base) { + child_ptregs->r12 = user_stack_base + user_stack_size - 16; + child_ptregs->ar_bspstore = user_stack_base; + child_ptregs->ar_rnat = 0; + child_ptregs->loadrs = 0; + } + } else { + /* + * Note: we simply preserve the relative position of + * the stack pointer here. There is no need to + * allocate a scratch area here, since that will have + * been taken care of by the caller of sys_clone() + * already. + */ + child_ptregs->r12 = (unsigned long) child_ptregs - 16; /* kernel sp */ + child_ptregs->r13 = (unsigned long) p; /* set `current' pointer */ + } + child_stack->ar_bspstore = child_rbs + rbs_size; + if (IS_IA32_PROCESS(regs)) + child_stack->b0 = (unsigned long) &ia32_ret_from_clone; + else + child_stack->b0 = (unsigned long) &ia64_ret_from_clone; + + /* copy parts of thread_struct: */ + p->thread.ksp = (unsigned long) child_stack - 16; + + /* stop some PSR bits from being inherited. + * the psr.up/psr.pp bits must be cleared on fork but inherited on execve() + * therefore we must specify them explicitly here and not include them in + * IA64_PSR_BITS_TO_CLEAR. + */ + child_ptregs->cr_ipsr = ((child_ptregs->cr_ipsr | IA64_PSR_BITS_TO_SET) + & ~(IA64_PSR_BITS_TO_CLEAR | IA64_PSR_PP | IA64_PSR_UP)); + + /* + * NOTE: The calling convention considers all floating point + * registers in the high partition (fph) to be scratch. Since + * the only way to get to this point is through a system call, + * we know that the values in fph are all dead. Hence, there + * is no need to inherit the fph state from the parent to the + * child and all we have to do is to make sure that + * IA64_THREAD_FPH_VALID is cleared in the child. + * + * XXX We could push this optimization a bit further by + * clearing IA64_THREAD_FPH_VALID on ANY system call. + * However, it's not clear this is worth doing. Also, it + * would be a slight deviation from the normal Linux system + * call behavior where scratch registers are preserved across + * system calls (unless used by the system call itself). + */ +# define THREAD_FLAGS_TO_CLEAR (IA64_THREAD_FPH_VALID | IA64_THREAD_DBG_VALID \ + | IA64_THREAD_PM_VALID) +# define THREAD_FLAGS_TO_SET 0 + p->thread.flags = ((current->thread.flags & ~THREAD_FLAGS_TO_CLEAR) + | THREAD_FLAGS_TO_SET); + ia64_drop_fpu(p); /* don't pick up stale state from a CPU's fph */ +#ifdef CONFIG_IA32_SUPPORT + /* + * If we're cloning an IA32 task then save the IA32 extra + * state from the current task to the new task + */ + if (IS_IA32_PROCESS(ia64_task_regs(current))) { + ia32_save_state(p); + if (clone_flags & CLONE_SETTLS) + retval = ia32_clone_tls(p, child_ptregs); + + /* Copy partially mapped page list */ + if (!retval) + retval = ia32_copy_partial_page_list(p, clone_flags); + } +#endif + +#ifdef CONFIG_PERFMON + if (current->thread.pfm_context) + pfm_inherit(p, child_ptregs); +#endif + return retval; +} + +static void +do_copy_task_regs (struct task_struct *task, struct unw_frame_info *info, void *arg) +{ + unsigned long mask, sp, nat_bits = 0, ip, ar_rnat, urbs_end, cfm; + elf_greg_t *dst = arg; + struct pt_regs *pt; + char nat; + int i; + + memset(dst, 0, sizeof(elf_gregset_t)); /* don't leak any kernel bits to user-level */ + + if (unw_unwind_to_user(info) < 0) + return; + + unw_get_sp(info, &sp); + pt = (struct pt_regs *) (sp + 16); + + urbs_end = ia64_get_user_rbs_end(task, pt, &cfm); + + if (ia64_sync_user_rbs(task, info->sw, pt->ar_bspstore, urbs_end) < 0) + return; + + ia64_peek(task, info->sw, urbs_end, (long) ia64_rse_rnat_addr((long *) urbs_end), + &ar_rnat); + + /* + * coredump format: + * r0-r31 + * NaT bits (for r0-r31; bit N == 1 iff rN is a NaT) + * predicate registers (p0-p63) + * b0-b7 + * ip cfm user-mask + * ar.rsc ar.bsp ar.bspstore ar.rnat + * ar.ccv ar.unat ar.fpsr ar.pfs ar.lc ar.ec + */ + + /* r0 is zero */ + for (i = 1, mask = (1UL << i); i < 32; ++i) { + unw_get_gr(info, i, &dst[i], &nat); + if (nat) + nat_bits |= mask; + mask <<= 1; + } + dst[32] = nat_bits; + unw_get_pr(info, &dst[33]); + + for (i = 0; i < 8; ++i) + unw_get_br(info, i, &dst[34 + i]); + + unw_get_rp(info, &ip); + dst[42] = ip + ia64_psr(pt)->ri; + dst[43] = cfm; + dst[44] = pt->cr_ipsr & IA64_PSR_UM; + + unw_get_ar(info, UNW_AR_RSC, &dst[45]); + /* + * For bsp and bspstore, unw_get_ar() would return the kernel + * addresses, but we need the user-level addresses instead: + */ + dst[46] = urbs_end; /* note: by convention PT_AR_BSP points to the end of the urbs! */ + dst[47] = pt->ar_bspstore; + dst[48] = ar_rnat; + unw_get_ar(info, UNW_AR_CCV, &dst[49]); + unw_get_ar(info, UNW_AR_UNAT, &dst[50]); + unw_get_ar(info, UNW_AR_FPSR, &dst[51]); + dst[52] = pt->ar_pfs; /* UNW_AR_PFS is == to pt->cr_ifs for interrupt frames */ + unw_get_ar(info, UNW_AR_LC, &dst[53]); + unw_get_ar(info, UNW_AR_EC, &dst[54]); + unw_get_ar(info, UNW_AR_CSD, &dst[55]); + unw_get_ar(info, UNW_AR_SSD, &dst[56]); +} + +void +do_dump_task_fpu (struct task_struct *task, struct unw_frame_info *info, void *arg) +{ + elf_fpreg_t *dst = arg; + int i; + + memset(dst, 0, sizeof(elf_fpregset_t)); /* don't leak any "random" bits */ + + if (unw_unwind_to_user(info) < 0) + return; + + /* f0 is 0.0, f1 is 1.0 */ + + for (i = 2; i < 32; ++i) + unw_get_fr(info, i, dst + i); + + ia64_flush_fph(task); + if ((task->thread.flags & IA64_THREAD_FPH_VALID) != 0) + memcpy(dst + 32, task->thread.fph, 96*16); +} + +void +do_copy_regs (struct unw_frame_info *info, void *arg) +{ + do_copy_task_regs(current, info, arg); +} + +void +do_dump_fpu (struct unw_frame_info *info, void *arg) +{ + do_dump_task_fpu(current, info, arg); +} + +int +dump_task_regs(struct task_struct *task, elf_gregset_t *regs) +{ + struct unw_frame_info tcore_info; + + if (current == task) { + unw_init_running(do_copy_regs, regs); + } else { + memset(&tcore_info, 0, sizeof(tcore_info)); + unw_init_from_blocked_task(&tcore_info, task); + do_copy_task_regs(task, &tcore_info, regs); + } + return 1; +} + +void +ia64_elf_core_copy_regs (struct pt_regs *pt, elf_gregset_t dst) +{ + unw_init_running(do_copy_regs, dst); +} + +int +dump_task_fpu (struct task_struct *task, elf_fpregset_t *dst) +{ + struct unw_frame_info tcore_info; + + if (current == task) { + unw_init_running(do_dump_fpu, dst); + } else { + memset(&tcore_info, 0, sizeof(tcore_info)); + unw_init_from_blocked_task(&tcore_info, task); + do_dump_task_fpu(task, &tcore_info, dst); + } + return 1; +} + +int +dump_fpu (struct pt_regs *pt, elf_fpregset_t dst) +{ + unw_init_running(do_dump_fpu, dst); + return 1; /* f0-f31 are always valid so we always return 1 */ +} + +long +sys_execve (char __user *filename, char __user * __user *argv, char __user * __user *envp, + struct pt_regs *regs) +{ + char *fname; + int error; + + fname = getname(filename); + error = PTR_ERR(fname); + if (IS_ERR(fname)) + goto out; + error = do_execve(fname, argv, envp, regs); + putname(fname); +out: + return error; +} + +pid_t +kernel_thread (int (*fn)(void *), void *arg, unsigned long flags) +{ + extern void start_kernel_thread (void); + unsigned long *helper_fptr = (unsigned long *) &start_kernel_thread; + struct { + struct switch_stack sw; + struct pt_regs pt; + } regs; + + memset(®s, 0, sizeof(regs)); + regs.pt.cr_iip = helper_fptr[0]; /* set entry point (IP) */ + regs.pt.r1 = helper_fptr[1]; /* set GP */ + regs.pt.r9 = (unsigned long) fn; /* 1st argument */ + regs.pt.r11 = (unsigned long) arg; /* 2nd argument */ + /* Preserve PSR bits, except for bits 32-34 and 37-45, which we can't read. */ + regs.pt.cr_ipsr = ia64_getreg(_IA64_REG_PSR) | IA64_PSR_BN; + regs.pt.cr_ifs = 1UL << 63; /* mark as valid, empty frame */ + regs.sw.ar_fpsr = regs.pt.ar_fpsr = ia64_getreg(_IA64_REG_AR_FPSR); + regs.sw.ar_bspstore = (unsigned long) current + IA64_RBS_OFFSET; + regs.sw.pr = (1 << PRED_KERNEL_STACK); + return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s.pt, 0, NULL, NULL); +} +EXPORT_SYMBOL(kernel_thread); + +/* This gets called from kernel_thread() via ia64_invoke_thread_helper(). */ +int +kernel_thread_helper (int (*fn)(void *), void *arg) +{ +#ifdef CONFIG_IA32_SUPPORT + if (IS_IA32_PROCESS(ia64_task_regs(current))) { + /* A kernel thread is always a 64-bit process. */ + current->thread.map_base = DEFAULT_MAP_BASE; + current->thread.task_size = DEFAULT_TASK_SIZE; + ia64_set_kr(IA64_KR_IO_BASE, current->thread.old_iob); + ia64_set_kr(IA64_KR_TSSD, current->thread.old_k1); + } +#endif + return (*fn)(arg); +} + +/* + * Flush thread state. This is called when a thread does an execve(). + */ +void +flush_thread (void) +{ + /* + * Remove function-return probe instances associated with this task + * and put them back on the free list. Do not insert an exit probe for + * this function, it will be disabled by kprobe_flush_task if you do. + */ + kprobe_flush_task(current); + + /* drop floating-point and debug-register state if it exists: */ + current->thread.flags &= ~(IA64_THREAD_FPH_VALID | IA64_THREAD_DBG_VALID); + ia64_drop_fpu(current); + if (IS_IA32_PROCESS(ia64_task_regs(current))) + ia32_drop_partial_page_list(current); +} + +/* + * Clean up state associated with current thread. This is called when + * the thread calls exit(). + */ +void +exit_thread (void) +{ + + /* + * Remove function-return probe instances associated with this task + * and put them back on the free list. Do not insert an exit probe for + * this function, it will be disabled by kprobe_flush_task if you do. + */ + kprobe_flush_task(current); + + ia64_drop_fpu(current); +#ifdef CONFIG_PERFMON + /* if needed, stop monitoring and flush state to perfmon context */ + if (current->thread.pfm_context) + pfm_exit_thread(current); + + /* free debug register resources */ + if (current->thread.flags & IA64_THREAD_DBG_VALID) + pfm_release_debug_registers(current); +#endif + if (IS_IA32_PROCESS(ia64_task_regs(current))) + ia32_drop_partial_page_list(current); +} + +unsigned long +get_wchan (struct task_struct *p) +{ + struct unw_frame_info info; + unsigned long ip; + int count = 0; + + /* + * Note: p may not be a blocked task (it could be current or + * another process running on some other CPU. Rather than + * trying to determine if p is really blocked, we just assume + * it's blocked and rely on the unwind routines to fail + * gracefully if the process wasn't really blocked after all. + * --davidm 99/12/15 + */ + unw_init_from_blocked_task(&info, p); + do { + if (unw_unwind(&info) < 0) + return 0; + unw_get_ip(&info, &ip); + if (!in_sched_functions(ip)) + return ip; + } while (count++ < 16); + return 0; +} + +void +cpu_halt (void) +{ + pal_power_mgmt_info_u_t power_info[8]; + unsigned long min_power; + int i, min_power_state; + + if (ia64_pal_halt_info(power_info) != 0) + return; + + min_power_state = 0; + min_power = power_info[0].pal_power_mgmt_info_s.power_consumption; + for (i = 1; i < 8; ++i) + if (power_info[i].pal_power_mgmt_info_s.im + && power_info[i].pal_power_mgmt_info_s.power_consumption < min_power) { + min_power = power_info[i].pal_power_mgmt_info_s.power_consumption; + min_power_state = i; + } + + while (1) + ia64_pal_halt(min_power_state); +} + +void +machine_restart (char *restart_cmd) +{ + (*efi.reset_system)(EFI_RESET_WARM, 0, 0, NULL); +} + +void +machine_halt (void) +{ + cpu_halt(); +} + +void +machine_power_off (void) +{ + if (pm_power_off) + pm_power_off(); + machine_halt(); +} +#endif // !XEN diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/ia64/linux-xen/unwind.c --- /dev/null Mon Jan 9 11:19:55 2006 +++ b/xen/arch/ia64/linux-xen/unwind.c Mon Jan 9 11:22:17 2006 @@ -0,0 +1,2332 @@ +/* + * Copyright (C) 1999-2004 Hewlett-Packard Co + * David Mosberger-Tang <davidm@xxxxxxxxxx> + * Copyright (C) 2003 Fenghua Yu <fenghua.yu@xxxxxxxxx> + * - Change pt_regs_off() to make it less dependant on pt_regs structure. + */ +/* + * This file implements call frame unwind support for the Linux + * kernel. Parsing and processing the unwind information is + * time-consuming, so this implementation translates the unwind + * descriptors into unwind scripts. These scripts are very simple + * (basically a sequence of assignments) and efficient to execute. + * They are cached for later re-use. Each script is specific for a + * given instruction pointer address and the set of predicate values + * that the script depends on (most unwind descriptors are + * unconditional and scripts often do not depend on predicates at + * all). This code is based on the unwind conventions described in + * the "IA-64 Software Conventions and Runtime Architecture" manual. + * + * SMP conventions: + * o updates to the global unwind data (in structure "unw") are serialized + * by the unw.lock spinlock + * o each unwind script has its own read-write lock; a thread must acquire + * a read lock before executing a script and must acquire a write lock + * before modifying a script + * o if both the unw.lock spinlock and a script's read-write lock must be + * acquired, then the read-write lock must be acquired first. + */ +#ifdef XEN +#include <xen/types.h> +#include <xen/elf.h> +#include <xen/kernel.h> +#include <xen/sched.h> +#include <xen/xmalloc.h> +#include <xen/spinlock.h> + +// work around +#ifdef CONFIG_SMP +#define write_trylock(lock) _raw_write_trylock(lock) +#else +#define write_trylock(lock) ({1;}) +#endif + +#else +#include <linux/module.h> +#include <linux/bootmem.h> +#include <linux/elf.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/slab.h> +#endif + +#include <asm/unwind.h> + +#include <asm/delay.h> +#include <asm/page.h> +#include <asm/ptrace.h> +#include <asm/ptrace_offsets.h> +#include <asm/rse.h> +#include <asm/sections.h> +#include <asm/system.h> +#include <asm/uaccess.h> + +#include "entry.h" +#include "unwind_i.h" + +#define UNW_LOG_CACHE_SIZE 7 /* each unw_script is ~256 bytes in size */ +#define UNW_CACHE_SIZE (1 << UNW_LOG_CACHE_SIZE) + +#define UNW_LOG_HASH_SIZE (UNW_LOG_CACHE_SIZE + 1) +#define UNW_HASH_SIZE (1 << UNW_LOG_HASH_SIZE) + +#define UNW_STATS 0 /* WARNING: this disabled interrupts for long time-spans!! */ + +#ifdef UNW_DEBUG + static unsigned int unw_debug_level = UNW_DEBUG; +# define UNW_DEBUG_ON(n) unw_debug_level >= n + /* Do not code a printk level, not all debug lines end in newline */ +# define UNW_DPRINT(n, ...) if (UNW_DEBUG_ON(n)) printk(__VA_ARGS__) +# define inline +#else /* !UNW_DEBUG */ +# define UNW_DEBUG_ON(n) 0 +# define UNW_DPRINT(n, ...) +#endif /* UNW_DEBUG */ + +#if UNW_STATS +# define STAT(x...) x +#else +# define STAT(x...) +#endif + +#ifdef XEN +#define alloc_reg_state() xmalloc(struct unw_reg_state) +#define free_reg_state(usr) xfree(usr) +#define alloc_labeled_state() xmalloc(struct unw_labeled_state) +#define free_labeled_state(usr) xfree(usr) +#else +#define alloc_reg_state() kmalloc(sizeof(struct unw_reg_state), GFP_ATOMIC) +#define free_reg_state(usr) kfree(usr) +#define alloc_labeled_state() kmalloc(sizeof(struct unw_labeled_state), GFP_ATOMIC) +#define free_labeled_state(usr) kfree(usr) +#endif + +typedef unsigned long unw_word; +typedef unsigned char unw_hash_index_t; + +static struct { + spinlock_t lock; /* spinlock for unwind data */ + + /* list of unwind tables (one per load-module) */ + struct unw_table *tables; + + unsigned long r0; /* constant 0 for r0 */ + + /* table of registers that prologues can save (and order in which they're saved): */ + const unsigned char save_order[8]; + + /* maps a preserved register index (preg_index) to corresponding switch_stack offset: */ + unsigned short sw_off[sizeof(struct unw_frame_info) / 8]; + + unsigned short lru_head; /* index of lead-recently used script */ + unsigned short lru_tail; /* index of most-recently used script */ + + /* index into unw_frame_info for preserved register i */ + unsigned short preg_index[UNW_NUM_REGS]; + + short pt_regs_offsets[32]; + + /* unwind table for the kernel: */ + struct unw_table kernel_table; + + /* unwind table describing the gate page (kernel code that is mapped into user space): */ + size_t gate_table_size; + unsigned long *gate_table; + + /* hash table that maps instruction pointer to script index: */ + unsigned short hash[UNW_HASH_SIZE]; + + /* script cache: */ + struct unw_script cache[UNW_CACHE_SIZE]; + +# ifdef UNW_DEBUG + const char *preg_name[UNW_NUM_REGS]; +# endif +# if UNW_STATS + struct { + struct { + int lookups; + int hinted_hits; + int normal_hits; + int collision_chain_traversals; + } cache; + struct { + unsigned long build_time; + unsigned long run_time; + unsigned long parse_time; + int builds; + int news; + int collisions; + int runs; + } script; + struct { + unsigned long init_time; + unsigned long unwind_time; + int inits; + int unwinds; + } api; + } stat; +# endif +} unw = { + .tables = &unw.kernel_table, + .lock = SPIN_LOCK_UNLOCKED, + .save_order = { + UNW_REG_RP, UNW_REG_PFS, UNW_REG_PSP, UNW_REG_PR, + UNW_REG_UNAT, UNW_REG_LC, UNW_REG_FPSR, UNW_REG_PRI_UNAT_GR + }, + .preg_index = { + offsetof(struct unw_frame_info, pri_unat_loc)/8, /* PRI_UNAT_GR */ + offsetof(struct unw_frame_info, pri_unat_loc)/8, /* PRI_UNAT_MEM */ + offsetof(struct unw_frame_info, bsp_loc)/8, + offsetof(struct unw_frame_info, bspstore_loc)/8, + offsetof(struct unw_frame_info, pfs_loc)/8, + offsetof(struct unw_frame_info, rnat_loc)/8, + offsetof(struct unw_frame_info, psp)/8, + offsetof(struct unw_frame_info, rp_loc)/8, + offsetof(struct unw_frame_info, r4)/8, + offsetof(struct unw_frame_info, r5)/8, + offsetof(struct unw_frame_info, r6)/8, + offsetof(struct unw_frame_info, r7)/8, + offsetof(struct unw_frame_info, unat_loc)/8, + offsetof(struct unw_frame_info, pr_loc)/8, + offsetof(struct unw_frame_info, lc_loc)/8, + offsetof(struct unw_frame_info, fpsr_loc)/8, + offsetof(struct unw_frame_info, b1_loc)/8, + offsetof(struct unw_frame_info, b2_loc)/8, + offsetof(struct unw_frame_info, b3_loc)/8, + offsetof(struct unw_frame_info, b4_loc)/8, + offsetof(struct unw_frame_info, b5_loc)/8, + offsetof(struct unw_frame_info, f2_loc)/8, + offsetof(struct unw_frame_info, f3_loc)/8, + offsetof(struct unw_frame_info, f4_loc)/8, + offsetof(struct unw_frame_info, f5_loc)/8, + offsetof(struct unw_frame_info, fr_loc[16 - 16])/8, + offsetof(struct unw_frame_info, fr_loc[17 - 16])/8, + offsetof(struct unw_frame_info, fr_loc[18 - 16])/8, + offsetof(struct unw_frame_info, fr_loc[19 - 16])/8, + offsetof(struct unw_frame_info, fr_loc[20 - 16])/8, + offsetof(struct unw_frame_info, fr_loc[21 - 16])/8, + offsetof(struct unw_frame_info, fr_loc[22 - 16])/8, + offsetof(struct unw_frame_info, fr_loc[23 - 16])/8, + offsetof(struct unw_frame_info, fr_loc[24 - 16])/8, + offsetof(struct unw_frame_info, fr_loc[25 - 16])/8, + offsetof(struct unw_frame_info, fr_loc[26 - 16])/8, + offsetof(struct unw_frame_info, fr_loc[27 - 16])/8, + offsetof(struct unw_frame_info, fr_loc[28 - 16])/8, + offsetof(struct unw_frame_info, fr_loc[29 - 16])/8, + offsetof(struct unw_frame_info, fr_loc[30 - 16])/8, + offsetof(struct unw_frame_info, fr_loc[31 - 16])/8, + }, + .pt_regs_offsets = { + [0] = -1, + offsetof(struct pt_regs, r1), + offsetof(struct pt_regs, r2), + offsetof(struct pt_regs, r3), + [4] = -1, [5] = -1, [6] = -1, [7] = -1, + offsetof(struct pt_regs, r8), + offsetof(struct pt_regs, r9), + offsetof(struct pt_regs, r10), + offsetof(struct pt_regs, r11), + offsetof(struct pt_regs, r12), + offsetof(struct pt_regs, r13), + offsetof(struct pt_regs, r14), + offsetof(struct pt_regs, r15), + offsetof(struct pt_regs, r16), + offsetof(struct pt_regs, r17), + offsetof(struct pt_regs, r18), + offsetof(struct pt_regs, r19), + offsetof(struct pt_regs, r20), + offsetof(struct pt_regs, r21), + offsetof(struct pt_regs, r22), + offsetof(struct pt_regs, r23), + offsetof(struct pt_regs, r24), + offsetof(struct pt_regs, r25), + offsetof(struct pt_regs, r26), + offsetof(struct pt_regs, r27), + offsetof(struct pt_regs, r28), + offsetof(struct pt_regs, r29), + offsetof(struct pt_regs, r30), + offsetof(struct pt_regs, r31), + }, + .hash = { [0 ... UNW_HASH_SIZE - 1] = -1 }, +#ifdef UNW_DEBUG + .preg_name = { + "pri_unat_gr", "pri_unat_mem", "bsp", "bspstore", "ar.pfs", "ar.rnat", "psp", "rp", + "r4", "r5", "r6", "r7", + "ar.unat", "pr", "ar.lc", "ar.fpsr", + "b1", "b2", "b3", "b4", "b5", + "f2", "f3", "f4", "f5", + "f16", "f17", "f18", "f19", "f20", "f21", "f22", "f23", + "f24", "f25", "f26", "f27", "f28", "f29", "f30", "f31" + } +#endif +}; + +static inline int +read_only (void *addr) +{ + return (unsigned long) ((char *) addr - (char *) &unw.r0) < sizeof(unw.r0); +} + +/* + * Returns offset of rREG in struct pt_regs. + */ +static inline unsigned long +pt_regs_off (unsigned long reg) +{ + short off = -1; + + if (reg < ARRAY_SIZE(unw.pt_regs_offsets)) + off = unw.pt_regs_offsets[reg]; + + if (off < 0) { + UNW_DPRINT(0, "unwind.%s: bad scratch reg r%lu\n", __FUNCTION__, reg); + off = 0; + } + return (unsigned long) off; +} + +static inline struct pt_regs * +get_scratch_regs (struct unw_frame_info *info) +{ + if (!info->pt) { + /* This should not happen with valid unwind info. */ + UNW_DPRINT(0, "unwind.%s: bad unwind info: resetting info->pt\n", __FUNCTION__); + if (info->flags & UNW_FLAG_INTERRUPT_FRAME) + info->pt = (unsigned long) ((struct pt_regs *) info->psp - 1); + else + info->pt = info->sp - 16; + } + UNW_DPRINT(3, "unwind.%s: sp 0x%lx pt 0x%lx\n", __FUNCTION__, info->sp, info->pt); + return (struct pt_regs *) info->pt; +} + +/* Unwind accessors. */ + +int +unw_access_gr (struct unw_frame_info *info, int regnum, unsigned long *val, char *nat, int write) +{ + unsigned long *addr, *nat_addr, nat_mask = 0, dummy_nat; + struct unw_ireg *ireg; + struct pt_regs *pt; + + if ((unsigned) regnum - 1 >= 127) { + if (regnum == 0 && !write) { + *val = 0; /* read r0 always returns 0 */ + *nat = 0; + return 0; + } + UNW_DPRINT(0, "unwind.%s: trying to access non-existent r%u\n", + __FUNCTION__, regnum); + return -1; + } + + if (regnum < 32) { + if (regnum >= 4 && regnum <= 7) { + /* access a preserved register */ + ireg = &info->r4 + (regnum - 4); + addr = ireg->loc; + if (addr) { + nat_addr = addr + ireg->nat.off; + switch (ireg->nat.type) { + case UNW_NAT_VAL: + /* simulate getf.sig/setf.sig */ + if (write) { + if (*nat) { + /* write NaTVal and be done with it */ + addr[0] = 0; + addr[1] = 0x1fffe; + return 0; + } + addr[1] = 0x1003e; + } else { + if (addr[0] == 0 && addr[1] == 0x1ffe) { + /* return NaT and be done with it */ + *val = 0; + *nat = 1; + return 0; + } + } + /* fall through */ + case UNW_NAT_NONE: + dummy_nat = 0; + nat_addr = &dummy_nat; + break; + + case UNW_NAT_MEMSTK: + nat_mask = (1UL << ((long) addr & 0x1f8)/8); + break; + + case UNW_NAT_REGSTK: + nat_addr = ia64_rse_rnat_addr(addr); + if ((unsigned long) addr < info->regstk.limit + || (unsigned long) addr >= info->regstk.top) + { + UNW_DPRINT(0, "unwind.%s: %p outside of regstk " + "[0x%lx-0x%lx)\n", + __FUNCTION__, (void *) addr, + info->regstk.limit, + info->regstk.top); + return -1; + } + if ((unsigned long) nat_addr >= info->regstk.top) + nat_addr = &info->sw->ar_rnat; + nat_mask = (1UL << ia64_rse_slot_num(addr)); + break; + } + } else { + addr = &info->sw->r4 + (regnum - 4); + nat_addr = &info->sw->ar_unat; + nat_mask = (1UL << ((long) addr & 0x1f8)/8); + } + } else { + /* access a scratch register */ + pt = get_scratch_regs(info); + addr = (unsigned long *) ((unsigned long)pt + pt_regs_off(regnum)); + if (info->pri_unat_loc) + nat_addr = info->pri_unat_loc; + else + nat_addr = &info->sw->caller_unat; + nat_mask = (1UL << ((long) addr & 0x1f8)/8); + } + } else { + /* access a stacked register */ + addr = ia64_rse_skip_regs((unsigned long *) info->bsp, regnum - 32); + nat_addr = ia64_rse_rnat_addr(addr); + if ((unsigned long) addr < info->regstk.limit + || (unsigned long) addr >= info->regstk.top) + { + UNW_DPRINT(0, "unwind.%s: ignoring attempt to access register outside " + "of rbs\n", __FUNCTION__); + return -1; + } + if ((unsigned long) nat_addr >= info->regstk.top) + nat_addr = &info->sw->ar_rnat; + nat_mask = (1UL << ia64_rse_slot_num(addr)); + } + + if (write) { + if (read_only(addr)) { + UNW_DPRINT(0, "unwind.%s: ignoring attempt to write read-only location\n", + __FUNCTION__); + } else { + *addr = *val; + if (*nat) + *nat_addr |= nat_mask; + else + *nat_addr &= ~nat_mask; + } + } else { + if ((*nat_addr & nat_mask) == 0) { + *val = *addr; + *nat = 0; + } else { + *val = 0; /* if register is a NaT, *addr may contain kernel data! */ + *nat = 1; + } + } + return 0; +} +EXPORT_SYMBOL(unw_access_gr); + +int +unw_access_br (struct unw_frame_info *info, int regnum, unsigned long *val, int write) +{ + unsigned long *addr; + struct pt_regs *pt; + + switch (regnum) { + /* scratch: */ + case 0: pt = get_scratch_regs(info); addr = &pt->b0; break; + case 6: pt = get_scratch_regs(info); addr = &pt->b6; break; + case 7: pt = get_scratch_regs(info); addr = &pt->b7; break; + + /* preserved: */ + case 1: case 2: case 3: case 4: case 5: + addr = *(&info->b1_loc + (regnum - 1)); + if (!addr) + addr = &info->sw->b1 + (regnum - 1); + break; + + default: + UNW_DPRINT(0, "unwind.%s: trying to access non-existent b%u\n", + __FUNCTION__, regnum); + return -1; + } + if (write) + if (read_only(addr)) { + UNW_DPRINT(0, "unwind.%s: ignoring attempt to write read-only location\n", + __FUNCTION__); + } else + *addr = *val; + else + *val = *addr; + return 0; +} +EXPORT_SYMBOL(unw_access_br); + +int +unw_access_fr (struct unw_frame_info *info, int regnum, struct ia64_fpreg *val, int write) +{ + struct ia64_fpreg *addr = NULL; + struct pt_regs *pt; + + if ((unsigned) (regnum - 2) >= 126) { + UNW_DPRINT(0, "unwind.%s: trying to access non-existent f%u\n", + __FUNCTION__, regnum); + return -1; + } + + if (regnum <= 5) { + addr = *(&info->f2_loc + (regnum - 2)); + if (!addr) + addr = &info->sw->f2 + (regnum - 2); + } else if (regnum <= 15) { + if (regnum <= 11) { + pt = get_scratch_regs(info); + addr = &pt->f6 + (regnum - 6); + } + else + addr = &info->sw->f12 + (regnum - 12); + } else if (regnum <= 31) { + addr = info->fr_loc[regnum - 16]; + if (!addr) + addr = &info->sw->f16 + (regnum - 16); + } else { + struct task_struct *t = info->task; + + if (write) + ia64_sync_fph(t); + else + ia64_flush_fph(t); +#ifdef XEN + addr = t->arch._thread.fph + (regnum - 32); +#else + addr = t->thread.fph + (regnum - 32); +#endif + } + + if (write) + if (read_only(addr)) { + UNW_DPRINT(0, "unwind.%s: ignoring attempt to write read-only location\n", + __FUNCTION__); + } else + *addr = *val; + else + *val = *addr; + return 0; +} +EXPORT_SYMBOL(unw_access_fr); + +int +unw_access_ar (struct unw_frame_info *info, int regnum, unsigned long *val, int write) +{ + unsigned long *addr; + struct pt_regs *pt; + + switch (regnum) { + case UNW_AR_BSP: + addr = info->bsp_loc; + if (!addr) + addr = &info->sw->ar_bspstore; + break; + + case UNW_AR_BSPSTORE: + addr = info->bspstore_loc; + if (!addr) + addr = &info->sw->ar_bspstore; + break; + + case UNW_AR_PFS: + addr = info->pfs_loc; + if (!addr) + addr = &info->sw->ar_pfs; + break; + + case UNW_AR_RNAT: + addr = info->rnat_loc; + if (!addr) + addr = &info->sw->ar_rnat; + break; + + case UNW_AR_UNAT: + addr = info->unat_loc; + if (!addr) + addr = &info->sw->caller_unat; + break; + + case UNW_AR_LC: + addr = info->lc_loc; + if (!addr) + addr = &info->sw->ar_lc; + break; + + case UNW_AR_EC: + if (!info->cfm_loc) + return -1; + if (write) + *info->cfm_loc = + (*info->cfm_loc & ~(0x3fUL << 52)) | ((*val & 0x3f) << 52); + else + *val = (*info->cfm_loc >> 52) & 0x3f; + return 0; + + case UNW_AR_FPSR: + addr = info->fpsr_loc; + if (!addr) + addr = &info->sw->ar_fpsr; + break; + + case UNW_AR_RSC: + pt = get_scratch_regs(info); + addr = &pt->ar_rsc; + break; + + case UNW_AR_CCV: + pt = get_scratch_regs(info); + addr = &pt->ar_ccv; + break; + + case UNW_AR_CSD: + pt = get_scratch_regs(info); + addr = &pt->ar_csd; + break; + + case UNW_AR_SSD: + pt = get_scratch_regs(info); + addr = &pt->ar_ssd; + break; + + default: + UNW_DPRINT(0, "unwind.%s: trying to access non-existent ar%u\n", + __FUNCTION__, regnum); + return -1; + } + + if (write) { + if (read_only(addr)) { + UNW_DPRINT(0, "unwind.%s: ignoring attempt to write read-only location\n", + __FUNCTION__); + } else + *addr = *val; + } else + *val = *addr; + return 0; +} +EXPORT_SYMBOL(unw_access_ar); + +int +unw_access_pr (struct unw_frame_info *info, unsigned long *val, int write) +{ + unsigned long *addr; + + addr = info->pr_loc; + if (!addr) + addr = &info->sw->pr; + + if (write) { + if (read_only(addr)) { + UNW_DPRINT(0, "unwind.%s: ignoring attempt to write read-only location\n", + __FUNCTION__); + } else + *addr = *val; + } else + *val = *addr; + return 0; +} +EXPORT_SYMBOL(unw_access_pr); + + +/* Routines to manipulate the state stack. */ + +static inline void +push (struct unw_state_record *sr) +{ + struct unw_reg_state *rs; + + rs = alloc_reg_state(); + if (!rs) { + printk(KERN_ERR "unwind: cannot stack reg state!\n"); + return; + } + memcpy(rs, &sr->curr, sizeof(*rs)); + sr->curr.next = rs; +} + +static void +pop (struct unw_state_record *sr) +{ + struct unw_reg_state *rs = sr->curr.next; + + if (!rs) { + printk(KERN_ERR "unwind: stack underflow!\n"); + return; + } + memcpy(&sr->curr, rs, sizeof(*rs)); + free_reg_state(rs); +} + +/* Make a copy of the state stack. Non-recursive to avoid stack overflows. */ +static struct unw_reg_state * +dup_state_stack (struct unw_reg_state *rs) +{ + struct unw_reg_state *copy, *prev = NULL, *first = NULL; + + while (rs) { + copy = alloc_reg_state(); + if (!copy) { + printk(KERN_ERR "unwind.dup_state_stack: out of memory\n"); + return NULL; + } + memcpy(copy, rs, sizeof(*copy)); + if (first) + prev->next = copy; + else + first = copy; + rs = rs->next; + prev = copy; + } + return first; +} + +/* Free all stacked register states (but not RS itself). */ +static void +free_state_stack (struct unw_reg_state *rs) +{ + struct unw_reg_state *p, *next; + + for (p = rs->next; p != NULL; p = next) { + next = p->next; + free_reg_state(p); + } + rs->next = NULL; +} + +/* Unwind decoder routines */ + +static enum unw_register_index __attribute_const__ +decode_abreg (unsigned char abreg, int memory) +{ + switch (abreg) { + case 0x04 ... 0x07: return UNW_REG_R4 + (abreg - 0x04); + case 0x22 ... 0x25: return UNW_REG_F2 + (abreg - 0x22); + case 0x30 ... 0x3f: return UNW_REG_F16 + (abreg - 0x30); + case 0x41 ... 0x45: return UNW_REG_B1 + (abreg - 0x41); + case 0x60: return UNW_REG_PR; + case 0x61: return UNW_REG_PSP; + case 0x62: return memory ? UNW_REG_PRI_UNAT_MEM : UNW_REG_PRI_UNAT_GR; + case 0x63: return UNW_REG_RP; + case 0x64: return UNW_REG_BSP; + case 0x65: return UNW_REG_BSPSTORE; + case 0x66: return UNW_REG_RNAT; + case 0x67: return UNW_REG_UNAT; + case 0x68: return UNW_REG_FPSR; + case 0x69: return UNW_REG_PFS; + case 0x6a: return UNW_REG_LC; + default: + break; + } + UNW_DPRINT(0, "unwind.%s: bad abreg=0x%x\n", __FUNCTION__, abreg); + return UNW_REG_LC; +} + +static void +set_reg (struct unw_reg_info *reg, enum unw_where where, int when, unsigned long val) +{ + reg->val = val; + reg->where = where; + if (reg->when == UNW_WHEN_NEVER) + reg->when = when; +} + +static void +alloc_spill_area (unsigned long *offp, unsigned long regsize, + struct unw_reg_info *lo, struct unw_reg_info *hi) +{ + struct unw_reg_info *reg; + + for (reg = hi; reg >= lo; --reg) { + if (reg->where == UNW_WHERE_SPILL_HOME) { + reg->where = UNW_WHERE_PSPREL; + *offp -= regsize; + reg->val = *offp; + } + } +} + +static inline void +spill_next_when (struct unw_reg_info **regp, struct unw_reg_info *lim, unw_word t) +{ + struct unw_reg_info *reg; + + for (reg = *regp; reg <= lim; ++reg) { + if (reg->where == UNW_WHERE_SPILL_HOME) { + reg->when = t; + *regp = reg + 1; + return; + } + } + UNW_DPRINT(0, "unwind.%s: excess spill!\n", __FUNCTION__); +} + +static inline void +finish_prologue (struct unw_state_record *sr) +{ + struct unw_reg_info *reg; + unsigned long off; + int i; + + /* + * First, resolve implicit register save locations (see Section "11.4.2.3 Rules + * for Using Unwind Descriptors", rule 3): + */ + for (i = 0; i < (int) ARRAY_SIZE(unw.save_order); ++i) { + reg = sr->curr.reg + unw.save_order[i]; + if (reg->where == UNW_WHERE_GR_SAVE) { + reg->where = UNW_WHERE_GR; + reg->val = sr->gr_save_loc++; + } + } + + /* + * Next, compute when the fp, general, and branch registers get + * saved. This must come before alloc_spill_area() because + * we need to know which registers are spilled to their home + * locations. + */ + if (sr->imask) { + unsigned char kind, mask = 0, *cp = sr->imask; + int t; + static const unsigned char limit[3] = { + UNW_REG_F31, UNW_REG_R7, UNW_REG_B5 + }; + struct unw_reg_info *(regs[3]); + + regs[0] = sr->curr.reg + UNW_REG_F2; + regs[1] = sr->curr.reg + UNW_REG_R4; + regs[2] = sr->curr.reg + UNW_REG_B1; + + for (t = 0; t < sr->region_len; ++t) { + if ((t & 3) == 0) + mask = *cp++; + kind = (mask >> 2*(3-(t & 3))) & 3; + if (kind > 0) + spill_next_when(®s[kind - 1], sr->curr.reg + limit[kind - 1], + sr->region_start + t); + } + } + /* + * Next, lay out the memory stack spill area: + */ + if (sr->any_spills) { + off = sr->spill_offset; + alloc_spill_area(&off, 16, sr->curr.reg + UNW_REG_F2, sr->curr.reg + UNW_REG_F31); + alloc_spill_area(&off, 8, sr->curr.reg + UNW_REG_B1, sr->curr.reg + UNW_REG_B5); + alloc_spill_area(&off, 8, sr->curr.reg + UNW_REG_R4, sr->curr.reg + UNW_REG_R7); + } +} + +/* + * Region header descriptors. + */ + +static void +desc_prologue (int body, unw_word rlen, unsigned char mask, unsigned char grsave, + struct unw_state_record *sr) +{ + int i, region_start; + + if (!(sr->in_body || sr->first_region)) + finish_prologue(sr); + sr->first_region = 0; + + /* check if we're done: */ + if (sr->when_target < sr->region_start + sr->region_len) { + sr->done = 1; + return; + } + + region_start = sr->region_start + sr->region_len; + + for (i = 0; i < sr->epilogue_count; ++i) + pop(sr); + sr->epilogue_count = 0; + sr->epilogue_start = UNW_WHEN_NEVER; + + sr->region_start = region_start; + sr->region_len = rlen; + sr->in_body = body; + + if (!body) { + push(sr); + + for (i = 0; i < 4; ++i) { + if (mask & 0x8) + set_reg(sr->curr.reg + unw.save_order[i], UNW_WHERE_GR, + sr->region_start + sr->region_len - 1, grsave++); + mask <<= 1; + } + sr->gr_save_loc = grsave; + sr->any_spills = 0; + sr->imask = NULL; + sr->spill_offset = 0x10; /* default to psp+16 */ + } +} + +/* + * Prologue descriptors. + */ + +static inline void +desc_abi (unsigned char abi, unsigned char context, struct unw_state_record *sr) +{ + if (abi == 3 && context == 'i') { + sr->flags |= UNW_FLAG_INTERRUPT_FRAME; + UNW_DPRINT(3, "unwind.%s: interrupt frame\n", __FUNCTION__); + } + else + UNW_DPRINT(0, "unwind%s: ignoring unwabi(abi=0x%x,context=0x%x)\n", + __FUNCTION__, abi, context); +} + +static inline void +desc_br_gr (unsigned char brmask, unsigned char gr, struct unw_state_record *sr) +{ + int i; + + for (i = 0; i < 5; ++i) { + if (brmask & 1) + set_reg(sr->curr.reg + UNW_REG_B1 + i, UNW_WHERE_GR, + sr->region_start + sr->region_len - 1, gr++); + brmask >>= 1; + } +} + +static inline void +desc_br_mem (unsigned char brmask, struct unw_state_record *sr) +{ + int i; + + for (i = 0; i < 5; ++i) { + if (brmask & 1) { + set_reg(sr->curr.reg + UNW_REG_B1 + i, UNW_WHERE_SPILL_HOME, + sr->region_start + sr->region_len - 1, 0); + sr->any_spills = 1; + } + brmask >>= 1; + } +} + +static inline void +desc_frgr_mem (unsigned char grmask, unw_word frmask, struct unw_state_record *sr) +{ + int i; + + for (i = 0; i < 4; ++i) { + if ((grmask & 1) != 0) { + set_reg(sr->curr.reg + UNW_REG_R4 + i, UNW_WHERE_SPILL_HOME, + sr->region_start + sr->region_len - 1, 0); + sr->any_spills = 1; + } + grmask >>= 1; + } + for (i = 0; i < 20; ++i) { + if ((frmask & 1) != 0) { + int base = (i < 4) ? UNW_REG_F2 : UNW_REG_F16 - 4; + set_reg(sr->curr.reg + base + i, UNW_WHERE_SPILL_HOME, + sr->region_start + sr->region_len - 1, 0); + sr->any_spills = 1; + } + frmask >>= 1; + } +} + +static inline void +desc_fr_mem (unsigned char frmask, struct unw_state_record *sr) +{ + int i; + + for (i = 0; i < 4; ++i) { + if ((frmask & 1) != 0) { + set_reg(sr->curr.reg + UNW_REG_F2 + i, UNW_WHERE_SPILL_HOME, + sr->region_start + sr->region_len - 1, 0); + sr->any_spills = 1; + } + frmask >>= 1; + } +} + +static inline void +desc_gr_gr (unsigned char grmask, unsigned char gr, struct unw_state_record *sr) +{ + int i; + + for (i = 0; i < 4; ++i) { + if ((grmask & 1) != 0) + set_reg(sr->curr.reg + UNW_REG_R4 + i, UNW_WHERE_GR, + sr->region_start + sr->region_len - 1, gr++); + grmask >>= 1; + } +} + +static inline void +desc_gr_mem (unsigned char grmask, struct unw_state_record *sr) +{ + int i; + + for (i = 0; i < 4; ++i) { + if ((grmask & 1) != 0) { + set_reg(sr->curr.reg + UNW_REG_R4 + i, UNW_WHERE_SPILL_HOME, + sr->region_start + sr->region_len - 1, 0); + sr->any_spills = 1; + } + grmask >>= 1; + } +} + +static inline void +desc_mem_stack_f (unw_word t, unw_word size, struct unw_state_record *sr) +{ + set_reg(sr->curr.reg + UNW_REG_PSP, UNW_WHERE_NONE, + sr->region_start + min_t(int, t, sr->region_len - 1), 16*size); +} + +static inline void +desc_mem_stack_v (unw_word t, struct unw_state_record *sr) +{ + sr->curr.reg[UNW_REG_PSP].when = sr->region_start + min_t(int, t, sr->region_len - 1); +} + +static inline void +desc_reg_gr (unsigned char reg, unsigned char dst, struct unw_state_record *sr) +{ + set_reg(sr->curr.reg + reg, UNW_WHERE_GR, sr->region_start + sr->region_len - 1, dst); +} + +static inline void +desc_reg_psprel (unsigned char reg, unw_word pspoff, struct unw_state_record *sr) +{ + set_reg(sr->curr.reg + reg, UNW_WHERE_PSPREL, sr->region_start + sr->region_len - 1, + 0x10 - 4*pspoff); +} + +static inline void +desc_reg_sprel (unsigned char reg, unw_word spoff, struct unw_state_record *sr) +{ + set_reg(sr->curr.reg + reg, UNW_WHERE_SPREL, sr->region_start + sr->region_len - 1, + 4*spoff); +} + +static inline void +desc_rp_br (unsigned char dst, struct unw_state_record *sr) +{ + sr->return_link_reg = dst; +} + +static inline void +desc_reg_when (unsigned char regnum, unw_word t, struct unw_state_record *sr) +{ + struct unw_reg_info *reg = sr->curr.reg + regnum; + + if (reg->where == UNW_WHERE_NONE) + reg->where = UNW_WHERE_GR_SAVE; + reg->when = sr->region_start + min_t(int, t, sr->region_len - 1); +} + +static inline void +desc_spill_base (unw_word pspoff, struct unw_state_record *sr) +{ + sr->spill_offset = 0x10 - 4*pspoff; +} + +static inline unsigned char * +desc_spill_mask (unsigned char *imaskp, struct unw_state_record *sr) +{ + sr->imask = imaskp; + return imaskp + (2*sr->region_len + 7)/8; +} + +/* + * Body descriptors. + */ +static inline void +desc_epilogue (unw_word t, unw_word ecount, struct unw_state_record *sr) +{ + sr->epilogue_start = sr->region_start + sr->region_len - 1 - t; + sr->epilogue_count = ecount + 1; +} + +static inline void +desc_copy_state (unw_word label, struct unw_state_record *sr) +{ + struct unw_labeled_state *ls; + + for (ls = sr->labeled_states; ls; ls = ls->next) { + if (ls->label == label) { + free_state_stack(&sr->curr); + memcpy(&sr->curr, &ls->saved_state, sizeof(sr->curr)); + sr->curr.next = dup_state_stack(ls->saved_state.next); + return; + } + } + printk(KERN_ERR "unwind: failed to find state labeled 0x%lx\n", label); +} + +static inline void +desc_label_state (unw_word label, struct unw_state_record *sr) +{ + struct unw_labeled_state *ls; + + ls = alloc_labeled_state(); + if (!ls) { + printk(KERN_ERR "unwind.desc_label_state(): out of memory\n"); + return; + } + ls->label = label; + memcpy(&ls->saved_state, &sr->curr, sizeof(ls->saved_state)); + ls->saved_state.next = dup_state_stack(sr->curr.next); + + /* insert into list of labeled states: */ + ls->next = sr->labeled_states; + sr->labeled_states = ls; +} + +/* + * General descriptors. + */ + +static inline int +desc_is_active (unsigned char qp, unw_word t, struct unw_state_record *sr) +{ + if (sr->when_target <= sr->region_start + min_t(int, t, sr->region_len - 1)) + return 0; + if (qp > 0) { + if ((sr->pr_val & (1UL << qp)) == 0) + return 0; + sr->pr_mask |= (1UL << qp); + } + return 1; +} + +static inline void +desc_restore_p (unsigned char qp, unw_word t, unsigned char abreg, struct unw_state_record *sr) +{ + struct unw_reg_info *r; + + if (!desc_is_active(qp, t, sr)) + return; + + r = sr->curr.reg + decode_abreg(abreg, 0); + r->where = UNW_WHERE_NONE; + r->when = UNW_WHEN_NEVER; + r->val = 0; +} + +static inline void +desc_spill_reg_p (unsigned char qp, unw_word t, unsigned char abreg, unsigned char x, + unsigned char ytreg, struct unw_state_record *sr) +{ + enum unw_where where = UNW_WHERE_GR; + struct unw_reg_info *r; + + if (!desc_is_active(qp, t, sr)) + return; + + if (x) + where = UNW_WHERE_BR; + else if (ytreg & 0x80) + where = UNW_WHERE_FR; + + r = sr->curr.reg + decode_abreg(abreg, 0); + r->where = where; + r->when = sr->region_start + min_t(int, t, sr->region_len - 1); + r->val = (ytreg & 0x7f); +} + +static inline void +desc_spill_psprel_p (unsigned char qp, unw_word t, unsigned char abreg, unw_word pspoff, + struct unw_state_record *sr) +{ + struct unw_reg_info *r; + + if (!desc_is_active(qp, t, sr)) + return; + + r = sr->curr.reg + decode_abreg(abreg, 1); + r->where = UNW_WHERE_PSPREL; + r->when = sr->region_start + min_t(int, t, sr->region_len - 1); + r->val = 0x10 - 4*pspoff; +} + +static inline void +desc_spill_sprel_p (unsigned char qp, unw_word t, unsigned char abreg, unw_word spoff, + struct unw_state_record *sr) +{ + struct unw_reg_info *r; + + if (!desc_is_active(qp, t, sr)) + return; + + r = sr->curr.reg + decode_abreg(abreg, 1); + r->where = UNW_WHERE_SPREL; + r->when = sr->region_start + min_t(int, t, sr->region_len - 1); + r->val = 4*spoff; +} + +#define UNW_DEC_BAD_CODE(code) printk(KERN_ERR "unwind: unknown code 0x%02x\n", \ + code); + +/* + * region headers: + */ +#define UNW_DEC_PROLOGUE_GR(fmt,r,m,gr,arg) desc_prologue(0,r,m,gr,arg) +#define UNW_DEC_PROLOGUE(fmt,b,r,arg) desc_prologue(b,r,0,32,arg) +/* + * prologue descriptors: + */ +#define UNW_DEC_ABI(fmt,a,c,arg) desc_abi(a,c,arg) +#define UNW_DEC_BR_GR(fmt,b,g,arg) desc_br_gr(b,g,arg) +#define UNW_DEC_BR_MEM(fmt,b,arg) desc_br_mem(b,arg) +#define UNW_DEC_FRGR_MEM(fmt,g,f,arg) desc_frgr_mem(g,f,arg) +#define UNW_DEC_FR_MEM(fmt,f,arg) desc_fr_mem(f,arg) +#define UNW_DEC_GR_GR(fmt,m,g,arg) desc_gr_gr(m,g,arg) +#define UNW_DEC_GR_MEM(fmt,m,arg) desc_gr_mem(m,arg) +#define UNW_DEC_MEM_STACK_F(fmt,t,s,arg) desc_mem_stack_f(t,s,arg) +#define UNW_DEC_MEM_STACK_V(fmt,t,arg) desc_mem_stack_v(t,arg) +#define UNW_DEC_REG_GR(fmt,r,d,arg) desc_reg_gr(r,d,arg) +#define UNW_DEC_REG_PSPREL(fmt,r,o,arg) desc_reg_psprel(r,o,arg) +#define UNW_DEC_REG_SPREL(fmt,r,o,arg) desc_reg_sprel(r,o,arg) +#define UNW_DEC_REG_WHEN(fmt,r,t,arg) desc_reg_when(r,t,arg) +#define UNW_DEC_PRIUNAT_WHEN_GR(fmt,t,arg) desc_reg_when(UNW_REG_PRI_UNAT_GR,t,arg) +#define UNW_DEC_PRIUNAT_WHEN_MEM(fmt,t,arg) desc_reg_when(UNW_REG_PRI_UNAT_MEM,t,arg) +#define UNW_DEC_PRIUNAT_GR(fmt,r,arg) desc_reg_gr(UNW_REG_PRI_UNAT_GR,r,arg) +#define UNW_DEC_PRIUNAT_PSPREL(fmt,o,arg) desc_reg_psprel(UNW_REG_PRI_UNAT_MEM,o,arg) +#define UNW_DEC_PRIUNAT_SPREL(fmt,o,arg) desc_reg_sprel(UNW_REG_PRI_UNAT_MEM,o,arg) +#define UNW_DEC_RP_BR(fmt,d,arg) desc_rp_br(d,arg) +#define UNW_DEC_SPILL_BASE(fmt,o,arg) desc_spill_base(o,arg) +#define UNW_DEC_SPILL_MASK(fmt,m,arg) (m = desc_spill_mask(m,arg)) +/* + * body descriptors: + */ +#define UNW_DEC_EPILOGUE(fmt,t,c,arg) desc_epilogue(t,c,arg) +#define UNW_DEC_COPY_STATE(fmt,l,arg) desc_copy_state(l,arg) +#define UNW_DEC_LABEL_STATE(fmt,l,arg) desc_label_state(l,arg) +/* + * general unwind descriptors: + */ +#define UNW_DEC_SPILL_REG_P(f,p,t,a,x,y,arg) desc_spill_reg_p(p,t,a,x,y,arg) +#define UNW_DEC_SPILL_REG(f,t,a,x,y,arg) desc_spill_reg_p(0,t,a,x,y,arg) +#define UNW_DEC_SPILL_PSPREL_P(f,p,t,a,o,arg) desc_spill_psprel_p(p,t,a,o,arg) +#define UNW_DEC_SPILL_PSPREL(f,t,a,o,arg) desc_spill_psprel_p(0,t,a,o,arg) +#define UNW_DEC_SPILL_SPREL_P(f,p,t,a,o,arg) desc_spill_sprel_p(p,t,a,o,arg) +#define UNW_DEC_SPILL_SPREL(f,t,a,o,arg) desc_spill_sprel_p(0,t,a,o,arg) +#define UNW_DEC_RESTORE_P(f,p,t,a,arg) desc_restore_p(p,t,a,arg) +#define UNW_DEC_RESTORE(f,t,a,arg) desc_restore_p(0,t,a,arg) + +#include "unwind_decoder.c" + + +/* Unwind scripts. */ + +static inline unw_hash_index_t +hash (unsigned long ip) +{ +# define hashmagic 0x9e3779b97f4a7c16UL /* based on (sqrt(5)/2-1)*2^64 */ + + return (ip >> 4)*hashmagic >> (64 - UNW_LOG_HASH_SIZE); +#undef hashmagic +} + +static inline long +cache_match (struct unw_script *script, unsigned long ip, unsigned long pr) +{ + read_lock(&script->lock); + if (ip == script->ip && ((pr ^ script->pr_val) & script->pr_mask) == 0) + /* keep the read lock... */ + return 1; + read_unlock(&script->lock); + return 0; +} + +static inline struct unw_script * +script_lookup (struct unw_frame_info *info) +{ + struct unw_script *script = unw.cache + info->hint; + unsigned short index; + unsigned long ip, pr; + + if (UNW_DEBUG_ON(0)) + return NULL; /* Always regenerate scripts in debug mode */ + + STAT(++unw.stat.cache.lookups); + + ip = info->ip; + pr = info->pr; + + if (cache_match(script, ip, pr)) { + STAT(++unw.stat.cache.hinted_hits); + return script; + } + + index = unw.hash[hash(ip)]; + if (index >= UNW_CACHE_SIZE) + return NULL; + + script = unw.cache + index; + while (1) { + if (cache_match(script, ip, pr)) { + /* update hint; no locking required as single-word writes are atomic */ + STAT(++unw.stat.cache.normal_hits); + unw.cache[info->prev_script].hint = script - unw.cache; + return script; + } + if (script->coll_chain >= UNW_HASH_SIZE) + return NULL; + script = unw.cache + script->coll_chain; + STAT(++unw.stat.cache.collision_chain_traversals); + } +} + +/* + * On returning, a write lock for the SCRIPT is still being held. + */ +static inline struct unw_script * +script_new (unsigned long ip) +{ + struct unw_script *script, *prev, *tmp; + unw_hash_index_t index; + unsigned short head; + + STAT(++unw.stat.script.news); + + /* + * Can't (easily) use cmpxchg() here because of ABA problem + * that is intrinsic in cmpxchg()... + */ + head = unw.lru_head; + script = unw.cache + head; + unw.lru_head = script->lru_chain; + + /* + * We'd deadlock here if we interrupted a thread that is holding a read lock on + * script->lock. Thus, if the write_trylock() fails, we simply bail out. The + * alternative would be to disable interrupts whenever we hold a read-lock, but + * that seems silly. + */ + if (!write_trylock(&script->lock)) + return NULL; + + /* re-insert script at the tail of the LRU chain: */ + unw.cache[unw.lru_tail].lru_chain = head; + unw.lru_tail = head; + + /* remove the old script from the hash table (if it's there): */ + if (script->ip) { + index = hash(script->ip); + tmp = unw.cache + unw.hash[index]; + prev = NULL; + while (1) { + if (tmp == script) { + if (prev) + prev->coll_chain = tmp->coll_chain; + else + unw.hash[index] = tmp->coll_chain; + break; + } else + prev = tmp; + if (tmp->coll_chain >= UNW_CACHE_SIZE) + /* old script wasn't in the hash-table */ + break; + tmp = unw.cache + tmp->coll_chain; + } + } + + /* enter new script in the hash table */ + index = hash(ip); + script->coll_chain = unw.hash[index]; + unw.hash[index] = script - unw.cache; + + script->ip = ip; /* set new IP while we're holding the locks */ + + STAT(if (script->coll_chain < UNW_CACHE_SIZE) ++unw.stat.script.collisions); + + script->flags = 0; + script->hint = 0; + script->count = 0; + return script; +} + +static void +script_finalize (struct unw_script *script, struct unw_state_record *sr) +{ + script->pr_mask = sr->pr_mask; + script->pr_val = sr->pr_val; + /* + * We could down-grade our write-lock on script->lock here but + * the rwlock API doesn't offer atomic lock downgrading, so + * we'll just keep the write-lock and release it later when + * we're done using the script. + */ +} + +static inline void +script_emit (struct unw_script *script, struct unw_insn insn) +{ + if (script->count >= UNW_MAX_SCRIPT_LEN) { + UNW_DPRINT(0, "unwind.%s: script exceeds maximum size of %u instructions!\n", + __FUNCTION__, UNW_MAX_SCRIPT_LEN); + return; + } + script->insn[script->count++] = insn; +} + +static inline void +emit_nat_info (struct unw_state_record *sr, int i, struct unw_script *script) +{ + struct unw_reg_info *r = sr->curr.reg + i; + enum unw_insn_opcode opc; + struct unw_insn insn; + unsigned long val = 0; + + switch (r->where) { + case UNW_WHERE_GR: + if (r->val >= 32) { + /* register got spilled to a stacked register */ + opc = UNW_INSN_SETNAT_TYPE; + val = UNW_NAT_REGSTK; + } else + /* register got spilled to a scratch register */ + opc = UNW_INSN_SETNAT_MEMSTK; + break; + + case UNW_WHERE_FR: + opc = UNW_INSN_SETNAT_TYPE; + val = UNW_NAT_VAL; + break; + + case UNW_WHERE_BR: + opc = UNW_INSN_SETNAT_TYPE; + val = UNW_NAT_NONE; + break; + + case UNW_WHERE_PSPREL: + case UNW_WHERE_SPREL: + opc = UNW_INSN_SETNAT_MEMSTK; + break; + + default: + UNW_DPRINT(0, "unwind.%s: don't know how to emit nat info for where = %u\n", + __FUNCTION__, r->where); + return; + } + insn.opc = opc; + insn.dst = unw.preg_index[i]; + insn.val = val; + script_emit(script, insn); +} + +static void +compile_reg (struct unw_state_record *sr, int i, struct unw_script *script) +{ + struct unw_reg_info *r = sr->curr.reg + i; + enum unw_insn_opcode opc; + unsigned long val, rval; + struct unw_insn insn; + long need_nat_info; + + if (r->where == UNW_WHERE_NONE || r->when >= sr->when_target) + return; + + opc = UNW_INSN_MOVE; + val = rval = r->val; + need_nat_info = (i >= UNW_REG_R4 && i <= UNW_REG_R7); + + switch (r->where) { + case UNW_WHERE_GR: + if (rval >= 32) { + opc = UNW_INSN_MOVE_STACKED; + val = rval - 32; + } else if (rval >= 4 && rval <= 7) { + if (need_nat_info) { + opc = UNW_INSN_MOVE2; + need_nat_info = 0; + } + val = unw.preg_index[UNW_REG_R4 + (rval - 4)]; + } else if (rval == 0) { + opc = UNW_INSN_MOVE_CONST; + val = 0; + } else { + /* register got spilled to a scratch register */ + opc = UNW_INSN_MOVE_SCRATCH; + val = pt_regs_off(rval); + } + break; + + case UNW_WHERE_FR: + if (rval <= 5) + val = unw.preg_index[UNW_REG_F2 + (rval - 2)]; + else if (rval >= 16 && rval <= 31) + val = unw.preg_index[UNW_REG_F16 + (rval - 16)]; + else { + opc = UNW_INSN_MOVE_SCRATCH; + if (rval <= 11) + val = offsetof(struct pt_regs, f6) + 16*(rval - 6); + else + UNW_DPRINT(0, "unwind.%s: kernel may not touch f%lu\n", + __FUNCTION__, rval); + } + break; + + case UNW_WHERE_BR: + if (rval >= 1 && rval <= 5) + val = unw.preg_index[UNW_REG_B1 + (rval - 1)]; + else { + opc = UNW_INSN_MOVE_SCRATCH; + if (rval == 0) + val = offsetof(struct pt_regs, b0); + else if (rval == 6) + val = offsetof(struct pt_regs, b6); + else + val = offsetof(struct pt_regs, b7); + } + break; + + case UNW_WHERE_SPREL: + opc = UNW_INSN_ADD_SP; + break; + + case UNW_WHERE_PSPREL: + opc = UNW_INSN_ADD_PSP; + break; + + default: + UNW_DPRINT(0, "unwind%s: register %u has unexpected `where' value of %u\n", + __FUNCTION__, i, r->where); + break; + } + insn.opc = opc; + insn.dst = unw.preg_index[i]; + insn.val = val; + script_emit(script, insn); + if (need_nat_info) + emit_nat_info(sr, i, script); + + if (i == UNW_REG_PSP) { + /* + * info->psp must contain the _value_ of the previous + * sp, not it's save location. We get this by + * dereferencing the value we just stored in + * info->psp: + */ + insn.opc = UNW_INSN_LOAD; + insn.dst = insn.val = unw.preg_index[UNW_REG_PSP]; + script_emit(script, insn); + } +} + +static inline const struct unw_table_entry * +lookup (struct unw_table *table, unsigned long rel_ip) +{ + const struct unw_table_entry *e = NULL; + unsigned long lo, hi, mid; + + /* do a binary search for right entry: */ + for (lo = 0, hi = table->length; lo < hi; ) { + mid = (lo + hi) / 2; + e = &table->array[mid]; + if (rel_ip < e->start_offset) + hi = mid; + else if (rel_ip >= e->end_offset) + lo = mid + 1; + else + break; + } + if (rel_ip < e->start_offset || rel_ip >= e->end_offset) + return NULL; + return e; +} + +/* + * Build an unwind script that unwinds from state OLD_STATE to the + * entrypoint of the function that called OLD_STATE. + */ +static inline struct unw_script * +build_script (struct unw_frame_info *info) +{ + const struct unw_table_entry *e = NULL; + struct unw_script *script = NULL; + struct unw_labeled_state *ls, *next; + unsigned long ip = info->ip; + struct unw_state_record sr; + struct unw_table *table; + struct unw_reg_info *r; + struct unw_insn insn; + u8 *dp, *desc_end; + u64 hdr; + int i; + STAT(unsigned long start, parse_start;) + + STAT(++unw.stat.script.builds; start = ia64_get_itc()); + + /* build state record */ + memset(&sr, 0, sizeof(sr)); + for (r = sr.curr.reg; r < sr.curr.reg + UNW_NUM_REGS; ++r) + r->when = UNW_WHEN_NEVER; + sr.pr_val = info->pr; + + UNW_DPRINT(3, "unwind.%s: ip 0x%lx\n", __FUNCTION__, ip); + script = script_new(ip); + if (!script) { + UNW_DPRINT(0, "unwind.%s: failed to create unwind script\n", __FUNCTION__); + STAT(unw.stat.script.build_time += ia64_get_itc() - start); + return NULL; + } + unw.cache[info->prev_script].hint = script - unw.cache; + + /* search the kernels and the modules' unwind tables for IP: */ + + STAT(parse_start = ia64_get_itc()); + + for (table = unw.tables; table; table = table->next) { + if (ip >= table->start && ip < table->end) { + e = lookup(table, ip - table->segment_base); + break; + } + } + if (!e) { + /* no info, return default unwinder (leaf proc, no mem stack, no saved regs) */ + UNW_DPRINT(1, "unwind.%s: no unwind info for ip=0x%lx (prev ip=0x%lx)\n", + __FUNCTION__, ip, unw.cache[info->prev_script].ip); + sr.curr.reg[UNW_REG_RP].where = UNW_WHERE_BR; + sr.curr.reg[UNW_REG_RP].when = -1; + sr.curr.reg[UNW_REG_RP].val = 0; + compile_reg(&sr, UNW_REG_RP, script); + script_finalize(script, &sr); + STAT(unw.stat.script.parse_time += ia64_get_itc() - parse_start); + STAT(unw.stat.script.build_time += ia64_get_itc() - start); + return script; + } + + sr.when_target = (3*((ip & ~0xfUL) - (table->segment_base + e->start_offset))/16 + + (ip & 0xfUL)); + hdr = *(u64 *) (table->segment_base + e->info_offset); + dp = (u8 *) (table->segment_base + e->info_offset + 8); + desc_end = dp + 8*UNW_LENGTH(hdr); + + while (!sr.done && dp < desc_end) + dp = unw_decode(dp, sr.in_body, &sr); + + if (sr.when_target > sr.epilogue_start) { + /* + * sp has been restored and all values on the memory stack below + * psp also have been restored. + */ + sr.curr.reg[UNW_REG_PSP].val = 0; + sr.curr.reg[UNW_REG_PSP].where = UNW_WHERE_NONE; + sr.curr.reg[UNW_REG_PSP].when = UNW_WHEN_NEVER; + for (r = sr.curr.reg; r < sr.curr.reg + UNW_NUM_REGS; ++r) + if ((r->where == UNW_WHERE_PSPREL && r->val <= 0x10) + || r->where == UNW_WHERE_SPREL) + { + r->val = 0; + r->where = UNW_WHERE_NONE; + r->when = UNW_WHEN_NEVER; + } + } + + script->flags = sr.flags; + + /* + * If RP did't get saved, generate entry for the return link + * register. + */ + if (sr.curr.reg[UNW_REG_RP].when >= sr.when_target) { + sr.curr.reg[UNW_REG_RP].where = UNW_WHERE_BR; + sr.curr.reg[UNW_REG_RP].when = -1; + sr.curr.reg[UNW_REG_RP].val = sr.return_link_reg; + UNW_DPRINT(1, "unwind.%s: using default for rp at ip=0x%lx where=%d val=0x%lx\n", + __FUNCTION__, ip, sr.curr.reg[UNW_REG_RP].where, + sr.curr.reg[UNW_REG_RP].val); + } + +#ifdef UNW_DEBUG + UNW_DPRINT(1, "unwind.%s: state record for func 0x%lx, t=%u:\n", + __FUNCTION__, table->segment_base + e->start_offset, sr.when_target); + for (r = sr.curr.reg; r < sr.curr.reg + UNW_NUM_REGS; ++r) { + if (r->where != UNW_WHERE_NONE || r->when != UNW_WHEN_NEVER) { + UNW_DPRINT(1, " %s <- ", unw.preg_name[r - sr.curr.reg]); + switch (r->where) { + case UNW_WHERE_GR: UNW_DPRINT(1, "r%lu", r->val); break; + case UNW_WHERE_FR: UNW_DPRINT(1, "f%lu", r->val); break; + case UNW_WHERE_BR: UNW_DPRINT(1, "b%lu", r->val); break; + case UNW_WHERE_SPREL: UNW_DPRINT(1, "[sp+0x%lx]", r->val); break; + case UNW_WHERE_PSPREL: UNW_DPRINT(1, "[psp+0x%lx]", r->val); break; + case UNW_WHERE_NONE: + UNW_DPRINT(1, "%s+0x%lx", unw.preg_name[r - sr.curr.reg], r->val); + break; + + default: + UNW_DPRINT(1, "BADWHERE(%d)", r->where); + break; + } + UNW_DPRINT(1, "\t\t%d\n", r->when); + } + } +#endif + + STAT(unw.stat.script.parse_time += ia64_get_itc() - parse_start); + + /* translate state record into unwinder instructions: */ + + /* + * First, set psp if we're dealing with a fixed-size frame; + * subsequent instructions may depend on this value. + */ + if (sr.when_target > sr.curr.reg[UNW_REG_PSP].when + && (sr.curr.reg[UNW_REG_PSP].where == UNW_WHERE_NONE) + && sr.curr.reg[UNW_REG_PSP].val != 0) { + /* new psp is sp plus frame size */ + insn.opc = UNW_INSN_ADD; + insn.dst = offsetof(struct unw_frame_info, psp)/8; + insn.val = sr.curr.reg[UNW_REG_PSP].val; /* frame size */ + script_emit(script, insn); + } + + /* determine where the primary UNaT is: */ + if (sr.when_target < sr.curr.reg[UNW_REG_PRI_UNAT_GR].when) + i = UNW_REG_PRI_UNAT_MEM; + else if (sr.when_target < sr.curr.reg[UNW_REG_PRI_UNAT_MEM].when) + i = UNW_REG_PRI_UNAT_GR; + else if (sr.curr.reg[UNW_REG_PRI_UNAT_MEM].when > sr.curr.reg[UNW_REG_PRI_UNAT_GR].when) + i = UNW_REG_PRI_UNAT_MEM; + else + i = UNW_REG_PRI_UNAT_GR; + + compile_reg(&sr, i, script); + + for (i = UNW_REG_BSP; i < UNW_NUM_REGS; ++i) + compile_reg(&sr, i, script); + + /* free labeled register states & stack: */ + + STAT(parse_start = ia64_get_itc()); + for (ls = sr.labeled_states; ls; ls = next) { + next = ls->next; + free_state_stack(&ls->saved_state); + free_labeled_state(ls); + } + free_state_stack(&sr.curr); + STAT(unw.stat.script.parse_time += ia64_get_itc() - parse_start); + + script_finalize(script, &sr); + STAT(unw.stat.script.build_time += ia64_get_itc() - start); + return script; +} + +/* + * Apply the unwinding actions represented by OPS and update SR to + * reflect the state that existed upon entry to the function that this + * unwinder represents. + */ +static inline void +run_script (struct unw_script *script, struct unw_frame_info *state) +{ + struct unw_insn *ip, *limit, next_insn; + unsigned long opc, dst, val, off; + unsigned long *s = (unsigned long *) state; + STAT(unsigned long start;) + + STAT(++unw.stat.script.runs; start = ia64_get_itc()); + state->flags = script->flags; + ip = script->insn; + limit = script->insn + script->count; + next_insn = *ip; + + while (ip++ < limit) { + opc = next_insn.opc; + dst = next_insn.dst; + val = next_insn.val; + next_insn = *ip; + + redo: + switch (opc) { + case UNW_INSN_ADD: + s[dst] += val; + break; + + case UNW_INSN_MOVE2: + if (!s[val]) + goto lazy_init; + s[dst+1] = s[val+1]; + s[dst] = s[val]; + break; + + case UNW_INSN_MOVE: + if (!s[val]) + goto lazy_init; + s[dst] = s[val]; + break; + + case UNW_INSN_MOVE_SCRATCH: + if (state->pt) { + s[dst] = (unsigned long) get_scratch_regs(state) + val; + } else { + s[dst] = 0; + UNW_DPRINT(0, "unwind.%s: no state->pt, dst=%ld, val=%ld\n", + __FUNCTION__, dst, val); + } + break; + + case UNW_INSN_MOVE_CONST: + if (val == 0) + s[dst] = (unsigned long) &unw.r0; + else { + s[dst] = 0; + UNW_DPRINT(0, "unwind.%s: UNW_INSN_MOVE_CONST bad val=%ld\n", + __FUNCTION__, val); + } + break; + + + case UNW_INSN_MOVE_STACKED: + s[dst] = (unsigned long) ia64_rse_skip_regs((unsigned long *)state->bsp, + val); + break; + + case UNW_INSN_ADD_PSP: + s[dst] = state->psp + val; + break; + + case UNW_INSN_ADD_SP: + s[dst] = state->sp + val; + break; + + case UNW_INSN_SETNAT_MEMSTK: + if (!state->pri_unat_loc) + state->pri_unat_loc = &state->sw->caller_unat; + /* register off. is a multiple of 8, so the least 3 bits (type) are 0 */ + s[dst+1] = ((unsigned long) state->pri_unat_loc - s[dst]) | UNW_NAT_MEMSTK; + break; + + case UNW_INSN_SETNAT_TYPE: + s[dst+1] = val; + break; + + case UNW_INSN_LOAD: +#ifdef UNW_DEBUG + if ((s[val] & (local_cpu_data->unimpl_va_mask | 0x7)) != 0 +#ifndef XEN + || s[val] < TASK_SIZE +#endif + ) + { + UNW_DPRINT(0, "unwind.%s: rejecting bad psp=0x%lx\n", + __FUNCTION__, s[val]); + break; + } +#endif + s[dst] = *(unsigned long *) s[val]; + break; + } + } + STAT(unw.stat.script.run_time += ia64_get_itc() - start); + return; + + lazy_init: + off = unw.sw_off[val]; + s[val] = (unsigned long) state->sw + off; + if (off >= offsetof(struct switch_stack, r4) && off <= offsetof(struct switch_stack, r7)) + /* + * We're initializing a general register: init NaT info, too. Note that + * the offset is a multiple of 8 which gives us the 3 bits needed for + * the type field. + */ + s[val+1] = (offsetof(struct switch_stack, ar_unat) - off) | UNW_NAT_MEMSTK; + goto redo; +} + +static int +find_save_locs (struct unw_frame_info *info) +{ + int have_write_lock = 0; + struct unw_script *scr; + unsigned long flags = 0; + + if ((info->ip & (local_cpu_data->unimpl_va_mask | 0xf)) +#ifndef XEN + || info->ip < TASK_SIZE +#endif + ) { + /* don't let obviously bad addresses pollute the cache */ + /* FIXME: should really be level 0 but it occurs too often. KAO */ + UNW_DPRINT(1, "unwind.%s: rejecting bad ip=0x%lx\n", __FUNCTION__, info->ip); + info->rp_loc = NULL; + return -1; + } + + scr = script_lookup(info); + if (!scr) { + spin_lock_irqsave(&unw.lock, flags); + scr = build_script(info); + if (!scr) { + spin_unlock_irqrestore(&unw.lock, flags); + UNW_DPRINT(0, + "unwind.%s: failed to locate/build unwind script for ip %lx\n", + __FUNCTION__, info->ip); + return -1; + } + have_write_lock = 1; + } + info->hint = scr->hint; + info->prev_script = scr - unw.cache; + + run_script(scr, info); + + if (have_write_lock) { + write_unlock(&scr->lock); + spin_unlock_irqrestore(&unw.lock, flags); + } else + read_unlock(&scr->lock); + return 0; +} + +int +unw_unwind (struct unw_frame_info *info) +{ + unsigned long prev_ip, prev_sp, prev_bsp; + unsigned long ip, pr, num_regs; + STAT(unsigned long start, flags;) + int retval; + + STAT(local_irq_save(flags); ++unw.stat.api.unwinds; start = ia64_get_itc()); + + prev_ip = info->ip; + prev_sp = info->sp; + prev_bsp = info->bsp; + + /* restore the ip */ + if (!info->rp_loc) { + /* FIXME: should really be level 0 but it occurs too often. KAO */ + UNW_DPRINT(1, "unwind.%s: failed to locate return link (ip=0x%lx)!\n", + __FUNCTION__, info->ip); + STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags)); + return -1; + } + ip = info->ip = *info->rp_loc; + if (ip < GATE_ADDR) { + UNW_DPRINT(2, "unwind.%s: reached user-space (ip=0x%lx)\n", __FUNCTION__, ip); + STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags)); + return -1; + } + + /* restore the cfm: */ + if (!info->pfs_loc) { + UNW_DPRINT(0, "unwind.%s: failed to locate ar.pfs!\n", __FUNCTION__); + STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags)); + return -1; + } + info->cfm_loc = info->pfs_loc; + + /* restore the bsp: */ + pr = info->pr; + num_regs = 0; + if ((info->flags & UNW_FLAG_INTERRUPT_FRAME)) { + info->pt = info->sp + 16; + if ((pr & (1UL << PRED_NON_SYSCALL)) != 0) + num_regs = *info->cfm_loc & 0x7f; /* size of frame */ + info->pfs_loc = + (unsigned long *) (info->pt + offsetof(struct pt_regs, ar_pfs)); + UNW_DPRINT(3, "unwind.%s: interrupt_frame pt 0x%lx\n", __FUNCTION__, info->pt); + } else + num_regs = (*info->cfm_loc >> 7) & 0x7f; /* size of locals */ + info->bsp = (unsigned long) ia64_rse_skip_regs((unsigned long *) info->bsp, -num_regs); + if (info->bsp < info->regstk.limit || info->bsp > info->regstk.top) { + UNW_DPRINT(0, "unwind.%s: bsp (0x%lx) out of range [0x%lx-0x%lx]\n", + __FUNCTION__, info->bsp, info->regstk.limit, info->regstk.top); + STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags)); + return -1; + } + + /* restore the sp: */ + info->sp = info->psp; + if (info->sp < info->memstk.top || info->sp > info->memstk.limit) { + UNW_DPRINT(0, "unwind.%s: sp (0x%lx) out of range [0x%lx-0x%lx]\n", + __FUNCTION__, info->sp, info->memstk.top, info->memstk.limit); + STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags)); + return -1; + } + + if (info->ip == prev_ip && info->sp == prev_sp && info->bsp == prev_bsp) { + UNW_DPRINT(0, "unwind.%s: ip, sp, bsp unchanged; stopping here (ip=0x%lx)\n", + __FUNCTION__, ip); + STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags)); + return -1; + } + + /* as we unwind, the saved ar.unat becomes the primary unat: */ + info->pri_unat_loc = info->unat_loc; + + /* finally, restore the predicates: */ + unw_get_pr(info, &info->pr); + + retval = find_save_locs(info); + STAT(unw.stat.api.unwind_time += ia64_get_itc() - start; local_irq_restore(flags)); + return retval; +} +EXPORT_SYMBOL(unw_unwind); + +int +unw_unwind_to_user (struct unw_frame_info *info) +{ + unsigned long ip, sp, pr = 0; + + while (unw_unwind(info) >= 0) { + unw_get_sp(info, &sp); + if ((long)((unsigned long)info->task + IA64_STK_OFFSET - sp) + < IA64_PT_REGS_SIZE) { + UNW_DPRINT(0, "unwind.%s: ran off the top of the kernel stack\n", + __FUNCTION__); + break; + } + if (unw_is_intr_frame(info) && + (pr & (1UL << PRED_USER_STACK))) + return 0; + if (unw_get_pr (info, &pr) < 0) { + unw_get_rp(info, &ip); + UNW_DPRINT(0, "unwind.%s: failed to read " + "predicate register (ip=0x%lx)\n", + __FUNCTION__, ip); + return -1; + } + } + unw_get_ip(info, &ip); + UNW_DPRINT(0, "unwind.%s: failed to unwind to user-level (ip=0x%lx)\n", + __FUNCTION__, ip); + return -1; +} +EXPORT_SYMBOL(unw_unwind_to_user); + +static void +init_frame_info (struct unw_frame_info *info, struct task_struct *t, + struct switch_stack *sw, unsigned long stktop) +{ + unsigned long rbslimit, rbstop, stklimit; + STAT(unsigned long start, flags;) + + STAT(local_irq_save(flags); ++unw.stat.api.inits; start = ia64_get_itc()); + + /* + * Subtle stuff here: we _could_ unwind through the switch_stack frame but we + * don't want to do that because it would be slow as each preserved register would + * have to be processed. Instead, what we do here is zero out the frame info and + * start the unwind process at the function that created the switch_stack frame. + * When a preserved value in switch_stack needs to be accessed, run_script() will + * initialize the appropriate pointer on demand. + */ + memset(info, 0, sizeof(*info)); + + rbslimit = (unsigned long) t + IA64_RBS_OFFSET; + rbstop = sw->ar_bspstore; + if (rbstop - (unsigned long) t >= IA64_STK_OFFSET) + rbstop = rbslimit; + + stklimit = (unsigned long) t + IA64_STK_OFFSET; + if (stktop <= rbstop) + stktop = rbstop; + + info->regstk.limit = rbslimit; + info->regstk.top = rbstop; + info->memstk.limit = stklimit; + info->memstk.top = stktop; + info->task = t; + info->sw = sw; + info->sp = info->psp = stktop; + info->pr = sw->pr; + UNW_DPRINT(3, "unwind.%s:\n" + " task 0x%lx\n" + " rbs = [0x%lx-0x%lx)\n" + " stk = [0x%lx-0x%lx)\n" + " pr 0x%lx\n" + " sw 0x%lx\n" + " sp 0x%lx\n", + __FUNCTION__, (unsigned long) t, rbslimit, rbstop, stktop, stklimit, + info->pr, (unsigned long) info->sw, info->sp); + STAT(unw.stat.api.init_time += ia64_get_itc() - start; local_irq_restore(flags)); +} + +void +unw_init_frame_info (struct unw_frame_info *info, struct task_struct *t, struct switch_stack *sw) +{ + unsigned long sol; + + init_frame_info(info, t, sw, (unsigned long) (sw + 1) - 16); + info->cfm_loc = &sw->ar_pfs; + sol = (*info->cfm_loc >> 7) & 0x7f; + info->bsp = (unsigned long) ia64_rse_skip_regs((unsigned long *) info->regstk.top, -sol); + info->ip = sw->b0; + UNW_DPRINT(3, "unwind.%s:\n" + " bsp 0x%lx\n" + " sol 0x%lx\n" + " ip 0x%lx\n", + __FUNCTION__, info->bsp, sol, info->ip); + find_save_locs(info); +} + +EXPORT_SYMBOL(unw_init_frame_info); + +void +unw_init_from_blocked_task (struct unw_frame_info *info, struct task_struct *t) +{ +#ifdef XEN + struct switch_stack *sw = (struct switch_stack *) (t->arch._thread.ksp + 16); +#else + struct switch_stack *sw = (struct switch_stack *) (t->thread.ksp + 16); +#endif + + UNW_DPRINT(1, "unwind.%s\n", __FUNCTION__); + unw_init_frame_info(info, t, sw); +} +EXPORT_SYMBOL(unw_init_from_blocked_task); + +static void +init_unwind_table (struct unw_table *table, const char *name, unsigned long segment_base, + unsigned long gp, const void *table_start, const void *table_end) +{ + const struct unw_table_entry *start = table_start, *end = table_end; + + table->name = name; + table->segment_base = segment_base; + table->gp = gp; + table->start = segment_base + start[0].start_offset; + table->end = segment_base + end[-1].end_offset; + table->array = start; + table->length = end - start; +} + +#ifndef XEN +void * +unw_add_unwind_table (const char *name, unsigned long segment_base, unsigned long gp, + const void *table_start, const void *table_end) +{ + const struct unw_table_entry *start = table_start, *end = table_end; + struct unw_table *table; + unsigned long flags; + + if (end - start <= 0) { + UNW_DPRINT(0, "unwind.%s: ignoring attempt to insert empty unwind table\n", + __FUNCTION__); + return NULL; + } + + table = kmalloc(sizeof(*table), GFP_USER); + if (!table) + return NULL; + + init_unwind_table(table, name, segment_base, gp, table_start, table_end); + + spin_lock_irqsave(&unw.lock, flags); + { + /* keep kernel unwind table at the front (it's searched most commonly): */ + table->next = unw.tables->next; + unw.tables->next = table; + } + spin_unlock_irqrestore(&unw.lock, flags); + + return table; +} + +void +unw_remove_unwind_table (void *handle) +{ + struct unw_table *table, *prev; + struct unw_script *tmp; + unsigned long flags; + long index; + + if (!handle) { + UNW_DPRINT(0, "unwind.%s: ignoring attempt to remove non-existent unwind table\n", + __FUNCTION__); + return; + } + + table = handle; + if (table == &unw.kernel_table) { + UNW_DPRINT(0, "unwind.%s: sorry, freeing the kernel's unwind table is a " + "no-can-do!\n", __FUNCTION__); + return; + } + + spin_lock_irqsave(&unw.lock, flags); + { + /* first, delete the table: */ + + for (prev = (struct unw_table *) &unw.tables; prev; prev = prev->next) + if (prev->next == table) + break; + if (!prev) { + UNW_DPRINT(0, "unwind.%s: failed to find unwind table %p\n", + __FUNCTION__, (void *) table); + spin_unlock_irqrestore(&unw.lock, flags); + return; + } + prev->next = table->next; + } + spin_unlock_irqrestore(&unw.lock, flags); + + /* next, remove hash table entries for this table */ + + for (index = 0; index <= UNW_HASH_SIZE; ++index) { + tmp = unw.cache + unw.hash[index]; + if (unw.hash[index] >= UNW_CACHE_SIZE + || tmp->ip < table->start || tmp->ip >= table->end) + continue; + + write_lock(&tmp->lock); + { + if (tmp->ip >= table->start && tmp->ip < table->end) { + unw.hash[index] = tmp->coll_chain; + tmp->ip = 0; + } + } + write_unlock(&tmp->lock); + } + + kfree(table); +} + +static int __init +create_gate_table (void) +{ + const struct unw_table_entry *entry, *start, *end; + unsigned long *lp, segbase = GATE_ADDR; + size_t info_size, size; + char *info; + Elf64_Phdr *punw = NULL, *phdr = (Elf64_Phdr *) (GATE_ADDR + GATE_EHDR->e_phoff); + int i; + + for (i = 0; i < GATE_EHDR->e_phnum; ++i, ++phdr) + if (phdr->p_type == PT_IA_64_UNWIND) { + punw = phdr; + break; + } + + if (!punw) { + printk("%s: failed to find gate DSO's unwind table!\n", __FUNCTION__); + return 0; + } + + start = (const struct unw_table_entry *) punw->p_vaddr; + end = (struct unw_table_entry *) ((char *) start + punw->p_memsz); + size = 0; + + unw_add_unwind_table("linux-gate.so", segbase, 0, start, end); + + for (entry = start; entry < end; ++entry) + size += 3*8 + 8 + 8*UNW_LENGTH(*(u64 *) (segbase + entry->info_offset)); + size += 8; /* reserve space for "end of table" marker */ + + unw.gate_table = kmalloc(size, GFP_KERNEL); + if (!unw.gate_table) { + unw.gate_table_size = 0; + printk(KERN_ERR "%s: unable to create unwind data for gate page!\n", __FUNCTION__); + return 0; + } + unw.gate_table_size = size; + + lp = unw.gate_table; + info = (char *) unw.gate_table + size; + + for (entry = start; entry < end; ++entry, lp += 3) { + info_size = 8 + 8*UNW_LENGTH(*(u64 *) (segbase + entry->info_offset)); + info -= info_size; + memcpy(info, (char *) segbase + entry->info_offset, info_size); + + lp[0] = segbase + entry->start_offset; /* start */ + lp[1] = segbase + entry->end_offset; /* end */ + lp[2] = info - (char *) unw.gate_table; /* info */ + } + *lp = 0; /* end-of-table marker */ + return 0; +} + +__initcall(create_gate_table); +#endif // !XEN + +void __init +unw_init (void) +{ + extern char __gp[]; + extern void unw_hash_index_t_is_too_narrow (void); + long i, off; + + if (8*sizeof(unw_hash_index_t) < UNW_LOG_HASH_SIZE) + unw_hash_index_t_is_too_narrow(); + + unw.sw_off[unw.preg_index[UNW_REG_PRI_UNAT_GR]] = SW(CALLER_UNAT); + unw.sw_off[unw.preg_index[UNW_REG_BSPSTORE]] = SW(AR_BSPSTORE); + unw.sw_off[unw.preg_index[UNW_REG_PFS]] = SW(AR_PFS); + unw.sw_off[unw.preg_index[UNW_REG_RP]] = SW(B0); + unw.sw_off[unw.preg_index[UNW_REG_UNAT]] = SW(CALLER_UNAT); + unw.sw_off[unw.preg_index[UNW_REG_PR]] = SW(PR); + unw.sw_off[unw.preg_index[UNW_REG_LC]] = SW(AR_LC); + unw.sw_off[unw.preg_index[UNW_REG_FPSR]] = SW(AR_FPSR); + for (i = UNW_REG_R4, off = SW(R4); i <= UNW_REG_R7; ++i, off += 8) + unw.sw_off[unw.preg_index[i]] = off; + for (i = UNW_REG_B1, off = SW(B1); i <= UNW_REG_B5; ++i, off += 8) + unw.sw_off[unw.preg_index[i]] = off; + for (i = UNW_REG_F2, off = SW(F2); i <= UNW_REG_F5; ++i, off += 16) + unw.sw_off[unw.preg_index[i]] = off; + for (i = UNW_REG_F16, off = SW(F16); i <= UNW_REG_F31; ++i, off += 16) + unw.sw_off[unw.preg_index[i]] = off; + + for (i = 0; i < UNW_CACHE_SIZE; ++i) { + if (i > 0) + unw.cache[i].lru_chain = (i - 1); + unw.cache[i].coll_chain = -1; + rwlock_init(&unw.cache[i].lock); + } + unw.lru_head = UNW_CACHE_SIZE - 1; + unw.lru_tail = 0; + + init_unwind_table(&unw.kernel_table, "kernel", KERNEL_START, (unsigned long) __gp, + __start_unwind, __end_unwind); +} + +/* + * DEPRECATED DEPRECATED DEPRECATED DEPRECATED DEPRECATED DEPRECATED DEPRECATED + * + * This system call has been deprecated. The new and improved way to get + * at the kernel's unwind info is via the gate DSO. The address of the + * ELF header for this DSO is passed to user-level via AT_SYSINFO_EHDR. + * + * DEPRECATED DEPRECATED DEPRECATED DEPRECATED DEPRECATED DEPRECATED DEPRECATED + * + * This system call copies the unwind data into the buffer pointed to by BUF and returns + * the size of the unwind data. If BUF_SIZE is smaller than the size of the unwind data + * or if BUF is NULL, nothing is copied, but the system call still returns the size of the + * unwind data. + * + * The first portion of the unwind data contains an unwind table and rest contains the + * associated unwind info (in no particular order). The unwind table consists of a table + * of entries of the form: + * + * u64 start; (64-bit address of start of function) + * u64 end; (64-bit address of start of function) + * u64 info; (BUF-relative offset to unwind info) + * + * The end of the unwind table is indicated by an entry with a START address of zero. + * + * Please see the IA-64 Software Conventions and Runtime Architecture manual for details + * on the format of the unwind info. + * + * ERRORS + * EFAULT BUF points outside your accessible address space. + */ +asmlinkage long +sys_getunwind (void __user *buf, size_t buf_size) +{ + if (buf && buf_size >= unw.gate_table_size) + if (copy_to_user(buf, unw.gate_table, unw.gate_table_size) != 0) + return -EFAULT; + return unw.gate_table_size; +} diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/ia64/linux-xen/unwind_decoder.c --- /dev/null Mon Jan 9 11:19:55 2006 +++ b/xen/arch/ia64/linux-xen/unwind_decoder.c Mon Jan 9 11:22:17 2006 @@ -0,0 +1,459 @@ +/* + * Copyright (C) 2000 Hewlett-Packard Co + * Copyright (C) 2000 David Mosberger-Tang <davidm@xxxxxxxxxx> + * + * Generic IA-64 unwind info decoder. + * + * This file is used both by the Linux kernel and objdump. Please keep + * the two copies of this file in sync. + * + * You need to customize the decoder by defining the following + * macros/constants before including this file: + * + * Types: + * unw_word Unsigned integer type with at least 64 bits + * + * Register names: + * UNW_REG_BSP + * UNW_REG_BSPSTORE + * UNW_REG_FPSR + * UNW_REG_LC + * UNW_REG_PFS + * UNW_REG_PR + * UNW_REG_RNAT + * UNW_REG_PSP + * UNW_REG_RP + * UNW_REG_UNAT + * + * Decoder action macros: + * UNW_DEC_BAD_CODE(code) + * UNW_DEC_ABI(fmt,abi,context,arg) + * UNW_DEC_BR_GR(fmt,brmask,gr,arg) + * UNW_DEC_BR_MEM(fmt,brmask,arg) + * UNW_DEC_COPY_STATE(fmt,label,arg) + * UNW_DEC_EPILOGUE(fmt,t,ecount,arg) + * UNW_DEC_FRGR_MEM(fmt,grmask,frmask,arg) + * UNW_DEC_FR_MEM(fmt,frmask,arg) + * UNW_DEC_GR_GR(fmt,grmask,gr,arg) + * UNW_DEC_GR_MEM(fmt,grmask,arg) + * UNW_DEC_LABEL_STATE(fmt,label,arg) + * UNW_DEC_MEM_STACK_F(fmt,t,size,arg) + * UNW_DEC_MEM_STACK_V(fmt,t,arg) + * UNW_DEC_PRIUNAT_GR(fmt,r,arg) + * UNW_DEC_PRIUNAT_WHEN_GR(fmt,t,arg) + * UNW_DEC_PRIUNAT_WHEN_MEM(fmt,t,arg) + * UNW_DEC_PRIUNAT_WHEN_PSPREL(fmt,pspoff,arg) + * UNW_DEC_PRIUNAT_WHEN_SPREL(fmt,spoff,arg) + * UNW_DEC_PROLOGUE(fmt,body,rlen,arg) + * UNW_DEC_PROLOGUE_GR(fmt,rlen,mask,grsave,arg) + * UNW_DEC_REG_PSPREL(fmt,reg,pspoff,arg) + * UNW_DEC_REG_REG(fmt,src,dst,arg) + * UNW_DEC_REG_SPREL(fmt,reg,spoff,arg) + * UNW_DEC_REG_WHEN(fmt,reg,t,arg) + * UNW_DEC_RESTORE(fmt,t,abreg,arg) + * UNW_DEC_RESTORE_P(fmt,qp,t,abreg,arg) + * UNW_DEC_SPILL_BASE(fmt,pspoff,arg) + * UNW_DEC_SPILL_MASK(fmt,imaskp,arg) + * UNW_DEC_SPILL_PSPREL(fmt,t,abreg,pspoff,arg) + * UNW_DEC_SPILL_PSPREL_P(fmt,qp,t,abreg,pspoff,arg) + * UNW_DEC_SPILL_REG(fmt,t,abreg,x,ytreg,arg) + * UNW_DEC_SPILL_REG_P(fmt,qp,t,abreg,x,ytreg,arg) + * UNW_DEC_SPILL_SPREL(fmt,t,abreg,spoff,arg) + * UNW_DEC_SPILL_SPREL_P(fmt,qp,t,abreg,pspoff,arg) + */ + +static unw_word +unw_decode_uleb128 (unsigned char **dpp) +{ + unsigned shift = 0; + unw_word byte, result = 0; + unsigned char *bp = *dpp; + + while (1) + { + byte = *bp++; + result |= (byte & 0x7f) << shift; + if ((byte & 0x80) == 0) + break; + shift += 7; + } + *dpp = bp; + return result; +} + +static unsigned char * +unw_decode_x1 (unsigned char *dp, unsigned char code, void *arg) +{ + unsigned char byte1, abreg; + unw_word t, off; + + byte1 = *dp++; + t = unw_decode_uleb128 (&dp); + off = unw_decode_uleb128 (&dp); + abreg = (byte1 & 0x7f); + if (byte1 & 0x80) + UNW_DEC_SPILL_SPREL(X1, t, abreg, off, arg); + else + UNW_DEC_SPILL_PSPREL(X1, t, abreg, off, arg); + return dp; +} + +static unsigned char * +unw_decode_x2 (unsigned char *dp, unsigned char code, void *arg) +{ + unsigned char byte1, byte2, abreg, x, ytreg; + unw_word t; + + byte1 = *dp++; byte2 = *dp++; + t = unw_decode_uleb128 (&dp); + abreg = (byte1 & 0x7f); + ytreg = byte2; + x = (byte1 >> 7) & 1; + if ((byte1 & 0x80) == 0 && ytreg == 0) + UNW_DEC_RESTORE(X2, t, abreg, arg); + else + UNW_DEC_SPILL_REG(X2, t, abreg, x, ytreg, arg); + return dp; +} + +static unsigned char * +unw_decode_x3 (unsigned char *dp, unsigned char code, void *arg) +{ + unsigned char byte1, byte2, abreg, qp; + unw_word t, off; + + byte1 = *dp++; byte2 = *dp++; + t = unw_decode_uleb128 (&dp); + off = unw_decode_uleb128 (&dp); + + qp = (byte1 & 0x3f); + abreg = (byte2 & 0x7f); + + if (byte1 & 0x80) + UNW_DEC_SPILL_SPREL_P(X3, qp, t, abreg, off, arg); + else + UNW_DEC_SPILL_PSPREL_P(X3, qp, t, abreg, off, arg); + return dp; +} + +static unsigned char * +unw_decode_x4 (unsigned char *dp, unsigned char code, void *arg) +{ + unsigned char byte1, byte2, byte3, qp, abreg, x, ytreg; + unw_word t; + + byte1 = *dp++; byte2 = *dp++; byte3 = *dp++; + t = unw_decode_uleb128 (&dp); + + qp = (byte1 & 0x3f); + abreg = (byte2 & 0x7f); + x = (byte2 >> 7) & 1; + ytreg = byte3; + + if ((byte2 & 0x80) == 0 && byte3 == 0) + UNW_DEC_RESTORE_P(X4, qp, t, abreg, arg); + else + UNW_DEC_SPILL_REG_P(X4, qp, t, abreg, x, ytreg, arg); + return dp; +} + +static unsigned char * +unw_decode_r1 (unsigned char *dp, unsigned char code, void *arg) +{ + int body = (code & 0x20) != 0; + unw_word rlen; + + rlen = (code & 0x1f); + UNW_DEC_PROLOGUE(R1, body, rlen, arg); + return dp; +} + +static unsigned char * +unw_decode_r2 (unsigned char *dp, unsigned char code, void *arg) +{ + unsigned char byte1, mask, grsave; + unw_word rlen; + + byte1 = *dp++; + + mask = ((code & 0x7) << 1) | ((byte1 >> 7) & 1); + grsave = (byte1 & 0x7f); + rlen = unw_decode_uleb128 (&dp); + UNW_DEC_PROLOGUE_GR(R2, rlen, mask, grsave, arg); + return dp; +} + +static unsigned char * +unw_decode_r3 (unsigned char *dp, unsigned char code, void *arg) +{ + unw_word rlen; + + rlen = unw_decode_uleb128 (&dp); + UNW_DEC_PROLOGUE(R3, ((code & 0x3) == 1), rlen, arg); + return dp; +} + +static unsigned char * +unw_decode_p1 (unsigned char *dp, unsigned char code, void *arg) +{ + unsigned char brmask = (code & 0x1f); + + UNW_DEC_BR_MEM(P1, brmask, arg); + return dp; +} + +static unsigned char * +unw_decode_p2_p5 (unsigned char *dp, unsigned char code, void *arg) +{ + if ((code & 0x10) == 0) + { + unsigned char byte1 = *dp++; + + UNW_DEC_BR_GR(P2, ((code & 0xf) << 1) | ((byte1 >> 7) & 1), + (byte1 & 0x7f), arg); + } + else if ((code & 0x08) == 0) + { + unsigned char byte1 = *dp++, r, dst; + + r = ((code & 0x7) << 1) | ((byte1 >> 7) & 1); + dst = (byte1 & 0x7f); + switch (r) + { + case 0: UNW_DEC_REG_GR(P3, UNW_REG_PSP, dst, arg); break; + case 1: UNW_DEC_REG_GR(P3, UNW_REG_RP, dst, arg); break; + case 2: UNW_DEC_REG_GR(P3, UNW_REG_PFS, dst, arg); break; + case 3: UNW_DEC_REG_GR(P3, UNW_REG_PR, dst, arg); break; + case 4: UNW_DEC_REG_GR(P3, UNW_REG_UNAT, dst, arg); break; + case 5: UNW_DEC_REG_GR(P3, UNW_REG_LC, dst, arg); break; + case 6: UNW_DEC_RP_BR(P3, dst, arg); break; + case 7: UNW_DEC_REG_GR(P3, UNW_REG_RNAT, dst, arg); break; + case 8: UNW_DEC_REG_GR(P3, UNW_REG_BSP, dst, arg); break; + case 9: UNW_DEC_REG_GR(P3, UNW_REG_BSPSTORE, dst, arg); break; + case 10: UNW_DEC_REG_GR(P3, UNW_REG_FPSR, dst, arg); break; + case 11: UNW_DEC_PRIUNAT_GR(P3, dst, arg); break; + default: UNW_DEC_BAD_CODE(r); break; + } + } + else if ((code & 0x7) == 0) + UNW_DEC_SPILL_MASK(P4, dp, arg); + else if ((code & 0x7) == 1) + { + unw_word grmask, frmask, byte1, byte2, byte3; + + byte1 = *dp++; byte2 = *dp++; byte3 = *dp++; + grmask = ((byte1 >> 4) & 0xf); + frmask = ((byte1 & 0xf) << 16) | (byte2 << 8) | byte3; + UNW_DEC_FRGR_MEM(P5, grmask, frmask, arg); + } + else + UNW_DEC_BAD_CODE(code); + return dp; +} + +static unsigned char * +unw_decode_p6 (unsigned char *dp, unsigned char code, void *arg) +{ + int gregs = (code & 0x10) != 0; + unsigned char mask = (code & 0x0f); + + if (gregs) + UNW_DEC_GR_MEM(P6, mask, arg); + else + UNW_DEC_FR_MEM(P6, mask, arg); + return dp; +} + +static unsigned char * +unw_decode_p7_p10 (unsigned char *dp, unsigned char code, void *arg) +{ + unsigned char r, byte1, byte2; + unw_word t, size; + + if ((code & 0x10) == 0) + { + r = (code & 0xf); + t = unw_decode_uleb128 (&dp); + switch (r) + { + case 0: + size = unw_decode_uleb128 (&dp); + UNW_DEC_MEM_STACK_F(P7, t, size, arg); + break; + + case 1: UNW_DEC_MEM_STACK_V(P7, t, arg); break; + case 2: UNW_DEC_SPILL_BASE(P7, t, arg); break; + case 3: UNW_DEC_REG_SPREL(P7, UNW_REG_PSP, t, arg); break; + case 4: UNW_DEC_REG_WHEN(P7, UNW_REG_RP, t, arg); break; + case 5: UNW_DEC_REG_PSPREL(P7, UNW_REG_RP, t, arg); break; + case 6: UNW_DEC_REG_WHEN(P7, UNW_REG_PFS, t, arg); break; + case 7: UNW_DEC_REG_PSPREL(P7, UNW_REG_PFS, t, arg); break; + case 8: UNW_DEC_REG_WHEN(P7, UNW_REG_PR, t, arg); break; + case 9: UNW_DEC_REG_PSPREL(P7, UNW_REG_PR, t, arg); break; + case 10: UNW_DEC_REG_WHEN(P7, UNW_REG_LC, t, arg); break; + case 11: UNW_DEC_REG_PSPREL(P7, UNW_REG_LC, t, arg); break; + case 12: UNW_DEC_REG_WHEN(P7, UNW_REG_UNAT, t, arg); break; + case 13: UNW_DEC_REG_PSPREL(P7, UNW_REG_UNAT, t, arg); break; + case 14: UNW_DEC_REG_WHEN(P7, UNW_REG_FPSR, t, arg); break; + case 15: UNW_DEC_REG_PSPREL(P7, UNW_REG_FPSR, t, arg); break; + default: UNW_DEC_BAD_CODE(r); break; + } + } + else + { + switch (code & 0xf) + { + case 0x0: /* p8 */ + { + r = *dp++; + t = unw_decode_uleb128 (&dp); + switch (r) + { + case 1: UNW_DEC_REG_SPREL(P8, UNW_REG_RP, t, arg); break; + case 2: UNW_DEC_REG_SPREL(P8, UNW_REG_PFS, t, arg); break; + case 3: UNW_DEC_REG_SPREL(P8, UNW_REG_PR, t, arg); break; + case 4: UNW_DEC_REG_SPREL(P8, UNW_REG_LC, t, arg); break; + case 5: UNW_DEC_REG_SPREL(P8, UNW_REG_UNAT, t, arg); break; + case 6: UNW_DEC_REG_SPREL(P8, UNW_REG_FPSR, t, arg); break; + case 7: UNW_DEC_REG_WHEN(P8, UNW_REG_BSP, t, arg); break; + case 8: UNW_DEC_REG_PSPREL(P8, UNW_REG_BSP, t, arg); break; + case 9: UNW_DEC_REG_SPREL(P8, UNW_REG_BSP, t, arg); break; + case 10: UNW_DEC_REG_WHEN(P8, UNW_REG_BSPSTORE, t, arg); break; + case 11: UNW_DEC_REG_PSPREL(P8, UNW_REG_BSPSTORE, t, arg); break; + case 12: UNW_DEC_REG_SPREL(P8, UNW_REG_BSPSTORE, t, arg); break; + case 13: UNW_DEC_REG_WHEN(P8, UNW_REG_RNAT, t, arg); break; + case 14: UNW_DEC_REG_PSPREL(P8, UNW_REG_RNAT, t, arg); break; + case 15: UNW_DEC_REG_SPREL(P8, UNW_REG_RNAT, t, arg); break; + case 16: UNW_DEC_PRIUNAT_WHEN_GR(P8, t, arg); break; + case 17: UNW_DEC_PRIUNAT_PSPREL(P8, t, arg); break; + case 18: UNW_DEC_PRIUNAT_SPREL(P8, t, arg); break; + case 19: UNW_DEC_PRIUNAT_WHEN_MEM(P8, t, arg); break; + default: UNW_DEC_BAD_CODE(r); break; + } + } + break; + + case 0x1: + byte1 = *dp++; byte2 = *dp++; + UNW_DEC_GR_GR(P9, (byte1 & 0xf), (byte2 & 0x7f), arg); + break; + + case 0xf: /* p10 */ + byte1 = *dp++; byte2 = *dp++; + UNW_DEC_ABI(P10, byte1, byte2, arg); + break; + + case 0x9: + return unw_decode_x1 (dp, code, arg); + + case 0xa: + return unw_decode_x2 (dp, code, arg); + + case 0xb: + return unw_decode_x3 (dp, code, arg); + + case 0xc: + return unw_decode_x4 (dp, code, arg); + + default: + UNW_DEC_BAD_CODE(code); + break; + } + } + return dp; +} + +static unsigned char * +unw_decode_b1 (unsigned char *dp, unsigned char code, void *arg) +{ + unw_word label = (code & 0x1f); + + if ((code & 0x20) != 0) + UNW_DEC_COPY_STATE(B1, label, arg); + else + UNW_DEC_LABEL_STATE(B1, label, arg); + return dp; +} + +static unsigned char * +unw_decode_b2 (unsigned char *dp, unsigned char code, void *arg) +{ + unw_word t; + + t = unw_decode_uleb128 (&dp); + UNW_DEC_EPILOGUE(B2, t, (code & 0x1f), arg); + return dp; +} + +static unsigned char * +unw_decode_b3_x4 (unsigned char *dp, unsigned char code, void *arg) +{ + unw_word t, ecount, label; + + if ((code & 0x10) == 0) + { + t = unw_decode_uleb128 (&dp); + ecount = unw_decode_uleb128 (&dp); + UNW_DEC_EPILOGUE(B3, t, ecount, arg); + } + else if ((code & 0x07) == 0) + { + label = unw_decode_uleb128 (&dp); + if ((code & 0x08) != 0) + UNW_DEC_COPY_STATE(B4, label, arg); + else + UNW_DEC_LABEL_STATE(B4, label, arg); + } + else + switch (code & 0x7) + { + case 1: return unw_decode_x1 (dp, code, arg); + case 2: return unw_decode_x2 (dp, code, arg); + case 3: return unw_decode_x3 (dp, code, arg); + case 4: return unw_decode_x4 (dp, code, arg); + default: UNW_DEC_BAD_CODE(code); break; + } + return dp; +} + +typedef unsigned char *(*unw_decoder) (unsigned char *, unsigned char, void *); + +static unw_decoder unw_decode_table[2][8] = +{ + /* prologue table: */ + { + unw_decode_r1, /* 0 */ + unw_decode_r1, + unw_decode_r2, + unw_decode_r3, + unw_decode_p1, /* 4 */ + unw_decode_p2_p5, + unw_decode_p6, + unw_decode_p7_p10 + }, + { + unw_decode_r1, /* 0 */ + unw_decode_r1, + unw_decode_r2, + unw_decode_r3, + unw_decode_b1, /* 4 */ + unw_decode_b1, + unw_decode_b2, + unw_decode_b3_x4 + } +}; + +/* + * Decode one descriptor and return address of next descriptor. + */ +static inline unsigned char * +unw_decode (unsigned char *dp, int inside_body, void *arg) +{ + unw_decoder decoder; + unsigned char code; + + code = *dp++; + decoder = unw_decode_table[inside_body][code >> 5]; + dp = (*decoder) (dp, code, arg); + return dp; +} diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/ia64/linux-xen/unwind_i.h --- /dev/null Mon Jan 9 11:19:55 2006 +++ b/xen/arch/ia64/linux-xen/unwind_i.h Mon Jan 9 11:22:17 2006 @@ -0,0 +1,164 @@ +/* + * Copyright (C) 2000, 2002-2003 Hewlett-Packard Co + * David Mosberger-Tang <davidm@xxxxxxxxxx> + * + * Kernel unwind support. + */ + +#define UNW_VER(x) ((x) >> 48) +#define UNW_FLAG_MASK 0x0000ffff00000000 +#define UNW_FLAG_OSMASK 0x0000f00000000000 +#define UNW_FLAG_EHANDLER(x) ((x) & 0x0000000100000000L) +#define UNW_FLAG_UHANDLER(x) ((x) & 0x0000000200000000L) +#define UNW_LENGTH(x) ((x) & 0x00000000ffffffffL) + +enum unw_register_index { + /* primary unat: */ + UNW_REG_PRI_UNAT_GR, + UNW_REG_PRI_UNAT_MEM, + + /* register stack */ + UNW_REG_BSP, /* register stack pointer */ + UNW_REG_BSPSTORE, + UNW_REG_PFS, /* previous function state */ + UNW_REG_RNAT, + /* memory stack */ + UNW_REG_PSP, /* previous memory stack pointer */ + /* return pointer: */ + UNW_REG_RP, + + /* preserved registers: */ + UNW_REG_R4, UNW_REG_R5, UNW_REG_R6, UNW_REG_R7, + UNW_REG_UNAT, UNW_REG_PR, UNW_REG_LC, UNW_REG_FPSR, + UNW_REG_B1, UNW_REG_B2, UNW_REG_B3, UNW_REG_B4, UNW_REG_B5, + UNW_REG_F2, UNW_REG_F3, UNW_REG_F4, UNW_REG_F5, + UNW_REG_F16, UNW_REG_F17, UNW_REG_F18, UNW_REG_F19, + UNW_REG_F20, UNW_REG_F21, UNW_REG_F22, UNW_REG_F23, + UNW_REG_F24, UNW_REG_F25, UNW_REG_F26, UNW_REG_F27, + UNW_REG_F28, UNW_REG_F29, UNW_REG_F30, UNW_REG_F31, + UNW_NUM_REGS +}; + +struct unw_info_block { + u64 header; + u64 desc[0]; /* unwind descriptors */ + /* personality routine and language-specific data follow behind descriptors */ +}; + +struct unw_table { + struct unw_table *next; /* must be first member! */ + const char *name; + unsigned long gp; /* global pointer for this load-module */ + unsigned long segment_base; /* base for offsets in the unwind table entries */ + unsigned long start; + unsigned long end; + const struct unw_table_entry *array; + unsigned long length; +}; + +enum unw_where { + UNW_WHERE_NONE, /* register isn't saved at all */ + UNW_WHERE_GR, /* register is saved in a general register */ + UNW_WHERE_FR, /* register is saved in a floating-point register */ + UNW_WHERE_BR, /* register is saved in a branch register */ + UNW_WHERE_SPREL, /* register is saved on memstack (sp-relative) */ + UNW_WHERE_PSPREL, /* register is saved on memstack (psp-relative) */ + /* + * At the end of each prologue these locations get resolved to + * UNW_WHERE_PSPREL and UNW_WHERE_GR, respectively: + */ + UNW_WHERE_SPILL_HOME, /* register is saved in its spill home */ + UNW_WHERE_GR_SAVE /* register is saved in next general register */ +}; + +#define UNW_WHEN_NEVER 0x7fffffff + +struct unw_reg_info { + unsigned long val; /* save location: register number or offset */ + enum unw_where where; /* where the register gets saved */ + int when; /* when the register gets saved */ +}; + +struct unw_reg_state { + struct unw_reg_state *next; /* next (outer) element on state stack */ + struct unw_reg_info reg[UNW_NUM_REGS]; /* register save locations */ +}; + +struct unw_labeled_state { + struct unw_labeled_state *next; /* next labeled state (or NULL) */ + unsigned long label; /* label for this state */ + struct unw_reg_state saved_state; +}; + +struct unw_state_record { + unsigned int first_region : 1; /* is this the first region? */ + unsigned int done : 1; /* are we done scanning descriptors? */ + unsigned int any_spills : 1; /* got any register spills? */ + unsigned int in_body : 1; /* are we inside a body (as opposed to a prologue)? */ + unsigned long flags; /* see UNW_FLAG_* in unwind.h */ + + u8 *imask; /* imask of spill_mask record or NULL */ + unsigned long pr_val; /* predicate values */ + unsigned long pr_mask; /* predicate mask */ + long spill_offset; /* psp-relative offset for spill base */ + int region_start; + int region_len; + int epilogue_start; + int epilogue_count; + int when_target; + + u8 gr_save_loc; /* next general register to use for saving a register */ + u8 return_link_reg; /* branch register in which the return link is passed */ + + struct unw_labeled_state *labeled_states; /* list of all labeled states */ + struct unw_reg_state curr; /* current state */ +}; + +enum unw_nat_type { + UNW_NAT_NONE, /* NaT not represented */ + UNW_NAT_VAL, /* NaT represented by NaT value (fp reg) */ + UNW_NAT_MEMSTK, /* NaT value is in unat word at offset OFF */ + UNW_NAT_REGSTK /* NaT is in rnat */ +}; + +enum unw_insn_opcode { + UNW_INSN_ADD, /* s[dst] += val */ + UNW_INSN_ADD_PSP, /* s[dst] = (s.psp + val) */ + UNW_INSN_ADD_SP, /* s[dst] = (s.sp + val) */ + UNW_INSN_MOVE, /* s[dst] = s[val] */ + UNW_INSN_MOVE2, /* s[dst] = s[val]; s[dst+1] = s[val+1] */ + UNW_INSN_MOVE_STACKED, /* s[dst] = ia64_rse_skip(*s.bsp, val) */ + UNW_INSN_SETNAT_MEMSTK, /* s[dst+1].nat.type = MEMSTK; + s[dst+1].nat.off = *s.pri_unat - s[dst] */ + UNW_INSN_SETNAT_TYPE, /* s[dst+1].nat.type = val */ + UNW_INSN_LOAD, /* s[dst] = *s[val] */ + UNW_INSN_MOVE_SCRATCH, /* s[dst] = scratch reg "val" */ + UNW_INSN_MOVE_CONST, /* s[dst] = constant reg "val" */ +}; + +struct unw_insn { + unsigned int opc : 4; + unsigned int dst : 9; + signed int val : 19; +}; + +/* + * Preserved general static registers (r4-r7) give rise to two script + * instructions; everything else yields at most one instruction; at + * the end of the script, the psp gets popped, accounting for one more + * instruction. + */ +#define UNW_MAX_SCRIPT_LEN (UNW_NUM_REGS + 5) + +struct unw_script { + unsigned long ip; /* ip this script is for */ + unsigned long pr_mask; /* mask of predicates script depends on */ + unsigned long pr_val; /* predicate values this script is for */ + rwlock_t lock; + unsigned int flags; /* see UNW_FLAG_* in unwind.h */ + unsigned short lru_chain; /* used for least-recently-used chain */ + unsigned short coll_chain; /* used for hash collisions */ + unsigned short hint; /* hint for next script to try (or -1) */ + unsigned short count; /* number of instructions in script */ + struct unw_insn insn[UNW_MAX_SCRIPT_LEN]; +}; diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/x86/x86_32/xen.lds.S --- /dev/null Mon Jan 9 11:19:55 2006 +++ b/xen/arch/x86/x86_32/xen.lds.S Mon Jan 9 11:22:17 2006 @@ -0,0 +1,85 @@ +/* ld script to make i386 Linux kernel + * Written by Martin Mares <mj@xxxxxxxxxxxxxxxxxxxxxxxx> + * Modified for i386 Xen by Keir Fraser + */ + +#include <xen/config.h> +#include <asm/page.h> +#undef ENTRY +#undef ALIGN + +OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") +OUTPUT_ARCH(i386) +ENTRY(start) +PHDRS +{ + text PT_LOAD ; +} +SECTIONS +{ + . = 0xFF000000 + 0x100000; + _text = .; /* Text and read-only data */ + .text : { + *(.text) + *(.fixup) + *(.gnu.warning) + } :text =0x9090 + .text.lock : { *(.text.lock) } :text /* out-of-line lock text */ + + _etext = .; /* End of text section */ + + .rodata : { *(.rodata) *(.rodata.*) } :text + + . = ALIGN(32); /* Exception table */ + __start___ex_table = .; + __ex_table : { *(__ex_table) } :text + __stop___ex_table = .; + + . = ALIGN(32); /* Pre-exception table */ + __start___pre_ex_table = .; + __pre_ex_table : { *(__pre_ex_table) } :text + __stop___pre_ex_table = .; + + .data : { /* Data */ + *(.data) + CONSTRUCTORS + } :text + + . = ALIGN(4096); /* Init code and data */ + __init_begin = .; + .text.init : { *(.text.init) } :text + .data.init : { *(.data.init) } :text + . = ALIGN(32); + __setup_start = .; + .setup.init : { *(.setup.init) } :text + __setup_end = .; + __initcall_start = .; + .initcall.init : { *(.initcall.init) } :text + __initcall_end = .; + . = ALIGN(STACK_SIZE); + __init_end = .; + + __bss_start = .; /* BSS */ + .bss : { + *(.bss.stack_aligned) + *(.bss.page_aligned) + *(.bss) + } :text + _end = . ; + + /* Sections to be discarded */ + /DISCARD/ : { + *(.text.exit) + *(.data.exit) + *(.exitcall.exit) + } + + /* Stabs debugging sections. */ + .stab 0 : { *(.stab) } + .stabstr 0 : { *(.stabstr) } + .stab.excl 0 : { *(.stab.excl) } + .stab.exclstr 0 : { *(.stab.exclstr) } + .stab.index 0 : { *(.stab.index) } + .stab.indexstr 0 : { *(.stab.indexstr) } + .comment 0 : { *(.comment) } +} diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/x86/x86_64/xen.lds.S --- /dev/null Mon Jan 9 11:19:55 2006 +++ b/xen/arch/x86/x86_64/xen.lds.S Mon Jan 9 11:22:17 2006 @@ -0,0 +1,83 @@ +/* Excerpts written by Martin Mares <mj@xxxxxxxxxxxxxxxxxxxxxxxx> */ +/* Modified for x86-64 Xen by Keir Fraser */ + +#include <xen/config.h> +#include <asm/page.h> +#undef ENTRY +#undef ALIGN + +OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") +OUTPUT_ARCH(i386:x86-64) +ENTRY(start) +PHDRS +{ + text PT_LOAD ; +} +SECTIONS +{ + . = 0xFFFF830000100000; + _text = .; /* Text and read-only data */ + .text : { + *(.text) + *(.fixup) + *(.gnu.warning) + } :text = 0x9090 + .text.lock : { *(.text.lock) } :text /* out-of-line lock text */ + + _etext = .; /* End of text section */ + + .rodata : { *(.rodata) *(.rodata.*) } :text + + . = ALIGN(32); /* Exception table */ + __start___ex_table = .; + __ex_table : { *(__ex_table) } :text + __stop___ex_table = .; + + . = ALIGN(32); /* Pre-exception table */ + __start___pre_ex_table = .; + __pre_ex_table : { *(__pre_ex_table) } :text + __stop___pre_ex_table = .; + + .data : { /* Data */ + *(.data) + CONSTRUCTORS + } :text + + . = ALIGN(4096); /* Init code and data */ + __init_begin = .; + .text.init : { *(.text.init) } :text + .data.init : { *(.data.init) } :text + . = ALIGN(32); + __setup_start = .; + .setup.init : { *(.setup.init) } :text + __setup_end = .; + __initcall_start = .; + .initcall.init : { *(.initcall.init) } :text + __initcall_end = .; + . = ALIGN(STACK_SIZE); + __init_end = .; + + __bss_start = .; /* BSS */ + .bss : { + *(.bss.stack_aligned) + *(.bss.page_aligned) + *(.bss) + } :text + _end = . ; + + /* Sections to be discarded */ + /DISCARD/ : { + *(.text.exit) + *(.data.exit) + *(.exitcall.exit) + } + + /* Stabs debugging sections. */ + .stab 0 : { *(.stab) } + .stabstr 0 : { *(.stabstr) } + .stab.excl 0 : { *(.stab.excl) } + .stab.exclstr 0 : { *(.stab.exclstr) } + .stab.index 0 : { *(.stab.index) } + .stab.indexstr 0 : { *(.stab.indexstr) } + .comment 0 : { *(.comment) } +} diff -r 25e3c8668f1f -r 8af1199488d3 xen/common/rangeset.c --- /dev/null Mon Jan 9 11:19:55 2006 +++ b/xen/common/rangeset.c Mon Jan 9 11:22:17 2006 @@ -0,0 +1,399 @@ +/****************************************************************************** + * rangeset.c + * + * Creation, maintenance and automatic destruction of per-domain sets of + * numeric ranges. + * + * Copyright (c) 2005, K A Fraser + */ + +#include <xen/sched.h> +#include <xen/rangeset.h> + +/* An inclusive range [s,e] and pointer to next range in ascending order. */ +struct range { + struct list_head list; + unsigned long s, e; +}; + +struct rangeset { + /* Owning domain and threaded list of rangesets. */ + struct list_head rangeset_list; + struct domain *domain; + + /* Ordered list of ranges contained in this set, and protecting lock. */ + struct list_head range_list; + spinlock_t lock; + + /* Pretty-printing name. */ + char name[32]; + + /* RANGESETF flags. */ + unsigned int flags; +}; + +/***************************** + * Private range functions hide the underlying linked-list implemnetation. + */ + +/* Find highest range lower than or containing s. NULL if no such range. */ +static struct range *find_range( + struct rangeset *r, unsigned long s) +{ + struct range *x = NULL, *y; + + list_for_each_entry ( y, &r->range_list, list ) + { + if ( y->s > s ) + break; + x = y; + } + + return x; +} + +/* Return the lowest range in the set r, or NULL if r is empty. */ +static struct range *first_range( + struct rangeset *r) +{ + if ( list_empty(&r->range_list) ) + return NULL; + return list_entry(r->range_list.next, struct range, list); +} + +/* Return range following x in ascending order, or NULL if x is the highest. */ +static struct range *next_range( + struct rangeset *r, struct range *x) +{ + if ( x->list.next == &r->range_list ) + return NULL; + return list_entry(x->list.next, struct range, list); +} + +/* Insert range y after range x in r. Insert as first range if x is NULL. */ +static void insert_range( + struct rangeset *r, struct range *x, struct range *y) +{ + list_add(&y->list, (x != NULL) ? &x->list : &r->range_list); +} + +/* Remove a range from its list and free it. */ +static void destroy_range( + struct range *x) +{ + list_del(&x->list); + xfree(x); +} + +/***************************** + * Core public functions + */ + +int rangeset_add_range( + struct rangeset *r, unsigned long s, unsigned long e) +{ + struct range *x, *y; + int rc = 0; + + spin_lock(&r->lock); + + x = find_range(r, s); + y = find_range(r, e); + + if ( x == y ) + { + if ( (x == NULL) || ((x->e < s) && ((x->e + 1) != s)) ) + { + x = xmalloc(struct range); + if ( x == NULL ) + { + rc = -ENOMEM; + goto out; + } + + x->s = s; + x->e = e; + + insert_range(r, y, x); + } + else if ( x->e < e ) + x->e = e; + } + else + { + if ( x == NULL ) + { + x = first_range(r); + x->s = s; + } + else if ( (x->e < s) && ((x->e + 1) != s) ) + { + x = next_range(r, x); + x->s = s; + } + + x->e = (y->e > e) ? y->e : e; + + for ( ; ; ) + { + y = next_range(r, x); + if ( (y == NULL) || (y->e > x->e) ) + break; + destroy_range(y); + } + } + + y = next_range(r, x); + if ( (y != NULL) && ((x->e + 1) == y->s) ) + { + x->e = y->e; + destroy_range(y); + } + + out: + spin_unlock(&r->lock); + return rc; +} + +int rangeset_remove_range( + struct rangeset *r, unsigned long s, unsigned long e) +{ + struct range *x, *y, *t; + int rc = 0; + + spin_lock(&r->lock); + + x = find_range(r, s); + y = find_range(r, e); + + if ( x == y ) + { + if ( (x == NULL) || (x->e < s) ) + goto out; + + if ( (x->s < s) && (x->e > e) ) + { + y = xmalloc(struct range); + if ( y == NULL ) + { + rc = -ENOMEM; + goto out; + } + + y->s = e + 1; + y->e = x->e; + x->e = s - 1; + + insert_range(r, x, y); + } + else if ( (x->s == s) && (x->e <= e) ) + destroy_range(x); + else if ( x->s == s ) + x->s = e + 1; + else if ( x->e <= e ) + x->e = s - 1; + } + else + { + if ( x == NULL ) + x = first_range(r); + + if ( x->s < s ) + { + x->e = s - 1; + x = next_range(r, x); + } + + while ( x != y ) + { + t = x; + x = next_range(r, x); + destroy_range(t); + } + + x->s = e + 1; + if ( x->s > x->e ) + destroy_range(x); + } + + out: + spin_unlock(&r->lock); + return rc; +} + +int rangeset_contains_range( + struct rangeset *r, unsigned long s, unsigned long e) +{ + struct range *x; + int contains; + + spin_lock(&r->lock); + x = find_range(r, s); + contains = (x && (x->e >= e)); + spin_unlock(&r->lock); + + return contains; +} + +int rangeset_add_singleton( + struct rangeset *r, unsigned long s) +{ + return rangeset_add_range(r, s, s); +} + +int rangeset_remove_singleton( + struct rangeset *r, unsigned long s) +{ + return rangeset_remove_range(r, s, s); +} + +int rangeset_contains_singleton( + struct rangeset *r, unsigned long s) +{ + return rangeset_contains_range(r, s, s); +} + +int rangeset_is_empty( + struct rangeset *r) +{ + return list_empty(&r->range_list); +} + +struct rangeset *rangeset_new( + struct domain *d, char *name, unsigned int flags) +{ + struct rangeset *r; + + r = xmalloc(struct rangeset); + if ( r == NULL ) + return NULL; + + spin_lock_init(&r->lock); + INIT_LIST_HEAD(&r->range_list); + + BUG_ON(flags & ~RANGESETF_prettyprint_hex); + r->flags = flags; + + if ( name != NULL ) + { + strncpy(r->name, name, sizeof(r->name)); + r->name[sizeof(r->name)-1] = '\0'; + } + else + { + sprintf(r->name, "(no name)"); + } + + if ( (r->domain = d) != NULL ) + { + spin_lock(&d->rangesets_lock); + list_add(&r->rangeset_list, &d->rangesets); + spin_unlock(&d->rangesets_lock); + } + + return r; +} + +void rangeset_destroy( + struct rangeset *r) +{ + struct range *x; + + if ( r == NULL ) + return; + + if ( r->domain != NULL ) + { + spin_lock(&r->domain->rangesets_lock); + list_del(&r->rangeset_list); + spin_unlock(&r->domain->rangesets_lock); + } + + while ( (x = first_range(r)) != NULL ) + destroy_range(x); + + xfree(r); +} + +void rangeset_domain_initialise( + struct domain *d) +{ + INIT_LIST_HEAD(&d->rangesets); + spin_lock_init(&d->rangesets_lock); +} + +void rangeset_domain_destroy( + struct domain *d) +{ + struct rangeset *r; + + while ( !list_empty(&d->rangesets) ) + { + r = list_entry(d->rangesets.next, struct rangeset, rangeset_list); + + BUG_ON(r->domain != d); + r->domain = NULL; + list_del(&r->rangeset_list); + + rangeset_destroy(r); + } +} + +/***************************** + * Pretty-printing functions + */ + +static void print_limit(struct rangeset *r, unsigned long s) +{ + printk((r->flags & RANGESETF_prettyprint_hex) ? "%lx" : "%lu", s); +} + +void rangeset_printk( + struct rangeset *r) +{ + int nr_printed = 0; + struct range *x; + + spin_lock(&r->lock); + + printk("%-10s {", r->name); + + for ( x = first_range(r); x != NULL; x = next_range(r, x) ) + { + if ( nr_printed++ ) + printk(","); + printk(" "); + print_limit(r, x->s); + if ( x->s != x->e ) + { + printk("-"); + print_limit(r, x->e); + } + } + + printk(" }"); + + spin_unlock(&r->lock); +} + +void rangeset_domain_printk( + struct domain *d) +{ + struct rangeset *r; + + printk("Rangesets belonging to domain %u:\n", d->domain_id); + + spin_lock(&d->rangesets_lock); + + if ( list_empty(&d->rangesets) ) + printk(" None\n"); + + list_for_each_entry ( r, &d->rangesets, rangeset_list ) + { + printk(" "); + rangeset_printk(r); + printk("\n"); + } + + spin_unlock(&d->rangesets_lock); +} diff -r 25e3c8668f1f -r 8af1199488d3 xen/include/asm-ia64/iocap.h --- /dev/null Mon Jan 9 11:19:55 2006 +++ b/xen/include/asm-ia64/iocap.h Mon Jan 9 11:22:17 2006 @@ -0,0 +1,10 @@ +/****************************************************************************** + * iocap.h + * + * Architecture-specific per-domain I/O capabilities. + */ + +#ifndef __IA64_IOCAP_H__ +#define __IA64_IOCAP_H__ + +#endif /* __IA64_IOCAP_H__ */ diff -r 25e3c8668f1f -r 8af1199488d3 xen/include/asm-x86/iocap.h --- /dev/null Mon Jan 9 11:19:55 2006 +++ b/xen/include/asm-x86/iocap.h Mon Jan 9 11:22:17 2006 @@ -0,0 +1,20 @@ +/****************************************************************************** + * iocap.h + * + * Architecture-specific per-domain I/O capabilities. + */ + +#ifndef __X86_IOCAP_H__ +#define __X86_IOCAP_H__ + +#define ioports_permit_access(d, s, e) \ + rangeset_add_range((d)->arch.ioport_caps, s, e) +#define ioports_deny_access(d, s, e) \ + rangeset_remove_range((d)->arch.ioport_caps, s, e) +#define ioports_access_permitted(d, s, e) \ + rangeset_contains_range((d)->arch.ioport_caps, s, e) + +#define cache_flush_permitted(d) \ + (!rangeset_is_empty((d)->iomem_caps)) + +#endif /* __X86_IOCAP_H__ */ diff -r 25e3c8668f1f -r 8af1199488d3 xen/include/public/hvm/hvm_info_table.h --- /dev/null Mon Jan 9 11:19:55 2006 +++ b/xen/include/public/hvm/hvm_info_table.h Mon Jan 9 11:22:17 2006 @@ -0,0 +1,24 @@ +/****************************************************************************** + * hvm/hvm_info_table.h + * + * HVM parameter and information table, written into guest memory map. + */ + +#ifndef __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__ +#define __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__ + +#define HVM_INFO_PFN 0x09F +#define HVM_INFO_OFFSET 0x800 +#define HVM_INFO_PADDR ((HVM_INFO_PFN << 12) + HVM_INFO_OFFSET) + +struct hvm_info_table { + char signature[8]; /* "HVM INFO" */ + uint32_t length; + uint8_t checksum; + uint8_t acpi_enabled; + uint8_t apic_enabled; + uint8_t pad[1]; + uint32_t nr_vcpus; +}; + +#endif /* __XEN_PUBLIC_HVM_HVM_INFO_TABLE_H__ */ diff -r 25e3c8668f1f -r 8af1199488d3 xen/include/public/hvm/ioreq.h --- /dev/null Mon Jan 9 11:19:55 2006 +++ b/xen/include/public/hvm/ioreq.h Mon Jan 9 11:22:17 2006 @@ -0,0 +1,90 @@ +/* + * ioreq.h: I/O request definitions for device models + * Copyright (c) 2004, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + */ + +#ifndef _IOREQ_H_ +#define _IOREQ_H_ + +#define IOREQ_READ 1 +#define IOREQ_WRITE 0 + +#define STATE_INVALID 0 +#define STATE_IOREQ_READY 1 +#define STATE_IOREQ_INPROCESS 2 +#define STATE_IORESP_READY 3 +#define STATE_IORESP_HOOK 4 + +#define IOREQ_TYPE_PIO 0 /* pio */ +#define IOREQ_TYPE_COPY 1 /* mmio ops */ +#define IOREQ_TYPE_AND 2 +#define IOREQ_TYPE_OR 3 +#define IOREQ_TYPE_XOR 4 + +/* + * VMExit dispatcher should cooperate with instruction decoder to + * prepare this structure and notify service OS and DM by sending + * virq + */ +typedef struct { + uint64_t addr; /* physical address */ + uint64_t size; /* size in bytes */ + uint64_t count; /* for rep prefixes */ + union { + uint64_t data; /* data */ + void *pdata; /* pointer to data */ + } u; + uint8_t state:4; + uint8_t pdata_valid:1; /* if 1, use pdata above */ + uint8_t dir:1; /* 1=read, 0=write */ + uint8_t df:1; + uint8_t type; /* I/O type */ +} ioreq_t; + +#define MAX_VECTOR 256 +#define BITS_PER_BYTE 8 +#define INTR_LEN (MAX_VECTOR/(BITS_PER_BYTE * sizeof(uint64_t))) +#define INTR_LEN_32 (MAX_VECTOR/(BITS_PER_BYTE * sizeof(uint32_t))) + +typedef struct { + uint16_t pic_elcr; + uint16_t pic_irr; + uint16_t pic_last_irr; + uint16_t pic_clear_irr; + int eport; /* Event channel port */ +} global_iodata_t; + +typedef struct { + ioreq_t vp_ioreq; +} vcpu_iodata_t; + +typedef struct { + global_iodata_t sp_global; + vcpu_iodata_t vcpu_iodata[1]; +} shared_iopage_t; + +#endif /* _IOREQ_H_ */ + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff -r 25e3c8668f1f -r 8af1199488d3 xen/include/public/hvm/vmx_assist.h --- /dev/null Mon Jan 9 11:19:55 2006 +++ b/xen/include/public/hvm/vmx_assist.h Mon Jan 9 11:22:17 2006 @@ -0,0 +1,97 @@ +/* + * vmx_assist.h: Context definitions for the VMXASSIST world switch. + * + * Leendert van Doorn, leendert@xxxxxxxxxxxxxx + * Copyright (c) 2005, International Business Machines Corporation. + */ + +#ifndef _VMX_ASSIST_H_ +#define _VMX_ASSIST_H_ + +#define VMXASSIST_BASE 0xD0000 +#define VMXASSIST_MAGIC 0x17101966 +#define VMXASSIST_MAGIC_OFFSET (VMXASSIST_BASE+8) + +#define VMXASSIST_NEW_CONTEXT (VMXASSIST_BASE + 12) +#define VMXASSIST_OLD_CONTEXT (VMXASSIST_NEW_CONTEXT + 4) + +#ifndef __ASSEMBLY__ + +union vmcs_arbytes { + struct arbyte_fields { + unsigned int seg_type : 4, + s : 1, + dpl : 2, + p : 1, + reserved0 : 4, + avl : 1, + reserved1 : 1, + default_ops_size: 1, + g : 1, + null_bit : 1, + reserved2 : 15; + } fields; + unsigned int bytes; +}; + +/* + * World switch state + */ +typedef struct vmx_assist_context { + uint32_t eip; /* execution pointer */ + uint32_t esp; /* stack pointer */ + uint32_t eflags; /* flags register */ + uint32_t cr0; + uint32_t cr3; /* page table directory */ + uint32_t cr4; + uint32_t idtr_limit; /* idt */ + uint32_t idtr_base; + uint32_t gdtr_limit; /* gdt */ + uint32_t gdtr_base; + uint32_t cs_sel; /* cs selector */ + uint32_t cs_limit; + uint32_t cs_base; + union vmcs_arbytes cs_arbytes; + uint32_t ds_sel; /* ds selector */ + uint32_t ds_limit; + uint32_t ds_base; + union vmcs_arbytes ds_arbytes; + uint32_t es_sel; /* es selector */ + uint32_t es_limit; + uint32_t es_base; + union vmcs_arbytes es_arbytes; + uint32_t ss_sel; /* ss selector */ + uint32_t ss_limit; + uint32_t ss_base; + union vmcs_arbytes ss_arbytes; + uint32_t fs_sel; /* fs selector */ + uint32_t fs_limit; + uint32_t fs_base; + union vmcs_arbytes fs_arbytes; + uint32_t gs_sel; /* gs selector */ + uint32_t gs_limit; + uint32_t gs_base; + union vmcs_arbytes gs_arbytes; + uint32_t tr_sel; /* task selector */ + uint32_t tr_limit; + uint32_t tr_base; + union vmcs_arbytes tr_arbytes; + uint32_t ldtr_sel; /* ldtr selector */ + uint32_t ldtr_limit; + uint32_t ldtr_base; + union vmcs_arbytes ldtr_arbytes; +} vmx_assist_context_t; + +#endif /* __ASSEMBLY__ */ + +#endif /* _VMX_ASSIST_H_ */ + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff -r 25e3c8668f1f -r 8af1199488d3 xen/include/xen/iocap.h --- /dev/null Mon Jan 9 11:19:55 2006 +++ b/xen/include/xen/iocap.h Mon Jan 9 11:22:17 2006 @@ -0,0 +1,34 @@ +/****************************************************************************** + * iocap.h + * + * Per-domain I/O capabilities. + */ + +#ifndef __XEN_IOCAP_H__ +#define __XEN_IOCAP_H__ + +#include <xen/rangeset.h> +#include <asm/iocap.h> + +#define iomem_permit_access(d, s, e) \ + rangeset_add_range((d)->iomem_caps, s, e) +#define iomem_deny_access(d, s, e) \ + rangeset_remove_range((d)->iomem_caps, s, e) +#define iomem_access_permitted(d, s, e) \ + rangeset_contains_range((d)->iomem_caps, s, e) + +#define irq_permit_access(d, i) \ + rangeset_add_singleton((d)->irq_caps, i) +#define irq_deny_access(d, i) \ + rangeset_remove_singleton((d)->irq_caps, i) +#define irqs_permit_access(d, s, e) \ + rangeset_add_range((d)->irq_caps, s, e) +#define irqs_deny_access(d, s, e) \ + rangeset_remove_range((d)->irq_caps, s, e) +#define irq_access_permitted(d, i) \ + rangeset_contains_singleton((d)->irq_caps, i) + +#define multipage_allocation_permitted(d) \ + (!rangeset_is_empty((d)->iomem_caps)) + +#endif /* __XEN_IOCAP_H__ */ diff -r 25e3c8668f1f -r 8af1199488d3 xen/include/xen/rangeset.h --- /dev/null Mon Jan 9 11:19:55 2006 +++ b/xen/include/xen/rangeset.h Mon Jan 9 11:22:17 2006 @@ -0,0 +1,71 @@ +/****************************************************************************** + * rangeset.h + * + * Creation, maintenance and automatic destruction of per-domain sets of + * numeric ranges. + * + * Copyright (c) 2005, K A Fraser + */ + +#ifndef __XEN_RANGESET_H__ +#define __XEN_RANGESET_H__ + +struct domain; +struct rangeset; + +/* + * Initialise/destroy per-domain rangeset information. + * + * It is invalid to create or destroy a rangeset belonging to a domain @d + * before rangeset_domain_initialise(d) returns or after calling + * rangeset_domain_destroy(d). + */ +void rangeset_domain_initialise( + struct domain *d); +void rangeset_domain_destroy( + struct domain *d); + +/* + * Create/destroy a rangeset. Optionally attach to specified domain @d for + * auto-destruction when the domain dies. A name may be specified, for use + * in debug pretty-printing, and various RANGESETF flags (defined below). + * + * It is invalid to perform any operation on a rangeset @r after calling + * rangeset_destroy(r). + */ +struct rangeset *rangeset_new( + struct domain *d, char *name, unsigned int flags); +void rangeset_destroy( + struct rangeset *r); + +/* Flags for passing to rangeset_new(). */ + /* Pretty-print range limits in hexadecimal. */ +#define _RANGESETF_prettyprint_hex 0 +#define RANGESETF_prettyprint_hex (1U << _RANGESETF_prettyprint_hex) + +int __must_check rangeset_is_empty( + struct rangeset *r); + +/* Add/remove/query a numeric range. */ +int __must_check rangeset_add_range( + struct rangeset *r, unsigned long s, unsigned long e); +int __must_check rangeset_remove_range( + struct rangeset *r, unsigned long s, unsigned long e); +int __must_check rangeset_contains_range( + struct rangeset *r, unsigned long s, unsigned long e); + +/* Add/remove/query a single number. */ +int __must_check rangeset_add_singleton( + struct rangeset *r, unsigned long s); +int __must_check rangeset_remove_singleton( + struct rangeset *r, unsigned long s); +int __must_check rangeset_contains_singleton( + struct rangeset *r, unsigned long s); + +/* Rangeset pretty printing. */ +void rangeset_printk( + struct rangeset *r); +void rangeset_domain_printk( + struct domain *d); + +#endif /* __XEN_RANGESET_H__ */ diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/x86/x86_32/xen.lds --- a/xen/arch/x86/x86_32/xen.lds Mon Jan 9 11:19:55 2006 +++ /dev/null Mon Jan 9 11:22:17 2006 @@ -1,79 +0,0 @@ -/* ld script to make i386 Linux kernel - * Written by Martin Mares <mj@xxxxxxxxxxxxxxxxxxxxxxxx> - * Modified for i386 Xen by Keir Fraser - */ -OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") -OUTPUT_ARCH(i386) -ENTRY(start) -PHDRS -{ - text PT_LOAD ; -} -SECTIONS -{ - . = 0xFF000000 + 0x100000; - _text = .; /* Text and read-only data */ - .text : { - *(.text) - *(.fixup) - *(.gnu.warning) - } :text =0x9090 - .text.lock : { *(.text.lock) } :text /* out-of-line lock text */ - - _etext = .; /* End of text section */ - - .rodata : { *(.rodata) *(.rodata.*) } :text - - . = ALIGN(32); /* Exception table */ - __start___ex_table = .; - __ex_table : { *(__ex_table) } :text - __stop___ex_table = .; - - . = ALIGN(32); /* Pre-exception table */ - __start___pre_ex_table = .; - __pre_ex_table : { *(__pre_ex_table) } :text - __stop___pre_ex_table = .; - - .data : { /* Data */ - *(.data) - CONSTRUCTORS - } :text - - . = ALIGN(4096); /* Init code and data */ - __init_begin = .; - .text.init : { *(.text.init) } :text - .data.init : { *(.data.init) } :text - . = ALIGN(32); - __setup_start = .; - .setup.init : { *(.setup.init) } :text - __setup_end = .; - __initcall_start = .; - .initcall.init : { *(.initcall.init) } :text - __initcall_end = .; - . = ALIGN(8192); - __init_end = .; - - __bss_start = .; /* BSS */ - .bss : { - *(.bss.twopage_aligned) - *(.bss.page_aligned) - *(.bss) - } :text - _end = . ; - - /* Sections to be discarded */ - /DISCARD/ : { - *(.text.exit) - *(.data.exit) - *(.exitcall.exit) - } - - /* Stabs debugging sections. */ - .stab 0 : { *(.stab) } - .stabstr 0 : { *(.stabstr) } - .stab.excl 0 : { *(.stab.excl) } - .stab.exclstr 0 : { *(.stab.exclstr) } - .stab.index 0 : { *(.stab.index) } - .stab.indexstr 0 : { *(.stab.indexstr) } - .comment 0 : { *(.comment) } -} diff -r 25e3c8668f1f -r 8af1199488d3 xen/arch/x86/x86_64/xen.lds --- a/xen/arch/x86/x86_64/xen.lds Mon Jan 9 11:19:55 2006 +++ /dev/null Mon Jan 9 11:22:17 2006 @@ -1,77 +0,0 @@ -/* Excerpts written by Martin Mares <mj@xxxxxxxxxxxxxxxxxxxxxxxx> */ -/* Modified for x86-64 Xen by Keir Fraser */ -OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") -OUTPUT_ARCH(i386:x86-64) -ENTRY(start) -PHDRS -{ - text PT_LOAD ; -} -SECTIONS -{ - . = 0xFFFF830000100000; - _text = .; /* Text and read-only data */ - .text : { - *(.text) - *(.fixup) - *(.gnu.warning) - } :text = 0x9090 - .text.lock : { *(.text.lock) } :text /* out-of-line lock text */ - - _etext = .; /* End of text section */ - - .rodata : { *(.rodata) *(.rodata.*) } :text - - . = ALIGN(32); /* Exception table */ - __start___ex_table = .; - __ex_table : { *(__ex_table) } :text - __stop___ex_table = .; - - . = ALIGN(32); /* Pre-exception table */ - __start___pre_ex_table = .; - __pre_ex_table : { *(__pre_ex_table) } :text - __stop___pre_ex_table = .; - - .data : { /* Data */ - *(.data) - CONSTRUCTORS - } :text - - . = ALIGN(4096); /* Init code and data */ - __init_begin = .; - .text.init : { *(.text.init) } :text - .data.init : { *(.data.init) } :text - . = ALIGN(32); - __setup_start = .; - .setup.init : { *(.setup.init) } :text - __setup_end = .; - __initcall_start = .; - .initcall.init : { *(.initcall.init) } :text - __initcall_end = .; - . = ALIGN(8192); - __init_end = .; - - __bss_start = .; /* BSS */ - .bss : { - *(.bss.twopage_aligned) - *(.bss.page_aligned) - *(.bss) - } :text - _end = . ; - - /* Sections to be discarded */ - /DISCARD/ : { - *(.text.exit) - *(.data.exit) - *(.exitcall.exit) - } - - /* Stabs debugging sections. */ - .stab 0 : { *(.stab) } - .stabstr 0 : { *(.stabstr) } - .stab.excl 0 : { *(.stab.excl) } - .stab.exclstr 0 : { *(.stab.exclstr) } - .stab.index 0 : { *(.stab.index) } - .stab.indexstr 0 : { *(.stab.indexstr) } - .comment 0 : { *(.comment) } -} diff -r 25e3c8668f1f -r 8af1199488d3 xen/include/asm-x86/physdev.h --- a/xen/include/asm-x86/physdev.h Mon Jan 9 11:19:55 2006 +++ /dev/null Mon Jan 9 11:22:17 2006 @@ -1,17 +0,0 @@ -/****************************************************************************** - * physdev.h - */ - -#ifndef __XEN_PHYSDEV_H__ -#define __XEN_PHYSDEV_H__ - -#include <public/physdev.h> - -void physdev_modify_ioport_access_range( - struct domain *d, int enable, int port, int num ); -void physdev_destroy_state(struct domain *d); -int domain_iomem_in_pfn(struct domain *p, unsigned long pfn); -long do_physdev_op(physdev_op_t *uop); -void physdev_init_dom0(struct domain *d); - -#endif /* __XEN_PHYSDEV_H__ */ diff -r 25e3c8668f1f -r 8af1199488d3 xen/include/public/io/ioreq.h --- a/xen/include/public/io/ioreq.h Mon Jan 9 11:19:55 2006 +++ /dev/null Mon Jan 9 11:22:17 2006 @@ -1,91 +0,0 @@ -/* - * ioreq.h: I/O request definitions for device models - * Copyright (c) 2004, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 Temple - * Place - Suite 330, Boston, MA 02111-1307 USA. - * - */ - -#ifndef _IOREQ_H_ -#define _IOREQ_H_ - -#define IOREQ_READ 1 -#define IOREQ_WRITE 0 - -#define STATE_INVALID 0 -#define STATE_IOREQ_READY 1 -#define STATE_IOREQ_INPROCESS 2 -#define STATE_IORESP_READY 3 -#define STATE_IORESP_HOOK 4 - -#define IOREQ_TYPE_PIO 0 /* pio */ -#define IOREQ_TYPE_COPY 1 /* mmio ops */ -#define IOREQ_TYPE_AND 2 -#define IOREQ_TYPE_OR 3 -#define IOREQ_TYPE_XOR 4 - -/* - * VMExit dispatcher should cooperate with instruction decoder to - * prepare this structure and notify service OS and DM by sending - * virq - */ -typedef struct { - uint64_t addr; /* physical address */ - uint64_t size; /* size in bytes */ - uint64_t count; /* for rep prefixes */ - union { - uint64_t data; /* data */ - void *pdata; /* pointer to data */ - } u; - uint8_t state:4; - uint8_t pdata_valid:1; /* if 1, use pdata above */ - uint8_t dir:1; /* 1=read, 0=write */ - uint8_t df:1; - uint8_t type; /* I/O type */ -} ioreq_t; - -#define MAX_VECTOR 256 -#define BITS_PER_BYTE 8 -#define INTR_LEN (MAX_VECTOR/(BITS_PER_BYTE * sizeof(uint64_t))) -#define INTR_LEN_32 (MAX_VECTOR/(BITS_PER_BYTE * sizeof(uint32_t))) - -typedef struct { - uint16_t pic_elcr; - uint16_t pic_irr; - uint16_t pic_last_irr; - uint16_t pic_clear_irr; - int eport; /* Event channel port */ -} global_iodata_t; - -typedef struct { - ioreq_t vp_ioreq; - unsigned long vp_intr[INTR_LEN]; -} vcpu_iodata_t; - -typedef struct { - global_iodata_t sp_global; - vcpu_iodata_t vcpu_iodata[1]; -} shared_iopage_t; - -#endif /* _IOREQ_H_ */ - -/* - * Local variables: - * mode: C - * c-set-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ diff -r 25e3c8668f1f -r 8af1199488d3 xen/include/public/vmx_assist.h --- a/xen/include/public/vmx_assist.h Mon Jan 9 11:19:55 2006 +++ /dev/null Mon Jan 9 11:22:17 2006 @@ -1,97 +0,0 @@ -/* - * vmx_assist.h: Context definitions for the VMXASSIST world switch. - * - * Leendert van Doorn, leendert@xxxxxxxxxxxxxx - * Copyright (c) 2005, International Business Machines Corporation. - */ - -#ifndef _VMX_ASSIST_H_ -#define _VMX_ASSIST_H_ - -#define VMXASSIST_BASE 0xD0000 -#define VMXASSIST_MAGIC 0x17101966 -#define VMXASSIST_MAGIC_OFFSET (VMXASSIST_BASE+8) - -#define VMXASSIST_NEW_CONTEXT (VMXASSIST_BASE + 12) -#define VMXASSIST_OLD_CONTEXT (VMXASSIST_NEW_CONTEXT + 4) - -#ifndef __ASSEMBLY__ - -union vmcs_arbytes { - struct arbyte_fields { - unsigned int seg_type : 4, - s : 1, - dpl : 2, - p : 1, - reserved0 : 4, - avl : 1, - reserved1 : 1, - default_ops_size: 1, - g : 1, - null_bit : 1, - reserved2 : 15; - } fields; - unsigned int bytes; -}; - -/* - * World switch state - */ -typedef struct vmx_assist_context { - uint32_t eip; /* execution pointer */ - uint32_t esp; /* stack pointer */ - uint32_t eflags; /* flags register */ - uint32_t cr0; - uint32_t cr3; /* page table directory */ - uint32_t cr4; - uint32_t idtr_limit; /* idt */ - uint32_t idtr_base; - uint32_t gdtr_limit; /* gdt */ - uint32_t gdtr_base; - uint32_t cs_sel; /* cs selector */ - uint32_t cs_limit; - uint32_t cs_base; - union vmcs_arbytes cs_arbytes; - uint32_t ds_sel; /* ds selector */ - uint32_t ds_limit; - uint32_t ds_base; - union vmcs_arbytes ds_arbytes; - uint32_t es_sel; /* es selector */ - uint32_t es_limit; - uint32_t es_base; - union vmcs_arbytes es_arbytes; - uint32_t ss_sel; /* ss selector */ - uint32_t ss_limit; - uint32_t ss_base; - union vmcs_arbytes ss_arbytes; - uint32_t fs_sel; /* fs selector */ - uint32_t fs_limit; - uint32_t fs_base; - union vmcs_arbytes fs_arbytes; - uint32_t gs_sel; /* gs selector */ - uint32_t gs_limit; - uint32_t gs_base; - union vmcs_arbytes gs_arbytes; - uint32_t tr_sel; /* task selector */ - uint32_t tr_limit; - uint32_t tr_base; - union vmcs_arbytes tr_arbytes; - uint32_t ldtr_sel; /* ldtr selector */ - uint32_t ldtr_limit; - uint32_t ldtr_base; - union vmcs_arbytes ldtr_arbytes; -} vmx_assist_context_t; - -#endif /* __ASSEMBLY__ */ - -#endif /* _VMX_ASSIST_H_ */ - -/* - * Local variables: - * mode: C - * c-set-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-changelog
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |