[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[XenPPC] [xenppc-unstable] merge



# HG changeset patch
# User Hollis Blanchard <hollisb@xxxxxxxxxx>
# Node ID 156a0963a1aed529e5c5517e7153b0ad64d99276
# Parent  d3e181fa238b93c616bd010edd45f707c359cf99
# Parent  c191c649cdb387e7ec573d218c9581c639c87700
merge
---
 linux-2.6-xen-sparse/arch/i386/mm/init-xen.c                     |   14 
 linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c              |    7 
 linux-2.6-xen-sparse/arch/x86_64/mm/init-xen.c                   |   27 
 linux-2.6-xen-sparse/drivers/xen/netback/netback.c               |  288 
++++++++--
 linux-2.6-xen-sparse/drivers/xen/netback/xenbus.c                |   26 
 linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c             |  173 ++++--
 linux-2.6-xen-sparse/drivers/xen/privcmd/privcmd.c               |   14 
 linux-2.6-xen-sparse/include/asm-i386/mach-xen/setup_arch_post.h |    7 
 linux-2.6-xen-sparse/include/xen/public/privcmd.h                |   16 
 tools/debugger/libxendebug/xendebug.c                            |    2 
 tools/firmware/vmxassist/vm86.c                                  |   65 ++
 tools/ioemu/hw/cirrus_vga.c                                      |   12 
 tools/ioemu/vl.c                                                 |   15 
 tools/libxc/xc_core.c                                            |    8 
 tools/libxc/xc_domain.c                                          |   10 
 tools/libxc/xc_hvm_build.c                                       |    6 
 tools/libxc/xc_ia64_stubs.c                                      |   12 
 tools/libxc/xc_linux.c                                           |    2 
 tools/libxc/xc_linux_build.c                                     |   58 +-
 tools/libxc/xc_linux_restore.c                                   |  210 ++++++-
 tools/libxc/xc_linux_save.c                                      |   51 +
 tools/libxc/xc_load_aout9.c                                      |    4 
 tools/libxc/xc_load_bin.c                                        |    4 
 tools/libxc/xc_load_elf.c                                        |   19 
 tools/libxc/xc_private.c                                         |   62 +-
 tools/libxc/xenctrl.h                                            |   19 
 tools/libxc/xg_private.h                                         |    7 
 tools/libxc/xg_save_restore.h                                    |   12 
 tools/tests/test_x86_emulator.c                                  |  131 ++--
 xen/arch/x86/domain.c                                            |   21 
 xen/arch/x86/domain_build.c                                      |    3 
 xen/arch/x86/hvm/vmx/vmx.c                                       |   22 
 xen/arch/x86/hvm/vmx/x86_32/exits.S                              |   35 -
 xen/arch/x86/hvm/vmx/x86_64/exits.S                              |   71 +-
 xen/arch/x86/mm.c                                                |   15 
 xen/arch/x86/x86_32/asm-offsets.c                                |    2 
 xen/arch/x86/x86_32/entry.S                                      |    5 
 xen/arch/x86/x86_32/traps.c                                      |    6 
 xen/arch/x86/x86_64/asm-offsets.c                                |    3 
 xen/arch/x86/x86_64/entry.S                                      |   10 
 xen/arch/x86/x86_64/traps.c                                      |   12 
 xen/arch/x86/x86_emulate.c                                       |    4 
 xen/common/kernel.c                                              |    5 
 xen/common/keyhandler.c                                          |    5 
 xen/common/memory.c                                              |   20 
 xen/include/public/arch-ia64.h                                   |    3 
 xen/include/public/arch-x86_32.h                                 |   19 
 xen/include/public/arch-x86_64.h                                 |   21 
 xen/include/public/callback.h                                    |   15 
 xen/include/public/dom0_ops.h                                    |   56 -
 xen/include/public/grant_table.h                                 |    2 
 xen/include/public/io/netif.h                                    |    4 
 xen/include/public/io/ring.h                                     |   16 
 xen/include/public/memory.h                                      |   10 
 xen/include/public/xen.h                                         |   22 
 55 files changed, 1228 insertions(+), 460 deletions(-)

diff -r d3e181fa238b -r 156a0963a1ae 
linux-2.6-xen-sparse/arch/i386/mm/init-xen.c
--- a/linux-2.6-xen-sparse/arch/i386/mm/init-xen.c      Fri Jun 02 12:54:22 
2006 -0500
+++ b/linux-2.6-xen-sparse/arch/i386/mm/init-xen.c      Tue Jun 06 13:25:31 
2006 -0500
@@ -558,15 +558,11 @@ void __init paging_init(void)
 
        kmap_init();
 
-       if (!xen_feature(XENFEAT_auto_translated_physmap) ||
-           xen_start_info->shared_info >= xen_start_info->nr_pages) {
-               /* Switch to the real shared_info page, and clear the
-                * dummy page. */
-               set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
-               HYPERVISOR_shared_info =
-                       (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
-               memset(empty_zero_page, 0, sizeof(empty_zero_page));
-       }
+       /* Switch to the real shared_info page, and clear the
+        * dummy page. */
+       set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
+       HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
+       memset(empty_zero_page, 0, sizeof(empty_zero_page));
 
        /* Setup mapping of lower 1st MB */
        for (i = 0; i < NR_FIX_ISAMAPS; i++)
diff -r d3e181fa238b -r 156a0963a1ae 
linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c
--- a/linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c       Fri Jun 02 
12:54:22 2006 -0500
+++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c       Tue Jun 06 
13:25:31 2006 -0500
@@ -665,13 +665,6 @@ void __init setup_arch(char **cmdline_p)
 
        setup_xen_features();
 
-       if (xen_feature(XENFEAT_auto_translated_physmap) &&
-           xen_start_info->shared_info < xen_start_info->nr_pages) {
-               HYPERVISOR_shared_info =
-                       (shared_info_t *)__va(xen_start_info->shared_info);
-               memset(empty_zero_page, 0, sizeof(empty_zero_page));
-       }
-
        HYPERVISOR_vm_assist(VMASST_CMD_enable,
                             VMASST_TYPE_writable_pagetables);
 
diff -r d3e181fa238b -r 156a0963a1ae 
linux-2.6-xen-sparse/arch/x86_64/mm/init-xen.c
--- a/linux-2.6-xen-sparse/arch/x86_64/mm/init-xen.c    Fri Jun 02 12:54:22 
2006 -0500
+++ b/linux-2.6-xen-sparse/arch/x86_64/mm/init-xen.c    Tue Jun 06 13:25:31 
2006 -0500
@@ -666,7 +666,18 @@ void __meminit init_memory_mapping(unsig
                        set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
        }
 
-       BUG_ON(!after_bootmem && start_pfn != table_end);
+       if (!after_bootmem) {
+               BUG_ON(start_pfn != table_end);
+               /*
+                * Destroy the temporary mappings created above. Prevents
+                * overlap with modules area (if init mapping is very big).
+                */
+               start = __START_KERNEL_map + (table_start << PAGE_SHIFT);
+               end   = __START_KERNEL_map + (table_end   << PAGE_SHIFT);
+               for (; start < end; start += PAGE_SIZE)
+                       WARN_ON(HYPERVISOR_update_va_mapping(
+                               start, __pte_ma(0), 0));
+       }
 
        __flush_tlb_all();
 }
@@ -752,15 +763,11 @@ void __init paging_init(void)
        free_area_init_node(0, NODE_DATA(0), zones,
                            __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
 
-       if (!xen_feature(XENFEAT_auto_translated_physmap) ||
-           xen_start_info->shared_info >= xen_start_info->nr_pages) {
-               /* Switch to the real shared_info page, and clear the
-                * dummy page. */
-               set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
-               HYPERVISOR_shared_info =
-                       (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
-               memset(empty_zero_page, 0, sizeof(empty_zero_page));
-       }
+       /* Switch to the real shared_info page, and clear the
+        * dummy page. */
+       set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
+       HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
+       memset(empty_zero_page, 0, sizeof(empty_zero_page));
 
        init_mm.context.pinned = 1;
 
diff -r d3e181fa238b -r 156a0963a1ae 
linux-2.6-xen-sparse/drivers/xen/netback/netback.c
--- a/linux-2.6-xen-sparse/drivers/xen/netback/netback.c        Fri Jun 02 
12:54:22 2006 -0500
+++ b/linux-2.6-xen-sparse/drivers/xen/netback/netback.c        Tue Jun 06 
13:25:31 2006 -0500
@@ -458,6 +458,9 @@ inline static void net_tx_action_dealloc
        dc = dealloc_cons;
        dp = dealloc_prod;
 
+       /* Ensure we see all indexes enqueued by netif_idx_release(). */
+       smp_rmb();
+
        /*
         * Free up any grants we have finished using
         */
@@ -487,6 +490,177 @@ inline static void net_tx_action_dealloc
        }
 }
 
+static void netbk_tx_err(netif_t *netif, RING_IDX end)
+{
+       RING_IDX cons = netif->tx.req_cons;
+
+       do {
+               netif_tx_request_t *txp = RING_GET_REQUEST(&netif->tx, cons);
+               make_tx_response(netif, txp->id, NETIF_RSP_ERROR);
+       } while (++cons < end);
+       netif->tx.req_cons = cons;
+       netif_schedule_work(netif);
+       netif_put(netif);
+}
+
+static int netbk_count_requests(netif_t *netif, netif_tx_request_t *txp,
+                               int work_to_do)
+{
+       netif_tx_request_t *first = txp;
+       RING_IDX cons = netif->tx.req_cons;
+       int frags = 1;
+
+       while (txp->flags & NETTXF_more_data) {
+               if (frags >= work_to_do) {
+                       DPRINTK("Need more frags\n");
+                       return -frags;
+               }
+
+               txp = RING_GET_REQUEST(&netif->tx, cons + frags);
+               if (txp->size > first->size) {
+                       DPRINTK("Frags galore\n");
+                       return -frags;
+               }
+
+               first->size -= txp->size;
+               frags++;
+
+               if (unlikely((txp->offset + txp->size) > PAGE_SIZE)) {
+                       DPRINTK("txp->offset: %x, size: %u\n",
+                               txp->offset, txp->size);
+                       return -frags;
+               }
+       }
+
+       return frags;
+}
+
+static gnttab_map_grant_ref_t *netbk_get_requests(netif_t *netif,
+                                                 struct sk_buff *skb,
+                                                 gnttab_map_grant_ref_t *mop)
+{
+       struct skb_shared_info *shinfo = skb_shinfo(skb);
+       skb_frag_t *frags = shinfo->frags;
+       netif_tx_request_t *txp;
+       unsigned long pending_idx = *((u16 *)skb->data);
+       RING_IDX cons = netif->tx.req_cons + 1;
+       int i, start;
+
+       /* Skip first skb fragment if it is on same page as header fragment. */
+       start = ((unsigned long)shinfo->frags[0].page == pending_idx);
+
+       for (i = start; i < shinfo->nr_frags; i++) {
+               txp = RING_GET_REQUEST(&netif->tx, cons++);
+               pending_idx = pending_ring[MASK_PEND_IDX(pending_cons++)];
+
+               gnttab_set_map_op(mop++, MMAP_VADDR(pending_idx),
+                                 GNTMAP_host_map | GNTMAP_readonly,
+                                 txp->gref, netif->domid);
+
+               memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp));
+               netif_get(netif);
+               pending_tx_info[pending_idx].netif = netif;
+               frags[i].page = (void *)pending_idx;
+       }
+
+       return mop;
+}
+
+static int netbk_tx_check_mop(struct sk_buff *skb,
+                              gnttab_map_grant_ref_t **mopp)
+{
+       gnttab_map_grant_ref_t *mop = *mopp;
+       int pending_idx = *((u16 *)skb->data);
+       netif_t *netif = pending_tx_info[pending_idx].netif;
+       netif_tx_request_t *txp;
+       struct skb_shared_info *shinfo = skb_shinfo(skb);
+       int nr_frags = shinfo->nr_frags;
+       int i, err, start;
+
+       /* Check status of header. */
+       err = mop->status;
+       if (unlikely(err)) {
+               txp = &pending_tx_info[pending_idx].req;
+               make_tx_response(netif, txp->id, NETIF_RSP_ERROR);
+               pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
+               netif_put(netif);
+       } else {
+               set_phys_to_machine(
+                       __pa(MMAP_VADDR(pending_idx)) >> PAGE_SHIFT,
+                       FOREIGN_FRAME(mop->dev_bus_addr >> PAGE_SHIFT));
+               grant_tx_handle[pending_idx] = mop->handle;
+       }
+
+       /* Skip first skb fragment if it is on same page as header fragment. */
+       start = ((unsigned long)shinfo->frags[0].page == pending_idx);
+
+       for (i = start; i < nr_frags; i++) {
+               int j, newerr;
+
+               pending_idx = (unsigned long)shinfo->frags[i].page;
+
+               /* Check error status: if okay then remember grant handle. */
+               newerr = (++mop)->status;
+               if (likely(!newerr)) {
+                       set_phys_to_machine(
+                               __pa(MMAP_VADDR(pending_idx))>>PAGE_SHIFT,
+                               FOREIGN_FRAME(mop->dev_bus_addr>>PAGE_SHIFT));
+                       grant_tx_handle[pending_idx] = mop->handle;
+                       /* Had a previous error? Invalidate this fragment. */
+                       if (unlikely(err))
+                               netif_idx_release(pending_idx);
+                       continue;
+               }
+
+               /* Error on this fragment: respond to client with an error. */
+               txp = &pending_tx_info[pending_idx].req;
+               make_tx_response(netif, txp->id, NETIF_RSP_ERROR);
+               pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
+               netif_put(netif);
+
+               /* Not the first error? Preceding frags already invalidated. */
+               if (err)
+                       continue;
+
+               /* First error: invalidate header and preceding fragments. */
+               pending_idx = *((u16 *)skb->data);
+               netif_idx_release(pending_idx);
+               for (j = start; j < i; j++) {
+                       pending_idx = (unsigned long)shinfo->frags[i].page;
+                       netif_idx_release(pending_idx);
+               }
+
+               /* Remember the error: invalidate all subsequent fragments. */
+               err = newerr;
+       }
+
+       *mopp = mop + 1;
+       return err;
+}
+
+static void netbk_fill_frags(struct sk_buff *skb)
+{
+       struct skb_shared_info *shinfo = skb_shinfo(skb);
+       int nr_frags = shinfo->nr_frags;
+       int i;
+
+       for (i = 0; i < nr_frags; i++) {
+               skb_frag_t *frag = shinfo->frags + i;
+               netif_tx_request_t *txp;
+               unsigned long pending_idx;
+
+               pending_idx = (unsigned long)frag->page;
+               txp = &pending_tx_info[pending_idx].req;
+               frag->page = virt_to_page(MMAP_VADDR(pending_idx));
+               frag->size = txp->size;
+               frag->page_offset = txp->offset;
+
+               skb->len += txp->size;
+               skb->data_len += txp->size;
+               skb->truesize += txp->size;
+       }
+}
+
 /* Called after netfront has transmitted */
 static void net_tx_action(unsigned long unused)
 {
@@ -504,7 +678,7 @@ static void net_tx_action(unsigned long 
                net_tx_action_dealloc();
 
        mop = tx_map_ops;
-       while ((NR_PENDING_REQS < MAX_PENDING_REQS) &&
+       while (((NR_PENDING_REQS + MAX_SKB_FRAGS) < MAX_PENDING_REQS) &&
                !list_empty(&net_schedule_list)) {
                /* Get a netif from the list with work to do. */
                ent = net_schedule_list.next;
@@ -552,38 +726,44 @@ static void net_tx_action(unsigned long 
                }
                netif->remaining_credit -= txreq.size;
 
-               netif->tx.req_cons++;
-
-               netif_schedule_work(netif);
-
-               if (unlikely(txreq.size < ETH_HLEN) || 
-                   unlikely(txreq.size > ETH_FRAME_LEN)) {
+               ret = netbk_count_requests(netif, &txreq, work_to_do);
+               if (unlikely(ret < 0)) {
+                       netbk_tx_err(netif, i - ret);
+                       continue;
+               }
+               i += ret;
+
+               if (unlikely(ret > MAX_SKB_FRAGS + 1)) {
+                       DPRINTK("Too many frags\n");
+                       netbk_tx_err(netif, i);
+                       continue;
+               }
+
+               if (unlikely(txreq.size < ETH_HLEN)) {
                        DPRINTK("Bad packet size: %d\n", txreq.size);
-                       make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
-                       netif_put(netif);
+                       netbk_tx_err(netif, i);
                        continue; 
                }
 
                /* No crossing a page as the payload mustn't fragment. */
-               if (unlikely((txreq.offset + txreq.size) >= PAGE_SIZE)) {
+               if (unlikely((txreq.offset + txreq.size) > PAGE_SIZE)) {
                        DPRINTK("txreq.offset: %x, size: %u, end: %lu\n", 
                                txreq.offset, txreq.size, 
                                (txreq.offset &~PAGE_MASK) + txreq.size);
-                       make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
-                       netif_put(netif);
+                       netbk_tx_err(netif, i);
                        continue;
                }
 
                pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
 
-               data_len = (txreq.size > PKT_PROT_LEN) ?
+               data_len = (txreq.size > PKT_PROT_LEN &&
+                           ret < MAX_SKB_FRAGS + 1) ?
                        PKT_PROT_LEN : txreq.size;
 
                skb = alloc_skb(data_len+16, GFP_ATOMIC);
                if (unlikely(skb == NULL)) {
                        DPRINTK("Can't allocate a skb in start_xmit.\n");
-                       make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
-                       netif_put(netif);
+                       netbk_tx_err(netif, i);
                        break;
                }
 
@@ -600,9 +780,23 @@ static void net_tx_action(unsigned long 
                pending_tx_info[pending_idx].netif = netif;
                *((u16 *)skb->data) = pending_idx;
 
+               __skb_put(skb, data_len);
+
+               skb_shinfo(skb)->nr_frags = ret - 1;
+               if (data_len < txreq.size) {
+                       skb_shinfo(skb)->nr_frags++;
+                       skb_shinfo(skb)->frags[0].page =
+                               (void *)(unsigned long)pending_idx;
+               }
+
                __skb_queue_tail(&tx_queue, skb);
 
                pending_cons++;
+
+               mop = netbk_get_requests(netif, skb, mop);
+
+               netif->tx.req_cons = i;
+               netif_schedule_work(netif);
 
                if ((mop - tx_map_ops) >= ARRAY_SIZE(tx_map_ops))
                        break;
@@ -617,75 +811,56 @@ static void net_tx_action(unsigned long 
 
        mop = tx_map_ops;
        while ((skb = __skb_dequeue(&tx_queue)) != NULL) {
+               netif_tx_request_t *txp;
+
                pending_idx = *((u16 *)skb->data);
                netif       = pending_tx_info[pending_idx].netif;
-               memcpy(&txreq, &pending_tx_info[pending_idx].req,
-                      sizeof(txreq));
+               txp         = &pending_tx_info[pending_idx].req;
 
                /* Check the remap error code. */
-               if (unlikely(mop->status)) {
+               if (unlikely(netbk_tx_check_mop(skb, &mop))) {
                        printk(KERN_ALERT "#### netback grant fails\n");
-                       make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
-                       netif_put(netif);
+                       skb_shinfo(skb)->nr_frags = 0;
                        kfree_skb(skb);
-                       mop++;
-                       pending_ring[MASK_PEND_IDX(pending_prod++)] =
-                               pending_idx;
                        continue;
                }
-               set_phys_to_machine(
-                       __pa(MMAP_VADDR(pending_idx)) >> PAGE_SHIFT,
-                       FOREIGN_FRAME(mop->dev_bus_addr >> PAGE_SHIFT));
-               grant_tx_handle[pending_idx] = mop->handle;
-
-               data_len = (txreq.size > PKT_PROT_LEN) ?
-                       PKT_PROT_LEN : txreq.size;
-
-               __skb_put(skb, data_len);
+
+               data_len = skb->len;
                memcpy(skb->data, 
-                      (void *)(MMAP_VADDR(pending_idx)|txreq.offset),
+                      (void *)(MMAP_VADDR(pending_idx)|txp->offset),
                       data_len);
-               if (data_len < txreq.size) {
+               if (data_len < txp->size) {
                        /* Append the packet payload as a fragment. */
-                       skb_shinfo(skb)->frags[0].page        = 
-                               virt_to_page(MMAP_VADDR(pending_idx));
-                       skb_shinfo(skb)->frags[0].size        =
-                               txreq.size - data_len;
-                       skb_shinfo(skb)->frags[0].page_offset = 
-                               txreq.offset + data_len;
-                       skb_shinfo(skb)->nr_frags = 1;
+                       txp->offset += data_len;
+                       txp->size -= data_len;
                } else {
                        /* Schedule a response immediately. */
                        netif_idx_release(pending_idx);
                }
-
-               skb->data_len  = txreq.size - data_len;
-               skb->len      += skb->data_len;
-               skb->truesize += skb->data_len;
-
-               skb->dev      = netif->dev;
-               skb->protocol = eth_type_trans(skb, skb->dev);
 
                /*
                 * Old frontends do not assert data_validated but we
                 * can infer it from csum_blank so test both flags.
                 */
-               if (txreq.flags & (NETTXF_data_validated|NETTXF_csum_blank)) {
+               if (txp->flags & (NETTXF_data_validated|NETTXF_csum_blank)) {
                        skb->ip_summed = CHECKSUM_UNNECESSARY;
                        skb->proto_data_valid = 1;
                } else {
                        skb->ip_summed = CHECKSUM_NONE;
                        skb->proto_data_valid = 0;
                }
-               skb->proto_csum_blank = !!(txreq.flags & NETTXF_csum_blank);
-
-               netif->stats.rx_bytes += txreq.size;
+               skb->proto_csum_blank = !!(txp->flags & NETTXF_csum_blank);
+
+               netbk_fill_frags(skb);
+
+               skb->dev      = netif->dev;
+               skb->protocol = eth_type_trans(skb, skb->dev);
+
+               netif->stats.rx_bytes += skb->len;
                netif->stats.rx_packets++;
 
                netif_rx(skb);
                netif->dev->last_rx = jiffies;
-
-               mop++;
        }
 }
 
@@ -695,7 +870,10 @@ static void netif_idx_release(u16 pendin
        unsigned long flags;
 
        spin_lock_irqsave(&_lock, flags);
-       dealloc_ring[MASK_PEND_IDX(dealloc_prod++)] = pending_idx;
+       dealloc_ring[MASK_PEND_IDX(dealloc_prod)] = pending_idx;
+       /* Sync with net_tx_action_dealloc: insert idx /then/ incr producer. */
+       smp_wmb();
+       dealloc_prod++;
        spin_unlock_irqrestore(&_lock, flags);
 
        tasklet_schedule(&net_tx_tasklet);
diff -r d3e181fa238b -r 156a0963a1ae 
linux-2.6-xen-sparse/drivers/xen/netback/xenbus.c
--- a/linux-2.6-xen-sparse/drivers/xen/netback/xenbus.c Fri Jun 02 12:54:22 
2006 -0500
+++ b/linux-2.6-xen-sparse/drivers/xen/netback/xenbus.c Tue Jun 06 13:25:31 
2006 -0500
@@ -69,6 +69,8 @@ static int netback_probe(struct xenbus_d
 static int netback_probe(struct xenbus_device *dev,
                         const struct xenbus_device_id *id)
 {
+       const char *message;
+       xenbus_transaction_t xbt;
        int err;
        struct backend_info *be = kzalloc(sizeof(struct backend_info),
                                          GFP_KERNEL);
@@ -86,6 +88,27 @@ static int netback_probe(struct xenbus_d
        if (err)
                goto fail;
 
+       do {
+               err = xenbus_transaction_start(&xbt);
+               if (err) {
+                       xenbus_dev_fatal(dev, err, "starting transaction");
+                       goto fail;
+               }
+
+               err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", 1);
+               if (err) {
+                       message = "writing feature-sg";
+                       goto abort_transaction;
+               }
+
+               err = xenbus_transaction_end(xbt, 0);
+       } while (err == -EAGAIN);
+
+       if (err) {
+               xenbus_dev_fatal(dev, err, "completing transaction");
+               goto fail;
+       }
+
        err = xenbus_switch_state(dev, XenbusStateInitWait);
        if (err) {
                goto fail;
@@ -93,6 +116,9 @@ static int netback_probe(struct xenbus_d
 
        return 0;
 
+abort_transaction:
+       xenbus_transaction_end(xbt, 1);
+       xenbus_dev_fatal(dev, err, "%s", message);
 fail:
        DPRINTK("failed");
        netback_remove(dev);
diff -r d3e181fa238b -r 156a0963a1ae 
linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c
--- a/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c      Fri Jun 02 
12:54:22 2006 -0500
+++ b/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c      Tue Jun 06 
13:25:31 2006 -0500
@@ -45,6 +45,7 @@
 #include <linux/bitops.h>
 #include <linux/ethtool.h>
 #include <linux/in.h>
+#include <linux/if_ether.h>
 #include <net/sock.h>
 #include <net/pkt_sched.h>
 #include <net/arp.h>
@@ -173,6 +174,11 @@ static void xennet_sysfs_delif(struct ne
 #define xennet_sysfs_delif(dev) do { } while(0)
 #endif
 
+static inline int xennet_can_sg(struct net_device *dev)
+{
+       return dev->features & NETIF_F_SG;
+}
+
 /**
  * Entry point to this code when a new device is created.  Allocate the basic
  * structures and the ring buffers for communication with the backend, and
@@ -307,8 +313,6 @@ again:
                goto destroy_ring;
        }
 
-       xenbus_switch_state(dev, XenbusStateConnected);
-
        return 0;
 
  abort_transaction:
@@ -370,12 +374,9 @@ static int setup_device(struct xenbus_de
                goto fail;
 
        memcpy(netdev->dev_addr, info->mac, ETH_ALEN);
-       network_connect(netdev);
        info->irq = bind_evtchn_to_irqhandler(
                info->evtchn, netif_int, SA_SAMPLE_RANDOM, netdev->name,
                netdev);
-       (void)send_fake_arp(netdev);
-       show_device(info);
 
        return 0;
 
@@ -391,15 +392,24 @@ static void backend_changed(struct xenbu
 static void backend_changed(struct xenbus_device *dev,
                            enum xenbus_state backend_state)
 {
+       struct netfront_info *np = dev->data;
+       struct net_device *netdev = np->netdev;
+
        DPRINTK("\n");
 
        switch (backend_state) {
        case XenbusStateInitialising:
-       case XenbusStateInitWait:
        case XenbusStateInitialised:
        case XenbusStateConnected:
        case XenbusStateUnknown:
        case XenbusStateClosed:
+               break;
+
+       case XenbusStateInitWait:
+               network_connect(netdev);
+               xenbus_switch_state(dev, XenbusStateConnected);
+               (void)send_fake_arp(netdev);
+               show_device(np);
                break;
 
        case XenbusStateClosing:
@@ -452,13 +462,17 @@ static int network_open(struct net_devic
        return 0;
 }
 
+static inline int netfront_tx_slot_available(struct netfront_info *np)
+{
+       return RING_FREE_REQUESTS(&np->tx) >= MAX_SKB_FRAGS + 1;
+}
+
 static inline void network_maybe_wake_tx(struct net_device *dev)
 {
        struct netfront_info *np = netdev_priv(dev);
 
        if (unlikely(netif_queue_stopped(dev)) &&
-           !RING_FULL(&np->tx) &&
-           !gnttab_empty_grant_references(&np->gref_tx_head) &&
+           netfront_tx_slot_available(np) &&
            likely(netif_running(dev)))
                netif_wake_queue(dev);
 }
@@ -485,7 +499,7 @@ static void network_tx_buf_gc(struct net
                                printk(KERN_ALERT "network_tx_buf_gc: warning "
                                       "-- grant still in use by backend "
                                       "domain.\n");
-                               break; /* bail immediately */
+                               BUG();
                        }
                        gnttab_end_foreign_access_ref(
                                np->grant_tx_ref[id], GNTMAP_readonly);
@@ -638,36 +652,95 @@ static void network_alloc_rx_buffers(str
        RING_PUSH_REQUESTS(&np->rx);
 }
 
+static void xennet_make_frags(struct sk_buff *skb, struct net_device *dev,
+                             struct netif_tx_request *tx)
+{
+       struct netfront_info *np = netdev_priv(dev);
+       char *data = skb->data;
+       unsigned long mfn;
+       RING_IDX prod = np->tx.req_prod_pvt;
+       int frags = skb_shinfo(skb)->nr_frags;
+       unsigned int offset = offset_in_page(data);
+       unsigned int len = skb_headlen(skb);
+       unsigned int id;
+       grant_ref_t ref;
+       int i;
+
+       while (len > PAGE_SIZE - offset) {
+               tx->size = PAGE_SIZE - offset;
+               tx->flags |= NETTXF_more_data;
+               len -= tx->size;
+               data += tx->size;
+               offset = 0;
+
+               id = get_id_from_freelist(np->tx_skbs);
+               np->tx_skbs[id] = skb_get(skb);
+               tx = RING_GET_REQUEST(&np->tx, prod++);
+               tx->id = id;
+               ref = gnttab_claim_grant_reference(&np->gref_tx_head);
+               BUG_ON((signed short)ref < 0);
+
+               mfn = virt_to_mfn(data);
+               gnttab_grant_foreign_access_ref(ref, np->xbdev->otherend_id,
+                                               mfn, GNTMAP_readonly);
+
+               tx->gref = np->grant_tx_ref[id] = ref;
+               tx->offset = offset;
+               tx->size = len;
+               tx->flags = 0;
+       }
+
+       for (i = 0; i < frags; i++) {
+               skb_frag_t *frag = skb_shinfo(skb)->frags + i;
+
+               tx->flags |= NETTXF_more_data;
+
+               id = get_id_from_freelist(np->tx_skbs);
+               np->tx_skbs[id] = skb_get(skb);
+               tx = RING_GET_REQUEST(&np->tx, prod++);
+               tx->id = id;
+               ref = gnttab_claim_grant_reference(&np->gref_tx_head);
+               BUG_ON((signed short)ref < 0);
+
+               mfn = pfn_to_mfn(page_to_pfn(frag->page));
+               gnttab_grant_foreign_access_ref(ref, np->xbdev->otherend_id,
+                                               mfn, GNTMAP_readonly);
+
+               tx->gref = np->grant_tx_ref[id] = ref;
+               tx->offset = frag->page_offset;
+               tx->size = frag->size;
+               tx->flags = 0;
+       }
+
+       np->tx.req_prod_pvt = prod;
+}
 
 static int network_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
        unsigned short id;
        struct netfront_info *np = netdev_priv(dev);
        struct netif_tx_request *tx;
+       char *data = skb->data;
        RING_IDX i;
        grant_ref_t ref;
        unsigned long mfn;
        int notify;
-
-       if (unlikely((((unsigned long)skb->data & ~PAGE_MASK) + skb->len) >=
-                    PAGE_SIZE)) {
-               struct sk_buff *nskb;
-               nskb = __dev_alloc_skb(skb->len, GFP_ATOMIC|__GFP_NOWARN);
-               if (unlikely(nskb == NULL))
-                       goto drop;
-               skb_put(nskb, skb->len);
-               memcpy(nskb->data, skb->data, skb->len);
-               /* Copy only the header fields we use in this driver. */
-               nskb->dev = skb->dev;
-               nskb->ip_summed = skb->ip_summed;
-               nskb->proto_data_valid = skb->proto_data_valid;
-               dev_kfree_skb(skb);
-               skb = nskb;
+       int frags = skb_shinfo(skb)->nr_frags;
+       unsigned int offset = offset_in_page(data);
+       unsigned int len = skb_headlen(skb);
+
+       frags += (offset + len + PAGE_SIZE - 1) / PAGE_SIZE;
+       if (unlikely(frags > MAX_SKB_FRAGS + 1)) {
+               printk(KERN_ALERT "xennet: skb rides the rocket: %d frags\n",
+                      frags);
+               dump_stack();
+               goto drop;
        }
 
        spin_lock_irq(&np->tx_lock);
 
-       if (unlikely(!netif_carrier_ok(dev))) {
+       if (unlikely(!netif_carrier_ok(dev) ||
+                    (frags > 1 && !xennet_can_sg(dev)))) {
                spin_unlock_irq(&np->tx_lock);
                goto drop;
        }
@@ -682,12 +755,12 @@ static int network_start_xmit(struct sk_
        tx->id   = id;
        ref = gnttab_claim_grant_reference(&np->gref_tx_head);
        BUG_ON((signed short)ref < 0);
-       mfn = virt_to_mfn(skb->data);
+       mfn = virt_to_mfn(data);
        gnttab_grant_foreign_access_ref(
                ref, np->xbdev->otherend_id, mfn, GNTMAP_readonly);
        tx->gref = np->grant_tx_ref[id] = ref;
-       tx->offset = (unsigned long)skb->data & ~PAGE_MASK;
-       tx->size = skb->len;
+       tx->offset = offset;
+       tx->size = len;
 
        tx->flags = 0;
        if (skb->ip_summed == CHECKSUM_HW) /* local packet? */
@@ -696,14 +769,17 @@ static int network_start_xmit(struct sk_
                tx->flags |= NETTXF_data_validated;
 
        np->tx.req_prod_pvt = i + 1;
+
+       xennet_make_frags(skb, dev, tx);
+       tx->size = skb->len;
+
        RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&np->tx, notify);
        if (notify)
                notify_remote_via_irq(np->irq);
 
        network_tx_buf_gc(dev);
 
-       if (RING_FULL(&np->tx) ||
-           gnttab_empty_grant_references(&np->gref_tx_head))
+       if (!netfront_tx_slot_available(np))
                netif_stop_queue(dev);
 
        spin_unlock_irq(&np->tx_lock);
@@ -963,12 +1039,46 @@ static struct net_device_stats *network_
        return &np->stats;
 }
 
+static int xennet_change_mtu(struct net_device *dev, int mtu)
+{
+       int max = xennet_can_sg(dev) ? 65535 - ETH_HLEN : ETH_DATA_LEN;
+
+       if (mtu > max)
+               return -EINVAL;
+       dev->mtu = mtu;
+       return 0;
+}
+
+static int xennet_set_sg(struct net_device *dev, u32 data)
+{
+       if (data) {
+               struct netfront_info *np = netdev_priv(dev);
+               int val;
+
+               if (xenbus_scanf(XBT_NULL, np->xbdev->otherend, "feature-sg",
+                                "%d", &val) < 0)
+                       val = 0;
+               if (!val)
+                       return -ENOSYS;
+       } else if (dev->mtu > ETH_DATA_LEN)
+               dev->mtu = ETH_DATA_LEN;
+
+       return ethtool_op_set_sg(dev, data);
+}
+
+static void xennet_set_features(struct net_device *dev)
+{
+       xennet_set_sg(dev, 1);
+}
+
 static void network_connect(struct net_device *dev)
 {
        struct netfront_info *np;
        int i, requeue_idx;
        struct netif_tx_request *tx;
        struct sk_buff *skb;
+
+       xennet_set_features(dev);
 
        np = netdev_priv(dev);
        spin_lock_irq(&np->tx_lock);
@@ -1081,6 +1191,8 @@ static struct ethtool_ops network_ethtoo
 {
        .get_tx_csum = ethtool_op_get_tx_csum,
        .set_tx_csum = ethtool_op_set_tx_csum,
+       .get_sg = ethtool_op_get_sg,
+       .set_sg = xennet_set_sg,
 };
 
 #ifdef CONFIG_SYSFS
@@ -1297,6 +1409,7 @@ static struct net_device * __devinit cre
        netdev->poll            = netif_poll;
        netdev->set_multicast_list = network_set_multicast_list;
        netdev->uninit          = netif_uninit;
+       netdev->change_mtu      = xennet_change_mtu;
        netdev->weight          = 64;
        netdev->features        = NETIF_F_IP_CSUM;
 
diff -r d3e181fa238b -r 156a0963a1ae 
linux-2.6-xen-sparse/drivers/xen/privcmd/privcmd.c
--- a/linux-2.6-xen-sparse/drivers/xen/privcmd/privcmd.c        Fri Jun 02 
12:54:22 2006 -0500
+++ b/linux-2.6-xen-sparse/drivers/xen/privcmd/privcmd.c        Tue Jun 06 
13:25:31 2006 -0500
@@ -61,11 +61,11 @@ static int privcmd_ioctl(struct inode *i
                __asm__ __volatile__ (
                        "pushl %%ebx; pushl %%ecx; pushl %%edx; "
                        "pushl %%esi; pushl %%edi; "
-                       "movl  4(%%eax),%%ebx ;"
-                       "movl  8(%%eax),%%ecx ;"
-                       "movl 12(%%eax),%%edx ;"
-                       "movl 16(%%eax),%%esi ;"
-                       "movl 20(%%eax),%%edi ;"
+                       "movl  8(%%eax),%%ebx ;"
+                       "movl 16(%%eax),%%ecx ;"
+                       "movl 24(%%eax),%%edx ;"
+                       "movl 32(%%eax),%%esi ;"
+                       "movl 40(%%eax),%%edi ;"
                        "movl   (%%eax),%%eax ;"
                        "shll $5,%%eax ;"
                        "addl $hypercall_page,%%eax ;"
@@ -161,7 +161,7 @@ static int privcmd_ioctl(struct inode *i
        case IOCTL_PRIVCMD_MMAPBATCH: {
                privcmd_mmapbatch_t m;
                struct vm_area_struct *vma = NULL;
-               unsigned long __user *p;
+               xen_pfn_t __user *p;
                unsigned long addr, mfn; 
                int i;
 
@@ -210,7 +210,7 @@ static int privcmd_ioctl(struct inode *i
        batch_err:
                printk("batch_err ret=%d vma=%p addr=%lx "
                       "num=%d arr=%p %lx-%lx\n", 
-                      ret, vma, m.addr, m.num, m.arr,
+                      ret, vma, (unsigned long)m.addr, m.num, m.arr,
                       vma ? vma->vm_start : 0, vma ? vma->vm_end : 0);
                break;
        }
diff -r d3e181fa238b -r 156a0963a1ae 
linux-2.6-xen-sparse/include/asm-i386/mach-xen/setup_arch_post.h
--- a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/setup_arch_post.h  Fri Jun 
02 12:54:22 2006 -0500
+++ b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/setup_arch_post.h  Tue Jun 
06 13:25:31 2006 -0500
@@ -61,13 +61,6 @@ static void __init machine_specific_arch
                .address = { __KERNEL_CS, (unsigned long)nmi },
        };
 
-       if (xen_feature(XENFEAT_auto_translated_physmap) &&
-           xen_start_info->shared_info < xen_start_info->nr_pages) {
-               HYPERVISOR_shared_info =
-                       (shared_info_t *)__va(xen_start_info->shared_info);
-               memset(empty_zero_page, 0, sizeof(empty_zero_page));
-       }
-
        ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
        if (ret == 0)
                ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
diff -r d3e181fa238b -r 156a0963a1ae 
linux-2.6-xen-sparse/include/xen/public/privcmd.h
--- a/linux-2.6-xen-sparse/include/xen/public/privcmd.h Fri Jun 02 12:54:22 
2006 -0500
+++ b/linux-2.6-xen-sparse/include/xen/public/privcmd.h Tue Jun 06 13:25:31 
2006 -0500
@@ -33,20 +33,22 @@
 #ifndef __LINUX_PUBLIC_PRIVCMD_H__
 #define __LINUX_PUBLIC_PRIVCMD_H__
 
+#include <linux/types.h>
+
 #ifndef __user
 #define __user
 #endif
 
 typedef struct privcmd_hypercall
 {
-       unsigned long op;
-       unsigned long arg[5];
+       __u64 op;
+       __u64 arg[5];
 } privcmd_hypercall_t;
 
 typedef struct privcmd_mmap_entry {
-       unsigned long va;
-       unsigned long mfn;
-       unsigned long npages;
+       __u64 va;
+       __u64 mfn;
+       __u64 npages;
 } privcmd_mmap_entry_t; 
 
 typedef struct privcmd_mmap {
@@ -58,8 +60,8 @@ typedef struct privcmd_mmapbatch {
 typedef struct privcmd_mmapbatch {
        int num;     /* number of pages to populate */
        domid_t dom; /* target domain */
-       unsigned long addr;  /* virtual address */
-       unsigned long __user *arr; /* array of mfns - top nibble set on err */
+       __u64 addr;  /* virtual address */
+       xen_pfn_t __user *arr; /* array of mfns - top nibble set on err */
 } privcmd_mmapbatch_t; 
 
 /*
diff -r d3e181fa238b -r 156a0963a1ae tools/debugger/libxendebug/xendebug.c
--- a/tools/debugger/libxendebug/xendebug.c     Fri Jun 02 12:54:22 2006 -0500
+++ b/tools/debugger/libxendebug/xendebug.c     Tue Jun 06 13:25:31 2006 -0500
@@ -57,7 +57,7 @@ typedef struct domain_context           
     vcpu_guest_context_t context[MAX_VIRT_CPUS];
 
     long            total_pages;
-    unsigned long  *page_array;
+    xen_pfn_t      *page_array;
 
     unsigned long   cr3_phys[MAX_VIRT_CPUS];
     unsigned long  *cr3_virt[MAX_VIRT_CPUS];
diff -r d3e181fa238b -r 156a0963a1ae tools/firmware/vmxassist/vm86.c
--- a/tools/firmware/vmxassist/vm86.c   Fri Jun 02 12:54:22 2006 -0500
+++ b/tools/firmware/vmxassist/vm86.c   Tue Jun 06 13:25:31 2006 -0500
@@ -36,6 +36,8 @@
 
 static unsigned prev_eip = 0;
 enum vm86_mode mode = 0;
+
+static struct regs saved_rm_regs;
 
 #ifdef DEBUG
 int traceset = 0;
@@ -795,6 +797,8 @@ protected_mode(struct regs *regs)
        oldctx.esp = regs->uesp;
        oldctx.eflags = regs->eflags;
 
+       memset(&saved_rm_regs, 0, sizeof(struct regs));
+
        /* reload all segment registers */
        if (!load_seg(regs->cs, &oldctx.cs_base,
                                &oldctx.cs_limit, &oldctx.cs_arbytes))
@@ -808,6 +812,7 @@ protected_mode(struct regs *regs)
                load_seg(0, &oldctx.es_base,
                            &oldctx.es_limit, &oldctx.es_arbytes);
                oldctx.es_sel = 0;
+               saved_rm_regs.ves = regs->ves;
        }
 
        if (load_seg(regs->uss, &oldctx.ss_base,
@@ -817,6 +822,7 @@ protected_mode(struct regs *regs)
                load_seg(0, &oldctx.ss_base,
                            &oldctx.ss_limit, &oldctx.ss_arbytes);
                oldctx.ss_sel = 0;
+               saved_rm_regs.uss = regs->uss;
        }
 
        if (load_seg(regs->vds, &oldctx.ds_base,
@@ -826,6 +832,7 @@ protected_mode(struct regs *regs)
                load_seg(0, &oldctx.ds_base,
                            &oldctx.ds_limit, &oldctx.ds_arbytes);
                oldctx.ds_sel = 0;
+               saved_rm_regs.vds = regs->vds;
        }
 
        if (load_seg(regs->vfs, &oldctx.fs_base,
@@ -835,6 +842,7 @@ protected_mode(struct regs *regs)
                load_seg(0, &oldctx.fs_base,
                            &oldctx.fs_limit, &oldctx.fs_arbytes);
                oldctx.fs_sel = 0;
+               saved_rm_regs.vfs = regs->vfs;
        }
 
        if (load_seg(regs->vgs, &oldctx.gs_base,
@@ -844,6 +852,7 @@ protected_mode(struct regs *regs)
                load_seg(0, &oldctx.gs_base,
                            &oldctx.gs_limit, &oldctx.gs_arbytes);
                oldctx.gs_sel = 0;
+               saved_rm_regs.vgs = regs->vgs;
        }
 
        /* initialize jump environment to warp back to protected mode */
@@ -880,16 +889,22 @@ real_mode(struct regs *regs)
                if (regs->uss >= HIGHMEM)
                        panic("%%ss 0x%lx higher than 1MB", regs->uss);
                regs->uss = address(regs, regs->uss, 0) >> 4;
+       } else {
+         regs->uss = saved_rm_regs.uss;
        }
        if (regs->vds != 0) {
                if (regs->vds >= HIGHMEM)
                        panic("%%ds 0x%lx higher than 1MB", regs->vds);
                regs->vds = address(regs, regs->vds, 0) >> 4;
+       } else {
+         regs->vds = saved_rm_regs.vds;
        }
        if (regs->ves != 0) {
                if (regs->ves >= HIGHMEM)
                        panic("%%es 0x%lx higher than 1MB", regs->ves);
                regs->ves = address(regs, regs->ves, 0) >> 4;
+       } else {
+         regs->ves = saved_rm_regs.ves;
        }
 
        /* this should get us into 16-bit mode */
@@ -971,6 +986,39 @@ jmpl(struct regs *regs, int prefix)
        } else if (mode == VM86_PROTECTED_TO_REAL) { /* jump to real mode */
                eip = (prefix & DATA32) ? fetch32(regs) : fetch16(regs);
                cs = fetch16(regs);
+
+               TRACE((regs, (regs->eip - n) + 1, "jmpl 0x%x:0x%x", cs, eip));
+
+                regs->cs = cs;
+                regs->eip = eip;
+               set_mode(regs, VM86_REAL);
+       } else
+               panic("jmpl");
+}
+
+static void
+jmpl_indirect(struct regs *regs, int prefix, unsigned modrm)
+{
+       unsigned n = regs->eip;
+       unsigned cs, eip;
+       unsigned addr;
+
+       addr  = operand(prefix, regs, modrm);
+
+       if (mode == VM86_REAL_TO_PROTECTED) { /* jump to protected mode */
+               eip = (prefix & DATA32) ? read32(addr) : read16(addr);
+               addr += (prefix & DATA32) ? 4 : 2;
+               cs = read16(addr);
+
+               TRACE((regs, (regs->eip - n) + 1, "jmpl 0x%x:0x%x", cs, eip));
+
+                regs->cs = cs;
+                regs->eip = eip;
+               set_mode(regs, VM86_PROTECTED);
+       } else if (mode == VM86_PROTECTED_TO_REAL) { /* jump to real mode */
+               eip = (prefix & DATA32) ? read32(addr) : read16(addr);
+               addr += (prefix & DATA32) ? 4 : 2;
+               cs = read16(addr);
 
                TRACE((regs, (regs->eip - n) + 1, "jmpl 0x%x:0x%x", cs, eip));
 
@@ -1306,6 +1354,23 @@ opcode(struct regs *regs)
                        }
                        goto invalid;
 
+               case 0xFF: /* jmpl (indirect) */
+                       if ((mode == VM86_REAL_TO_PROTECTED) ||
+                           (mode == VM86_PROTECTED_TO_REAL)) {
+                               unsigned modrm = fetch8(regs);
+                               
+                               switch((modrm >> 3) & 7) {
+                               case 5:
+                                 jmpl_indirect(regs, prefix, modrm);
+                                 return OPC_INVALID;
+
+                               default:
+                                 break;
+                               }
+
+                       }
+                       goto invalid;
+
                case 0xEB: /* short jump */
                        if ((mode == VM86_REAL_TO_PROTECTED) ||
                            (mode == VM86_PROTECTED_TO_REAL)) {
diff -r d3e181fa238b -r 156a0963a1ae tools/ioemu/hw/cirrus_vga.c
--- a/tools/ioemu/hw/cirrus_vga.c       Fri Jun 02 12:54:22 2006 -0500
+++ b/tools/ioemu/hw/cirrus_vga.c       Tue Jun 06 13:25:31 2006 -0500
@@ -2462,7 +2462,7 @@ extern FILE *logfile;
 extern FILE *logfile;
 static void * set_vram_mapping(unsigned long begin, unsigned long end)
 {
-    unsigned long * extent_start = NULL;
+    xen_pfn_t *extent_start = NULL;
     unsigned long nr_extents;
     void *vram_pointer = NULL;
     int i;
@@ -2473,14 +2473,14 @@ static void * set_vram_mapping(unsigned 
     end = (end + TARGET_PAGE_SIZE -1 ) & TARGET_PAGE_MASK;
     nr_extents = (end - begin) >> TARGET_PAGE_BITS;
 
-    extent_start = malloc(sizeof(unsigned long) * nr_extents );
+    extent_start = malloc(sizeof(xen_pfn_t) * nr_extents );
     if (extent_start == NULL)
     {
         fprintf(stderr, "Failed malloc on set_vram_mapping\n");
         return NULL;
     }
 
-    memset(extent_start, 0, sizeof(unsigned long) * nr_extents);
+    memset(extent_start, 0, sizeof(xen_pfn_t) * nr_extents);
 
     for (i = 0; i < nr_extents; i++)
     {
@@ -2508,7 +2508,7 @@ static void * set_vram_mapping(unsigned 
 
 static int unset_vram_mapping(unsigned long begin, unsigned long end)
 {
-    unsigned long * extent_start = NULL;
+    xen_pfn_t *extent_start = NULL;
     unsigned long nr_extents;
     int i;
 
@@ -2519,7 +2519,7 @@ static int unset_vram_mapping(unsigned l
     end = (end + TARGET_PAGE_SIZE -1 ) & TARGET_PAGE_MASK;
     nr_extents = (end - begin) >> TARGET_PAGE_BITS;
 
-    extent_start = malloc(sizeof(unsigned long) * nr_extents );
+    extent_start = malloc(sizeof(xen_pfn_t) * nr_extents );
 
     if (extent_start == NULL)
     {
@@ -2527,7 +2527,7 @@ static int unset_vram_mapping(unsigned l
         return -1;
     }
 
-    memset(extent_start, 0, sizeof(unsigned long) * nr_extents);
+    memset(extent_start, 0, sizeof(xen_pfn_t) * nr_extents);
 
     for (i = 0; i < nr_extents; i++)
         extent_start[i] = (begin + (i * TARGET_PAGE_SIZE)) >> TARGET_PAGE_BITS;
diff -r d3e181fa238b -r 156a0963a1ae tools/ioemu/vl.c
--- a/tools/ioemu/vl.c  Fri Jun 02 12:54:22 2006 -0500
+++ b/tools/ioemu/vl.c  Tue Jun 06 13:25:31 2006 -0500
@@ -2458,7 +2458,7 @@ int unset_mm_mapping(int xc_handle,
                      uint32_t domid,
                      unsigned long nr_pages,
                      unsigned int address_bits,
-                     unsigned long *extent_start)
+                     xen_pfn_t *extent_start)
 {
     int err = 0;
     xc_dominfo_t info;
@@ -2491,7 +2491,7 @@ int set_mm_mapping(int xc_handle,
                     uint32_t domid,
                     unsigned long nr_pages,
                     unsigned int address_bits,
-                    unsigned long *extent_start)
+                    xen_pfn_t *extent_start)
 {
     xc_dominfo_t info;
     int err = 0;
@@ -2557,7 +2557,8 @@ int main(int argc, char **argv)
     int serial_device_index;
     char qemu_dm_logfilename[64];
     const char *loadvm = NULL;
-    unsigned long nr_pages, *page_array;
+    unsigned long nr_pages;
+    xen_pfn_t *page_array;
     extern void *shared_page;
 
 #if !defined(CONFIG_SOFTMMU)
@@ -3023,8 +3024,8 @@ int main(int argc, char **argv)
 
     xc_handle = xc_interface_open();
 
-    if ( (page_array = (unsigned long *)
-                        malloc(nr_pages * sizeof(unsigned long))) == NULL)
+    if ( (page_array = (xen_pfn_t *)
+                        malloc(nr_pages * sizeof(xen_pfn_t))) == NULL)
     {
         fprintf(logfile, "malloc returned error %d\n", errno);
         exit(-1);
@@ -3079,8 +3080,8 @@ int main(int argc, char **argv)
                                        page_array[0]);
 #endif
 
-    fprintf(logfile, "shared page at pfn:%lx, mfn: %lx\n", (nr_pages-1),
-           (page_array[nr_pages - 1]));
+    fprintf(logfile, "shared page at pfn:%lx, mfn: %"PRIx64"\n", (nr_pages-1),
+           (uint64_t)(page_array[nr_pages - 1]));
 
     /* we always create the cdrom drive, even if no disk is there */
     bdrv_init();
diff -r d3e181fa238b -r 156a0963a1ae tools/libxc/xc_core.c
--- a/tools/libxc/xc_core.c     Fri Jun 02 12:54:22 2006 -0500
+++ b/tools/libxc/xc_core.c     Tue Jun 06 13:25:31 2006 -0500
@@ -28,7 +28,7 @@ xc_domain_dumpcore_via_callback(int xc_h
                                 dumpcore_rtn_t dump_rtn)
 {
     unsigned long nr_pages;
-    unsigned long *page_array = NULL;
+    xen_pfn_t *page_array = NULL;
     xc_dominfo_t info;
     int i, nr_vcpus = 0;
     char *dump_mem, *dump_mem_start = NULL;
@@ -70,7 +70,7 @@ xc_domain_dumpcore_via_callback(int xc_h
         sizeof(vcpu_guest_context_t)*nr_vcpus;
     dummy_len = (sizeof(struct xc_core_header) +
                  (sizeof(vcpu_guest_context_t) * nr_vcpus) +
-                 (nr_pages * sizeof(unsigned long)));
+                 (nr_pages * sizeof(xen_pfn_t)));
     header.xch_pages_offset = round_pgup(dummy_len);
 
     sts = dump_rtn(args, (char *)&header, sizeof(struct xc_core_header));
@@ -81,7 +81,7 @@ xc_domain_dumpcore_via_callback(int xc_h
     if ( sts != 0 )
         goto error_out;
 
-    if ( (page_array = malloc(nr_pages * sizeof(unsigned long))) == NULL )
+    if ( (page_array = malloc(nr_pages * sizeof(xen_pfn_t))) == NULL )
     {
         printf("Could not allocate memory\n");
         goto error_out;
@@ -91,7 +91,7 @@ xc_domain_dumpcore_via_callback(int xc_h
         printf("Could not get the page frame list\n");
         goto error_out;
     }
-    sts = dump_rtn(args, (char *)page_array, nr_pages * sizeof(unsigned long));
+    sts = dump_rtn(args, (char *)page_array, nr_pages * sizeof(xen_pfn_t));
     if ( sts != 0 )
         goto error_out;
 
diff -r d3e181fa238b -r 156a0963a1ae tools/libxc/xc_domain.c
--- a/tools/libxc/xc_domain.c   Fri Jun 02 12:54:22 2006 -0500
+++ b/tools/libxc/xc_domain.c   Tue Jun 06 13:25:31 2006 -0500
@@ -291,7 +291,7 @@ int xc_domain_memory_increase_reservatio
                                           unsigned long nr_extents,
                                           unsigned int extent_order,
                                           unsigned int address_bits,
-                                          unsigned long *extent_start)
+                                          xen_pfn_t *extent_start)
 {
     int err;
     struct xen_memory_reservation reservation = {
@@ -324,7 +324,7 @@ int xc_domain_memory_decrease_reservatio
                                           uint32_t domid,
                                           unsigned long nr_extents,
                                           unsigned int extent_order,
-                                          unsigned long *extent_start)
+                                          xen_pfn_t *extent_start)
 {
     int err;
     struct xen_memory_reservation reservation = {
@@ -363,7 +363,7 @@ int xc_domain_memory_populate_physmap(in
                                           unsigned long nr_extents,
                                           unsigned int extent_order,
                                           unsigned int address_bits,
-                                          unsigned long *extent_start)
+                                          xen_pfn_t *extent_start)
 {
     int err;
     struct xen_memory_reservation reservation = {
@@ -392,8 +392,8 @@ int xc_domain_translate_gpfn_list(int xc
 int xc_domain_translate_gpfn_list(int xc_handle,
                                   uint32_t domid,
                                   unsigned long nr_gpfns,
-                                  unsigned long *gpfn_list,
-                                  unsigned long *mfn_list)
+                                  xen_pfn_t *gpfn_list,
+                                  xen_pfn_t *mfn_list)
 {
     struct xen_translate_gpfn_list op = {
         .domid        = domid,
diff -r d3e181fa238b -r 156a0963a1ae tools/libxc/xc_hvm_build.c
--- a/tools/libxc/xc_hvm_build.c        Fri Jun 02 12:54:22 2006 -0500
+++ b/tools/libxc/xc_hvm_build.c        Tue Jun 06 13:25:31 2006 -0500
@@ -135,7 +135,7 @@ static void set_hvm_info_checksum(struct
  * hvmloader will use this info to set BIOS accordingly
  */
 static int set_hvm_info(int xc_handle, uint32_t dom,
-                        unsigned long *pfn_list, unsigned int vcpus,
+                        xen_pfn_t *pfn_list, unsigned int vcpus,
                         unsigned int pae, unsigned int acpi, unsigned int apic)
 {
     char *va_map;
@@ -178,7 +178,7 @@ static int setup_guest(int xc_handle,
                        unsigned int store_evtchn,
                        unsigned long *store_mfn)
 {
-    unsigned long *page_array = NULL;
+    xen_pfn_t *page_array = NULL;
     unsigned long count, i;
     unsigned long long ptr;
     xc_mmu_t *mmu = NULL;
@@ -223,7 +223,7 @@ static int setup_guest(int xc_handle,
         goto error_out;
     }
 
-    if ( (page_array = malloc(nr_pages * sizeof(unsigned long))) == NULL )
+    if ( (page_array = malloc(nr_pages * sizeof(xen_pfn_t))) == NULL )
     {
         PERROR("Could not allocate memory.\n");
         goto error_out;
diff -r d3e181fa238b -r 156a0963a1ae tools/libxc/xc_ia64_stubs.c
--- a/tools/libxc/xc_ia64_stubs.c       Fri Jun 02 12:54:22 2006 -0500
+++ b/tools/libxc/xc_ia64_stubs.c       Tue Jun 06 13:25:31 2006 -0500
@@ -57,7 +57,7 @@ xc_plan9_build(int xc_handle,
 
 int xc_ia64_get_pfn_list(int xc_handle,
                          uint32_t domid,
-                         unsigned long *pfn_buf,
+                         xen_pfn_t *pfn_buf,
                          unsigned int start_page,
                          unsigned int nr_pages)
 {
@@ -65,7 +65,7 @@ int xc_ia64_get_pfn_list(int xc_handle,
     int num_pfns,ret;
     unsigned int __start_page, __nr_pages;
     unsigned long max_pfns;
-    unsigned long *__pfn_buf;
+    xen_pfn_t *__pfn_buf;
 
     __start_page = start_page;
     __nr_pages = nr_pages;
@@ -80,7 +80,7 @@ int xc_ia64_get_pfn_list(int xc_handle,
         set_xen_guest_handle(op.u.getmemlist.buffer, __pfn_buf);
 
         if ( (max_pfns != -1UL)
-            && mlock(__pfn_buf, __nr_pages * sizeof(unsigned long)) != 0 )
+            && mlock(__pfn_buf, __nr_pages * sizeof(xen_pfn_t)) != 0 )
         {
             PERROR("Could not lock pfn list buffer");
             return -1;
@@ -89,7 +89,7 @@ int xc_ia64_get_pfn_list(int xc_handle,
         ret = do_dom0_op(xc_handle, &op);
 
         if (max_pfns != -1UL)
-            (void)munlock(__pfn_buf, __nr_pages * sizeof(unsigned long));
+            (void)munlock(__pfn_buf, __nr_pages * sizeof(xen_pfn_t));
 
         if (max_pfns == -1UL)
             return 0;
@@ -122,10 +122,10 @@ int xc_ia64_copy_to_domain_pages(int xc_
 {
     // N.B. gva should be page aligned
 
-    unsigned long *page_array = NULL;
+    xen_pfn_t *page_array = NULL;
     int i;
 
-    if ( (page_array = malloc(nr_pages * sizeof(unsigned long))) == NULL ){
+    if ( (page_array = malloc(nr_pages * sizeof(xen_pfn_t))) == NULL ){
         PERROR("Could not allocate memory");
         goto error_out;
     }
diff -r d3e181fa238b -r 156a0963a1ae tools/libxc/xc_linux.c
--- a/tools/libxc/xc_linux.c    Fri Jun 02 12:54:22 2006 -0500
+++ b/tools/libxc/xc_linux.c    Tue Jun 06 13:25:31 2006 -0500
@@ -28,7 +28,7 @@ int xc_interface_close(int xc_handle)
 }
 
 void *xc_map_foreign_batch(int xc_handle, uint32_t dom, int prot,
-                           unsigned long *arr, int num)
+                           xen_pfn_t *arr, int num)
 {
     privcmd_mmapbatch_t ioctlx;
     void *addr;
diff -r d3e181fa238b -r 156a0963a1ae tools/libxc/xc_linux_build.c
--- a/tools/libxc/xc_linux_build.c      Fri Jun 02 12:54:22 2006 -0500
+++ b/tools/libxc/xc_linux_build.c      Tue Jun 06 13:25:31 2006 -0500
@@ -10,6 +10,7 @@
 #include "xc_aout9.h"
 #include <stdlib.h>
 #include <unistd.h>
+#include <inttypes.h>
 #include <zlib.h>
 
 #if defined(__i386__)
@@ -136,7 +137,7 @@ int load_initrd(int xc_handle, domid_t d
 int load_initrd(int xc_handle, domid_t dom,
                 struct initrd_info *initrd,
                 unsigned long physbase,
-                unsigned long *phys_to_mach)
+                xen_pfn_t *phys_to_mach)
 {
     char page[PAGE_SIZE];
     unsigned long pfn_start, pfn, nr_pages;
@@ -189,7 +190,7 @@ static int setup_pg_tables(int xc_handle
                            vcpu_guest_context_t *ctxt,
                            unsigned long dsi_v_start,
                            unsigned long v_end,
-                           unsigned long *page_array,
+                           xen_pfn_t *page_array,
                            unsigned long vpt_start,
                            unsigned long vpt_end,
                            unsigned shadow_mode_enabled)
@@ -251,19 +252,35 @@ static int setup_pg_tables_pae(int xc_ha
                                vcpu_guest_context_t *ctxt,
                                unsigned long dsi_v_start,
                                unsigned long v_end,
-                               unsigned long *page_array,
+                               xen_pfn_t *page_array,
                                unsigned long vpt_start,
                                unsigned long vpt_end,
-                               unsigned shadow_mode_enabled)
+                               unsigned shadow_mode_enabled,
+                               unsigned pae_mode)
 {
     l1_pgentry_64_t *vl1tab = NULL, *vl1e = NULL;
     l2_pgentry_64_t *vl2tab = NULL, *vl2e = NULL;
     l3_pgentry_64_t *vl3tab = NULL, *vl3e = NULL;
     uint64_t l1tab, l2tab, l3tab, pl1tab, pl2tab, pl3tab;
-    unsigned long ppt_alloc, count;
+    unsigned long ppt_alloc, count, nmfn;
 
     /* First allocate page for page dir. */
     ppt_alloc = (vpt_start - dsi_v_start) >> PAGE_SHIFT;
+
+    if ( pae_mode == PAEKERN_extended_cr3 )
+    {
+        ctxt->vm_assist |= (1UL << VMASST_TYPE_pae_extended_cr3);
+    }
+    else if ( page_array[ppt_alloc] > 0xfffff )
+    {
+        nmfn = xc_make_page_below_4G(xc_handle, dom, page_array[ppt_alloc]);
+        if ( nmfn == 0 )
+        {
+            fprintf(stderr, "Couldn't get a page below 4GB :-(\n");
+            goto error_out;
+        }
+        page_array[ppt_alloc] = nmfn;
+    }
 
     alloc_pt(l3tab, vl3tab, pl3tab);
     vl3e = &vl3tab[l3_table_offset_pae(dsi_v_start)];
@@ -340,7 +357,7 @@ static int setup_pg_tables_64(int xc_han
                               vcpu_guest_context_t *ctxt,
                               unsigned long dsi_v_start,
                               unsigned long v_end,
-                              unsigned long *page_array,
+                              xen_pfn_t *page_array,
                               unsigned long vpt_start,
                               unsigned long vpt_end,
                               int shadow_mode_enabled)
@@ -451,7 +468,7 @@ static int setup_guest(int xc_handle,
                        unsigned int console_evtchn, unsigned long *console_mfn,
                        uint32_t required_features[XENFEAT_NR_SUBMAPS])
 {
-    unsigned long *page_array = NULL;
+    xen_pfn_t *page_array = NULL;
     struct load_funcs load_funcs;
     struct domain_setup_info dsi;
     unsigned long vinitrd_start;
@@ -478,7 +495,7 @@ static int setup_guest(int xc_handle,
 
     start_page = dsi.v_start >> PAGE_SHIFT;
     pgnr = (v_end - dsi.v_start) >> PAGE_SHIFT;
-    if ( (page_array = malloc(pgnr * sizeof(unsigned long))) == NULL )
+    if ( (page_array = malloc(pgnr * sizeof(xen_pfn_t))) == NULL )
     {
         PERROR("Could not allocate memory");
         goto error_out;
@@ -579,11 +596,11 @@ static int compat_check(int xc_handle, s
     }
 
     if (strstr(xen_caps, "xen-3.0-x86_32p")) {
-        if (!dsi->pae_kernel) {
+        if (dsi->pae_kernel == PAEKERN_no) {
             ERROR("Non PAE-kernel on PAE host.");
             return 0;
         }
-    } else if (dsi->pae_kernel) {
+    } else if (dsi->pae_kernel != PAEKERN_no) {
         ERROR("PAE-kernel on non-PAE host.");
         return 0;
     }
@@ -606,7 +623,7 @@ static int setup_guest(int xc_handle,
                        unsigned int console_evtchn, unsigned long *console_mfn,
                        uint32_t required_features[XENFEAT_NR_SUBMAPS])
 {
-    unsigned long *page_array = NULL;
+    xen_pfn_t *page_array = NULL;
     unsigned long count, i, hypercall_pfn;
     start_info_t *start_info;
     shared_info_t *shared_info;
@@ -617,7 +634,7 @@ static int setup_guest(int xc_handle,
 
     unsigned long nr_pt_pages;
     unsigned long physmap_pfn;
-    unsigned long *physmap, *physmap_e;
+    xen_pfn_t *physmap, *physmap_e;
 
     struct load_funcs load_funcs;
     struct domain_setup_info dsi;
@@ -673,7 +690,8 @@ static int setup_guest(int xc_handle,
 
     for ( i = 0; i < XENFEAT_NR_SUBMAPS; i++ )
     {
-        if ( (supported_features[i]&required_features[i]) != 
required_features[i] )
+        if ( (supported_features[i] & required_features[i]) !=
+             required_features[i] )
         {
             ERROR("Guest kernel does not support a required feature.");
             goto error_out;
@@ -719,7 +737,7 @@ static int setup_guest(int xc_handle,
     (((((_h) + ((1UL<<(_s))-1)) & ~((1UL<<(_s))-1)) - \
     ((_l) & ~((1UL<<(_s))-1))) >> (_s))
 #if defined(__i386__)
-        if ( dsi.pae_kernel )
+        if ( dsi.pae_kernel != PAEKERN_no )
         {
             if ( (1 + /* # L3 */
                   NR(dsi.v_start, v_end, L3_PAGETABLE_SHIFT_PAE) + /* # L2 */
@@ -797,11 +815,11 @@ static int setup_guest(int xc_handle,
 
     /* setup page tables */
 #if defined(__i386__)
-    if (dsi.pae_kernel)
+    if (dsi.pae_kernel != PAEKERN_no)
         rc = setup_pg_tables_pae(xc_handle, dom, ctxt,
                                  dsi.v_start, v_end,
                                  page_array, vpt_start, vpt_end,
-                                 shadow_mode_enabled);
+                                 shadow_mode_enabled, dsi.pae_kernel);
     else
         rc = setup_pg_tables(xc_handle, dom, ctxt,
                              dsi.v_start, v_end,
@@ -824,7 +842,7 @@ static int setup_guest(int xc_handle,
      */
     if ( !shadow_mode_enabled )
     {
-        if ( dsi.pae_kernel )
+        if ( dsi.pae_kernel != PAEKERN_no )
         {
             if ( pin_table(xc_handle, MMUEXT_PIN_L3_TABLE,
                            xen_cr3_to_pfn(ctxt->ctrlreg[3]), dom) )
@@ -865,8 +883,8 @@ static int setup_guest(int xc_handle,
             ((uint64_t)page_array[count] << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE,
             count) )
         {
-            fprintf(stderr,"m2p update failure p=%lx m=%lx\n",
-                    count, page_array[count]);
+            fprintf(stderr,"m2p update failure p=%lx m=%"PRIx64"\n",
+                    count, (uint64_t)page_array[count]);
             munmap(physmap, PAGE_SIZE);
             goto error_out;
         }
@@ -958,7 +976,7 @@ static int setup_guest(int xc_handle,
     rc = xc_version(xc_handle, XENVER_version, NULL);
     sprintf(start_info->magic, "xen-%i.%i-x86_%d%s",
             rc >> 16, rc & (0xFFFF), (unsigned int)sizeof(long)*8,
-            dsi.pae_kernel ? "p" : "");
+            (dsi.pae_kernel != PAEKERN_no) ? "p" : "");
     start_info->nr_pages     = nr_pages;
     start_info->shared_info  = guest_shared_info_mfn << PAGE_SHIFT;
     start_info->flags        = flags;
diff -r d3e181fa238b -r 156a0963a1ae tools/libxc/xc_linux_restore.c
--- a/tools/libxc/xc_linux_restore.c    Fri Jun 02 12:54:22 2006 -0500
+++ b/tools/libxc/xc_linux_restore.c    Tue Jun 06 13:25:31 2006 -0500
@@ -25,10 +25,10 @@ static unsigned long max_pfn;
 static unsigned long max_pfn;
 
 /* Live mapping of the table mapping each PFN to its current MFN. */
-static unsigned long *live_p2m = NULL;
+static xen_pfn_t *live_p2m = NULL;
 
 /* A table mapping each PFN to its new MFN. */
-static unsigned long *p2m = NULL;
+static xen_pfn_t *p2m = NULL;
 
 
 static ssize_t
@@ -108,7 +108,7 @@ int xc_linux_restore(int xc_handle, int 
                      unsigned int console_evtchn, unsigned long *console_mfn)
 {
     DECLARE_DOM0_OP;
-    int rc = 1, i, n;
+    int rc = 1, i, n, pae_extended_cr3 = 0;
     unsigned long mfn, pfn;
     unsigned int prev_pc, this_pc;
     int verify = 0;
@@ -126,7 +126,7 @@ int xc_linux_restore(int xc_handle, int 
     unsigned long *pfn_type = NULL;
 
     /* A table of MFNs to map in the current region */
-    unsigned long *region_mfn = NULL;
+    xen_pfn_t *region_mfn = NULL;
 
     /* Types of the pfns in the current region */
     unsigned long region_pfn_type[MAX_BATCH_SIZE];
@@ -135,7 +135,7 @@ int xc_linux_restore(int xc_handle, int 
     unsigned long *page = NULL;
 
     /* A copy of the pfn-to-mfn table frame list. */
-    unsigned long *p2m_frame_list = NULL;
+    xen_pfn_t *p2m_frame_list = NULL;
 
     /* A temporary mapping of the guest's start_info page. */
     start_info_t *start_info;
@@ -162,30 +162,88 @@ int xc_linux_restore(int xc_handle, int 
         return 1;
     }
 
-
     if (mlock(&ctxt, sizeof(ctxt))) {
         /* needed for build dom0 op, but might as well do early */
         ERR("Unable to mlock ctxt");
         return 1;
     }
 
-
-    /* Read the saved P2M frame list */
-    if(!(p2m_frame_list = malloc(P2M_FL_SIZE))) {
+    if (!(p2m_frame_list = malloc(P2M_FL_SIZE))) {
         ERR("Couldn't allocate p2m_frame_list array");
         goto out;
     }
 
-    if (!read_exact(io_fd, p2m_frame_list, P2M_FL_SIZE)) {
+    /* Read first entry of P2M list, or extended-info signature (~0UL). */
+    if (!read_exact(io_fd, p2m_frame_list, sizeof(long))) {
+        ERR("read extended-info signature failed");
+        goto out;
+    }
+
+    if (p2m_frame_list[0] == ~0UL) {
+        uint32_t tot_bytes;
+
+        /* Next 4 bytes: total size of following extended info. */
+        if (!read_exact(io_fd, &tot_bytes, sizeof(tot_bytes))) {
+            ERR("read extended-info size failed");
+            goto out;
+        }
+
+        while (tot_bytes) {
+            uint32_t chunk_bytes;
+            char     chunk_sig[4];
+
+            /* 4-character chunk signature + 4-byte remaining chunk size. */
+            if (!read_exact(io_fd, chunk_sig, sizeof(chunk_sig)) ||
+                !read_exact(io_fd, &chunk_bytes, sizeof(chunk_bytes))) {
+                ERR("read extended-info chunk signature failed");
+                goto out;
+            }
+            tot_bytes -= 8;
+
+            /* VCPU context structure? */
+            if (!strncmp(chunk_sig, "vcpu", 4)) {
+                if (!read_exact(io_fd, &ctxt, sizeof(ctxt))) {
+                    ERR("read extended-info vcpu context failed");
+                    goto out;
+                }
+                tot_bytes   -= sizeof(struct vcpu_guest_context);
+                chunk_bytes -= sizeof(struct vcpu_guest_context);
+
+                if (ctxt.vm_assist & (1UL << VMASST_TYPE_pae_extended_cr3))
+                    pae_extended_cr3 = 1;
+            }
+
+            /* Any remaining bytes of this chunk: read and discard. */
+            while (chunk_bytes) {
+                unsigned long sz = chunk_bytes;
+                if ( sz > P2M_FL_SIZE )
+                    sz = P2M_FL_SIZE;
+                if (!read_exact(io_fd, p2m_frame_list, sz)) {
+                    ERR("read-and-discard extended-info chunk bytes failed");
+                    goto out;
+                }
+                chunk_bytes -= sz;
+                tot_bytes   -= sz;
+            }
+        }
+
+        /* Now read the real first entry of P2M list. */
+        if (!read_exact(io_fd, p2m_frame_list, sizeof(long))) {
+            ERR("read first entry of p2m_frame_list failed");
+            goto out;
+        }
+    }
+
+    /* First entry is already read into the p2m array. */
+    if (!read_exact(io_fd, &p2m_frame_list[1], P2M_FL_SIZE - sizeof(long))) {
         ERR("read p2m_frame_list failed");
         goto out;
     }
 
-
     /* We want zeroed memory so use calloc rather than malloc. */
-    p2m        = calloc(max_pfn, sizeof(unsigned long));
+    p2m        = calloc(max_pfn, sizeof(xen_pfn_t));
     pfn_type   = calloc(max_pfn, sizeof(unsigned long));
-    region_mfn = calloc(MAX_BATCH_SIZE, sizeof(unsigned long));
+    region_mfn = calloc(MAX_BATCH_SIZE, sizeof(xen_pfn_t));
 
     if ((p2m == NULL) || (pfn_type == NULL) || (region_mfn == NULL)) {
         ERR("memory alloc failed");
@@ -193,7 +251,7 @@ int xc_linux_restore(int xc_handle, int 
         goto out;
     }
 
-    if (mlock(region_mfn, sizeof(unsigned long) * MAX_BATCH_SIZE)) {
+    if (mlock(region_mfn, sizeof(xen_pfn_t) * MAX_BATCH_SIZE)) {
         ERR("Could not mlock region_mfn");
         goto out;
     }
@@ -331,17 +389,27 @@ int xc_linux_restore(int xc_handle, int 
                 ** A page table page - need to 'uncanonicalize' it, i.e.
                 ** replace all the references to pfns with the corresponding
                 ** mfns for the new domain.
+                **
+                ** On PAE we need to ensure that PGDs are in MFNs < 4G, and
+                ** so we may need to update the p2m after the main loop.
+                ** Hence we defer canonicalization of L1s until then.
                 */
-                if(!uncanonicalize_pagetable(pagetype, page)) {
-                    /*
-                    ** Failing to uncanonicalize a page table can be ok
-                    ** under live migration since the pages type may have
-                    ** changed by now (and we'll get an update later).
-                    */
-                    DPRINTF("PT L%ld race on pfn=%08lx mfn=%08lx\n",
-                            pagetype >> 28, pfn, mfn);
-                    nraces++;
-                    continue;
+                if ((pt_levels != 3) ||
+                    pae_extended_cr3 ||
+                    (pagetype != L1TAB)) {
+
+                    if (!uncanonicalize_pagetable(pagetype, page)) {
+                        /*
+                        ** Failing to uncanonicalize a page table can be ok
+                        ** under live migration since the pages type may have
+                        ** changed by now (and we'll get an update later).
+                        */
+                        DPRINTF("PT L%ld race on pfn=%08lx mfn=%08lx\n",
+                                pagetype >> 28, pfn, mfn);
+                        nraces++;
+                        continue;
+                    }
+
                 }
 
             } else if(pagetype != NOTAB) {
@@ -389,6 +457,100 @@ int xc_linux_restore(int xc_handle, int 
     }
 
     DPRINTF("Received all pages (%d races)\n", nraces);
+
+    if ((pt_levels == 3) && !pae_extended_cr3) {
+
+        /*
+        ** XXX SMH on PAE we need to ensure PGDs are in MFNs < 4G. This
+        ** is a little awkward and involves (a) finding all such PGDs and
+        ** replacing them with 'lowmem' versions; (b) upating the p2m[]
+        ** with the new info; and (c) canonicalizing all the L1s using the
+        ** (potentially updated) p2m[].
+        **
+        ** This is relatively slow (and currently involves two passes through
+        ** the pfn_type[] array), but at least seems to be correct. May wish
+        ** to consider more complex approaches to optimize this later.
+        */
+
+        int j, k;
+
+        /* First pass: find all L3TABs current in > 4G mfns and get new mfns */
+        for (i = 0; i < max_pfn; i++) {
+
+            if (((pfn_type[i] & LTABTYPE_MASK)==L3TAB) && (p2m[i]>0xfffffUL)) {
+
+                unsigned long new_mfn;
+                uint64_t l3ptes[4];
+                uint64_t *l3tab;
+
+                l3tab = (uint64_t *)
+                    xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
+                                         PROT_READ, p2m[i]);
+
+                for(j = 0; j < 4; j++)
+                    l3ptes[j] = l3tab[j];
+
+                munmap(l3tab, PAGE_SIZE);
+
+                if (!(new_mfn=xc_make_page_below_4G(xc_handle, dom, p2m[i]))) {
+                    ERR("Couldn't get a page below 4GB :-(");
+                    goto out;
+                }
+
+                p2m[i] = new_mfn;
+                if (xc_add_mmu_update(xc_handle, mmu,
+                                      (((unsigned long long)new_mfn)
+                                       << PAGE_SHIFT) |
+                                      MMU_MACHPHYS_UPDATE, i)) {
+                    ERR("Couldn't m2p on PAE root pgdir");
+                    goto out;
+                }
+
+                l3tab = (uint64_t *)
+                    xc_map_foreign_range(xc_handle, dom, PAGE_SIZE,
+                                         PROT_READ | PROT_WRITE, p2m[i]);
+
+                for(j = 0; j < 4; j++)
+                    l3tab[j] = l3ptes[j];
+
+                munmap(l3tab, PAGE_SIZE);
+
+            }
+        }
+
+        /* Second pass: find all L1TABs and uncanonicalize them */
+        j = 0;
+
+        for(i = 0; i < max_pfn; i++) {
+
+            if (((pfn_type[i] & LTABTYPE_MASK)==L1TAB)) {
+                region_mfn[j] = p2m[i];
+                j++;
+            }
+
+            if(i == (max_pfn-1) || j == MAX_BATCH_SIZE) {
+
+                if (!(region_base = xc_map_foreign_batch(
+                          xc_handle, dom, PROT_READ | PROT_WRITE,
+                          region_mfn, j))) {
+                    ERR("map batch failed");
+                    goto out;
+                }
+
+                for(k = 0; k < j; k++) {
+                    if(!uncanonicalize_pagetable(L1TAB,
+                                                 region_base + k*PAGE_SIZE)) {
+                        ERR("failed uncanonicalize pt!");
+                        goto out;
+                    }
+                }
+
+                munmap(region_base, j*PAGE_SIZE);
+                j = 0;
+            }
+        }
+
+    }
 
 
     if (xc_finish_mmu_updates(xc_handle, mmu)) {
diff -r d3e181fa238b -r 156a0963a1ae tools/libxc/xc_linux_save.c
--- a/tools/libxc/xc_linux_save.c       Fri Jun 02 12:54:22 2006 -0500
+++ b/tools/libxc/xc_linux_save.c       Tue Jun 06 13:25:31 2006 -0500
@@ -40,10 +40,10 @@ static unsigned long max_pfn;
 static unsigned long max_pfn;
 
 /* Live mapping of the table mapping each PFN to its current MFN. */
-static unsigned long *live_p2m = NULL;
+static xen_pfn_t *live_p2m = NULL;
 
 /* Live mapping of system MFN to PFN table. */
-static unsigned long *live_m2p = NULL;
+static xen_pfn_t *live_m2p = NULL;
 
 /* grep fodder: machine_to_phys */
 
@@ -501,22 +501,22 @@ void canonicalize_pagetable(unsigned lon
 
 
 
-static unsigned long *xc_map_m2p(int xc_handle,
+static xen_pfn_t *xc_map_m2p(int xc_handle,
                                  unsigned long max_mfn,
                                  int prot)
 {
     struct xen_machphys_mfn_list xmml;
     privcmd_mmap_entry_t *entries;
     unsigned long m2p_chunks, m2p_size;
-    unsigned long *m2p;
-    unsigned long *extent_start;
+    xen_pfn_t *m2p;
+    xen_pfn_t *extent_start;
     int i, rc;
 
     m2p_size   = M2P_SIZE(max_mfn);
     m2p_chunks = M2P_CHUNKS(max_mfn);
 
     xmml.max_extents = m2p_chunks;
-    if (!(extent_start = malloc(m2p_chunks * sizeof(unsigned long)))) {
+    if (!(extent_start = malloc(m2p_chunks * sizeof(xen_pfn_t)))) {
         ERR("failed to allocate space for m2p mfns");
         return NULL;
     }
@@ -583,11 +583,11 @@ int xc_linux_save(int xc_handle, int io_
     char page[PAGE_SIZE];
 
     /* Double and single indirect references to the live P2M table */
-    unsigned long *live_p2m_frame_list_list = NULL;
-    unsigned long *live_p2m_frame_list = NULL;
+    xen_pfn_t *live_p2m_frame_list_list = NULL;
+    xen_pfn_t *live_p2m_frame_list = NULL;
 
     /* A copy of the pfn-to-mfn table frame list. */
-    unsigned long *p2m_frame_list = NULL;
+    xen_pfn_t *p2m_frame_list = NULL;
 
     /* Live mapping of shared info structure */
     shared_info_t *live_shinfo = NULL;
@@ -712,11 +712,11 @@ int xc_linux_save(int xc_handle, int io_
     memcpy(p2m_frame_list, live_p2m_frame_list, P2M_FL_SIZE);
 
     /* Canonicalise the pfn-to-mfn table frame-number list. */
-    for (i = 0; i < max_pfn; i += ulpp) {
-        if (!translate_mfn_to_pfn(&p2m_frame_list[i/ulpp])) {
+    for (i = 0; i < max_pfn; i += fpp) {
+        if (!translate_mfn_to_pfn(&p2m_frame_list[i/fpp])) {
             ERR("Frame# in pfn-to-mfn frame list is not in pseudophys");
-            ERR("entry %d: p2m_frame_list[%ld] is 0x%lx", i, i/ulpp,
-                p2m_frame_list[i/ulpp]);
+            ERR("entry %d: p2m_frame_list[%ld] is 0x%"PRIx64, i, i/fpp,
+                (uint64_t)p2m_frame_list[i/fpp]);
             goto out;
         }
     }
@@ -818,12 +818,33 @@ int xc_linux_save(int xc_handle, int io_
 
     /* Start writing out the saved-domain record. */
 
-    if(!write_exact(io_fd, &max_pfn, sizeof(unsigned long))) {
+    if (!write_exact(io_fd, &max_pfn, sizeof(unsigned long))) {
         ERR("write: max_pfn");
         goto out;
     }
 
-    if(!write_exact(io_fd, p2m_frame_list, P2M_FL_SIZE)) {
+    /*
+     * Write an extended-info structure to inform the restore code that
+     * a PAE guest understands extended CR3 (PDPTs above 4GB). Turns off
+     * slow paths in the restore code.
+     */
+    if ((pt_levels == 3) &&
+        (ctxt.vm_assist & (1UL << VMASST_TYPE_pae_extended_cr3))) {
+        unsigned long signature = ~0UL;
+        uint32_t tot_sz   = sizeof(struct vcpu_guest_context) + 8;
+        uint32_t chunk_sz = sizeof(struct vcpu_guest_context);
+        char chunk_sig[]  = "vcpu";
+        if (!write_exact(io_fd, &signature, sizeof(signature)) ||
+            !write_exact(io_fd, &tot_sz,    sizeof(tot_sz)) ||
+            !write_exact(io_fd, &chunk_sig, 4) ||
+            !write_exact(io_fd, &chunk_sz,  sizeof(chunk_sz)) ||
+            !write_exact(io_fd, &ctxt,      sizeof(ctxt))) {
+            ERR("write: extended info");
+            goto out;
+        }
+    }
+
+    if (!write_exact(io_fd, p2m_frame_list, P2M_FL_SIZE)) {
         ERR("write: p2m_frame_list");
         goto out;
     }
diff -r d3e181fa238b -r 156a0963a1ae tools/libxc/xc_load_aout9.c
--- a/tools/libxc/xc_load_aout9.c       Fri Jun 02 12:54:22 2006 -0500
+++ b/tools/libxc/xc_load_aout9.c       Tue Jun 06 13:25:31 2006 -0500
@@ -17,7 +17,7 @@
 #define KOFFSET(_p)       ((_p)&~KZERO)
 
 static int parseaout9image(const char *, unsigned long, struct 
domain_setup_info *);
-static int loadaout9image(const char *, unsigned long, int, uint32_t, unsigned 
long *, struct domain_setup_info *);
+static int loadaout9image(const char *, unsigned long, int, uint32_t, 
xen_pfn_t *, struct domain_setup_info *);
 static void copyout(int, uint32_t, unsigned long *, unsigned long, const char 
*, int);
 struct Exec *get_header(const char *, unsigned long, struct Exec *);
 
@@ -79,7 +79,7 @@ loadaout9image(
     const char *image,
     unsigned long image_size,
     int xch, uint32_t dom,
-    unsigned long *parray,
+    xen_pfn_t *parray,
     struct domain_setup_info *dsi)
 {
     struct Exec ehdr;
diff -r d3e181fa238b -r 156a0963a1ae tools/libxc/xc_load_bin.c
--- a/tools/libxc/xc_load_bin.c Fri Jun 02 12:54:22 2006 -0500
+++ b/tools/libxc/xc_load_bin.c Tue Jun 06 13:25:31 2006 -0500
@@ -107,7 +107,7 @@ static int
 static int
 loadbinimage(
     const char *image, unsigned long image_size, int xch, uint32_t dom,
-    unsigned long *parray, struct domain_setup_info *dsi);
+    xen_pfn_t *parray, struct domain_setup_info *dsi);
 
 int probe_bin(const char *image,
               unsigned long image_size,
@@ -235,7 +235,7 @@ static int
 static int
 loadbinimage(
     const char *image, unsigned long image_size, int xch, uint32_t dom,
-    unsigned long *parray, struct domain_setup_info *dsi)
+    xen_pfn_t *parray, struct domain_setup_info *dsi)
 {
     unsigned long size;
     char         *va;
diff -r d3e181fa238b -r 156a0963a1ae tools/libxc/xc_load_elf.c
--- a/tools/libxc/xc_load_elf.c Fri Jun 02 12:54:22 2006 -0500
+++ b/tools/libxc/xc_load_elf.c Tue Jun 06 13:25:31 2006 -0500
@@ -17,10 +17,10 @@ static int
 static int
 loadelfimage(
     const char *image, unsigned long image_size, int xch, uint32_t dom,
-    unsigned long *parray, struct domain_setup_info *dsi);
+    xen_pfn_t *parray, struct domain_setup_info *dsi);
 static int
 loadelfsymtab(
-    const char *image, int xch, uint32_t dom, unsigned long *parray,
+    const char *image, int xch, uint32_t dom, xen_pfn_t *parray,
     struct domain_setup_info *dsi);
 
 int probe_elf(const char *image,
@@ -138,8 +138,15 @@ static int parseelfimage(const char *ima
             ERROR("Actually saw: '%s'", guestinfo);
             return -EINVAL;
         }
-        if ( (strstr(guestinfo, "PAE=yes") != NULL) )
-            dsi->pae_kernel = 1;
+
+        dsi->pae_kernel = PAEKERN_no;
+        p = strstr(guestinfo, "PAE=yes");
+        if ( p != NULL )
+        {
+            dsi->pae_kernel = PAEKERN_yes;
+            if ( !strncmp(p+7, "[extended-cr3]", 14) )
+                dsi->pae_kernel = PAEKERN_extended_cr3;
+        }
 
         break;
     }
@@ -220,7 +227,7 @@ static int
 static int
 loadelfimage(
     const char *image, unsigned long elfsize, int xch, uint32_t dom,
-    unsigned long *parray, struct domain_setup_info *dsi)
+    xen_pfn_t *parray, struct domain_setup_info *dsi)
 {
     Elf_Ehdr *ehdr = (Elf_Ehdr *)image;
     Elf_Phdr *phdr;
@@ -274,7 +281,7 @@ loadelfimage(
 
 static int
 loadelfsymtab(
-    const char *image, int xch, uint32_t dom, unsigned long *parray,
+    const char *image, int xch, uint32_t dom, xen_pfn_t *parray,
     struct domain_setup_info *dsi)
 {
     Elf_Ehdr *ehdr = (Elf_Ehdr *)image, *sym_ehdr;
diff -r d3e181fa238b -r 156a0963a1ae tools/libxc/xc_private.c
--- a/tools/libxc/xc_private.c  Fri Jun 02 12:54:22 2006 -0500
+++ b/tools/libxc/xc_private.c  Tue Jun 06 13:25:31 2006 -0500
@@ -4,6 +4,7 @@
  * Helper functions for the rest of the library.
  */
 
+#include <inttypes.h>
 #include "xc_private.h"
 
 /* NB: arr must be mlock'ed */
@@ -134,9 +135,9 @@ int xc_memory_op(int xc_handle,
     struct xen_memory_reservation *reservation = arg;
     struct xen_machphys_mfn_list *xmml = arg;
     struct xen_translate_gpfn_list *trans = arg;
-    unsigned long *extent_start;
-    unsigned long *gpfn_list;
-    unsigned long *mfn_list;
+    xen_pfn_t *extent_start;
+    xen_pfn_t *gpfn_list;
+    xen_pfn_t *mfn_list;
     long ret = -EINVAL;
 
     hypercall.op     = __HYPERVISOR_memory_op;
@@ -156,7 +157,7 @@ int xc_memory_op(int xc_handle,
         get_xen_guest_handle(extent_start, reservation->extent_start);
         if ( (extent_start != NULL) &&
              (mlock(extent_start,
-                    reservation->nr_extents * sizeof(unsigned long)) != 0) )
+                    reservation->nr_extents * sizeof(xen_pfn_t)) != 0) )
         {
             PERROR("Could not mlock");
             safe_munlock(reservation, sizeof(*reservation));
@@ -171,7 +172,7 @@ int xc_memory_op(int xc_handle,
         }
         get_xen_guest_handle(extent_start, xmml->extent_start);
         if ( mlock(extent_start,
-                   xmml->max_extents * sizeof(unsigned long)) != 0 )
+                   xmml->max_extents * sizeof(xen_pfn_t)) != 0 )
         {
             PERROR("Could not mlock");
             safe_munlock(xmml, sizeof(*xmml));
@@ -192,17 +193,17 @@ int xc_memory_op(int xc_handle,
             goto out1;
         }
         get_xen_guest_handle(gpfn_list, trans->gpfn_list);
-        if ( mlock(gpfn_list, trans->nr_gpfns * sizeof(long)) != 0 )
+        if ( mlock(gpfn_list, trans->nr_gpfns * sizeof(xen_pfn_t)) != 0 )
         {
             PERROR("Could not mlock");
             safe_munlock(trans, sizeof(*trans));
             goto out1;
         }
         get_xen_guest_handle(mfn_list, trans->mfn_list);
-        if ( mlock(mfn_list, trans->nr_gpfns * sizeof(long)) != 0 )
-        {
-            PERROR("Could not mlock");
-            safe_munlock(gpfn_list, trans->nr_gpfns * sizeof(long));
+        if ( mlock(mfn_list, trans->nr_gpfns * sizeof(xen_pfn_t)) != 0 )
+        {
+            PERROR("Could not mlock");
+            safe_munlock(gpfn_list, trans->nr_gpfns * sizeof(xen_pfn_t));
             safe_munlock(trans, sizeof(*trans));
             goto out1;
         }
@@ -220,22 +221,22 @@ int xc_memory_op(int xc_handle,
         get_xen_guest_handle(extent_start, reservation->extent_start);
         if ( extent_start != NULL )
             safe_munlock(extent_start,
-                         reservation->nr_extents * sizeof(unsigned long));
+                         reservation->nr_extents * sizeof(xen_pfn_t));
         break;
     case XENMEM_machphys_mfn_list:
         safe_munlock(xmml, sizeof(*xmml));
         get_xen_guest_handle(extent_start, xmml->extent_start);
         safe_munlock(extent_start,
-                     xmml->max_extents * sizeof(unsigned long));
+                     xmml->max_extents * sizeof(xen_pfn_t));
         break;
     case XENMEM_add_to_physmap:
         safe_munlock(arg, sizeof(struct xen_add_to_physmap));
         break;
     case XENMEM_translate_gpfn_list:
             get_xen_guest_handle(mfn_list, trans->mfn_list);
-            safe_munlock(mfn_list, trans->nr_gpfns * sizeof(long));
+            safe_munlock(mfn_list, trans->nr_gpfns * sizeof(xen_pfn_t));
             get_xen_guest_handle(gpfn_list, trans->gpfn_list);
-            safe_munlock(gpfn_list, trans->nr_gpfns * sizeof(long));
+            safe_munlock(gpfn_list, trans->nr_gpfns * sizeof(xen_pfn_t));
             safe_munlock(trans, sizeof(*trans));
         break;
     }
@@ -263,7 +264,7 @@ long long xc_domain_get_cpu_usage( int x
 
 int xc_get_pfn_list(int xc_handle,
                     uint32_t domid,
-                    unsigned long *pfn_buf,
+                    xen_pfn_t *pfn_buf,
                     unsigned long max_pfns)
 {
     DECLARE_DOM0_OP;
@@ -274,10 +275,10 @@ int xc_get_pfn_list(int xc_handle,
     set_xen_guest_handle(op.u.getmemlist.buffer, pfn_buf);
 
 #ifdef VALGRIND
-    memset(pfn_buf, 0, max_pfns * sizeof(unsigned long));
+    memset(pfn_buf, 0, max_pfns * sizeof(xen_pfn_t));
 #endif
 
-    if ( mlock(pfn_buf, max_pfns * sizeof(unsigned long)) != 0 )
+    if ( mlock(pfn_buf, max_pfns * sizeof(xen_pfn_t)) != 0 )
     {
         PERROR("xc_get_pfn_list: pfn_buf mlock failed");
         return -1;
@@ -285,7 +286,7 @@ int xc_get_pfn_list(int xc_handle,
 
     ret = do_dom0_op(xc_handle, &op);
 
-    safe_munlock(pfn_buf, max_pfns * sizeof(unsigned long));
+    safe_munlock(pfn_buf, max_pfns * sizeof(xen_pfn_t));
 
 #if 0
 #ifdef DEBUG
@@ -364,7 +365,7 @@ unsigned long xc_get_filesz(int fd)
 }
 
 void xc_map_memcpy(unsigned long dst, const char *src, unsigned long size,
-                   int xch, uint32_t dom, unsigned long *parray,
+                   int xch, uint32_t dom, xen_pfn_t *parray,
                    unsigned long vstart)
 {
     char *va;
@@ -428,6 +429,29 @@ int xc_version(int xc_handle, int cmd, v
         safe_munlock(arg, argsize);
 
     return rc;
+}
+
+unsigned long xc_make_page_below_4G(
+    int xc_handle, uint32_t domid, unsigned long mfn)
+{
+    xen_pfn_t old_mfn = mfn;
+    xen_pfn_t new_mfn;
+
+    if ( xc_domain_memory_decrease_reservation(
+        xc_handle, domid, 1, 0, &old_mfn) != 0 )
+    {
+        fprintf(stderr,"xc_make_page_below_4G decrease failed. mfn=%lx\n",mfn);
+        return 0;
+    }
+
+    if ( xc_domain_memory_increase_reservation(
+        xc_handle, domid, 1, 0, 32, &new_mfn) != 0 )
+    {
+        fprintf(stderr,"xc_make_page_below_4G increase failed. mfn=%lx\n",mfn);
+        return 0;
+    }
+
+    return new_mfn;
 }
 
 /*
diff -r d3e181fa238b -r 156a0963a1ae tools/libxc/xenctrl.h
--- a/tools/libxc/xenctrl.h     Fri Jun 02 12:54:22 2006 -0500
+++ b/tools/libxc/xenctrl.h     Tue Jun 06 13:25:31 2006 -0500
@@ -420,26 +420,26 @@ int xc_domain_memory_increase_reservatio
                                           unsigned long nr_extents,
                                           unsigned int extent_order,
                                           unsigned int address_bits,
-                                          unsigned long *extent_start);
+                                          xen_pfn_t *extent_start);
 
 int xc_domain_memory_decrease_reservation(int xc_handle,
                                           uint32_t domid,
                                           unsigned long nr_extents,
                                           unsigned int extent_order,
-                                          unsigned long *extent_start);
+                                          xen_pfn_t *extent_start);
 
 int xc_domain_memory_populate_physmap(int xc_handle,
                                       uint32_t domid,
                                       unsigned long nr_extents,
                                       unsigned int extent_order,
                                       unsigned int address_bits,
-                                      unsigned long *extent_start);
+                                      xen_pfn_t *extent_start);
 
 int xc_domain_translate_gpfn_list(int xc_handle,
                                   uint32_t domid,
                                   unsigned long nr_gpfns,
-                                  unsigned long *gpfn_list,
-                                  unsigned long *mfn_list);
+                                  xen_pfn_t *gpfn_list,
+                                  xen_pfn_t *mfn_list);
 
 int xc_domain_ioport_permission(int xc_handle,
                                 uint32_t domid,
@@ -458,6 +458,9 @@ int xc_domain_iomem_permission(int xc_ha
                                unsigned long nr_mfns,
                                uint8_t allow_access);
 
+unsigned long xc_make_page_below_4G(int xc_handle, uint32_t domid,
+                                    unsigned long mfn);
+
 typedef dom0_perfc_desc_t xc_perfc_desc_t;
 /* IMPORTANT: The caller is responsible for mlock()'ing the @desc array. */
 int xc_perfc_control(int xc_handle,
@@ -489,7 +492,7 @@ void *xc_map_foreign_range(int xc_handle
                             unsigned long mfn );
 
 void *xc_map_foreign_batch(int xc_handle, uint32_t dom, int prot,
-                           unsigned long *arr, int num );
+                           xen_pfn_t *arr, int num );
 
 /**
  * Translates a virtual address in the context of a given domain and
@@ -504,11 +507,11 @@ unsigned long xc_translate_foreign_addre
 unsigned long xc_translate_foreign_address(int xc_handle, uint32_t dom,
                                            int vcpu, unsigned long long virt);
 
-int xc_get_pfn_list(int xc_handle, uint32_t domid, unsigned long *pfn_buf,
+int xc_get_pfn_list(int xc_handle, uint32_t domid, xen_pfn_t *pfn_buf,
                     unsigned long max_pfns);
 
 int xc_ia64_get_pfn_list(int xc_handle, uint32_t domid,
-                         unsigned long *pfn_buf,
+                         xen_pfn_t *pfn_buf,
                          unsigned int start_page, unsigned int nr_pages);
 
 int xc_copy_to_domain_page(int xc_handle, uint32_t domid,
diff -r d3e181fa238b -r 156a0963a1ae tools/libxc/xg_private.h
--- a/tools/libxc/xg_private.h  Fri Jun 02 12:54:22 2006 -0500
+++ b/tools/libxc/xg_private.h  Tue Jun 06 13:25:31 2006 -0500
@@ -156,6 +156,9 @@ struct domain_setup_info
 
     unsigned long elf_paddr_offset;
 
+#define PAEKERN_no           0
+#define PAEKERN_yes          1
+#define PAEKERN_extended_cr3 2
     unsigned int  pae_kernel;
 
     unsigned int  load_symtab;
@@ -170,7 +173,7 @@ typedef int (*parseimagefunc)(const char
                               struct domain_setup_info *dsi);
 typedef int (*loadimagefunc)(const char *image, unsigned long image_size,
                              int xch,
-                             uint32_t dom, unsigned long *parray,
+                             uint32_t dom, xen_pfn_t *parray,
                              struct domain_setup_info *dsi);
 
 struct load_funcs
@@ -198,7 +201,7 @@ unsigned long xc_get_filesz(int fd);
 unsigned long xc_get_filesz(int fd);
 
 void xc_map_memcpy(unsigned long dst, const char *src, unsigned long size,
-                   int xch, uint32_t dom, unsigned long *parray,
+                   int xch, uint32_t dom, xen_pfn_t *parray,
                    unsigned long vstart);
 
 int pin_table(int xc_handle, unsigned int type, unsigned long mfn,
diff -r d3e181fa238b -r 156a0963a1ae tools/libxc/xg_save_restore.h
--- a/tools/libxc/xg_save_restore.h     Fri Jun 02 12:54:22 2006 -0500
+++ b/tools/libxc/xg_save_restore.h     Tue Jun 06 13:25:31 2006 -0500
@@ -105,23 +105,23 @@ static int get_platform_info(int xc_hand
 */
 #define M2P_SHIFT       L2_PAGETABLE_SHIFT_PAE
 #define M2P_CHUNK_SIZE  (1 << M2P_SHIFT)
-#define M2P_SIZE(_m)    ROUNDUP(((_m) * sizeof(unsigned long)), M2P_SHIFT)
+#define M2P_SIZE(_m)    ROUNDUP(((_m) * sizeof(xen_pfn_t)), M2P_SHIFT)
 #define M2P_CHUNKS(_m)  (M2P_SIZE((_m)) >> M2P_SHIFT)
 
 /* Size in bytes of the P2M (rounded up to the nearest PAGE_SIZE bytes) */
-#define P2M_SIZE        ROUNDUP((max_pfn * sizeof(unsigned long)), PAGE_SHIFT)
+#define P2M_SIZE        ROUNDUP((max_pfn * sizeof(xen_pfn_t)), PAGE_SHIFT)
 
-/* Number of unsigned longs in a page */
-#define ulpp            (PAGE_SIZE/sizeof(unsigned long))
+/* Number of xen_pfn_t in a page */
+#define fpp             (PAGE_SIZE/sizeof(xen_pfn_t))
 
 /* Number of entries in the pfn_to_mfn_frame_list */
-#define P2M_FL_ENTRIES  (((max_pfn)+ulpp-1)/ulpp)
+#define P2M_FL_ENTRIES  (((max_pfn)+fpp-1)/fpp)
 
 /* Size in bytes of the pfn_to_mfn_frame_list     */
 #define P2M_FL_SIZE     ((P2M_FL_ENTRIES)*sizeof(unsigned long))
 
 /* Number of entries in the pfn_to_mfn_frame_list_list */
-#define P2M_FLL_ENTRIES (((max_pfn)+(ulpp*ulpp)-1)/(ulpp*ulpp))
+#define P2M_FLL_ENTRIES (((max_pfn)+(fpp*fpp)-1)/(fpp*fpp))
 
 /* Current guests allow 8MB 'slack' in their P2M */
 #define NR_SLACK_ENTRIES   ((8 * 1024 * 1024) / PAGE_SIZE)
diff -r d3e181fa238b -r 156a0963a1ae tools/tests/test_x86_emulator.c
--- a/tools/tests/test_x86_emulator.c   Fri Jun 02 12:54:22 2006 -0500
+++ b/tools/tests/test_x86_emulator.c   Tue Jun 06 13:25:31 2006 -0500
@@ -13,6 +13,7 @@ typedef int64_t            s64;
 typedef int64_t            s64;
 #include <public/xen.h>
 #include <asm-x86/x86_emulate.h>
+#include <sys/mman.h>
 
 static int read_any(
     unsigned long addr,
@@ -85,23 +86,30 @@ int main(int argc, char **argv)
     struct x86_emulate_ctxt ctxt;
     struct cpu_user_regs regs;
     char instr[20] = { 0x01, 0x08 }; /* add %ecx,(%eax) */
-    unsigned int res = 0x7FFFFFFF;
-    u32 cmpxchg8b_res[2] = { 0x12345678, 0x87654321 };
+    unsigned int *res;
     int rc;
 
     ctxt.regs = &regs;
     ctxt.mode = X86EMUL_MODE_PROT32;
 
+    res = mmap((void *)0x100000, 0x1000, PROT_READ|PROT_WRITE,
+               MAP_FIXED|MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
+    if ( res == MAP_FAILED )
+    {
+        fprintf(stderr, "mmap to low address failed\n");
+        exit(1);
+    }
+
     printf("%-40s", "Testing addl %%ecx,(%%eax)...");
     instr[0] = 0x01; instr[1] = 0x08;
     regs.eflags = 0x200;
     regs.eip    = (unsigned long)&instr[0];
     regs.ecx    = 0x12345678;
-    ctxt.cr2    = (unsigned long)&res;
-    res         = 0x7FFFFFFF;
-    rc = x86_emulate_memop(&ctxt, &emulops);
-    if ( (rc != 0) || 
-         (res != 0x92345677) || 
+    ctxt.cr2    = (unsigned long)res;
+    *res        = 0x7FFFFFFF;
+    rc = x86_emulate_memop(&ctxt, &emulops);
+    if ( (rc != 0) || 
+         (*res != 0x92345677) || 
          (regs.eflags != 0xa94) ||
          (regs.eip != (unsigned long)&instr[2]) )
         goto fail;
@@ -116,11 +124,25 @@ int main(int argc, char **argv)
 #else
     regs.ecx    = 0x12345678UL;
 #endif
-    ctxt.cr2    = (unsigned long)&res;
-    rc = x86_emulate_memop(&ctxt, &emulops);
-    if ( (rc != 0) || 
-         (res != 0x92345677) || 
+    ctxt.cr2    = (unsigned long)res;
+    rc = x86_emulate_memop(&ctxt, &emulops);
+    if ( (rc != 0) || 
+         (*res != 0x92345677) || 
          (regs.ecx != 0x8000000FUL) ||
+         (regs.eip != (unsigned long)&instr[2]) )
+        goto fail;
+    printf("okay\n");
+
+    printf("%-40s", "Testing movl (%%eax),%%ecx...");
+    instr[0] = 0x8b; instr[1] = 0x08;
+    regs.eflags = 0x200;
+    regs.eip    = (unsigned long)&instr[0];
+    regs.ecx    = ~0UL;
+    ctxt.cr2    = (unsigned long)res;
+    rc = x86_emulate_memop(&ctxt, &emulops);
+    if ( (rc != 0) || 
+         (*res != 0x92345677) || 
+         (regs.ecx != 0x92345677UL) ||
          (regs.eip != (unsigned long)&instr[2]) )
         goto fail;
     printf("okay\n");
@@ -131,10 +153,10 @@ int main(int argc, char **argv)
     regs.eip    = (unsigned long)&instr[0];
     regs.eax    = 0x92345677UL;
     regs.ecx    = 0xAA;
-    ctxt.cr2    = (unsigned long)&res;
-    rc = x86_emulate_memop(&ctxt, &emulops);
-    if ( (rc != 0) || 
-         (res != 0x923456AA) || 
+    ctxt.cr2    = (unsigned long)res;
+    rc = x86_emulate_memop(&ctxt, &emulops);
+    if ( (rc != 0) || 
+         (*res != 0x923456AA) || 
          (regs.eflags != 0x244) ||
          (regs.eax != 0x92345677UL) ||
          (regs.eip != (unsigned long)&instr[4]) )
@@ -147,10 +169,10 @@ int main(int argc, char **argv)
     regs.eip    = (unsigned long)&instr[0];
     regs.eax    = 0xAABBCC77UL;
     regs.ecx    = 0xFF;
-    ctxt.cr2    = (unsigned long)&res;
-    rc = x86_emulate_memop(&ctxt, &emulops);
-    if ( (rc != 0) || 
-         (res != 0x923456AA) || 
+    ctxt.cr2    = (unsigned long)res;
+    rc = x86_emulate_memop(&ctxt, &emulops);
+    if ( (rc != 0) || 
+         (*res != 0x923456AA) || 
          ((regs.eflags&0x240) != 0x200) ||
          (regs.eax != 0xAABBCCAA) ||
          (regs.ecx != 0xFF) ||
@@ -163,10 +185,10 @@ int main(int argc, char **argv)
     regs.eflags = 0x200;
     regs.eip    = (unsigned long)&instr[0];
     regs.ecx    = 0x12345678;
-    ctxt.cr2    = (unsigned long)&res;
-    rc = x86_emulate_memop(&ctxt, &emulops);
-    if ( (rc != 0) || 
-         (res != 0x12345678) || 
+    ctxt.cr2    = (unsigned long)res;
+    rc = x86_emulate_memop(&ctxt, &emulops);
+    if ( (rc != 0) || 
+         (*res != 0x12345678) || 
          (regs.eflags != 0x200) ||
          (regs.ecx != 0x923456AA) ||
          (regs.eip != (unsigned long)&instr[2]) )
@@ -176,14 +198,14 @@ int main(int argc, char **argv)
     printf("%-40s", "Testing lock cmpxchgl %%ecx,(%%eax)...");
     instr[0] = 0xf0; instr[1] = 0x0f; instr[2] = 0xb1; instr[3] = 0x08;
     regs.eflags = 0x200;
-    res         = 0x923456AA;
+    *res        = 0x923456AA;
     regs.eip    = (unsigned long)&instr[0];
     regs.eax    = 0x923456AAUL;
     regs.ecx    = 0xDDEEFF00L;
-    ctxt.cr2    = (unsigned long)&res;
-    rc = x86_emulate_memop(&ctxt, &emulops);
-    if ( (rc != 0) || 
-         (res != 0xDDEEFF00) || 
+    ctxt.cr2    = (unsigned long)res;
+    rc = x86_emulate_memop(&ctxt, &emulops);
+    if ( (rc != 0) || 
+         (*res != 0xDDEEFF00) || 
          (regs.eflags != 0x244) ||
          (regs.eax != 0x923456AAUL) ||
          (regs.eip != (unsigned long)&instr[4]) )
@@ -192,54 +214,57 @@ int main(int argc, char **argv)
 
     printf("%-40s", "Testing rep movsw...");
     instr[0] = 0xf3; instr[1] = 0x66; instr[2] = 0xa5;
-    res         = 0x22334455;
+    *res        = 0x22334455;
     regs.eflags = 0x200;
     regs.ecx    = 23;
     regs.eip    = (unsigned long)&instr[0];
-    regs.esi    = (unsigned long)&res + 0;
-    regs.edi    = (unsigned long)&res + 2;
+    regs.esi    = (unsigned long)res + 0;
+    regs.edi    = (unsigned long)res + 2;
     regs.error_code = 0; /* read fault */
     ctxt.cr2    = regs.esi;
     rc = x86_emulate_memop(&ctxt, &emulops);
     if ( (rc != 0) || 
-         (res != 0x44554455) ||
+         (*res != 0x44554455) ||
          (regs.eflags != 0x200) ||
          (regs.ecx != 22) || 
-         (regs.esi != ((unsigned long)&res + 2)) ||
-         (regs.edi != ((unsigned long)&res + 4)) ||
+         (regs.esi != ((unsigned long)res + 2)) ||
+         (regs.edi != ((unsigned long)res + 4)) ||
          (regs.eip != (unsigned long)&instr[0]) )
         goto fail;
     printf("okay\n");
 
     printf("%-40s", "Testing btrl $0x1,(%edi)...");
     instr[0] = 0x0f; instr[1] = 0xba; instr[2] = 0x37; instr[3] = 0x01;
-    res         = 0x2233445F;
-    regs.eflags = 0x200;
-    regs.eip    = (unsigned long)&instr[0];
-    regs.edi    = (unsigned long)&res;
+    *res        = 0x2233445F;
+    regs.eflags = 0x200;
+    regs.eip    = (unsigned long)&instr[0];
+    regs.edi    = (unsigned long)res;
     ctxt.cr2    = regs.edi;
     rc = x86_emulate_memop(&ctxt, &emulops);
     if ( (rc != 0) || 
-         (res != 0x2233445D) ||
+         (*res != 0x2233445D) ||
          ((regs.eflags&0x201) != 0x201) ||
          (regs.eip != (unsigned long)&instr[4]) )
         goto fail;
     printf("okay\n");
+
+    res[0] = 0x12345678;
+    res[1] = 0x87654321;
 
     printf("%-40s", "Testing cmpxchg8b (%edi) [succeeding]...");
     instr[0] = 0x0f; instr[1] = 0xc7; instr[2] = 0x0f;
     regs.eflags = 0x200;
-    regs.eax    = cmpxchg8b_res[0];
-    regs.edx    = cmpxchg8b_res[1];
+    regs.eax    = res[0];
+    regs.edx    = res[1];
     regs.ebx    = 0x9999AAAA;
     regs.ecx    = 0xCCCCFFFF;
     regs.eip    = (unsigned long)&instr[0];
-    regs.edi    = (unsigned long)cmpxchg8b_res;
+    regs.edi    = (unsigned long)res;
     ctxt.cr2    = regs.edi;
     rc = x86_emulate_memop(&ctxt, &emulops);
     if ( (rc != 0) || 
-         (cmpxchg8b_res[0] != 0x9999AAAA) ||
-         (cmpxchg8b_res[1] != 0xCCCCFFFF) ||
+         (res[0] != 0x9999AAAA) ||
+         (res[1] != 0xCCCCFFFF) ||
          ((regs.eflags&0x240) != 0x240) ||
          (regs.eip != (unsigned long)&instr[3]) )
         goto fail;
@@ -248,12 +273,12 @@ int main(int argc, char **argv)
     printf("%-40s", "Testing cmpxchg8b (%edi) [failing]...");
     instr[0] = 0x0f; instr[1] = 0xc7; instr[2] = 0x0f;
     regs.eip    = (unsigned long)&instr[0];
-    regs.edi    = (unsigned long)cmpxchg8b_res;
+    regs.edi    = (unsigned long)res;
     ctxt.cr2    = regs.edi;
     rc = x86_emulate_memop(&ctxt, &emulops);
     if ( (rc != 0) || 
-         (cmpxchg8b_res[0] != 0x9999AAAA) ||
-         (cmpxchg8b_res[1] != 0xCCCCFFFF) ||
+         (res[0] != 0x9999AAAA) ||
+         (res[1] != 0xCCCCFFFF) ||
          (regs.eax != 0x9999AAAA) ||
          (regs.edx != 0xCCCCFFFF) ||
          ((regs.eflags&0x240) != 0x200) ||
@@ -265,11 +290,11 @@ int main(int argc, char **argv)
     instr[0] = 0x0f; instr[1] = 0xbe; instr[2] = 0x08;
     regs.eip    = (unsigned long)&instr[0];
     regs.ecx    = 0x12345678;
-    ctxt.cr2    = (unsigned long)&res;
-    res         = 0x82;
+    ctxt.cr2    = (unsigned long)res;
+    *res        = 0x82;
     rc = x86_emulate_memop(&ctxt, &emulops);
     if ( (rc != 0) ||
-         (res != 0x82) ||
+         (*res != 0x82) ||
          (regs.ecx != 0xFFFFFF82) ||
          ((regs.eflags&0x240) != 0x200) ||
          (regs.eip != (unsigned long)&instr[3]) )
@@ -280,11 +305,11 @@ int main(int argc, char **argv)
     instr[0] = 0x0f; instr[1] = 0xb7; instr[2] = 0x08;
     regs.eip    = (unsigned long)&instr[0];
     regs.ecx    = 0x12345678;
-    ctxt.cr2    = (unsigned long)&res;
-    res         = 0x1234aa82;
+    ctxt.cr2    = (unsigned long)res;
+    *res        = 0x1234aa82;
     rc = x86_emulate_memop(&ctxt, &emulops);
     if ( (rc != 0) ||
-         (res != 0x1234aa82) ||
+         (*res != 0x1234aa82) ||
          (regs.ecx != 0xaa82) ||
          ((regs.eflags&0x240) != 0x200) ||
          (regs.eip != (unsigned long)&instr[3]) )
diff -r d3e181fa238b -r 156a0963a1ae xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c     Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/arch/x86/domain.c     Tue Jun 06 13:25:31 2006 -0500
@@ -259,7 +259,7 @@ int arch_set_info_guest(
     struct vcpu *v, struct vcpu_guest_context *c)
 {
     struct domain *d = v->domain;
-    unsigned long cr3_pfn;
+    unsigned long cr3_pfn = INVALID_MFN;
     int i, rc;
 
     if ( !(c->flags & VGCF_HVM_GUEST) )
@@ -524,20 +524,29 @@ static void load_segments(struct vcpu *n
     if ( unlikely(!all_segs_okay) )
     {
         struct cpu_user_regs *regs = guest_cpu_user_regs();
-        unsigned long   *rsp =
+        unsigned long *rsp =
             (n->arch.flags & TF_kernel_mode) ?
             (unsigned long *)regs->rsp :
             (unsigned long *)nctxt->kernel_sp;
+        unsigned long cs_and_mask, rflags;
 
         if ( !(n->arch.flags & TF_kernel_mode) )
             toggle_guest_mode(n);
         else
             regs->cs &= ~3;
 
+        /* CS longword also contains full evtchn_upcall_mask. */
+        cs_and_mask = (unsigned long)regs->cs |
+            ((unsigned long)n->vcpu_info->evtchn_upcall_mask << 32);
+
+        /* Fold upcall mask into RFLAGS.IF. */
+        rflags  = regs->rflags & ~X86_EFLAGS_IF;
+        rflags |= !n->vcpu_info->evtchn_upcall_mask << 9;
+
         if ( put_user(regs->ss,            rsp- 1) |
              put_user(regs->rsp,           rsp- 2) |
-             put_user(regs->rflags,        rsp- 3) |
-             put_user(regs->cs,            rsp- 4) |
+             put_user(rflags,              rsp- 3) |
+             put_user(cs_and_mask,         rsp- 4) |
              put_user(regs->rip,           rsp- 5) |
              put_user(nctxt->user_regs.gs, rsp- 6) |
              put_user(nctxt->user_regs.fs, rsp- 7) |
@@ -549,6 +558,10 @@ static void load_segments(struct vcpu *n
             DPRINTK("Error while creating failsafe callback frame.\n");
             domain_crash(n->domain);
         }
+
+        if ( test_bit(_VGCF_failsafe_disables_events,
+                      &n->arch.guest_context.flags) )
+            n->vcpu_info->evtchn_upcall_mask = 1;
 
         regs->entry_vector  = TRAP_syscall;
         regs->rflags       &= 0xFFFCBEFFUL;
diff -r d3e181fa238b -r 156a0963a1ae xen/arch/x86/domain_build.c
--- a/xen/arch/x86/domain_build.c       Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/arch/x86/domain_build.c       Tue Jun 06 13:25:31 2006 -0500
@@ -301,6 +301,9 @@ int construct_dom0(struct domain *d,
                xen_pae ? "yes" : "no", dom0_pae ? "yes" : "no");
         return -EINVAL;
     }
+
+    if ( xen_pae && !!strstr(dsi.xen_section_string, "PAE=yes[extended-cr3]") )
+        set_bit(VMASST_TYPE_pae_extended_cr3, &d->vm_assist);
 
     if ( (p = strstr(dsi.xen_section_string, "FEATURES=")) != NULL )
     {
diff -r d3e181fa238b -r 156a0963a1ae xen/arch/x86/hvm/vmx/vmx.c
--- a/xen/arch/x86/hvm/vmx/vmx.c        Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/arch/x86/hvm/vmx/vmx.c        Tue Jun 06 13:25:31 2006 -0500
@@ -1970,7 +1970,6 @@ static inline void vmx_vmexit_do_extint(
         __hvm_bug(regs);
 
     vector &= INTR_INFO_VECTOR_MASK;
-    local_irq_disable();
     TRACE_VMEXIT(1,vector);
 
     switch(vector) {
@@ -2065,30 +2064,33 @@ asmlinkage void vmx_vmexit_handler(struc
     struct vcpu *v = current;
     int error;
 
-    if ((error = __vmread(VM_EXIT_REASON, &exit_reason)))
-        __hvm_bug(&regs);
+    error = __vmread(VM_EXIT_REASON, &exit_reason);
+    BUG_ON(error);
 
     perfc_incra(vmexits, exit_reason);
 
-    /* don't bother H/W interrutps */
-    if (exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT &&
-        exit_reason != EXIT_REASON_VMCALL &&
-        exit_reason != EXIT_REASON_IO_INSTRUCTION) 
+    if ( (exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT) &&
+         (exit_reason != EXIT_REASON_VMCALL) &&
+         (exit_reason != EXIT_REASON_IO_INSTRUCTION) )
         HVM_DBG_LOG(DBG_LEVEL_0, "exit reason = %x", exit_reason);
 
-    if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
+    if ( exit_reason != EXIT_REASON_EXTERNAL_INTERRUPT )
+        local_irq_enable();
+
+    if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
+    {
         printk("Failed vm entry (reason 0x%x)\n", exit_reason);
         printk("*********** VMCS Area **************\n");
         vmcs_dump_vcpu();
         printk("**************************************\n");
         domain_crash_synchronous();
-        return;
     }
 
     __vmread(GUEST_RIP, &eip);
     TRACE_VMEXIT(0,exit_reason);
 
-    switch (exit_reason) {
+    switch ( exit_reason )
+    {
     case EXIT_REASON_EXCEPTION_NMI:
     {
         /*
diff -r d3e181fa238b -r 156a0963a1ae xen/arch/x86/hvm/vmx/x86_32/exits.S
--- a/xen/arch/x86/hvm/vmx/x86_32/exits.S       Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/arch/x86/hvm/vmx/x86_32/exits.S       Tue Jun 06 13:25:31 2006 -0500
@@ -55,29 +55,26 @@
  * domain pointer, DS, ES, FS, GS. Therefore, we effectively skip 6 registers.
  */
 
-#define HVM_MONITOR_EFLAGS     0x202 /* IF on */
 #define NR_SKIPPED_REGS        6       /* See the above explanation */
-#define HVM_SAVE_ALL_NOSEGREGS \
-        pushl $HVM_MONITOR_EFLAGS; \
-        popf; \
-        subl $(NR_SKIPPED_REGS*4), %esp; \
+#define HVM_SAVE_ALL_NOSEGREGS                                              \
+        subl $(NR_SKIPPED_REGS*4), %esp;                                    \
         movl $0, 0xc(%esp);  /* XXX why do we need to force eflags==0 ?? */ \
-        pushl %eax; \
-        pushl %ebp; \
-        pushl %edi; \
-        pushl %esi; \
-        pushl %edx; \
-        pushl %ecx; \
+        pushl %eax;                                                         \
+        pushl %ebp;                                                         \
+        pushl %edi;                                                         \
+        pushl %esi;                                                         \
+        pushl %edx;                                                         \
+        pushl %ecx;                                                         \
         pushl %ebx;
 
-#define HVM_RESTORE_ALL_NOSEGREGS   \
-        popl %ebx;  \
-        popl %ecx;  \
-        popl %edx;  \
-        popl %esi;  \
-        popl %edi;  \
-        popl %ebp;  \
-        popl %eax;  \
+#define HVM_RESTORE_ALL_NOSEGREGS               \
+        popl %ebx;                              \
+        popl %ecx;                              \
+        popl %edx;                              \
+        popl %esi;                              \
+        popl %edi;                              \
+        popl %ebp;                              \
+        popl %eax;                              \
         addl $(NR_SKIPPED_REGS*4), %esp
 
         ALIGN
diff -r d3e181fa238b -r 156a0963a1ae xen/arch/x86/hvm/vmx/x86_64/exits.S
--- a/xen/arch/x86/hvm/vmx/x86_64/exits.S       Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/arch/x86/hvm/vmx/x86_64/exits.S       Tue Jun 06 13:25:31 2006 -0500
@@ -51,45 +51,42 @@
  * (2/1)  u32 entry_vector;
  * (1/1)  u32 error_code;
  */
-#define HVM_MONITOR_RFLAGS     0x202 /* IF on */
 #define NR_SKIPPED_REGS        6       /* See the above explanation */
-#define HVM_SAVE_ALL_NOSEGREGS \
-        pushq $HVM_MONITOR_RFLAGS; \
-        popfq; \
-        subq $(NR_SKIPPED_REGS*8), %rsp; \
-        pushq %rdi; \
-        pushq %rsi; \
-        pushq %rdx; \
-        pushq %rcx; \
-        pushq %rax; \
-        pushq %r8;  \
-        pushq %r9;  \
-        pushq %r10; \
-        pushq %r11; \
-        pushq %rbx; \
-        pushq %rbp; \
-        pushq %r12; \
-        pushq %r13; \
-        pushq %r14; \
-        pushq %r15; \
+#define HVM_SAVE_ALL_NOSEGREGS                  \
+        subq $(NR_SKIPPED_REGS*8), %rsp;        \
+        pushq %rdi;                             \
+        pushq %rsi;                             \
+        pushq %rdx;                             \
+        pushq %rcx;                             \
+        pushq %rax;                             \
+        pushq %r8;                              \
+        pushq %r9;                              \
+        pushq %r10;                             \
+        pushq %r11;                             \
+        pushq %rbx;                             \
+        pushq %rbp;                             \
+        pushq %r12;                             \
+        pushq %r13;                             \
+        pushq %r14;                             \
+        pushq %r15;
 
-#define HVM_RESTORE_ALL_NOSEGREGS \
-        popq %r15; \
-        popq %r14; \
-        popq %r13; \
-        popq %r12; \
-        popq %rbp; \
-        popq %rbx; \
-        popq %r11; \
-        popq %r10; \
-        popq %r9;  \
-        popq %r8;  \
-        popq %rax; \
-        popq %rcx; \
-        popq %rdx; \
-        popq %rsi; \
-        popq %rdi; \
-        addq $(NR_SKIPPED_REGS*8), %rsp; \
+#define HVM_RESTORE_ALL_NOSEGREGS               \
+        popq %r15;                              \
+        popq %r14;                              \
+        popq %r13;                              \
+        popq %r12;                              \
+        popq %rbp;                              \
+        popq %rbx;                              \
+        popq %r11;                              \
+        popq %r10;                              \
+        popq %r9;                               \
+        popq %r8;                               \
+        popq %rax;                              \
+        popq %rcx;                              \
+        popq %rdx;                              \
+        popq %rsi;                              \
+        popq %rdi;                              \
+        addq $(NR_SKIPPED_REGS*8), %rsp;
 
 ENTRY(vmx_asm_vmexit_handler)
         /* selectors are restored/saved by VMX */
diff -r d3e181fa238b -r 156a0963a1ae xen/arch/x86/mm.c
--- a/xen/arch/x86/mm.c Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/arch/x86/mm.c Tue Jun 06 13:25:31 2006 -0500
@@ -996,6 +996,21 @@ static int alloc_l3_table(struct page_in
     int            i;
 
     ASSERT(!shadow_mode_refcounts(d));
+
+#ifdef CONFIG_X86_PAE
+    /*
+     * PAE pgdirs above 4GB are unacceptable if the guest does not understand
+     * the weird 'extended cr3' format for dealing with high-order address
+     * bits. We cut some slack for control tools (before vcpu0 is initialised).
+     */
+    if ( (pfn >= 0x100000) &&
+         unlikely(!VM_ASSIST(d, VMASST_TYPE_pae_extended_cr3)) &&
+         d->vcpu[0] && test_bit(_VCPUF_initialised, &d->vcpu[0]->vcpu_flags) )
+    {
+        MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn);
+        return 0;
+    }
+#endif
 
     pl3e = map_domain_page(pfn);
     for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
diff -r d3e181fa238b -r 156a0963a1ae xen/arch/x86/x86_32/asm-offsets.c
--- a/xen/arch/x86/x86_32/asm-offsets.c Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/arch/x86/x86_32/asm-offsets.c Tue Jun 06 13:25:31 2006 -0500
@@ -64,11 +64,13 @@ void __dummy__(void)
            arch.guest_context.kernel_ss);
     OFFSET(VCPU_kernel_sp, struct vcpu,
            arch.guest_context.kernel_sp);
+    OFFSET(VCPU_guest_context_flags, struct vcpu, arch.guest_context.flags);
     OFFSET(VCPU_arch_guest_fpu_ctxt, struct vcpu, arch.guest_context.fpu_ctxt);
     OFFSET(VCPU_flags, struct vcpu, vcpu_flags);
     OFFSET(VCPU_nmi_addr, struct vcpu, nmi_addr);
     DEFINE(_VCPUF_nmi_pending, _VCPUF_nmi_pending);
     DEFINE(_VCPUF_nmi_masked, _VCPUF_nmi_masked);
+    DEFINE(_VGCF_failsafe_disables_events, _VGCF_failsafe_disables_events);
     BLANK();
 
     OFFSET(TSS_ss0, struct tss_struct, ss0);
diff -r d3e181fa238b -r 156a0963a1ae xen/arch/x86/x86_32/entry.S
--- a/xen/arch/x86/x86_32/entry.S       Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/arch/x86/x86_32/entry.S       Tue Jun 06 13:25:31 2006 -0500
@@ -130,7 +130,10 @@ failsafe_callback:
         movl  VCPU_failsafe_sel(%ebx),%eax
         movw  %ax,TRAPBOUNCE_cs(%edx)
         movw  $TBF_FAILSAFE,TRAPBOUNCE_flags(%edx)
-        call  create_bounce_frame
+        bt    $_VGCF_failsafe_disables_events,VCPU_guest_context_flags(%ebx)
+        jnc   1f
+        orw   $TBF_INTERRUPT,TRAPBOUNCE_flags(%edx)
+1:      call  create_bounce_frame
         xorl  %eax,%eax
         movl  %eax,UREGS_ds(%esp)
         movl  %eax,UREGS_es(%esp)
diff -r d3e181fa238b -r 156a0963a1ae xen/arch/x86/x86_32/traps.c
--- a/xen/arch/x86/x86_32/traps.c       Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/arch/x86/x86_32/traps.c       Tue Jun 06 13:25:31 2006 -0500
@@ -346,6 +346,12 @@ static long register_guest_callback(stru
     case CALLBACKTYPE_failsafe:
         v->arch.guest_context.failsafe_callback_cs  = reg->address.cs;
         v->arch.guest_context.failsafe_callback_eip = reg->address.eip;
+        if ( reg->flags & CALLBACKF_mask_events )
+            set_bit(_VGCF_failsafe_disables_events,
+                    &v->arch.guest_context.flags);
+        else
+            clear_bit(_VGCF_failsafe_disables_events,
+                      &v->arch.guest_context.flags);
         break;
 
 #ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
diff -r d3e181fa238b -r 156a0963a1ae xen/arch/x86/x86_64/asm-offsets.c
--- a/xen/arch/x86/x86_64/asm-offsets.c Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/arch/x86/x86_64/asm-offsets.c Tue Jun 06 13:25:31 2006 -0500
@@ -64,11 +64,14 @@ void __dummy__(void)
            arch.guest_context.syscall_callback_eip);
     OFFSET(VCPU_kernel_sp, struct vcpu,
            arch.guest_context.kernel_sp);
+    OFFSET(VCPU_guest_context_flags, struct vcpu, arch.guest_context.flags);
     OFFSET(VCPU_arch_guest_fpu_ctxt, struct vcpu, arch.guest_context.fpu_ctxt);
     OFFSET(VCPU_flags, struct vcpu, vcpu_flags);
     OFFSET(VCPU_nmi_addr, struct vcpu, nmi_addr);
     DEFINE(_VCPUF_nmi_pending, _VCPUF_nmi_pending);
     DEFINE(_VCPUF_nmi_masked, _VCPUF_nmi_masked);
+    DEFINE(_VGCF_failsafe_disables_events, _VGCF_failsafe_disables_events);
+    DEFINE(_VGCF_syscall_disables_events,  _VGCF_syscall_disables_events);
     BLANK();
 
     OFFSET(VCPU_svm_vmcb_pa, struct vcpu, arch.hvm_svm.vmcb_pa);
diff -r d3e181fa238b -r 156a0963a1ae xen/arch/x86/x86_64/entry.S
--- a/xen/arch/x86/x86_64/entry.S       Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/arch/x86/x86_64/entry.S       Tue Jun 06 13:25:31 2006 -0500
@@ -30,7 +30,10 @@ switch_to_kernel:
         movq  VCPU_syscall_addr(%rbx),%rax
         movq  %rax,TRAPBOUNCE_eip(%rdx)
         movw  $0,TRAPBOUNCE_flags(%rdx)
-        call  create_bounce_frame
+        bt    $_VGCF_syscall_disables_events,VCPU_guest_context_flags(%rbx)
+        jnc   1f
+        orw   $TBF_INTERRUPT,TRAPBOUNCE_flags(%rdx)
+1:      call  create_bounce_frame
         jmp   test_all_events
 
 /* %rbx: struct vcpu, interrupts disabled */
@@ -77,7 +80,10 @@ failsafe_callback:
         movq  VCPU_failsafe_addr(%rbx),%rax
         movq  %rax,TRAPBOUNCE_eip(%rdx)
         movw  $TBF_FAILSAFE,TRAPBOUNCE_flags(%rdx)
-        call  create_bounce_frame
+        bt    $_VGCF_failsafe_disables_events,VCPU_guest_context_flags(%rbx)
+        jnc   1f
+        orw   $TBF_INTERRUPT,TRAPBOUNCE_flags(%rdx)
+1:      call  create_bounce_frame
         jmp   test_all_events
 .previous
 .section __pre_ex_table,"a"
diff -r d3e181fa238b -r 156a0963a1ae xen/arch/x86/x86_64/traps.c
--- a/xen/arch/x86/x86_64/traps.c       Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/arch/x86/x86_64/traps.c       Tue Jun 06 13:25:31 2006 -0500
@@ -334,10 +334,22 @@ static long register_guest_callback(stru
 
     case CALLBACKTYPE_failsafe:
         v->arch.guest_context.failsafe_callback_eip = reg->address;
+        if ( reg->flags & CALLBACKF_mask_events )
+            set_bit(_VGCF_failsafe_disables_events,
+                    &v->arch.guest_context.flags);
+        else
+            clear_bit(_VGCF_failsafe_disables_events,
+                      &v->arch.guest_context.flags);
         break;
 
     case CALLBACKTYPE_syscall:
         v->arch.guest_context.syscall_callback_eip  = reg->address;
+        if ( reg->flags & CALLBACKF_mask_events )
+            set_bit(_VGCF_syscall_disables_events,
+                    &v->arch.guest_context.flags);
+        else
+            clear_bit(_VGCF_syscall_disables_events,
+                      &v->arch.guest_context.flags);
         break;
 
     case CALLBACKTYPE_nmi:
diff -r d3e181fa238b -r 156a0963a1ae xen/arch/x86/x86_emulate.c
--- a/xen/arch/x86/x86_emulate.c        Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/arch/x86/x86_emulate.c        Tue Jun 06 13:25:31 2006 -0500
@@ -100,8 +100,8 @@ static uint8_t opcode_table[256] = {
     ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
     ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
     /* 0x88 - 0x8F */
-    ByteOp|DstMem|SrcReg|ModRM, DstMem|SrcReg|ModRM,
-    ByteOp|DstReg|SrcMem|ModRM, DstReg|SrcMem|ModRM,
+    ByteOp|DstMem|SrcReg|ModRM|Mov, DstMem|SrcReg|ModRM|Mov,
+    ByteOp|DstReg|SrcMem|ModRM|Mov, DstReg|SrcMem|ModRM|Mov,
     0, 0, 0, DstMem|SrcNone|ModRM|Mov,
     /* 0x90 - 0x9F */
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
diff -r d3e181fa238b -r 156a0963a1ae xen/common/kernel.c
--- a/xen/common/kernel.c       Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/common/kernel.c       Tue Jun 06 13:25:31 2006 -0500
@@ -184,6 +184,7 @@ long do_xen_version(int cmd, XEN_GUEST_H
     case XENVER_get_features:
     {
         xen_feature_info_t fi;
+        struct domain *d = current->domain;
 
         if ( copy_from_guest(&fi, arg, 1) )
             return -EFAULT;
@@ -191,7 +192,9 @@ long do_xen_version(int cmd, XEN_GUEST_H
         switch ( fi.submap_idx )
         {
         case 0:
-            fi.submap = (1U << XENFEAT_pae_pgdir_above_4gb);
+            fi.submap = 0;
+            if ( VM_ASSIST(d, VMASST_TYPE_pae_extended_cr3) )
+                fi.submap |= (1U << XENFEAT_pae_pgdir_above_4gb);
             if ( shadow_mode_translate(current->domain) )
                 fi.submap |= 
                     (1U << XENFEAT_writable_page_tables) |
diff -r d3e181fa238b -r 156a0963a1ae xen/common/keyhandler.c
--- a/xen/common/keyhandler.c   Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/common/keyhandler.c   Tue Jun 06 13:25:31 2006 -0500
@@ -128,11 +128,12 @@ static void dump_domains(unsigned char k
                d->domain_flags, atomic_read(&d->refcnt),
                d->tot_pages, d->xenheap_pages, cpuset);
         printk("    handle=%02x%02x%02x%02x-%02x%02x-%02x%02x-"
-               "%02x%02x-%02x%02x%02x%02x%02x%02x\n",
+               "%02x%02x-%02x%02x%02x%02x%02x%02x vm_assist=%08lx\n",
                d->handle[ 0], d->handle[ 1], d->handle[ 2], d->handle[ 3],
                d->handle[ 4], d->handle[ 5], d->handle[ 6], d->handle[ 7],
                d->handle[ 8], d->handle[ 9], d->handle[10], d->handle[11],
-               d->handle[12], d->handle[13], d->handle[14], d->handle[15]);
+               d->handle[12], d->handle[13], d->handle[14], d->handle[15],
+               d->vm_assist);
 
         arch_dump_domain_info(d);
 
diff -r d3e181fa238b -r 156a0963a1ae xen/common/memory.c
--- a/xen/common/memory.c       Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/common/memory.c       Tue Jun 06 13:25:31 2006 -0500
@@ -31,14 +31,15 @@ static long
 static long
 increase_reservation(
     struct domain *d, 
-    XEN_GUEST_HANDLE(ulong) extent_list,
+    XEN_GUEST_HANDLE(xen_pfn_t) extent_list,
     unsigned int   nr_extents,
     unsigned int   extent_order,
     unsigned int   flags,
     int           *preempted)
 {
     struct page_info *page;
-    unsigned long     i, mfn;
+    unsigned long i;
+    xen_pfn_t mfn;
 
     if ( !guest_handle_is_null(extent_list) &&
          !guest_handle_okay(extent_list, nr_extents) )
@@ -80,14 +81,16 @@ static long
 static long
 populate_physmap(
     struct domain *d, 
-    XEN_GUEST_HANDLE(ulong) extent_list,
+    XEN_GUEST_HANDLE(xen_pfn_t) extent_list,
     unsigned int  nr_extents,
     unsigned int  extent_order,
     unsigned int  flags,
     int          *preempted)
 {
     struct page_info *page;
-    unsigned long    i, j, gpfn, mfn;
+    unsigned long i, j;
+    xen_pfn_t gpfn;
+    xen_pfn_t mfn;
 
     if ( !guest_handle_okay(extent_list, nr_extents) )
         return 0;
@@ -177,13 +180,14 @@ static long
 static long
 decrease_reservation(
     struct domain *d,
-    XEN_GUEST_HANDLE(ulong) extent_list,
+    XEN_GUEST_HANDLE(xen_pfn_t) extent_list,
     unsigned int   nr_extents,
     unsigned int   extent_order,
     unsigned int   flags,
     int           *preempted)
 {
-    unsigned long    i, j, gmfn;
+    unsigned long i, j;
+    xen_pfn_t gmfn;
 
     if ( !guest_handle_okay(extent_list, nr_extents) )
         return 0;
@@ -214,7 +218,9 @@ translate_gpfn_list(
     XEN_GUEST_HANDLE(xen_translate_gpfn_list_t) uop, unsigned long *progress)
 {
     struct xen_translate_gpfn_list op;
-    unsigned long i, gpfn, mfn;
+    unsigned long i;
+    xen_pfn_t gpfn;
+    xen_pfn_t mfn;
     struct domain *d;
 
     if ( copy_from_guest(&op, uop, 1) )
diff -r d3e181fa238b -r 156a0963a1ae xen/include/public/arch-ia64.h
--- a/xen/include/public/arch-ia64.h    Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/include/public/arch-ia64.h    Tue Jun 06 13:25:31 2006 -0500
@@ -26,6 +26,9 @@ DEFINE_XEN_GUEST_HANDLE(int);
 DEFINE_XEN_GUEST_HANDLE(int);
 DEFINE_XEN_GUEST_HANDLE(long);
 DEFINE_XEN_GUEST_HANDLE(void);
+
+typedef unsigned long xen_pfn_t;
+DEFINE_XEN_GUEST_HANDLE(xen_pfn_t);
 #endif
 
 /* Arch specific VIRQs definition */
diff -r d3e181fa238b -r 156a0963a1ae xen/include/public/arch-x86_32.h
--- a/xen/include/public/arch-x86_32.h  Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/include/public/arch-x86_32.h  Tue Jun 06 13:25:31 2006 -0500
@@ -28,6 +28,9 @@ DEFINE_XEN_GUEST_HANDLE(int);
 DEFINE_XEN_GUEST_HANDLE(int);
 DEFINE_XEN_GUEST_HANDLE(long);
 DEFINE_XEN_GUEST_HANDLE(void);
+
+typedef unsigned long xen_pfn_t;
+DEFINE_XEN_GUEST_HANDLE(xen_pfn_t);
 #endif
 
 /*
@@ -138,9 +141,17 @@ struct vcpu_guest_context {
 struct vcpu_guest_context {
     /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */
     struct { char x[512]; } fpu_ctxt;       /* User-level FPU registers     */
-#define VGCF_I387_VALID (1<<0)
-#define VGCF_HVM_GUEST  (1<<1)
-#define VGCF_IN_KERNEL  (1<<2)
+#define VGCF_I387_VALID                (1<<0)
+#define VGCF_HVM_GUEST                 (1<<1)
+#define VGCF_IN_KERNEL                 (1<<2)
+#define _VGCF_i387_valid               0
+#define VGCF_i387_valid                (1<<_VGCF_i387_valid)
+#define _VGCF_hvm_guest                1
+#define VGCF_hvm_guest                 (1<<_VGCF_hvm_guest)
+#define _VGCF_in_kernel                2
+#define VGCF_in_kernel                 (1<<_VGCF_in_kernel)
+#define _VGCF_failsafe_disables_events 3
+#define VGCF_failsafe_disables_events  (1<<_VGCF_failsafe_disables_events)
     unsigned long flags;                    /* VGCF_* flags                 */
     struct cpu_user_regs user_regs;         /* User-level CPU registers     */
     struct trap_info trap_ctxt[256];        /* Virtual IDT                  */
@@ -169,7 +180,7 @@ struct arch_shared_info {
 struct arch_shared_info {
     unsigned long max_pfn;                  /* max pfn that appears in table */
     /* Frame containing list of mfns containing list of mfns containing p2m. */
-    unsigned long pfn_to_mfn_frame_list_list;
+    xen_pfn_t     pfn_to_mfn_frame_list_list;
     unsigned long nmi_reason;
 };
 typedef struct arch_shared_info arch_shared_info_t;
diff -r d3e181fa238b -r 156a0963a1ae xen/include/public/arch-x86_64.h
--- a/xen/include/public/arch-x86_64.h  Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/include/public/arch-x86_64.h  Tue Jun 06 13:25:31 2006 -0500
@@ -28,6 +28,9 @@ DEFINE_XEN_GUEST_HANDLE(int);
 DEFINE_XEN_GUEST_HANDLE(int);
 DEFINE_XEN_GUEST_HANDLE(long);
 DEFINE_XEN_GUEST_HANDLE(void);
+
+typedef unsigned long xen_pfn_t;
+DEFINE_XEN_GUEST_HANDLE(xen_pfn_t);
 #endif
 
 /*
@@ -211,9 +214,19 @@ struct vcpu_guest_context {
 struct vcpu_guest_context {
     /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */
     struct { char x[512]; } fpu_ctxt;       /* User-level FPU registers     */
-#define VGCF_I387_VALID (1<<0)
-#define VGCF_HVM_GUEST  (1<<1)
-#define VGCF_IN_KERNEL  (1<<2)
+#define VGCF_I387_VALID                (1<<0)
+#define VGCF_HVM_GUEST                 (1<<1)
+#define VGCF_IN_KERNEL                 (1<<2)
+#define _VGCF_i387_valid               0
+#define VGCF_i387_valid                (1<<_VGCF_i387_valid)
+#define _VGCF_hvm_guest                1
+#define VGCF_hvm_guest                 (1<<_VGCF_hvm_guest)
+#define _VGCF_in_kernel                2
+#define VGCF_in_kernel                 (1<<_VGCF_in_kernel)
+#define _VGCF_failsafe_disables_events 3
+#define VGCF_failsafe_disables_events  (1<<_VGCF_failsafe_disables_events)
+#define _VGCF_syscall_disables_events  4
+#define VGCF_syscall_disables_events   (1<<_VGCF_syscall_disables_events)
     unsigned long flags;                    /* VGCF_* flags                 */
     struct cpu_user_regs user_regs;         /* User-level CPU registers     */
     struct trap_info trap_ctxt[256];        /* Virtual IDT                  */
@@ -240,7 +253,7 @@ struct arch_shared_info {
 struct arch_shared_info {
     unsigned long max_pfn;                  /* max pfn that appears in table */
     /* Frame containing list of mfns containing list of mfns containing p2m. */
-    unsigned long pfn_to_mfn_frame_list_list;
+    xen_pfn_t     pfn_to_mfn_frame_list_list;
     unsigned long nmi_reason;
 };
 typedef struct arch_shared_info arch_shared_info_t;
diff -r d3e181fa238b -r 156a0963a1ae xen/include/public/callback.h
--- a/xen/include/public/callback.h     Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/include/public/callback.h     Tue Jun 06 13:25:31 2006 -0500
@@ -29,12 +29,20 @@
 #define CALLBACKTYPE_nmi                   4
 
 /*
+ * Disable event deliver during callback? This flag is ignored for event and
+ * NMI callbacks: event delivery is unconditionally disabled.
+ */
+#define _CALLBACKF_mask_events             0
+#define CALLBACKF_mask_events              (1U << _CALLBACKF_mask_events)
+
+/*
  * Register a callback.
  */
 #define CALLBACKOP_register                0
 struct callback_register {
-     int type;
-     xen_callback_t address;
+    uint16_t type;
+    uint16_t flags;
+    xen_callback_t address;
 };
 typedef struct callback_register callback_register_t;
 DEFINE_XEN_GUEST_HANDLE(callback_register_t);
@@ -47,7 +55,8 @@ DEFINE_XEN_GUEST_HANDLE(callback_registe
  */
 #define CALLBACKOP_unregister              1
 struct callback_unregister {
-     int type;
+    uint16_t type;
+    uint16_t _unused;
 };
 typedef struct callback_unregister callback_unregister_t;
 DEFINE_XEN_GUEST_HANDLE(callback_unregister_t);
diff -r d3e181fa238b -r 156a0963a1ae xen/include/public/dom0_ops.h
--- a/xen/include/public/dom0_ops.h     Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/include/public/dom0_ops.h     Tue Jun 06 13:25:31 2006 -0500
@@ -19,7 +19,7 @@
  * This makes sure that old versions of dom0 tools will stop working in a
  * well-defined way (rather than crashing the machine, for instance).
  */
-#define DOM0_INTERFACE_VERSION   0x03000000
+#define DOM0_INTERFACE_VERSION   0x03000001
 
 /************************************************************************/
 
@@ -27,10 +27,10 @@ struct dom0_getmemlist {
 struct dom0_getmemlist {
     /* IN variables. */
     domid_t       domain;
-    unsigned long max_pfns;
-    XEN_GUEST_HANDLE(ulong) buffer;
-    /* OUT variables. */
-    unsigned long num_pfns;
+    uint64_t max_pfns;
+    XEN_GUEST_HANDLE(xen_pfn_t) buffer;
+    /* OUT variables. */
+    uint64_t num_pfns;
 };
 typedef struct dom0_getmemlist dom0_getmemlist_t;
 DEFINE_XEN_GUEST_HANDLE(dom0_getmemlist_t);
@@ -96,9 +96,9 @@ struct dom0_getdomaininfo {
 #define DOMFLAGS_SHUTDOWNMASK 255 /* DOMFLAGS_SHUTDOWN guest-supplied code.  */
 #define DOMFLAGS_SHUTDOWNSHIFT 16
     uint32_t flags;
-    unsigned long tot_pages;
-    unsigned long max_pages;
-    unsigned long shared_info_frame;       /* MFN of shared_info struct */
+    uint64_t tot_pages;
+    uint64_t max_pages;
+    xen_pfn_t shared_info_frame;  /* MFN of shared_info struct */
     uint64_t cpu_time;
     uint32_t nr_online_vcpus;     /* Number of VCPUs currently online. */
     uint32_t max_vcpu_id;         /* Maximum VCPUID in use by this domain. */
@@ -162,7 +162,7 @@ DEFINE_XEN_GUEST_HANDLE(dom0_settime_t);
 
 struct dom0_getpageframeinfo {
     /* IN variables. */
-    unsigned long mfn;     /* Machine page frame number to query.       */
+    xen_pfn_t mfn;         /* Machine page frame number to query.       */
     domid_t domain;        /* To which domain does the frame belong?    */
     /* OUT variables. */
     /* Is the page PINNED to a type? */
@@ -213,7 +213,7 @@ struct dom0_tbufcontrol {
     cpumap_t      cpu_mask;
     uint32_t      evt_mask;
     /* OUT variables */
-    unsigned long buffer_mfn;
+    xen_pfn_t buffer_mfn;
     uint32_t size;
 };
 typedef struct dom0_tbufcontrol dom0_tbufcontrol_t;
@@ -229,8 +229,8 @@ struct dom0_physinfo {
     uint32_t sockets_per_node;
     uint32_t nr_nodes;
     uint32_t cpu_khz;
-    unsigned long total_pages;
-    unsigned long free_pages;
+    uint64_t total_pages;
+    uint64_t free_pages;
     uint32_t hw_cap[8];
 };
 typedef struct dom0_physinfo dom0_physinfo_t;
@@ -276,7 +276,7 @@ struct dom0_shadow_control {
     uint32_t       op;
     XEN_GUEST_HANDLE(ulong) dirty_bitmap;
     /* IN/OUT variables. */
-    unsigned long  pages;        /* size of buffer, updated with actual size */
+    uint64_t       pages;        /* size of buffer, updated with actual size */
     /* OUT variables. */
     struct dom0_shadow_control_stats stats;
 };
@@ -286,8 +286,8 @@ DEFINE_XEN_GUEST_HANDLE(dom0_shadow_cont
 #define DOM0_SETDOMAINMAXMEM   28
 struct dom0_setdomainmaxmem {
     /* IN variables. */
-    domid_t       domain;
-    unsigned long max_memkb;
+    domid_t  domain;
+    uint64_t max_memkb;
 };
 typedef struct dom0_setdomainmaxmem dom0_setdomainmaxmem_t;
 DEFINE_XEN_GUEST_HANDLE(dom0_setdomainmaxmem_t);
@@ -295,8 +295,8 @@ DEFINE_XEN_GUEST_HANDLE(dom0_setdomainma
 #define DOM0_GETPAGEFRAMEINFO2 29   /* batched interface */
 struct dom0_getpageframeinfo2 {
     /* IN variables. */
-    domid_t        domain;
-    unsigned long  num;
+    domid_t  domain;
+    uint64_t num;
     /* IN/OUT variables. */
     XEN_GUEST_HANDLE(ulong) array;
 };
@@ -313,12 +313,12 @@ DEFINE_XEN_GUEST_HANDLE(dom0_getpagefram
 #define DOM0_ADD_MEMTYPE         31
 struct dom0_add_memtype {
     /* IN variables. */
-    unsigned long mfn;
-    unsigned long nr_mfns;
-    uint32_t      type;
-    /* OUT variables. */
-    uint32_t      handle;
-    uint32_t      reg;
+    xen_pfn_t mfn;
+    uint64_t nr_mfns;
+    uint32_t type;
+    /* OUT variables. */
+    uint32_t handle;
+    uint32_t reg;
 };
 typedef struct dom0_add_memtype dom0_add_memtype_t;
 DEFINE_XEN_GUEST_HANDLE(dom0_add_memtype_t);
@@ -345,8 +345,8 @@ struct dom0_read_memtype {
     /* IN variables. */
     uint32_t reg;
     /* OUT variables. */
-    unsigned long mfn;
-    unsigned long nr_mfns;
+    xen_pfn_t mfn;
+    uint64_t nr_mfns;
     uint32_t type;
 };
 typedef struct dom0_read_memtype dom0_read_memtype_t;
@@ -499,8 +499,8 @@ DEFINE_XEN_GUEST_HANDLE(dom0_irq_permiss
 #define DOM0_IOMEM_PERMISSION 47
 struct dom0_iomem_permission {
     domid_t  domain;          /* domain to be affected */
-    unsigned long first_mfn;  /* first page (physical page number) in range */
-    unsigned long nr_mfns;    /* number of pages in range (>0) */
+    xen_pfn_t first_mfn;      /* first page (physical page number) in range */
+    uint64_t nr_mfns;         /* number of pages in range (>0) */
     uint8_t allow_access;     /* allow (!0) or deny (0) access to range? */
 };
 typedef struct dom0_iomem_permission dom0_iomem_permission_t;
@@ -509,7 +509,7 @@ DEFINE_XEN_GUEST_HANDLE(dom0_iomem_permi
 #define DOM0_HYPERCALL_INIT   48
 struct dom0_hypercall_init {
     domid_t  domain;          /* domain to be affected */
-    unsigned long mfn;        /* machine frame to be initialised */
+    xen_pfn_t mfn;            /* machine frame to be initialised */
 };
 typedef struct dom0_hypercall_init dom0_hypercall_init_t;
 DEFINE_XEN_GUEST_HANDLE(dom0_hypercall_init_t);
diff -r d3e181fa238b -r 156a0963a1ae xen/include/public/grant_table.h
--- a/xen/include/public/grant_table.h  Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/include/public/grant_table.h  Tue Jun 06 13:25:31 2006 -0500
@@ -244,7 +244,7 @@ DEFINE_XEN_GUEST_HANDLE(gnttab_dump_tabl
 #define GNTTABOP_transfer                4
 struct gnttab_transfer {
     /* IN parameters. */
-    unsigned long mfn;
+    xen_pfn_t     mfn;
     domid_t       domid;
     grant_ref_t   ref;
     /* OUT parameters. */
diff -r d3e181fa238b -r 156a0963a1ae xen/include/public/io/netif.h
--- a/xen/include/public/io/netif.h     Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/include/public/io/netif.h     Tue Jun 06 13:25:31 2006 -0500
@@ -26,6 +26,10 @@
 /* Packet data has been validated against protocol checksum. */
 #define _NETTXF_data_validated (1)
 #define  NETTXF_data_validated (1U<<_NETTXF_data_validated)
+
+/* Packet continues in the request. */
+#define _NETTXF_more_data      (2)
+#define  NETTXF_more_data      (1U<<_NETTXF_more_data)
 
 struct netif_tx_request {
     grant_ref_t gref;      /* Reference to buffer page */
diff -r d3e181fa238b -r 156a0963a1ae xen/include/public/io/ring.h
--- a/xen/include/public/io/ring.h      Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/include/public/io/ring.h      Tue Jun 06 13:25:31 2006 -0500
@@ -151,19 +151,27 @@ typedef struct __name##_back_ring __name
 #define RING_SIZE(_r)                                                   \
     ((_r)->nr_ents)
 
+/* Number of free requests (for use on front side only). */
+#define RING_FREE_REQUESTS(_r)                                         \
+    (RING_SIZE(_r) - ((_r)->req_prod_pvt - (_r)->rsp_cons))
+
 /* Test if there is an empty slot available on the front ring.
  * (This is only meaningful from the front. )
  */
 #define RING_FULL(_r)                                                   \
-    (((_r)->req_prod_pvt - (_r)->rsp_cons) == RING_SIZE(_r))
+    (RING_FREE_REQUESTS(_r) == 0)
 
 /* Test if there are outstanding messages to be processed on a ring. */
 #define RING_HAS_UNCONSUMED_RESPONSES(_r)                               \
-    ((_r)->rsp_cons != (_r)->sring->rsp_prod)
+    ((_r)->sring->rsp_prod - (_r)->rsp_cons)
 
 #define RING_HAS_UNCONSUMED_REQUESTS(_r)                                \
-    (((_r)->req_cons != (_r)->sring->req_prod) &&                       \
-     (((_r)->req_cons - (_r)->rsp_prod_pvt) != RING_SIZE(_r)))
+    ({                                                                 \
+       unsigned int req = (_r)->sring->req_prod - (_r)->req_cons;      \
+       unsigned int rsp = RING_SIZE(_r) -                              \
+                          ((_r)->req_cons - (_r)->rsp_prod_pvt);       \
+       req < rsp ? req : rsp;                                          \
+    })
 
 /* Direct access to individual ring elements, by index. */
 #define RING_GET_REQUEST(_r, _idx)                                      \
diff -r d3e181fa238b -r 156a0963a1ae xen/include/public/memory.h
--- a/xen/include/public/memory.h       Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/include/public/memory.h       Tue Jun 06 13:25:31 2006 -0500
@@ -29,7 +29,7 @@ struct xen_memory_reservation {
      *   OUT: GMFN bases of extents that were allocated
      *   (NB. This command also updates the mach_to_phys translation table)
      */
-    XEN_GUEST_HANDLE(ulong) extent_start;
+    XEN_GUEST_HANDLE(xen_pfn_t) extent_start;
 
     /* Number of extents, and size/alignment of each (2^extent_order pages). */
     unsigned long  nr_extents;
@@ -87,7 +87,7 @@ struct xen_machphys_mfn_list {
      * any large discontiguities in the machine address space, 2MB gaps in
      * the machphys table will be represented by an MFN base of zero.
      */
-    XEN_GUEST_HANDLE(ulong) extent_start;
+    XEN_GUEST_HANDLE(xen_pfn_t) extent_start;
 
     /*
      * Number of extents written to the above array. This will be smaller
@@ -117,7 +117,7 @@ struct xen_add_to_physmap {
     unsigned long idx;
 
     /* GPFN where the source mapping page should appear. */
-    unsigned long gpfn;
+    xen_pfn_t     gpfn;
 };
 typedef struct xen_add_to_physmap xen_add_to_physmap_t;
 DEFINE_XEN_GUEST_HANDLE(xen_add_to_physmap_t);
@@ -135,13 +135,13 @@ struct xen_translate_gpfn_list {
     unsigned long nr_gpfns;
 
     /* List of GPFNs to translate. */
-    XEN_GUEST_HANDLE(ulong) gpfn_list;
+    XEN_GUEST_HANDLE(xen_pfn_t) gpfn_list;
 
     /*
      * Output list to contain MFN translations. May be the same as the input
      * list (in which case each input GPFN is overwritten with the output MFN).
      */
-    XEN_GUEST_HANDLE(ulong) mfn_list;
+    XEN_GUEST_HANDLE(xen_pfn_t) mfn_list;
 };
 typedef struct xen_translate_gpfn_list xen_translate_gpfn_list_t;
 DEFINE_XEN_GUEST_HANDLE(xen_translate_gpfn_list_t);
diff -r d3e181fa238b -r 156a0963a1ae xen/include/public/xen.h
--- a/xen/include/public/xen.h  Fri Jun 02 12:54:22 2006 -0500
+++ b/xen/include/public/xen.h  Tue Jun 06 13:25:31 2006 -0500
@@ -199,7 +199,7 @@ struct mmuext_op {
     unsigned int cmd;
     union {
         /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR */
-        unsigned long mfn;
+        xen_pfn_t     mfn;
         /* INVLPG_LOCAL, INVLPG_ALL, SET_LDT */
         unsigned long linear_addr;
     } arg1;
@@ -236,10 +236,24 @@ DEFINE_XEN_GUEST_HANDLE(mmuext_op_t);
  */
 #define VMASST_CMD_enable                0
 #define VMASST_CMD_disable               1
+
+/* x86/32 guests: simulate full 4GB segment limits. */
 #define VMASST_TYPE_4gb_segments         0
+
+/* x86/32 guests: trap (vector 15) whenever above vmassist is used. */
 #define VMASST_TYPE_4gb_segments_notify  1
+
+/*
+ * x86 guests: support writes to bottom-level PTEs.
+ * NB1. Page-directory entries cannot be written.
+ * NB2. Guest must continue to remove all writable mappings of PTEs.
+ */
 #define VMASST_TYPE_writable_pagetables  2
-#define MAX_VMASST_TYPE 2
+
+/* x86/PAE guests: support PDPTs above 4GB. */
+#define VMASST_TYPE_pae_extended_cr3     3
+
+#define MAX_VMASST_TYPE                  3
 
 #ifndef __ASSEMBLY__
 
@@ -449,9 +463,9 @@ struct start_info {
     unsigned long nr_pages;     /* Total pages allocated to this domain.  */
     unsigned long shared_info;  /* MACHINE address of shared info struct. */
     uint32_t flags;             /* SIF_xxx flags.                         */
-    unsigned long store_mfn;    /* MACHINE page number of shared page.    */
+    xen_pfn_t store_mfn;        /* MACHINE page number of shared page.    */
     uint32_t store_evtchn;      /* Event channel for store communication. */
-    unsigned long console_mfn;  /* MACHINE address of console page.       */
+    xen_pfn_t console_mfn;      /* MACHINE page number of console page.   */
     uint32_t console_evtchn;    /* Event channel for console messages.    */
     /* THE FOLLOWING ARE ONLY FILLED IN ON INITIAL BOOT (NOT RESUME).     */
     unsigned long pt_base;      /* VIRTUAL address of page directory.     */

_______________________________________________
Xen-ppc-devel mailing list
Xen-ppc-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-ppc-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.