[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen-unstable] merge with xen-unstable.hg



# HG changeset patch
# User Isaku Yamahata <yamahata@xxxxxxxxxxxxx>
# Date 1210239607 -32400
# Node ID 611787b6ca35fb43a811533316c799adae9cccdb
# Parent  f2457c7aff8d45949bc2c83876a7d26d0588663f
# Parent  9a6ad687ec20dd84753e334b104a154738eae6ec
merge with xen-unstable.hg
---
 xen/drivers/passthrough/pci_regs.h                |  530 --------
 xen/drivers/passthrough/vtd/msi.h                 |  127 -
 README                                            |   25 
 docs/ChangeLog                                    |    9 
 docs/misc/vtd.txt                                 |   35 
 docs/src/user.tex                                 |    4 
 extras/mini-os/blkfront.c                         |   18 
 extras/mini-os/fbfront.c                          |   35 
 extras/mini-os/fs-front.c                         |    6 
 extras/mini-os/include/lib.h                      |    6 
 extras/mini-os/include/xenbus.h                   |   11 
 extras/mini-os/netfront.c                         |   18 
 extras/mini-os/xenbus/xenbus.c                    |   37 
 tools/examples/xend-config-xenapi.sxp             |    6 
 tools/examples/xend-config.sxp                    |   15 
 tools/examples/xmexample.hvm                      |   25 
 tools/ioemu/Makefile.target                       |    2 
 tools/ioemu/hw/cirrus_vga.c                       |   50 
 tools/ioemu/hw/pass-through.c                     |   54 
 tools/ioemu/hw/pass-through.h                     |    9 
 tools/ioemu/hw/pt-msi.c                           |  488 +++++++
 tools/ioemu/hw/pt-msi.h                           |   65 
 tools/ioemu/hw/vga.c                              |   90 +
 tools/ioemu/hw/vga_int.h                          |    2 
 tools/ioemu/sdl.c                                 |    3 
 tools/ioemu/vl.h                                  |    1 
 tools/ioemu/vnc.c                                 |    5 
 tools/libfsimage/Makefile                         |    2 
 tools/libfsimage/common/fsimage.c                 |   29 
 tools/libfsimage/common/fsimage.h                 |    6 
 tools/libfsimage/common/fsimage_grub.c            |    4 
 tools/libfsimage/common/fsimage_grub.h            |    8 
 tools/libfsimage/common/fsimage_priv.h            |    3 
 tools/libfsimage/common/mapfile-GNU               |    3 
 tools/libfsimage/common/mapfile-SunOS             |    3 
 tools/libfsimage/zfs/Makefile                     |   37 
 tools/libfsimage/zfs/fsys_zfs.c                   | 1457 ++++++++++++++++++++++
 tools/libfsimage/zfs/fsys_zfs.h                   |  203 +++
 tools/libfsimage/zfs/mb_info.h                    |  217 +++
 tools/libfsimage/zfs/zfs-include/dmu.h            |  105 +
 tools/libfsimage/zfs/zfs-include/dmu_objset.h     |   35 
 tools/libfsimage/zfs/zfs-include/dnode.h          |   76 +
 tools/libfsimage/zfs/zfs-include/dsl_dataset.h    |   53 
 tools/libfsimage/zfs/zfs-include/dsl_dir.h        |   49 
 tools/libfsimage/zfs/zfs-include/spa.h            |  283 ++++
 tools/libfsimage/zfs/zfs-include/uberblock_impl.h |   49 
 tools/libfsimage/zfs/zfs-include/vdev_impl.h      |   70 +
 tools/libfsimage/zfs/zfs-include/zap_impl.h       |  110 +
 tools/libfsimage/zfs/zfs-include/zap_leaf.h       |  100 +
 tools/libfsimage/zfs/zfs-include/zfs.h            |  112 +
 tools/libfsimage/zfs/zfs-include/zfs_acl.h        |   55 
 tools/libfsimage/zfs/zfs-include/zfs_znode.h      |   68 +
 tools/libfsimage/zfs/zfs-include/zil.h            |   51 
 tools/libfsimage/zfs/zfs-include/zio.h            |   81 +
 tools/libfsimage/zfs/zfs-include/zio_checksum.h   |   42 
 tools/libfsimage/zfs/zfs_fletcher.c               |   93 +
 tools/libfsimage/zfs/zfs_lzjb.c                   |   60 
 tools/libfsimage/zfs/zfs_sha256.c                 |  124 +
 tools/libxc/Makefile                              |    1 
 tools/libxc/xc_cpufeature.h                       |  115 +
 tools/libxc/xc_cpuid_x86.c                        |  433 ++++++
 tools/libxc/xc_domain.c                           |   26 
 tools/libxc/xc_minios.c                           |    4 
 tools/libxc/xc_misc.c                             |   31 
 tools/libxc/xc_pagetab.c                          |    2 
 tools/libxc/xc_physdev.c                          |   72 +
 tools/libxc/xc_private.h                          |   30 
 tools/libxc/xenctrl.h                             |   59 
 tools/pygrub/src/fsimage/fsimage.c                |   24 
 tools/pygrub/src/pygrub                           |   18 
 tools/python/xen/lowlevel/xc/xc.c                 |  164 ++
 tools/python/xen/util/acmpolicy.py                |  165 ++
 tools/python/xen/util/blkif.py                    |    8 
 tools/python/xen/util/bootloader.py               |   25 
 tools/python/xen/util/pci.py                      |   87 +
 tools/python/xen/util/xsm/acm/acm.py              |    4 
 tools/python/xen/util/xsm/flask/flask.py          |    2 
 tools/python/xen/web/tcp.py                       |   40 
 tools/python/xen/xend/XendCheckpoint.py           |    1 
 tools/python/xen/xend/XendConfig.py               |   53 
 tools/python/xen/xend/XendDomain.py               |   15 
 tools/python/xen/xend/XendDomainInfo.py           |   13 
 tools/python/xen/xend/XendOptions.py              |   11 
 tools/python/xen/xend/XendXSPolicyAdmin.py        |   11 
 tools/python/xen/xend/image.py                    |   33 
 tools/python/xen/xend/server/SrvDomain.py         |    1 
 tools/python/xen/xend/server/blkif.py             |    3 
 tools/python/xen/xend/server/irqif.py             |    7 
 tools/python/xen/xend/server/netif.py             |    3 
 tools/python/xen/xend/server/pciif.py             |   19 
 tools/python/xen/xend/server/relocate.py          |   13 
 tools/python/xen/xm/addlabel.py                   |   10 
 tools/python/xen/xm/create.py                     |   35 
 tools/python/xen/xm/dry-run.py                    |    5 
 tools/python/xen/xm/main.py                       |   28 
 tools/python/xen/xm/migrate.py                    |    6 
 tools/python/xen/xm/xenapi_create.py              |    4 
 tools/xenstore/xenstore_client.c                  |    5 
 tools/xenstore/xenstored_core.c                   |   30 
 xen/Makefile                                      |    7 
 xen/arch/ia64/vmx/vmx_hypercall.c                 |    4 
 xen/arch/x86/Makefile                             |    1 
 xen/arch/x86/acpi/Makefile                        |    2 
 xen/arch/x86/acpi/boot.c                          |   24 
 xen/arch/x86/acpi/cpu_idle.c                      |  957 ++++++++++++++
 xen/arch/x86/apic.c                               |   50 
 xen/arch/x86/domain.c                             |   48 
 xen/arch/x86/domctl.c                             |   44 
 xen/arch/x86/genapic/Makefile                     |    1 
 xen/arch/x86/genapic/delivery.c                   |    2 
 xen/arch/x86/genapic/probe.c                      |    2 
 xen/arch/x86/genapic/x2apic.c                     |   79 +
 xen/arch/x86/hvm/Makefile                         |    1 
 xen/arch/x86/hvm/hvm.c                            |  165 +-
 xen/arch/x86/hvm/i8254.c                          |   28 
 xen/arch/x86/hvm/stdvga.c                         |   55 
 xen/arch/x86/hvm/svm/emulate.c                    |  115 -
 xen/arch/x86/hvm/svm/svm.c                        |  131 -
 xen/arch/x86/hvm/vlapic.c                         |    7 
 xen/arch/x86/hvm/vmsi.c                           |  189 ++
 xen/arch/x86/hvm/vmx/intr.c                       |   16 
 xen/arch/x86/hvm/vmx/vmx.c                        |   54 
 xen/arch/x86/hvm/vpt.c                            |    6 
 xen/arch/x86/i8259.c                              |    7 
 xen/arch/x86/io_apic.c                            |   79 +
 xen/arch/x86/irq.c                                |   33 
 xen/arch/x86/mm/p2m.c                             |   21 
 xen/arch/x86/mm/shadow/common.c                   |  171 ++
 xen/arch/x86/mm/shadow/multi.c                    |   83 +
 xen/arch/x86/mm/shadow/private.h                  |    9 
 xen/arch/x86/mpparse.c                            |   13 
 xen/arch/x86/msi.c                                |  787 +++++++++++
 xen/arch/x86/nmi.c                                |    5 
 xen/arch/x86/numa.c                               |    2 
 xen/arch/x86/pci.c                                |   58 
 xen/arch/x86/physdev.c                            |  354 +++++
 xen/arch/x86/platform_hypercall.c                 |   23 
 xen/arch/x86/setup.c                              |   12 
 xen/arch/x86/shutdown.c                           |    2 
 xen/arch/x86/smp.c                                |    5 
 xen/arch/x86/smpboot.c                            |   97 -
 xen/arch/x86/time.c                               |   53 
 xen/arch/x86/traps.c                              |    3 
 xen/arch/x86/x86_64/Makefile                      |    2 
 xen/arch/x86/x86_64/cpu_idle.c                    |  128 +
 xen/arch/x86/x86_64/platform_hypercall.c          |    4 
 xen/drivers/acpi/Makefile                         |    1 
 xen/drivers/acpi/hwregs.c                         |  339 ++++-
 xen/drivers/acpi/utglobal.c                       |  136 ++
 xen/drivers/passthrough/amd/iommu_detect.c        |    2 
 xen/drivers/passthrough/amd/iommu_init.c          |    5 
 xen/drivers/passthrough/amd/pci_amd_iommu.c       |    2 
 xen/drivers/passthrough/io.c                      |  126 +
 xen/drivers/passthrough/iommu.c                   |    2 
 xen/drivers/passthrough/vtd/dmar.c                |    2 
 xen/drivers/passthrough/vtd/intremap.c            |    4 
 xen/drivers/passthrough/vtd/iommu.c               |  131 -
 xen/drivers/passthrough/vtd/iommu.h               |    1 
 xen/drivers/passthrough/vtd/qinval.c              |    4 
 xen/drivers/passthrough/vtd/utils.c               |   39 
 xen/drivers/passthrough/vtd/x86/vtd.c             |    4 
 xen/include/asm-ia64/config.h                     |    1 
 xen/include/asm-powerpc/types.h                   |    1 
 xen/include/asm-x86/apic.h                        |  105 +
 xen/include/asm-x86/apicdef.h                     |   19 
 xen/include/asm-x86/cpufeature.h                  |    2 
 xen/include/asm-x86/domain.h                      |   18 
 xen/include/asm-x86/fixmap.h                      |    3 
 xen/include/asm-x86/genapic.h                     |   14 
 xen/include/asm-x86/hvm/io.h                      |    1 
 xen/include/asm-x86/hvm/irq.h                     |   13 
 xen/include/asm-x86/hvm/svm/amd-iommu-defs.h      |   29 
 xen/include/asm-x86/hvm/svm/emulate.h             |   11 
 xen/include/asm-x86/hvm/vlapic.h                  |    2 
 xen/include/asm-x86/hvm/vmx/vmx.h                 |   20 
 xen/include/asm-x86/hvm/vpt.h                     |    1 
 xen/include/asm-x86/irq.h                         |    6 
 xen/include/asm-x86/mach-generic/mach_apic.h      |    6 
 xen/include/asm-x86/msi.h                         |  210 +++
 xen/include/asm-x86/msr-index.h                   |    1 
 xen/include/asm-x86/pirq.h                        |   11 
 xen/include/asm-x86/processor.h                   |    6 
 xen/include/asm-x86/shadow.h                      |    6 
 xen/include/asm-x86/smp.h                         |    8 
 xen/include/asm-x86/types.h                       |    3 
 xen/include/public/domctl.h                       |   23 
 xen/include/public/hvm/hvm_op.h                   |   16 
 xen/include/public/io/xs_wire.h                   |    1 
 xen/include/public/physdev.h                      |   32 
 xen/include/public/platform.h                     |   65 
 xen/include/xen/iommu.h                           |   22 
 xen/include/xen/irq.h                             |    2 
 xen/include/xen/pci.h                             |   24 
 xen/include/xen/pci_regs.h                        |  530 ++++++++
 xen/include/xen/sched.h                           |    3 
 xen/include/xen/time.h                            |    2 
 xen/include/xlat.lst                              |    5 
 197 files changed, 11293 insertions(+), 1546 deletions(-)

diff -r f2457c7aff8d -r 611787b6ca35 README
--- a/README    Fri Apr 25 20:13:52 2008 +0900
+++ b/README    Thu May 08 18:40:07 2008 +0900
@@ -1,10 +1,10 @@
 #################################
- __  __            _____  ____  
- \ \/ /___ _ __   |___ / |___ \ 
-  \  // _ \ '_ \    |_ \   __) |
-  /  \  __/ | | |  ___) | / __/ 
- /_/\_\___|_| |_| |____(_)_____|
-                                
+ __  __            _____  _____  
+ \ \/ /___ _ __   |___ / |___ /  
+  \  // _ \ '_ \    |_ \   |_ \  
+  /  \  __/ | | |  ___) | ___) | 
+ /_/\_\___|_| |_| |____(_)____/  
+                                 
 #################################
 
 http://www.xen.org/
@@ -21,11 +21,10 @@ by the original Xen development team to 
 by the original Xen development team to build enterprise products
 around Xen.
 
-The 3.2 release offers excellent performance, hardware support and
+The 3.3 release offers excellent performance, hardware support and
 enterprise-grade features such as x86_32-PAE, x86_64, SMP guests and
-live relocation of VMs. This install tree contains source for a Linux
-2.6 guest; ports to Linux 2.4, NetBSD, FreeBSD and Solaris are
-available from the community.
+live relocation of VMs. Ports to Linux 2.6, Linux 2.4, NetBSD, FreeBSD
+and Solaris are available from the community.
 
 This file contains some quick-start instructions to install Xen on
 your system. For full documentation, see the Xen User Manual. If this
@@ -55,8 +54,8 @@ 2. Configure your bootloader to boot Xen
    /boot/grub/menu.lst: edit this file to include an entry like the
    following:
 
-    title Xen 3.2 / XenLinux 2.6
-       kernel /boot/xen-3.2.gz console=vga
+    title Xen 3.3 / XenLinux 2.6
+       kernel /boot/xen-3.3.gz console=vga
        module /boot/vmlinuz-2.6-xen root=<root-dev> ro console=tty0
        module /boot/initrd-2.6-xen.img
 
@@ -75,7 +74,7 @@ 2. Configure your bootloader to boot Xen
    32MB memory for internal use, which is not available for allocation
    to virtual machines.
 
-3. Reboot your system and select the "Xen 3.2 / XenLinux 2.6" menu
+3. Reboot your system and select the "Xen 3.3 / XenLinux 2.6" menu
    option. After booting Xen, Linux will start and your initialisation
    scripts should execute in the usual way.
 
diff -r f2457c7aff8d -r 611787b6ca35 docs/ChangeLog
--- a/docs/ChangeLog    Fri Apr 25 20:13:52 2008 +0900
+++ b/docs/ChangeLog    Thu May 08 18:40:07 2008 +0900
@@ -15,6 +15,15 @@ http://lists.xensource.com/archives/html
 
 Xen 3.3 release
 ---------------
+
+17538: Add XENPF_set_processor_pminfo
+http://xenbits.xensource.com/xen-unstable.hg?rev/5bb9093eb0e9
+
+17537: Add MSI support
+http://xenbits.xensource.com/xen-unstable.hg?rev/ad55c06c9bbc
+
+17524: Add DOMCTL_set_cpuid to configure guest CPUID on x86 systems.
+http://xenbits.xensource.com/xen-unstable.hg?rev/18727843db60
 
 17336: Add platform capabilities field to XEN_SYSCTL_physinfo
 http://xenbits.xensource.com/xen-unstable.hg?rev/250606290439
diff -r f2457c7aff8d -r 611787b6ca35 docs/misc/vtd.txt
--- a/docs/misc/vtd.txt Fri Apr 25 20:13:52 2008 +0900
+++ b/docs/misc/vtd.txt Thu May 08 18:40:07 2008 +0900
@@ -2,7 +2,7 @@ Authors : Allen Kay    <allen.m.kay@inte
 Authors : Allen Kay    <allen.m.kay@xxxxxxxxx>
           Weidong Han  <weidong.han@xxxxxxxxx>
 Created : October-24-2007
-Updated : December-13-2007
+Updated : May-07-2008
 
 How to turn on VT-d in Xen
 --------------------------
@@ -22,7 +22,7 @@ title Xen-Fedora Core (2.6.18-xen)
 title Xen-Fedora Core (2.6.18-xen)
         root (hd0,0)
         kernel /boot/xen.gz com1=115200,8n1 console=com1
-        module /boot/vmlinuz-2.6.18.8-xen root=LABEL=/ ro console=tty0 
console=ttyS0,115200,8n1 pciback.hide=(01:00.0)(03:00.0) 
pciback.verbose_request=1 apic=debug
+        module /boot/vmlinuz-2.6.18.8-xen root=LABEL=/ ro xencons=ttyS 
console=tty0 console=ttyS0, pciback.hide=(01:00.0)(03:00.0)
         module /boot/initrd-2.6.18-xen.img
 
 12) reboot system
@@ -47,14 +47,39 @@ 1) Host OS: PAE, 64-bit
 1) Host OS: PAE, 64-bit
 2) Guest OS: 32-bit, PAE, 64-bit
 
-Because current Xen doesn't support MSI, for guest OS which uses MSI by 
default, need to add "pci=nomsi" option on its grub, e.g. RHEL5, FC6.
-
 
 Combinations Tested:
 --------------------
 
 1) 64-bit host: 32/PAE/64 Linux/XP/Win2003/Vista guests
 2) PAE host: 32/PAE Linux/XP/Win2003/Vista guests
+
+
+VTd device hotplug:
+-------------------
+ 
+2 virtual PCI slots (6~7) are reserved in HVM guest to support VTd hotplug. If 
you have more VTd devices, only 2 of them can support hotplug. Usage is simple:
+
+ 1. List the VTd device by dom. You can see a VTd device 0:2:0.0 is inserted 
in the HVM domain's PCI slot 6. '''lspci''' inside the guest should see the 
same.
+
+       [root@vt-vtd ~]# xm pci-list HVMDomainVtd
+       VSlt domain   bus   slot   func
+       0x6    0x0  0x02   0x00    0x0
+
+ 2. Detach the device from the guest by the physical BDF. Then HVM guest will 
receive a virtual PCI hot removal event to detach the physical device
+
+       [root@vt-vtd ~]# xm pci-detach HVMDomainVtd 0:2:0.0
+
+ 3. Attach a PCI device to the guest by the physical BDF and desired virtual 
slot(optional). Following command would insert the physical device into guest's 
virtual slot 7
+
+       [root@vt-vtd ~]# xm pci-attach HVMDomainVtd 0:2:0.0 7
+
+VTd hotplug usage model:
+------------------------
+
+ * For live migration: As you know, VTd device would break the live migration 
as physical device can't be save/restored like virtual device. With hotplug, 
live migration is back again. Just hot remove all the VTd devices before live 
migration and hot add new VTd devices on target machine after live migration.
+
+ * VTd hotplug for device switch: VTd hotplug can be used to dynamically 
switch physical device between different HVM guest without shutdown.
 
 
 VT-d Enabled Systems
@@ -74,3 +99,5 @@ http://www.dell.com/content/products/cat
 - HP Compaq:  DC7800
 
http://h10010.www1.hp.com/wwpc/us/en/en/WF04a/12454-12454-64287-321860-3328898.html
 
+For more information, pls refer to http://wiki.xensource.com/xenwiki/VTdHowTo.
+
diff -r f2457c7aff8d -r 611787b6ca35 docs/src/user.tex
--- a/docs/src/user.tex Fri Apr 25 20:13:52 2008 +0900
+++ b/docs/src/user.tex Thu May 08 18:40:07 2008 +0900
@@ -2459,9 +2459,7 @@ file. Please refer to Section~\ref{subse
 file. Please refer to Section~\ref{subsection:acmlabelmanageddomains}
 if you are using managed domains.
 
-The following configuration file defines \verb|domain1|
-(Note: www.jailtime.org or www.xen-get.org might be good
-places to look for example domU images):
+The following configuration file defines \verb|domain1|:
 
 \begin{scriptsize}
 \begin{verbatim}
diff -r f2457c7aff8d -r 611787b6ca35 extras/mini-os/blkfront.c
--- a/extras/mini-os/blkfront.c Fri Apr 25 20:13:52 2008 +0900
+++ b/extras/mini-os/blkfront.c Thu May 08 18:40:07 2008 +0900
@@ -50,6 +50,8 @@ struct blkfront_dev {
     char *backend;
     struct blkfront_info info;
 
+    xenbus_event_queue events;
+
 #ifdef HAVE_LIBC
     int fd;
 #endif
@@ -100,6 +102,8 @@ struct blkfront_dev *init_blkfront(char 
     FRONT_RING_INIT(&dev->ring, s, PAGE_SIZE);
 
     dev->ring_ref = gnttab_grant_access(dev->dom,virt_to_mfn(s),0);
+
+    dev->events = NULL;
 
     // FIXME: proper frees on failures
 again:
@@ -166,11 +170,9 @@ done:
 
         snprintf(path, sizeof(path), "%s/state", dev->backend);
 
-        xenbus_watch_path(XBT_NIL, path);
-
-        xenbus_wait_for_value(path,"4");
-
-        xenbus_unwatch_path(XBT_NIL, path);
+        xenbus_watch_path_token(XBT_NIL, path, path, &dev->events);
+
+        xenbus_wait_for_value(path, "4", &dev->events);
 
         snprintf(path, sizeof(path), "%s/info", dev->backend);
         dev->info.info = xenbus_read_integer(path);
@@ -211,10 +213,12 @@ void shutdown_blkfront(struct blkfront_d
 
     snprintf(path, sizeof(path), "%s/state", dev->backend);
     err = xenbus_printf(XBT_NIL, nodename, "state", "%u", 5); /* closing */
-    xenbus_wait_for_value(path,"5");
+    xenbus_wait_for_value(path, "5", &dev->events);
 
     err = xenbus_printf(XBT_NIL, nodename, "state", "%u", 6);
-    xenbus_wait_for_value(path,"6");
+    xenbus_wait_for_value(path, "6", &dev->events);
+
+    xenbus_unwatch_path(XBT_NIL, path);
 
     unbind_evtchn(dev->evtchn);
 
diff -r f2457c7aff8d -r 611787b6ca35 extras/mini-os/fbfront.c
--- a/extras/mini-os/fbfront.c  Fri Apr 25 20:13:52 2008 +0900
+++ b/extras/mini-os/fbfront.c  Thu May 08 18:40:07 2008 +0900
@@ -31,6 +31,8 @@ struct kbdfront_dev {
     char *nodename;
     char *backend;
 
+    xenbus_event_queue events;
+
 #ifdef HAVE_LIBC
     int fd;
 #endif
@@ -75,6 +77,8 @@ struct kbdfront_dev *init_kbdfront(char 
     dev->page = s = (struct xenkbd_page*) alloc_page();
     memset(s,0,PAGE_SIZE);
 
+    dev->events = NULL;
+
     s->in_cons = s->in_prod = 0;
     s->out_cons = s->out_prod = 0;
 
@@ -136,11 +140,9 @@ done:
 
         snprintf(path, sizeof(path), "%s/state", dev->backend);
 
-        xenbus_watch_path(XBT_NIL, path);
-
-        xenbus_wait_for_value(path,"4");
-
-        xenbus_unwatch_path(XBT_NIL, path);
+        xenbus_watch_path_token(XBT_NIL, path, path, &dev->events);
+
+        xenbus_wait_for_value(path, "4", &dev->events);
 
         printk("%s connected\n", dev->backend);
 
@@ -199,10 +201,12 @@ void shutdown_kbdfront(struct kbdfront_d
 
     snprintf(path, sizeof(path), "%s/state", dev->backend);
     err = xenbus_printf(XBT_NIL, nodename, "state", "%u", 5); /* closing */
-    xenbus_wait_for_value(path,"5");
+    xenbus_wait_for_value(path, "5", &dev->events);
 
     err = xenbus_printf(XBT_NIL, nodename, "state", "%u", 6);
-    xenbus_wait_for_value(path,"6");
+    xenbus_wait_for_value(path, "6", &dev->events);
+
+    xenbus_unwatch_path(XBT_NIL, path);
 
     unbind_evtchn(dev->evtchn);
 
@@ -249,6 +253,8 @@ struct fbfront_dev {
     int stride;
     int mem_length;
     int offset;
+
+    xenbus_event_queue events;
 };
 
 void fbfront_handler(evtchn_port_t port, struct pt_regs *regs, void *data)
@@ -292,6 +298,7 @@ struct fbfront_dev *init_fbfront(char *n
     dev->stride = s->line_length = stride;
     dev->mem_length = s->mem_length = n * PAGE_SIZE;
     dev->offset = 0;
+    dev->events = NULL;
 
     const int max_pd = sizeof(s->pd) / sizeof(s->pd[0]);
     unsigned long mapped = 0;
@@ -368,13 +375,11 @@ done:
 
         snprintf(path, sizeof(path), "%s/state", dev->backend);
 
-        xenbus_watch_path(XBT_NIL, path);
-
-        xenbus_wait_for_value(path,"4");
+        xenbus_watch_path_token(XBT_NIL, path, path, &dev->events);
+
+        xenbus_wait_for_value(path, "4", &dev->events);
 
         printk("%s connected\n", dev->backend);
-
-        xenbus_unwatch_path(XBT_NIL, path);
 
         snprintf(path, sizeof(path), "%s/request-update", dev->backend);
         dev->request_update = xenbus_read_integer(path);
@@ -463,10 +468,12 @@ void shutdown_fbfront(struct fbfront_dev
 
     snprintf(path, sizeof(path), "%s/state", dev->backend);
     err = xenbus_printf(XBT_NIL, nodename, "state", "%u", 5); /* closing */
-    xenbus_wait_for_value(path,"5");
+    xenbus_wait_for_value(path, "5", &dev->events);
 
     err = xenbus_printf(XBT_NIL, nodename, "state", "%u", 6);
-    xenbus_wait_for_value(path,"6");
+    xenbus_wait_for_value(path, "6", &dev->events);
+
+    xenbus_unwatch_path(XBT_NIL, path);
 
     unbind_evtchn(dev->evtchn);
 
diff -r f2457c7aff8d -r 611787b6ca35 extras/mini-os/fs-front.c
--- a/extras/mini-os/fs-front.c Fri Apr 25 20:13:52 2008 +0900
+++ b/extras/mini-os/fs-front.c Thu May 08 18:40:07 2008 +0900
@@ -917,6 +917,7 @@ static int init_fs_import(struct fs_impo
     struct fsif_sring *sring;
     int retry = 0;
     domid_t self_id;
+    xenbus_event_queue events = NULL;
 
     printk("Initialising FS fortend to backend dom %d\n", import->dom_id);
     /* Allocate page for the shared ring */
@@ -1026,8 +1027,9 @@ done:
     sprintf(r_nodename, "%s/state", import->backend);
     sprintf(token, "fs-front-%d", import->import_id);
     /* The token will not be unique if multiple imports are inited */
-    xenbus_watch_path(XBT_NIL, r_nodename/*, token*/);
-    xenbus_wait_for_value(/*token,*/ r_nodename, STATE_READY);
+    xenbus_watch_path_token(XBT_NIL, r_nodename, r_nodename, &events);
+    xenbus_wait_for_value(r_nodename, STATE_READY, &events);
+    xenbus_unwatch_path(XBT_NIL, r_nodename);
     printk("Backend ready.\n");
    
     //create_thread("fs-tester", test_fs_import, import); 
diff -r f2457c7aff8d -r 611787b6ca35 extras/mini-os/include/lib.h
--- a/extras/mini-os/include/lib.h      Fri Apr 25 20:13:52 2008 +0900
+++ b/extras/mini-os/include/lib.h      Thu May 08 18:40:07 2008 +0900
@@ -162,7 +162,7 @@ extern struct file {
              * wakes select for this FD. */
             struct {
                 evtchn_port_t port;
-                volatile unsigned long pending;
+                unsigned long pending;
                 int bound;
             } ports[MAX_EVTCHN_PORTS];
        } evtchn;
@@ -178,10 +178,10 @@ extern struct file {
         struct {
             /* To each xenbus FD is associated a queue of watch events for this
              * FD.  */
-            struct xenbus_event *volatile events;
+            xenbus_event_queue events;
         } xenbus;
     };
-    volatile int read; /* maybe available for read */
+    int read;  /* maybe available for read */
 } files[];
 
 int alloc_fd(enum fd_type type);
diff -r f2457c7aff8d -r 611787b6ca35 extras/mini-os/include/xenbus.h
--- a/extras/mini-os/include/xenbus.h   Fri Apr 25 20:13:52 2008 +0900
+++ b/extras/mini-os/include/xenbus.h   Thu May 08 18:40:07 2008 +0900
@@ -19,17 +19,18 @@ struct xenbus_event {
     char *token;
     struct xenbus_event *next;
 };
+typedef struct xenbus_event *xenbus_event_queue;
 
-char *xenbus_watch_path_token(xenbus_transaction_t xbt, const char *path, 
const char *token, struct xenbus_event *volatile *events);
+char *xenbus_watch_path_token(xenbus_transaction_t xbt, const char *path, 
const char *token, xenbus_event_queue *events);
 char *xenbus_unwatch_path_token(xenbus_transaction_t xbt, const char *path, 
const char *token);
 extern struct wait_queue_head xenbus_watch_queue;
-void xenbus_wait_for_watch(void);
-char **xenbus_wait_for_watch_return(void);
-char* xenbus_wait_for_value(const char *path, const char *value);
+void xenbus_wait_for_watch(xenbus_event_queue *queue);
+char **xenbus_wait_for_watch_return(xenbus_event_queue *queue);
+char* xenbus_wait_for_value(const char *path, const char *value, 
xenbus_event_queue *queue);
 
 /* When no token is provided, use a global queue. */
 #define XENBUS_WATCH_PATH_TOKEN "xenbus_watch_path"
-extern struct xenbus_event * volatile xenbus_events;
+extern xenbus_event_queue xenbus_events;
 #define xenbus_watch_path(xbt, path) xenbus_watch_path_token(xbt, path, 
XENBUS_WATCH_PATH_TOKEN, NULL)
 #define xenbus_unwatch_path(xbt, path) xenbus_unwatch_path_token(xbt, path, 
XENBUS_WATCH_PATH_TOKEN)
 
diff -r f2457c7aff8d -r 611787b6ca35 extras/mini-os/netfront.c
--- a/extras/mini-os/netfront.c Fri Apr 25 20:13:52 2008 +0900
+++ b/extras/mini-os/netfront.c Thu May 08 18:40:07 2008 +0900
@@ -52,6 +52,8 @@ struct netfront_dev {
 
     char *nodename;
     char *backend;
+
+    xenbus_event_queue events;
 
 #ifdef HAVE_LIBC
     int fd;
@@ -328,6 +330,8 @@ struct netfront_dev *init_netfront(char 
 
     dev->netif_rx = thenetif_rx;
 
+    dev->events = NULL;
+
     // FIXME: proper frees on failures
 again:
     err = xenbus_transaction_start(&xbt);
@@ -399,11 +403,9 @@ done:
         char path[strlen(dev->backend) + 1 + 5 + 1];
         snprintf(path, sizeof(path), "%s/state", dev->backend);
 
-        xenbus_watch_path(XBT_NIL, path);
-
-        xenbus_wait_for_value(path,"4");
-
-        xenbus_unwatch_path(XBT_NIL, path);
+        xenbus_watch_path_token(XBT_NIL, path, path, &dev->events);
+
+        xenbus_wait_for_value(path, "4", &dev->events);
 
         if (ip) {
             snprintf(path, sizeof(path), "%s/ip", dev->backend);
@@ -458,10 +460,12 @@ void shutdown_netfront(struct netfront_d
 
     snprintf(path, sizeof(path), "%s/state", dev->backend);
     err = xenbus_printf(XBT_NIL, nodename, "state", "%u", 5); /* closing */
-    xenbus_wait_for_value(path,"5");
+    xenbus_wait_for_value(path, "5", &dev->events);
 
     err = xenbus_printf(XBT_NIL, nodename, "state", "%u", 6);
-    xenbus_wait_for_value(path,"6");
+    xenbus_wait_for_value(path, "6", &dev->events);
+
+    xenbus_unwatch_path(XBT_NIL, path);
 
     unbind_evtchn(dev->evtchn);
 
diff -r f2457c7aff8d -r 611787b6ca35 extras/mini-os/xenbus/xenbus.c
--- a/extras/mini-os/xenbus/xenbus.c    Fri Apr 25 20:13:52 2008 +0900
+++ b/extras/mini-os/xenbus/xenbus.c    Thu May 08 18:40:07 2008 +0900
@@ -45,10 +45,10 @@ static DECLARE_WAIT_QUEUE_HEAD(xb_waitq)
 static DECLARE_WAIT_QUEUE_HEAD(xb_waitq);
 DECLARE_WAIT_QUEUE_HEAD(xenbus_watch_queue);
 
-struct xenbus_event *volatile xenbus_events;
+xenbus_event_queue xenbus_events;
 static struct watch {
     char *token;
-    struct xenbus_event *volatile *events;
+    xenbus_event_queue *events;
     struct watch *next;
 } *watches;
 struct xenbus_req_info 
@@ -75,28 +75,34 @@ static void memcpy_from_ring(const void 
     memcpy(dest + c1, ring, c2);
 }
 
-char **xenbus_wait_for_watch_return()
+char **xenbus_wait_for_watch_return(xenbus_event_queue *queue)
 {
     struct xenbus_event *event;
+    if (!queue)
+        queue = &xenbus_events;
     DEFINE_WAIT(w);
-    while (!(event = xenbus_events)) {
+    while (!(event = *queue)) {
         add_waiter(w, xenbus_watch_queue);
         schedule();
     }
     remove_waiter(w);
-    xenbus_events = event->next;
+    *queue = event->next;
     return &event->path;
 }
 
-void xenbus_wait_for_watch(void)
+void xenbus_wait_for_watch(xenbus_event_queue *queue)
 {
     char **ret;
-    ret = xenbus_wait_for_watch_return();
+    if (!queue)
+        queue = &xenbus_events;
+    ret = xenbus_wait_for_watch_return(queue);
     free(ret);
 }
 
-char* xenbus_wait_for_value(const char* path, const char* value)
-{
+char* xenbus_wait_for_value(const char* path, const char* value, 
xenbus_event_queue *queue)
+{
+    if (!queue)
+        queue = &xenbus_events;
     for(;;)
     {
         char *res, *msg;
@@ -109,7 +115,7 @@ char* xenbus_wait_for_value(const char* 
         free(res);
 
         if(r==0) break;
-        else xenbus_wait_for_watch();
+        else xenbus_wait_for_watch(queue);
     }
     return NULL;
 }
@@ -147,8 +153,8 @@ static void xenbus_thread_func(void *ign
 
             if(msg.type == XS_WATCH_EVENT)
             {
-               struct xenbus_event *event = malloc(sizeof(*event) + msg.len),
-                                    *volatile *events = NULL;
+               struct xenbus_event *event = malloc(sizeof(*event) + msg.len);
+                xenbus_event_queue *events = NULL;
                char *data = (char*)event + sizeof(*event);
                 struct watch *watch;
 
@@ -167,8 +173,6 @@ static void xenbus_thread_func(void *ign
                         events = watch->events;
                         break;
                     }
-                if (!events)
-                    events = &xenbus_events;
 
                event->next = *events;
                *events = event;
@@ -463,7 +467,7 @@ char *xenbus_write(xenbus_transaction_t 
     return NULL;
 }
 
-char* xenbus_watch_path_token( xenbus_transaction_t xbt, const char *path, 
const char *token, struct xenbus_event *volatile *events)
+char* xenbus_watch_path_token( xenbus_transaction_t xbt, const char *path, 
const char *token, xenbus_event_queue *events)
 {
     struct xsd_sockmsg *rep;
 
@@ -473,6 +477,9 @@ char* xenbus_watch_path_token( xenbus_tr
     };
 
     struct watch *watch = malloc(sizeof(*watch));
+
+    if (!events)
+        events = &xenbus_events;
 
     watch->token = strdup(token);
     watch->events = events;
diff -r f2457c7aff8d -r 611787b6ca35 tools/examples/xend-config-xenapi.sxp
--- a/tools/examples/xend-config-xenapi.sxp     Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/examples/xend-config-xenapi.sxp     Thu May 08 18:40:07 2008 +0900
@@ -66,9 +66,9 @@
 
 
 # Address and port xend should use for the legacy TCP XMLRPC interface, 
-# if xen-tcp-xmlrpc-server is set.
-#(xen-tcp-xmlrpc-server-address 'localhost')
-#(xen-tcp-xmlrpc-server-port 8006)
+# if xend-tcp-xmlrpc-server is set.
+#(xend-tcp-xmlrpc-server-address 'localhost')
+#(xend-tcp-xmlrpc-server-port 8006)
 
 # SSL key and certificate to use for the legacy TCP XMLRPC interface.
 # Setting these will mean that this port serves only SSL connections as
diff -r f2457c7aff8d -r 611787b6ca35 tools/examples/xend-config.sxp
--- a/tools/examples/xend-config.sxp    Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/examples/xend-config.sxp    Thu May 08 18:40:07 2008 +0900
@@ -64,9 +64,9 @@
 
 
 # Address and port xend should use for the legacy TCP XMLRPC interface, 
-# if xen-tcp-xmlrpc-server is set.
-#(xen-tcp-xmlrpc-server-address 'localhost')
-#(xen-tcp-xmlrpc-server-port 8006)
+# if xend-tcp-xmlrpc-server is set.
+#(xend-tcp-xmlrpc-server-address 'localhost')
+#(xend-tcp-xmlrpc-server-port 8006)
 
 # SSL key and certificate to use for the legacy TCP XMLRPC interface.
 # Setting these will mean that this port serves only SSL connections as
@@ -81,6 +81,15 @@
 # Port xend should use for the relocation interface, if xend-relocation-server
 # is set.
 #(xend-relocation-port 8002)
+
+# Whether to use tls when relocating.
+#(xend-relocation-tls no)
+
+# SSL key and certificate to use for the relocation interface.
+# Setting these will mean that this port serves only SSL connections as
+# opposed to plaintext ones.
+#(xend-relocation-server-ssl-key-file  /etc/xen/xmlrpc.key)
+#(xend-relocation-server-ssl-cert-file  /etc/xen/xmlrpc.crt)
 
 # Address xend should listen on for HTTP connections, if xend-http-server is
 # set.
diff -r f2457c7aff8d -r 611787b6ca35 tools/examples/xmexample.hvm
--- a/tools/examples/xmexample.hvm      Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/examples/xmexample.hvm      Thu May 08 18:40:07 2008 +0900
@@ -219,3 +219,28 @@ serial='pty'
 #-----------------------------------------------------------------------------
 #   Set keyboard layout, default is en-us keyboard. 
 #keymap='ja'
+
+#-----------------------------------------------------------------------------
+#   Configure guest CPUID responses:
+#cpuid=[ '1:ecx=xxxxxxxxxxxxxxxxxxxxxxxxxx1xxxxx,
+#           eax=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' ]
+# - Set the VMX feature flag in the guest (CPUID_1:ECX:5)
+# - Default behaviour for all other bits in ECX And EAX registers.
+# 
+# Each successive character represent a lesser-significant bit:
+#  '1' -> force the corresponding bit to 1
+#  '0' -> force to 0
+#  'x' -> we don't care (default behaviour)
+#  'k' -> pass through the host bit value
+#  's' -> as 'k' but preserve across save/restore and migration
+#
+#   Configure host CPUID consistency checks, which must be satisfied for this
+#   VM to be allowed to run on this host's processor type:
+#cpuid_check=[ '1:ecx=xxxxxxxxxxxxxxxxxxxxxxxxxx1xxxxx' ]
+# - Host must have VMX feature flag set
+#
+# The format is similar to the above for 'cpuid':
+#  '1' -> the bit must be '1'
+#  '0' -> the bit must be '0'
+#  'x' -> we don't care (do not check)
+#  's' -> the bit must be the same as on the host that started this VM
diff -r f2457c7aff8d -r 611787b6ca35 tools/ioemu/Makefile.target
--- a/tools/ioemu/Makefile.target       Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/ioemu/Makefile.target       Thu May 08 18:40:07 2008 +0900
@@ -370,7 +370,7 @@ endif
 
 ifdef CONFIG_PASSTHROUGH
 LIBS+=-lpci
-VL_OBJS+= pass-through.o
+VL_OBJS+= pass-through.o pt-msi.o
 CFLAGS += -DCONFIG_PASSTHROUGH
 $(info *** PCI passthrough capability has been enabled ***)
 endif
diff -r f2457c7aff8d -r 611787b6ca35 tools/ioemu/hw/cirrus_vga.c
--- a/tools/ioemu/hw/cirrus_vga.c       Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/ioemu/hw/cirrus_vga.c       Thu May 08 18:40:07 2008 +0900
@@ -234,8 +234,6 @@ typedef struct CirrusVGAState {
     int cirrus_linear_io_addr;
     int cirrus_linear_bitblt_io_addr;
     int cirrus_mmio_io_addr;
-    unsigned long cirrus_lfb_addr;
-    unsigned long cirrus_lfb_end;
     uint32_t cirrus_addr_mask;
     uint32_t linear_mmio_mask;
     uint8_t cirrus_shadow_gr0;
@@ -2657,11 +2655,11 @@ static void cirrus_update_memory_access(
         
        mode = s->gr[0x05] & 0x7;
        if (mode < 4 || mode > 5 || ((s->gr[0x0B] & 0x4) == 0)) {
-            if (s->cirrus_lfb_addr && s->cirrus_lfb_end && !s->map_addr) {
+            if (s->lfb_addr && s->lfb_end && !s->map_addr) {
                 void *vram_pointer, *old_vram;
 
-                vram_pointer = set_vram_mapping(s->cirrus_lfb_addr,
-                                                s->cirrus_lfb_end);
+                vram_pointer = set_vram_mapping(s->lfb_addr,
+                                                s->lfb_end);
                 if (!vram_pointer)
                     fprintf(stderr, "NULL vram_pointer\n");
                 else {
@@ -2669,21 +2667,21 @@ static void cirrus_update_memory_access(
                                                VGA_RAM_SIZE);
                     qemu_free(old_vram);
                 }
-                s->map_addr = s->cirrus_lfb_addr;
-                s->map_end = s->cirrus_lfb_end;
+                s->map_addr = s->lfb_addr;
+                s->map_end = s->lfb_end;
             }
             s->cirrus_linear_write[0] = cirrus_linear_mem_writeb;
             s->cirrus_linear_write[1] = cirrus_linear_mem_writew;
             s->cirrus_linear_write[2] = cirrus_linear_mem_writel;
         } else {
         generic_io:
-            if (s->cirrus_lfb_addr && s->cirrus_lfb_end && s->map_addr) {
+            if (s->lfb_addr && s->lfb_end && s->map_addr) {
                 void *old_vram;
 
                 old_vram = vga_update_vram((VGAState *)s, NULL, VGA_RAM_SIZE);
 
-                unset_vram_mapping(s->cirrus_lfb_addr,
-                                   s->cirrus_lfb_end, 
+                unset_vram_mapping(s->lfb_addr,
+                                   s->lfb_end, 
                                    old_vram);
 
                 s->map_addr = s->map_end = 0;
@@ -3049,27 +3047,27 @@ void cirrus_stop_acc(CirrusVGAState *s)
     if (s->map_addr){
         int error;
         s->map_addr = 0;
-        error = unset_vram_mapping(s->cirrus_lfb_addr,
-                s->cirrus_lfb_end, s->vram_ptr);
+        error = unset_vram_mapping(s->lfb_addr,
+                s->lfb_end, s->vram_ptr);
         fprintf(stderr, "cirrus_stop_acc:unset_vram_mapping.\n");
     }
 }
 
 void cirrus_restart_acc(CirrusVGAState *s)
 {
-    if (s->cirrus_lfb_addr && s->cirrus_lfb_end) {
+    if (s->lfb_addr && s->lfb_end) {
         void *vram_pointer, *old_vram;
         fprintf(stderr, "cirrus_vga_load:re-enable vga acc.lfb_addr=0x%lx, 
lfb_end=0x%lx.\n",
-                s->cirrus_lfb_addr, s->cirrus_lfb_end);
-        vram_pointer = set_vram_mapping(s->cirrus_lfb_addr ,s->cirrus_lfb_end);
+                s->lfb_addr, s->lfb_end);
+        vram_pointer = set_vram_mapping(s->lfb_addr ,s->lfb_end);
         if (!vram_pointer){
             fprintf(stderr, "cirrus_vga_load:NULL vram_pointer\n");
         } else {
             old_vram = vga_update_vram((VGAState *)s, vram_pointer,
                     VGA_RAM_SIZE);
             qemu_free(old_vram);
-            s->map_addr = s->cirrus_lfb_addr;
-            s->map_end = s->cirrus_lfb_end;
+            s->map_addr = s->lfb_addr;
+            s->map_end = s->lfb_end;
         }
     }
 }
@@ -3120,8 +3118,8 @@ static void cirrus_vga_save(QEMUFile *f,
 
     vga_acc = (!!s->map_addr);
     qemu_put_8s(f, &vga_acc);
-    qemu_put_be64s(f, (uint64_t*)&s->cirrus_lfb_addr);
-    qemu_put_be64s(f, (uint64_t*)&s->cirrus_lfb_end);
+    qemu_put_be64s(f, (uint64_t*)&s->lfb_addr);
+    qemu_put_be64s(f, (uint64_t*)&s->lfb_end);
     qemu_put_buffer(f, s->vram_ptr, VGA_RAM_SIZE); 
 }
 
@@ -3175,8 +3173,8 @@ static int cirrus_vga_load(QEMUFile *f, 
     qemu_get_be32s(f, &s->hw_cursor_y);
 
     qemu_get_8s(f, &vga_acc);
-    qemu_get_be64s(f, (uint64_t*)&s->cirrus_lfb_addr);
-    qemu_get_be64s(f, (uint64_t*)&s->cirrus_lfb_end);
+    qemu_get_be64s(f, (uint64_t*)&s->lfb_addr);
+    qemu_get_be64s(f, (uint64_t*)&s->lfb_end);
     qemu_get_buffer(f, s->vram_ptr, VGA_RAM_SIZE); 
     if (vga_acc){
         cirrus_restart_acc(s);
@@ -3337,11 +3335,11 @@ static void cirrus_pci_lfb_map(PCIDevice
     /* XXX: add byte swapping apertures */
     cpu_register_physical_memory(addr, s->vram_size,
                                 s->cirrus_linear_io_addr);
-    s->cirrus_lfb_addr = addr;
-    s->cirrus_lfb_end = addr + VGA_RAM_SIZE;
-
-    if (s->map_addr && (s->cirrus_lfb_addr != s->map_addr) &&
-        (s->cirrus_lfb_end != s->map_end))
+    s->lfb_addr = addr;
+    s->lfb_end = addr + VGA_RAM_SIZE;
+
+    if (s->map_addr && (s->lfb_addr != s->map_addr) &&
+        (s->lfb_end != s->map_end))
         fprintf(logfile, "cirrus vga map change while on lfb mode\n");
 
     cpu_register_physical_memory(addr + 0x1000000, 0x400000,
diff -r f2457c7aff8d -r 611787b6ca35 tools/ioemu/hw/pass-through.c
--- a/tools/ioemu/hw/pass-through.c     Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/ioemu/hw/pass-through.c     Thu May 08 18:40:07 2008 +0900
@@ -26,6 +26,7 @@
 #include "pass-through.h"
 #include "pci/header.h"
 #include "pci/pci.h"
+#include "pt-msi.h"
 
 extern FILE *logfile;
 
@@ -286,6 +287,9 @@ static void pt_pci_write_config(PCIDevic
         pci_default_write_config(d, address, val, len);
         return;
     }
+
+    if ( pt_msi_write(assigned_device, address, val, len) )
+        return;
 
     /* PCI config pass-through */
     if (address == 0x4) {
@@ -333,6 +337,7 @@ static uint32_t pt_pci_read_config(PCIDe
         break;
     }
 
+    pt_msi_read(assigned_device, address, len, &val);
 exit:
 
 #ifdef PT_DEBUG_PCI_CONFIG_ACCESS
@@ -445,11 +450,41 @@ static int pt_unregister_regions(struct 
 
 }
 
+uint8_t find_cap_offset(struct pci_dev *pci_dev, uint8_t cap)
+{
+    int id;
+    int max_cap = 48;
+    int pos = PCI_CAPABILITY_LIST;
+    int status;
+
+    status = pci_read_byte(pci_dev, PCI_STATUS);
+    if ( (status & PCI_STATUS_CAP_LIST) == 0 )
+        return 0;
+
+    while ( max_cap-- )
+    {
+        pos = pci_read_byte(pci_dev, pos);
+        if ( pos < 0x40 )
+            break;
+
+        pos &= ~3;
+        id = pci_read_byte(pci_dev, pos + PCI_CAP_LIST_ID);
+
+        if ( id == 0xff )
+            break;
+        if ( id == cap )
+            return pos;
+
+        pos += PCI_CAP_LIST_NEXT;
+    }
+    return 0;
+}
+
 struct pt_dev * register_real_device(PCIBus *e_bus,
         const char *e_dev_name, int e_devfn, uint8_t r_bus, uint8_t r_dev,
         uint8_t r_func, uint32_t machine_irq, struct pci_access *pci_access)
 {
-    int rc = -1, i;
+    int rc = -1, i, pos;
     struct pt_dev *assigned_device = NULL;
     struct pci_dev *pci_dev;
     uint8_t e_device, e_intx;
@@ -511,6 +546,9 @@ struct pt_dev * register_real_device(PCI
     for ( i = 0; i < PCI_CONFIG_SIZE; i++ )
         assigned_device->dev.config[i] = pci_read_byte(pci_dev, i);
 
+    if ( (pos = find_cap_offset(pci_dev, PCI_CAP_ID_MSI)) )
+        pt_msi_init(assigned_device, pos);
+
     /* Handle real device's MMIO/PIO BARs */
     pt_register_regions(assigned_device);
 
@@ -519,7 +557,21 @@ struct pt_dev * register_real_device(PCI
     e_intx = assigned_device->dev.config[0x3d]-1;
 
     if ( PT_MACHINE_IRQ_AUTO == machine_irq )
+    {
+        int pirq = pci_dev->irq;
+
         machine_irq = pci_dev->irq;
+        rc = xc_physdev_map_pirq(xc_handle, domid, MAP_PIRQ_TYPE_GSI,
+                                machine_irq, &pirq);
+
+        if ( rc )
+        {
+            /* TBD: unregister device in case of an error */
+            PT_LOG("Error: Mapping irq failed, rc = %d\n", rc);
+        }
+        else
+            machine_irq = pirq;
+    }
 
     /* bind machine_irq to device */
     if ( 0 != machine_irq )
diff -r f2457c7aff8d -r 611787b6ca35 tools/ioemu/hw/pass-through.h
--- a/tools/ioemu/hw/pass-through.h     Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/ioemu/hw/pass-through.h     Thu May 08 18:40:07 2008 +0900
@@ -57,6 +57,14 @@ struct pt_region {
     } access;
 };
 
+struct pt_msi_info {
+    uint32_t flags;
+    int offset;
+    int size;
+    int pvec;   /* physical vector used */
+    int pirq;  /* guest pirq corresponding */
+};
+
 /*
     This structure holds the context of the mapping functions
     and data that is relevant for qemu device management.
@@ -65,6 +73,7 @@ struct pt_dev {
     PCIDevice dev;
     struct pci_dev *pci_dev;                     /* libpci struct */
     struct pt_region bases[PCI_NUM_REGIONS];    /* Access regions */
+    struct pt_msi_info *msi;                    /* MSI virtualization */
 };
 
 /* Used for formatting PCI BDF into cf8 format */
diff -r f2457c7aff8d -r 611787b6ca35 tools/ioemu/hw/pt-msi.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ioemu/hw/pt-msi.c   Thu May 08 18:40:07 2008 +0900
@@ -0,0 +1,488 @@
+/*
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Jiang Yunhong <yunhong.jiang@xxxxxxxxx>
+ *
+ * This file implements direct PCI assignment to a HVM guest
+ */
+
+#include "pt-msi.h"
+
+#define PT_MSI_CTRL_WR_MASK_HI      (0x1)
+#define PT_MSI_CTRL_WR_MASK_LO      (0x8E)
+#define PT_MSI_DATA_WR_MASK         (0x38)
+int pt_msi_init(struct pt_dev *dev, int pos)
+{
+    uint8_t id;
+    uint16_t flags;
+    struct pci_dev *pd = dev->pci_dev;
+    PCIDevice *d = (struct PCIDevice *)dev;
+
+    id = pci_read_byte(pd, pos + PCI_CAP_LIST_ID);
+
+    if ( id != PCI_CAP_ID_MSI )
+    {
+        PT_LOG("pt_msi_init: error id %x pos %x\n", id, pos);
+        return -1;
+    }
+
+    dev->msi = malloc(sizeof(struct pt_msi_info));
+    if ( !dev->msi )
+    {
+        PT_LOG("pt_msi_init: error allocation pt_msi_info\n");
+        return -1;
+    }
+    memset(dev->msi, 0, sizeof(struct pt_msi_info));
+
+    dev->msi->offset = pos;
+    dev->msi->size = 0xa;
+
+    flags = pci_read_byte(pd, pos + PCI_MSI_FLAGS);
+    if ( flags & PCI_MSI_FLAGS_ENABLE )
+    {
+        PT_LOG("pt_msi_init: MSI enabled already, disable first\n");
+        pci_write_byte(pd, pos + PCI_MSI_FLAGS, flags & ~PCI_MSI_FLAGS_ENABLE);
+    }
+    dev->msi->flags |= (flags | MSI_FLAG_UNINIT);
+
+    if ( flags & PCI_MSI_FLAGS_64BIT )
+        dev->msi->size += 4;
+    if ( flags & PCI_MSI_FLAGS_PVMASK )
+        dev->msi->size += 10;
+
+    /* All register is 0 after reset, except first 4 byte */
+    *(uint32_t *)(&d->config[pos]) = pci_read_long(pd, pos);
+    d->config[pos + 2] &=  PT_MSI_CTRL_WR_MASK_LO;
+    d->config[pos + 3] &=  PT_MSI_CTRL_WR_MASK_HI;
+
+    return 0;
+}
+
+/*
+ * setup physical msi, but didn't enable it
+ */
+static int pt_msi_setup(struct pt_dev *dev)
+{
+    int vector = -1, pirq = -1;
+
+    if ( !(dev->msi->flags & MSI_FLAG_UNINIT) )
+    {
+        PT_LOG("setup physical after initialized?? \n");
+        return -1;
+    }
+
+    if ( xc_physdev_map_pirq_msi(xc_handle, domid, MAP_PIRQ_TYPE_MSI,
+                            vector, &pirq,
+                                                       dev->pci_dev->dev << 3 
| dev->pci_dev->func,
+                                                       dev->pci_dev->bus, 1) )
+    {
+        PT_LOG("error map vector %x\n", vector);
+        return -1;
+    }
+    dev->msi->pirq = pirq;
+    PT_LOG("vector %x pirq %x\n", vector, pirq);
+
+    return 0;
+}
+
+/*
+ * caller should make sure mask is supported
+ */
+static uint32_t get_msi_gmask(struct pt_dev *d)
+{
+    struct PCIDevice *pd = (struct PCIDevice *)d;
+
+    if ( d->msi->flags & PCI_MSI_FLAGS_64BIT )
+        return *(uint32_t *)(pd->config + d->msi->offset + 0xc);
+    else
+        return *(uint32_t *)(pd->config + d->msi->offset + 0x10);
+
+}
+
+static uint16_t get_msi_gdata(struct pt_dev *d)
+{
+    struct PCIDevice *pd = (struct PCIDevice *)d;
+
+    if ( d->msi->flags & PCI_MSI_FLAGS_64BIT )
+        return *(uint16_t *)(pd->config + d->msi->offset + PCI_MSI_DATA_64);
+    else
+        return *(uint16_t *)(pd->config + d->msi->offset + PCI_MSI_DATA_32);
+}
+
+static uint64_t get_msi_gaddr(struct pt_dev *d)
+{
+    struct PCIDevice *pd = (struct PCIDevice *)d;
+    uint32_t addr_hi;
+    uint64_t addr = 0;
+
+    addr =(uint64_t)(*(uint32_t *)(pd->config +
+                     d->msi->offset + PCI_MSI_ADDRESS_LO));
+
+    if ( d->msi->flags & PCI_MSI_FLAGS_64BIT )
+    {
+        addr_hi = *(uint32_t *)(pd->config + d->msi->offset
+                                + PCI_MSI_ADDRESS_HI);
+        addr |= (uint64_t)addr_hi << 32;
+    }
+    return addr;
+}
+
+static uint8_t get_msi_gctrl(struct pt_dev *d)
+{
+    struct PCIDevice *pd = (struct PCIDevice *)d;
+
+    return  *(uint8_t *)(pd->config + d->msi->offset + PCI_MSI_FLAGS);
+}
+
+static uint32_t get_msi_gflags(struct pt_dev *d)
+{
+    uint32_t result = 0;
+    int rh, dm, dest_id, deliv_mode, trig_mode;
+    uint16_t data;
+    uint64_t addr;
+
+    data = get_msi_gdata(d);
+    addr = get_msi_gaddr(d);
+
+    rh = (addr >> MSI_ADDR_REDIRECTION_SHIFT) & 0x1;
+    dm = (addr >> MSI_ADDR_DESTMODE_SHIFT) & 0x1;
+    dest_id = (addr >> MSI_TARGET_CPU_SHIFT) & 0xff;
+    deliv_mode = (data >> MSI_DATA_DELIVERY_SHIFT) & 0x7;
+    trig_mode = (data >> MSI_DATA_TRIGGER_SHIFT) & 0x1;
+
+    result |= dest_id | (rh << GFLAGS_SHIFT_RH) | (dm << GFLAGS_SHIFT_DM) | \
+                (deliv_mode << GLFAGS_SHIFT_DELIV_MODE) |
+                (trig_mode << GLFAGS_SHIFT_TRG_MODE);
+
+    return result;
+}
+
+/*
+ * This may be arch different
+ */
+static inline uint8_t get_msi_gvec(struct pt_dev *d)
+{
+    return get_msi_gdata(d) & 0xff;
+}
+
+static inline uint8_t get_msi_hvec(struct pt_dev *d)
+{
+    struct pci_dev *pd = d->pci_dev;
+    uint16_t data;
+
+    if ( d->msi->flags & PCI_MSI_FLAGS_64BIT )
+        data = pci_read_word(pd, PCI_MSI_DATA_64);
+    else
+        data = pci_read_word(pd, PCI_MSI_DATA_32);
+
+    return data & 0xff;
+}
+
+/*
+ * Update msi mapping, usually called when MSI enabled,
+ * except the first time
+ */
+static int pt_msi_update(struct pt_dev *d)
+{
+    PT_LOG("now update msi with pirq %x gvec %x\n",
+            get_msi_gvec(d), d->msi->pirq);
+    return xc_domain_update_msi_irq(xc_handle, domid, get_msi_gvec(d),
+                                     d->msi->pirq, get_msi_gflags(d));
+}
+
+static int pt_msi_enable(struct pt_dev *d, int enable)
+{
+    uint16_t ctrl;
+    struct pci_dev *pd = d->pci_dev;
+
+    if ( !pd )
+        return -1;
+
+    ctrl = pci_read_word(pd, d->msi->offset + PCI_MSI_FLAGS);
+
+    if ( enable )
+        ctrl |= PCI_MSI_FLAGS_ENABLE;
+    else
+        ctrl &= ~PCI_MSI_FLAGS_ENABLE;
+
+    pci_write_word(pd, d->msi->offset + PCI_MSI_FLAGS, ctrl);
+    return 0;
+}
+
+static int pt_msi_control_update(struct pt_dev *d, uint16_t old_ctrl)
+{
+    uint16_t new_ctrl;
+    PCIDevice *pd = (PCIDevice *)d;
+
+    new_ctrl = get_msi_gctrl(d);
+
+    PT_LOG("old_ctrl %x new_Ctrl %x\n", old_ctrl, new_ctrl);
+
+    if ( new_ctrl & PCI_MSI_FLAGS_ENABLE )
+    {
+        if ( d->msi->flags & MSI_FLAG_UNINIT )
+        {
+            /* Init physical one */
+            PT_LOG("setup msi for dev %x\n", pd->devfn);
+            if ( pt_msi_setup(d) )
+            {
+                PT_LOG("pt_msi_setup error!!!\n");
+                return -1;
+            }
+            pt_msi_update(d);
+
+            d->msi->flags &= ~MSI_FLAG_UNINIT;
+            d->msi->flags |= PT_MSI_MAPPED;
+
+            /* Enable physical MSI only after bind */
+            pt_msi_enable(d, 1);
+        }
+        else if ( !(old_ctrl & PCI_MSI_FLAGS_ENABLE) )
+            pt_msi_enable(d, 1);
+    }
+    else if ( old_ctrl & PCI_MSI_FLAGS_ENABLE )
+        pt_msi_enable(d, 0);
+
+    /* Currently no support for multi-vector */
+    if ( (new_ctrl & PCI_MSI_FLAGS_QSIZE) != 0x0 )
+        PT_LOG("try to set more than 1 vector ctrl %x\n", new_ctrl);
+
+    return 0;
+}
+
+static int
+pt_msi_map_update(struct pt_dev *d, uint32_t old_data, uint64_t old_addr)
+{
+    uint16_t pctrl;
+    uint32_t data;
+    uint64_t addr;
+
+    data = get_msi_gdata(d);
+    addr = get_msi_gaddr(d);
+
+    PT_LOG("old_data %x old_addr %lx data %x addr %lx\n",
+            old_data, old_addr, data, addr);
+
+    if ( data != old_data || addr != old_addr )
+        if ( get_msi_gctrl(d) & PCI_MSI_FLAGS_ENABLE )
+            pt_msi_update(d);
+
+    return 0;
+}
+
+static int pt_msi_mask_update(struct pt_dev *d, uint32_t old_mask)
+{
+    struct pci_dev *pd = d->pci_dev;
+    uint32_t mask;
+    int offset;
+
+    if ( !(d->msi->flags & PCI_MSI_FLAGS_PVMASK) )
+        return -1;
+
+    mask = get_msi_gmask(d);
+
+    if ( d->msi->flags & PCI_MSI_FLAGS_64BIT )
+        offset = d->msi->offset + 0xc;
+    else
+        offset = d->msi->offset + 0x10;
+
+    if ( old_mask != mask )
+        pci_write_long(pd, offset, mask);
+}
+
+#define ACCESSED_DATA 0x2
+#define ACCESSED_MASK 0x4
+#define ACCESSED_ADDR 0x8
+#define ACCESSED_CTRL 0x10
+
+int pt_msi_write(struct pt_dev *d, uint32_t addr, uint32_t val, uint32_t len)
+{
+    struct pci_dev *pd;
+    int i, cur = addr;
+    uint8_t value, flags = 0;
+    uint16_t old_ctrl = 0, old_data = 0;
+    uint32_t old_mask = 0;
+    uint64_t old_addr = 0;
+    PCIDevice *dev = (PCIDevice *)d;
+    int can_write = 1;
+
+    if ( !d || !d->msi )
+        return 0;
+
+    if ( (addr >= (d->msi->offset + d->msi->size) ) ||
+         (addr + len) < d->msi->offset)
+        return 0;
+
+    PT_LOG("addr %x val %x len %x offset %x size %x\n",
+            addr, val, len, d->msi->offset, d->msi->size);
+
+    pd = d->pci_dev;
+    old_ctrl = get_msi_gctrl(d);
+    old_addr = get_msi_gaddr(d);
+    old_data = get_msi_gdata(d);
+
+    if ( d->msi->flags & PCI_MSI_FLAGS_PVMASK )
+        old_mask = get_msi_gmask(d);
+
+    for ( i = 0; i < len; i++, cur++ )
+    {
+        int off;
+        uint8_t orig_value;
+
+        if ( cur < d->msi->offset )
+            continue;
+        else if ( cur >= (d->msi->offset + d->msi->size) )
+            break;
+
+        off = cur - d->msi->offset;
+        value = (val >> (i * 8)) & 0xff;
+
+        switch ( off )
+        {
+            case 0x0 ... 0x1:
+                can_write = 0;
+                break;
+            case 0x2:
+            case 0x3:
+                flags |= ACCESSED_CTRL;
+
+                orig_value = pci_read_byte(pd, d->msi->offset + off);
+
+                orig_value &= (off == 2) ? PT_MSI_CTRL_WR_MASK_LO:
+                                      PT_MSI_CTRL_WR_MASK_HI;
+
+                orig_value |= value & ( (off == 2) ? ~PT_MSI_CTRL_WR_MASK_LO:
+                                              ~PT_MSI_CTRL_WR_MASK_HI);
+                value = orig_value;
+                break;
+            case 0x4 ... 0x7:
+                flags |= ACCESSED_ADDR;
+                /* bit 4 ~ 11 is reserved for MSI in x86 */
+                if ( off == 0x4 )
+                    value &= 0x0f;
+                if ( off == 0x5 )
+                    value &= 0xf0;
+                break;
+            case 0x8 ... 0xb:
+                if ( d->msi->flags & PCI_MSI_FLAGS_64BIT )
+                {
+                    /* Up 32bit is reserved in x86 */
+                    flags |= ACCESSED_ADDR;
+                    if ( value )
+                        PT_LOG("Write up32 addr with %x \n", value);
+                }
+                else
+                {
+                    if ( off == 0xa || off == 0xb )
+                        can_write = 0;
+                    else
+                        flags |= ACCESSED_DATA;
+                    if ( off == 0x9 )
+                        value &= ~PT_MSI_DATA_WR_MASK;
+                }
+                break;
+            case 0xc ... 0xf:
+                if ( d->msi->flags & PCI_MSI_FLAGS_64BIT )
+                {
+                    if ( off == 0xe || off == 0xf )
+                        can_write = 0;
+                    else
+                    {
+                        flags |= ACCESSED_DATA;
+                        if (off == 0xd)
+                            value &= ~PT_MSI_DATA_WR_MASK;
+                    }
+                }
+                else
+                {
+                    if ( d->msi->flags & PCI_MSI_FLAGS_PVMASK )
+                        flags |= ACCESSED_MASK;
+                    else
+                        PT_LOG("why comes to MASK without mask support??\n");
+                }
+                break;
+            case 0x10 ... 0x13:
+                if ( d->msi->flags & PCI_MSI_FLAGS_64BIT )
+                {
+                    if ( d->msi->flags & PCI_MSI_FLAGS_PVMASK )
+                        flags |= ACCESSED_MASK;
+                    else
+                        PT_LOG("why comes to MASK without mask support??\n");
+                }
+                else
+                    can_write = 0;
+                break;
+            case 0x14 ... 0x18:
+                can_write = 0;
+                break;
+            default:
+                PT_LOG("Non MSI register!!!\n");
+                break;
+        }
+
+        if ( can_write )
+            dev->config[cur] = value;
+    }
+
+    if ( flags & ACCESSED_DATA || flags & ACCESSED_ADDR )
+        pt_msi_map_update(d, old_data, old_addr);
+
+    if ( flags & ACCESSED_MASK )
+        pt_msi_mask_update(d, old_mask);
+
+    /* This will enable physical one, do it in last step */
+    if ( flags & ACCESSED_CTRL )
+        pt_msi_control_update(d, old_ctrl);
+
+    return 1;
+}
+
+int pt_msi_read(struct pt_dev *d, int addr, int len, uint32_t *val)
+{
+    int e_addr = addr, e_len = len, offset = 0, i;
+    uint8_t e_val = 0;
+    PCIDevice *pd = (PCIDevice *)d;
+
+    if ( !d || !d->msi )
+        return 0;
+
+    if ( (addr > (d->msi->offset + d->msi->size) ) ||
+         (addr + len) <= d->msi->offset )
+        return 0;
+
+    PT_LOG("pt_msi_read addr %x len %x val %x offset %x size %x\n",
+            addr, len, *val, d->msi->offset, d->msi->size);
+
+    if ( (addr + len ) > (d->msi->offset + d->msi->size) )
+        e_len -= addr + len - d->msi->offset - d->msi->size;
+
+    if ( addr < d->msi->offset )
+    {
+        e_addr = d->msi->offset;
+        offset = d->msi->offset - addr;
+        e_len -= offset;
+    }
+
+    for ( i = 0; i < e_len; i++ )
+    {
+        e_val = *(uint8_t *)(&pd->config[e_addr] + i);
+        *val &= ~(0xff << ( (offset + i) * 8));
+        *val |= (e_val << ( (offset + i) * 8));
+    }
+
+    return e_len;
+}
+
diff -r f2457c7aff8d -r 611787b6ca35 tools/ioemu/hw/pt-msi.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ioemu/hw/pt-msi.h   Thu May 08 18:40:07 2008 +0900
@@ -0,0 +1,65 @@
+#ifndef _PT_MSI_H
+#define _PT_MSI_H
+
+#include "vl.h"
+#include "pci/header.h"
+#include "pci/pci.h"
+#include "pass-through.h"
+
+#define MSI_FLAG_UNINIT 0x1000
+#define PT_MSI_MAPPED   0x2000
+
+#define MSI_DATA_VECTOR_SHIFT          0
+#define     MSI_DATA_VECTOR(v)         (((u8)v) << MSI_DATA_VECTOR_SHIFT)
+
+#define MSI_DATA_DELIVERY_SHIFT        8
+#define     MSI_DATA_DELIVERY_FIXED    (0 << MSI_DATA_DELIVERY_SHIFT)
+#define     MSI_DATA_DELIVERY_LOWPRI   (1 << MSI_DATA_DELIVERY_SHIFT)
+
+#define MSI_DATA_LEVEL_SHIFT           14
+#define     MSI_DATA_LEVEL_DEASSERT    (0 << MSI_DATA_LEVEL_SHIFT)
+#define     MSI_DATA_LEVEL_ASSERT      (1 << MSI_DATA_LEVEL_SHIFT)
+
+#define MSI_DATA_TRIGGER_SHIFT         15
+#define     MSI_DATA_TRIGGER_EDGE      (0 << MSI_DATA_TRIGGER_SHIFT)
+#define     MSI_DATA_TRIGGER_LEVEL     (1 << MSI_DATA_TRIGGER_SHIFT)
+
+/*
+   + * Shift/mask fields for APIC-based bus address
+   + */
+
+#define MSI_ADDR_HEADER                0xfee00000
+#define MSI_TARGET_CPU_SHIFT                  12
+
+#define MSI_ADDR_DESTID_MASK           0xfff0000f
+#define     MSI_ADDR_DESTID_CPU(cpu)   ((cpu) << MSI_TARGET_CPU_SHIFT)
+
+#define MSI_ADDR_DESTMODE_SHIFT        2
+#define     MSI_ADDR_DESTMODE_PHYS     (0 << MSI_ADDR_DESTMODE_SHIFT)
+#define        MSI_ADDR_DESTMODE_LOGIC (1 << MSI_ADDR_DESTMODE_SHIFT)
+
+#define MSI_ADDR_REDIRECTION_SHIFT     3
+#define     MSI_ADDR_REDIRECTION_CPU   (0 << MSI_ADDR_REDIRECTION_SHIFT)
+#define     MSI_ADDR_REDIRECTION_LOWPRI (1 << MSI_ADDR_REDIRECTION_SHIFT)
+
+#define PCI_MSI_FLAGS_PVMASK           0x100
+
+#define AUTO_ASSIGN -1
+
+/* shift count for gflags */
+#define GFLAGS_SHIFT_DEST_ID        0
+#define GFLAGS_SHIFT_RH             8
+#define GFLAGS_SHIFT_DM             9
+#define GLFAGS_SHIFT_DELIV_MODE     12
+#define GLFAGS_SHIFT_TRG_MODE       15
+
+int
+pt_msi_init(struct pt_dev *dev, int pos);
+
+int
+pt_msi_write(struct pt_dev *d, uint32_t addr, uint32_t val, uint32_t len);
+
+int
+pt_msi_read(struct pt_dev *d, int addr, int len, uint32_t *val);
+
+#endif
diff -r f2457c7aff8d -r 611787b6ca35 tools/ioemu/hw/vga.c
--- a/tools/ioemu/hw/vga.c      Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/ioemu/hw/vga.c      Thu May 08 18:40:07 2008 +0900
@@ -1075,7 +1075,7 @@ static rgb_to_pixel_dup_func *rgb_to_pix
  */
 static void vga_draw_text(VGAState *s, int full_update)
 {
-    int cx, cy, cheight, cw, ch, cattr, height, width, ch_attr, depth;
+    int cx, cy, cheight, cw, ch, cattr, height, width, ch_attr;
     int cx_min, cx_max, linesize, x_incr;
     uint32_t offset, fgcol, bgcol, v, cursor_offset;
     uint8_t *d1, *d, *src, *s1, *dest, *cursor_ptr;
@@ -1086,9 +1086,11 @@ static void vga_draw_text(VGAState *s, i
     vga_draw_glyph8_func *vga_draw_glyph8;
     vga_draw_glyph9_func *vga_draw_glyph9;
 
-    depth = s->get_bpp(s);
-    if (s->ds->dpy_colourdepth != NULL && s->ds->depth != depth)
-        s->ds->dpy_colourdepth(s->ds, depth);
+    /* Disable dirty bit tracking */
+    xc_hvm_track_dirty_vram(xc_handle, domid, 0, 0, NULL);
+
+    if (s->ds->dpy_colourdepth != NULL && s->ds->depth != 0)
+        s->ds->dpy_colourdepth(s->ds, 0);
     s->rgb_to_pixel = 
         rgb_to_pixel_dup_table[get_depth_index(s->ds)];
 
@@ -1486,7 +1488,7 @@ static void vga_draw_graphic(VGAState *s
 static void vga_draw_graphic(VGAState *s, int full_update)
 {
     int y1, y, update, linesize, y_start, double_scan, mask, depth;
-    int width, height, shift_control, line_offset, bwidth, ds_depth;
+    int width, height, shift_control, line_offset, bwidth, ds_depth, bits;
     ram_addr_t page0, page1;
     int disp_width, multi_scan, multi_run;
     uint8_t *d;
@@ -1534,6 +1536,7 @@ static void vga_draw_graphic(VGAState *s
         } else {
             v = VGA_DRAW_LINE4;
         }
+        bits = 4;
     } else if (shift_control == 1) {
         full_update |= update_palette16(s);
         if (s->sr[0x01] & 8) {
@@ -1542,28 +1545,35 @@ static void vga_draw_graphic(VGAState *s
         } else {
             v = VGA_DRAW_LINE2;
         }
+        bits = 4;
     } else {
         switch(s->get_bpp(s)) {
         default:
         case 0:
             full_update |= update_palette256(s);
             v = VGA_DRAW_LINE8D2;
+            bits = 4;
             break;
         case 8:
             full_update |= update_palette256(s);
             v = VGA_DRAW_LINE8;
+            bits = 8;
             break;
         case 15:
             v = VGA_DRAW_LINE15;
+            bits = 16;
             break;
         case 16:
             v = VGA_DRAW_LINE16;
+            bits = 16;
             break;
         case 24:
             v = VGA_DRAW_LINE24;
+            bits = 24;
             break;
         case 32:
             v = VGA_DRAW_LINE32;
+            bits = 32;
             break;
         }
     }
@@ -1591,12 +1601,72 @@ static void vga_draw_graphic(VGAState *s
            width, height, v, line_offset, s->cr[9], s->cr[0x17], 
s->line_compare, s->sr[0x01]);
 #endif
 
-    for (y = 0; y < s->vram_size; y += TARGET_PAGE_SIZE)
-        if (vram_dirty(s, y, TARGET_PAGE_SIZE))
+    y = 0;
+
+    if (height - 1 > s->line_compare || multi_run || (s->cr[0x17] & 3) != 3
+            || !s->lfb_addr) {
+        /* Tricky things happen, disable dirty bit tracking */
+        xc_hvm_track_dirty_vram(xc_handle, domid, 0, 0, NULL);
+
+        for ( ; y < s->vram_size; y += TARGET_PAGE_SIZE)
+            if (vram_dirty(s, y, TARGET_PAGE_SIZE))
+                cpu_physical_memory_set_dirty(s->vram_offset + y);
+    } else {
+        /* Tricky things won't have any effect, i.e. we are in the very simple
+         * (and very usual) case of a linear buffer. */
+        unsigned long end;
+
+        for ( ; y < ((s->start_addr * 4) & TARGET_PAGE_MASK); y += 
TARGET_PAGE_SIZE)
+            /* We will not read that anyway. */
             cpu_physical_memory_set_dirty(s->vram_offset + y);
 
+        if (y < (s->start_addr * 4)) {
+            /* start address not aligned on a page, track dirtyness by hand. */
+            if (vram_dirty(s, y, TARGET_PAGE_SIZE))
+                cpu_physical_memory_set_dirty(s->vram_offset + y);
+            y += TARGET_PAGE_SIZE;
+        }
+
+        /* use page table dirty bit tracking for the inner of the LFB */
+        end = s->start_addr * 4 + height * line_offset;
+        {
+            unsigned long npages = ((end & TARGET_PAGE_MASK) - y) / 
TARGET_PAGE_SIZE;
+            const int width = sizeof(unsigned long) * 8;
+            unsigned long bitmap[(npages + width - 1) / width];
+            int err;
+
+            if (!(err = xc_hvm_track_dirty_vram(xc_handle, domid,
+                        (s->lfb_addr + y) / TARGET_PAGE_SIZE, npages, 
bitmap))) {
+                int i, j;
+                for (i = 0; i < sizeof(bitmap) / sizeof(*bitmap); i++) {
+                    unsigned long map = bitmap[i];
+                    for (j = i * width; map && j < npages; map >>= 1, j++)
+                        if (map & 1)
+                            cpu_physical_memory_set_dirty(s->vram_offset + y
+                                + j * TARGET_PAGE_SIZE);
+                }
+                y += npages * TARGET_PAGE_SIZE;
+            } else {
+                /* ENODATA just means we have changed mode and will succeed
+                 * next time */
+                if (err != -ENODATA)
+                    fprintf(stderr, "track_dirty_vram(%lx, %lx) failed 
(%d)\n", s->lfb_addr + y, npages, err);
+            }
+        }
+
+        for ( ; y < s->vram_size && y < end; y += TARGET_PAGE_SIZE)
+            /* failed or end address not aligned on a page, track dirtyness by
+             * hand. */
+            if (vram_dirty(s, y, TARGET_PAGE_SIZE))
+                cpu_physical_memory_set_dirty(s->vram_offset + y);
+
+        for ( ; y < s->vram_size; y += TARGET_PAGE_SIZE)
+            /* We will not read that anyway. */
+            cpu_physical_memory_set_dirty(s->vram_offset + y);
+    }
+
     addr1 = (s->start_addr * 4);
-    bwidth = width * 4;
+    bwidth = (width * bits + 7) / 8;
     y_start = -1;
     page_min = 0;
     page_max = 0;
@@ -1682,6 +1752,10 @@ static void vga_draw_blank(VGAState *s, 
         return;
     if (s->last_scr_width <= 0 || s->last_scr_height <= 0)
         return;
+
+    /* Disable dirty bit tracking */
+    xc_hvm_track_dirty_vram(xc_handle, domid, 0, 0, NULL);
+
     s->rgb_to_pixel = 
         rgb_to_pixel_dup_table[get_depth_index(s->ds)];
     if (s->ds->depth == 8) 
diff -r f2457c7aff8d -r 611787b6ca35 tools/ioemu/hw/vga_int.h
--- a/tools/ioemu/hw/vga_int.h  Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/ioemu/hw/vga_int.h  Thu May 08 18:40:07 2008 +0900
@@ -87,6 +87,8 @@
     unsigned int vram_size;                                             \
     unsigned long bios_offset;                                          \
     unsigned int bios_size;                                             \
+    unsigned long lfb_addr;                                             \
+    unsigned long lfb_end;                                              \
     PCIDevice *pci_dev;                                                 \
     uint32_t latch;                                                     \
     uint8_t sr_index;                                                   \
diff -r f2457c7aff8d -r 611787b6ca35 tools/ioemu/sdl.c
--- a/tools/ioemu/sdl.c Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/ioemu/sdl.c Thu May 08 18:40:07 2008 +0900
@@ -234,6 +234,9 @@ static void sdl_resize(DisplayState *ds,
 
  again:
     screen = SDL_SetVideoMode(w, h, 0, flags);
+
+    /* Process any WM-generated resize event */
+    SDL_PumpEvents();
 
     if (!screen) {
         fprintf(stderr, "Could not open SDL display: %s\n", SDL_GetError());
diff -r f2457c7aff8d -r 611787b6ca35 tools/ioemu/vl.h
--- a/tools/ioemu/vl.h  Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/ioemu/vl.h  Thu May 08 18:40:07 2008 +0900
@@ -940,7 +940,6 @@ struct DisplayState {
     uint32_t *palette;
     uint64_t gui_timer_interval;
 
-    int switchbpp;
     int shared_buf;
     
     void (*dpy_update)(struct DisplayState *s, int x, int y, int w, int h);
diff -r f2457c7aff8d -r 611787b6ca35 tools/ioemu/vnc.c
--- a/tools/ioemu/vnc.c Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/ioemu/vnc.c Thu May 08 18:40:07 2008 +0900
@@ -198,6 +198,7 @@ struct VncState
     char *x509key;
 #endif
     char challenge[VNC_AUTH_CHALLENGE_SIZE];
+    int switchbpp;
 
 #if CONFIG_VNC_TLS
     int wiremode;
@@ -1686,7 +1687,7 @@ static void vnc_dpy_colourdepth(DisplayS
         default:
             return;
     }
-    if (ds->switchbpp) {
+    if (vs->switchbpp) {
         vnc_client_error(vs);
     } else if (vs->csock != -1 && vs->has_WMVi) {
         /* Sending a WMVi message to notify the client*/
@@ -2647,7 +2648,7 @@ int vnc_display_open(DisplayState *ds, c
        if (strncmp(options, "password", 8) == 0) {
            password = 1; /* Require password auth */
         } else if (strncmp(options, "switchbpp", 9) == 0) {
-            ds->switchbpp = 1;
+            vs->switchbpp = 1;
 #if CONFIG_VNC_TLS
        } else if (strncmp(options, "tls", 3) == 0) {
            tls = 1; /* Require TLS */
diff -r f2457c7aff8d -r 611787b6ca35 tools/libfsimage/Makefile
--- a/tools/libfsimage/Makefile Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/libfsimage/Makefile Thu May 08 18:40:07 2008 +0900
@@ -1,7 +1,7 @@ XEN_ROOT = ../..
 XEN_ROOT = ../..
 include $(XEN_ROOT)/tools/Rules.mk
 
-SUBDIRS-y = common ufs reiserfs iso9660 fat
+SUBDIRS-y = common ufs reiserfs iso9660 fat zfs
 SUBDIRS-y += $(shell env CC="$(CC)" ./check-libext2fs)
 
 .PHONY: all clean install
diff -r f2457c7aff8d -r 611787b6ca35 tools/libfsimage/common/fsimage.c
--- a/tools/libfsimage/common/fsimage.c Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/libfsimage/common/fsimage.c Thu May 08 18:40:07 2008 +0900
@@ -17,7 +17,7 @@
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
  *
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -51,6 +51,7 @@ fsi_t *fsi_open_fsimage(const char *path
        fsi->f_fd = fd;
        fsi->f_off = off;
        fsi->f_data = NULL;
+       fsi->f_bootstring = NULL;
 
        pthread_mutex_lock(&fsi_lock);
        err = find_plugin(fsi, path, options);
@@ -140,3 +141,29 @@ ssize_t fsi_pread_file(fsi_file_t *ffi, 
 
        return (ret);
 }
+
+char *
+fsi_bootstring_alloc(fsi_t *fsi, size_t len)
+{
+       fsi->f_bootstring = malloc(len);
+       if (fsi->f_bootstring == NULL)
+               return (NULL);
+
+       bzero(fsi->f_bootstring, len);
+       return (fsi->f_bootstring);
+}
+
+void
+fsi_bootstring_free(fsi_t *fsi)
+{
+       if (fsi->f_bootstring != NULL) {
+               free(fsi->f_bootstring);
+               fsi->f_bootstring = NULL;
+       }
+}
+
+char *
+fsi_fs_bootstring(fsi_t *fsi)
+{
+       return (fsi->f_bootstring);
+}
diff -r f2457c7aff8d -r 611787b6ca35 tools/libfsimage/common/fsimage.h
--- a/tools/libfsimage/common/fsimage.h Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/libfsimage/common/fsimage.h Thu May 08 18:40:07 2008 +0900
@@ -17,7 +17,7 @@
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
  *
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -45,6 +45,10 @@ ssize_t fsi_read_file(fsi_file_t *, void
 ssize_t fsi_read_file(fsi_file_t *, void *, size_t);
 ssize_t fsi_pread_file(fsi_file_t *, void *, size_t, uint64_t);
 
+char *fsi_bootstring_alloc(fsi_t *, size_t);
+void fsi_bootstring_free(fsi_t *);
+char *fsi_fs_bootstring(fsi_t *);
+
 #ifdef __cplusplus
 };
 #endif
diff -r f2457c7aff8d -r 611787b6ca35 tools/libfsimage/common/fsimage_grub.c
--- a/tools/libfsimage/common/fsimage_grub.c    Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/libfsimage/common/fsimage_grub.c    Thu May 08 18:40:07 2008 +0900
@@ -17,7 +17,7 @@
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
  *
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -286,6 +286,7 @@ fsig_mount(fsi_t *fsi, const char *path,
 
        if (!ops->fpo_mount(ffi, options)) {
                fsip_file_free(ffi);
+               fsi_bootstring_free(fsi);
                free(fsi->f_data);
                fsi->f_data = NULL;
                return (-1);
@@ -299,6 +300,7 @@ static int
 static int
 fsig_umount(fsi_t *fsi)
 {
+       fsi_bootstring_free(fsi);
        free(fsi->f_data);
        return (0);
 }
diff -r f2457c7aff8d -r 611787b6ca35 tools/libfsimage/common/fsimage_grub.h
--- a/tools/libfsimage/common/fsimage_grub.h    Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/libfsimage/common/fsimage_grub.h    Thu May 08 18:40:07 2008 +0900
@@ -17,7 +17,7 @@
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
  *
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -72,6 +72,12 @@ unsigned long fsig_log2(unsigned long);
 #define        ERR_FILELENGTH 1
 #define        ERR_BAD_FILETYPE 1
 #define        ERR_FILE_NOT_FOUND 1
+#define        ERR_BAD_ARGUMENT 1
+#define        ERR_FILESYSTEM_NOT_FOUND 1
+#define        ERR_NO_BOOTPATH 1
+#define        ERR_DEV_VALUES 1
+#define        ERR_WONT_FIT 1
+#define        ERR_READ 1
 
 fsi_plugin_ops_t *fsig_init(fsi_plugin_t *, fsig_plugin_ops_t *);
 
diff -r f2457c7aff8d -r 611787b6ca35 tools/libfsimage/common/fsimage_priv.h
--- a/tools/libfsimage/common/fsimage_priv.h    Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/libfsimage/common/fsimage_priv.h    Thu May 08 18:40:07 2008 +0900
@@ -17,7 +17,7 @@
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
  *
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -46,6 +46,7 @@ struct fsi {
        uint64_t f_off;
        void *f_data;
        fsi_plugin_t *f_plugin;
+       char *f_bootstring;
 };
 
 struct fsi_file {
diff -r f2457c7aff8d -r 611787b6ca35 tools/libfsimage/common/mapfile-GNU
--- a/tools/libfsimage/common/mapfile-GNU       Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/libfsimage/common/mapfile-GNU       Thu May 08 18:40:07 2008 +0900
@@ -8,6 +8,9 @@ VERSION {
                        fsi_close_file;
                        fsi_read_file;
                        fsi_pread_file;
+                       fsi_bootstring_alloc;
+                       fsi_bootstring_free;
+                       fsi_fs_bootstring;
        
                        fsip_fs_set_data;
                        fsip_file_alloc;
diff -r f2457c7aff8d -r 611787b6ca35 tools/libfsimage/common/mapfile-SunOS
--- a/tools/libfsimage/common/mapfile-SunOS     Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/libfsimage/common/mapfile-SunOS     Thu May 08 18:40:07 2008 +0900
@@ -7,6 +7,9 @@ libfsimage.so.1.0 {
                fsi_close_file;
                fsi_read_file;
                fsi_pread_file;
+               fsi_bootstring_alloc;
+               fsi_bootstring_free;
+               fsi_fs_bootstring;
 
                fsip_fs_set_data;
                fsip_file_alloc;
diff -r f2457c7aff8d -r 611787b6ca35 tools/libfsimage/zfs/Makefile
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libfsimage/zfs/Makefile     Thu May 08 18:40:07 2008 +0900
@@ -0,0 +1,37 @@
+#
+#  GRUB  --  GRand Unified Bootloader
+#  Copyright (C) 1999,2000,2001,2002,2003,2004  Free Software Foundation, Inc.
+#
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation; either version 2 of the License, or
+#  (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with this program; if not, write to the Free Software
+#  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+
+# 
+#  Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+#  Use is subject to license terms.
+#
+
+XEN_ROOT = ../../..
+
+LIB_SRCS-y = fsys_zfs.c zfs_lzjb.c zfs_sha256.c zfs_fletcher.c
+
+FS = zfs
+
+.PHONY: all
+all: fs-all
+
+.PHONY: install
+install: fs-install
+
+include $(XEN_ROOT)/tools/libfsimage/Rules.mk
diff -r f2457c7aff8d -r 611787b6ca35 tools/libfsimage/zfs/fsys_zfs.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libfsimage/zfs/fsys_zfs.c   Thu May 08 18:40:07 2008 +0900
@@ -0,0 +1,1457 @@
+/*
+ *  GRUB  --  GRand Unified Bootloader
+ *  Copyright (C) 1999,2000,2001,2002,2003,2004  Free Software Foundation, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * All files in the zfs directory are derived from the OpenSolaris
+ * zfs grub files.  All files in the zfs-include directory were
+ * included without changes.
+ */
+
+/*
+ * The zfs plug-in routines for GRUB are:
+ *
+ * zfs_mount() - locates a valid uberblock of the root pool and reads
+ *             in its MOS at the memory address MOS.
+ *
+ * zfs_open() - locates a plain file object by following the MOS
+ *             and places its dnode at the memory address DNODE.
+ *
+ * zfs_read() - read in the data blocks pointed by the DNODE.
+ *
+ * ZFS_SCRATCH is used as a working area.
+ *
+ * (memory addr)   MOS      DNODE      ZFS_SCRATCH
+ *                 |         |          |
+ *         +-------V---------V----------V---------------+
+ *   memory |       | dnode   | dnode    |  scratch      |
+ *         |       | 512B    | 512B     |  area         |
+ *         +--------------------------------------------+
+ */
+
+#include <stdio.h>
+#include <strings.h>
+
+/* From "shared.h" */
+#include "mb_info.h"
+
+/* Boot signature related defines for the findroot command */
+#define        BOOTSIGN_DIR    "/boot/grub/bootsign"
+#define        BOOTSIGN_BACKUP "/etc/bootsign"
+
+/* Maybe redirect memory requests through grub_scratch_mem. */
+#define        RAW_ADDR(x) (x)
+#define        RAW_SEG(x) (x)
+
+/* ZFS will use the top 4 Meg of physical memory (below 4Gig) for sratch */
+#define        ZFS_SCRATCH_SIZE 0x400000
+
+#define        MIN(x, y) ((x) < (y) ? (x) : (y))
+/* End from shared.h */
+
+#include "fsys_zfs.h"
+
+/* cache for a file block of the currently zfs_open()-ed file */
+#define        file_buf zfs_ba->zfs_file_buf
+#define        file_start zfs_ba->zfs_file_start
+#define        file_end zfs_ba->zfs_file_end
+
+/* cache for a dnode block */
+#define        dnode_buf zfs_ba->zfs_dnode_buf
+#define        dnode_mdn zfs_ba->zfs_dnode_mdn
+#define        dnode_start zfs_ba->zfs_dnode_start
+#define        dnode_end zfs_ba->zfs_dnode_end
+
+#define        stackbase zfs_ba->zfs_stackbase
+
+decomp_entry_t decomp_table[ZIO_COMPRESS_FUNCTIONS] =
+{
+       {"noop", 0},
+       {"on", lzjb_decompress},        /* ZIO_COMPRESS_ON */
+       {"off", 0},
+       {"lzjb", lzjb_decompress}       /* ZIO_COMPRESS_LZJB */
+};
+
+/* From disk_io.c */
+/* ZFS root filesystem for booting */
+#define        current_bootpath zfs_ba->zfs_current_bootpath
+#define        current_rootpool zfs_ba->zfs_current_rootpool
+#define        current_bootfs zfs_ba->zfs_current_bootfs
+#define        current_bootfs_obj zfs_ba->zfs_current_bootfs_obj
+#define        is_zfs_mount (*fsig_int1(ffi))
+/* End from disk_io.c */
+
+#define        is_zfs_open zfs_ba->zfs_open
+
+/*
+ * Our own version of bcmp().
+ */
+static int
+zfs_bcmp(const void *s1, const void *s2, size_t n)
+{
+       const unsigned char *ps1 = s1;
+       const unsigned char *ps2 = s2;
+
+       if (s1 != s2 && n != 0) {
+               do {
+                       if (*ps1++ != *ps2++)
+                               return (1);
+               } while (--n != 0);
+       }
+
+       return (0);
+}
+
+/*
+ * Our own version of log2().  Same thing as highbit()-1.
+ */
+static int
+zfs_log2(uint64_t num)
+{
+       int i = 0;
+
+       while (num > 1) {
+               i++;
+               num = num >> 1;
+       }
+
+       return (i);
+}
+
+/* Checksum Functions */
+static void
+zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+       ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
+}
+
+/* Checksum Table and Values */
+zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
+       {{NULL,                 NULL},                  0, 0,   "inherit"},
+       {{NULL,                 NULL},                  0, 0,   "on"},
+       {{zio_checksum_off,     zio_checksum_off},      0, 0,   "off"},
+       {{zio_checksum_SHA256,  zio_checksum_SHA256},   1, 1,   "label"},
+       {{zio_checksum_SHA256,  zio_checksum_SHA256},   1, 1,   "gang_header"},
+       {{fletcher_2_native,    fletcher_2_byteswap},   0, 1,   "zilog"},
+       {{fletcher_2_native,    fletcher_2_byteswap},   0, 0,   "fletcher2"},
+       {{fletcher_4_native,    fletcher_4_byteswap},   1, 0,   "fletcher4"},
+       {{zio_checksum_SHA256,  zio_checksum_SHA256},   1, 0,   "SHA256"}
+};
+
+/*
+ * zio_checksum_verify: Provides support for checksum verification.
+ *
+ * Fletcher2, Fletcher4, and SHA256 are supported.
+ *
+ * Return:
+ *     -1 = Failure
+ *      0 = Success
+ */
+static int
+zio_checksum_verify(blkptr_t *bp, char *data, int size)
+{
+       zio_cksum_t zc = bp->blk_cksum;
+       uint32_t checksum = BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER :
+           BP_GET_CHECKSUM(bp);
+       int byteswap = BP_SHOULD_BYTESWAP(bp);
+       zio_block_tail_t *zbt = (zio_block_tail_t *)(data + size) - 1;
+       zio_checksum_info_t *ci = &zio_checksum_table[checksum];
+       zio_cksum_t actual_cksum, expected_cksum;
+
+       /* byteswap is not supported */
+       if (byteswap)
+               return (-1);
+
+       if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
+               return (-1);
+
+       if (ci->ci_zbt) {
+               if (checksum == ZIO_CHECKSUM_GANG_HEADER) {
+                       /*
+                        * 'gang blocks' is not supported.
+                        */
+                       return (-1);
+               }
+
+               if (zbt->zbt_magic == BSWAP_64(ZBT_MAGIC)) {
+                       /* byte swapping is not supported */
+                       return (-1);
+               } else {
+                       expected_cksum = zbt->zbt_cksum;
+                       zbt->zbt_cksum = zc;
+                       ci->ci_func[0](data, size, &actual_cksum);
+                       zbt->zbt_cksum = expected_cksum;
+               }
+               zc = expected_cksum;
+
+       } else {
+               if (BP_IS_GANG(bp))
+                       return (-1);
+               ci->ci_func[byteswap](data, size, &actual_cksum);
+       }
+
+       if ((actual_cksum.zc_word[0] - zc.zc_word[0]) |
+           (actual_cksum.zc_word[1] - zc.zc_word[1]) |
+           (actual_cksum.zc_word[2] - zc.zc_word[2]) |
+           (actual_cksum.zc_word[3] - zc.zc_word[3]))
+               return (-1);
+
+       return (0);
+}
+
+/*
+ * vdev_label_offset takes "offset" (the offset within a vdev_label) and
+ * returns its physical disk offset (starting from the beginning of the vdev).
+ *
+ * Input:
+ *     psize   : Physical size of this vdev
+ *      l      : Label Number (0-3)
+ *     offset  : The offset with a vdev_label in which we want the physical
+ *               address
+ * Return:
+ *     Success : physical disk offset
+ *     Failure : errnum = ERR_BAD_ARGUMENT, return value is meaningless
+ */
+static uint64_t
+vdev_label_offset(fsi_file_t *ffi, uint64_t psize, int l, uint64_t offset)
+{
+       /* XXX Need to add back label support! */
+       if (l >= VDEV_LABELS/2 || offset > sizeof (vdev_label_t)) {
+               errnum = ERR_BAD_ARGUMENT;
+               return (0);
+       }
+
+       return (offset + l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ?
+           0 : psize - VDEV_LABELS * sizeof (vdev_label_t)));
+
+}
+
+/*
+ * vdev_uberblock_compare takes two uberblock structures and returns an integer
+ * indicating the more recent of the two.
+ *     Return Value = 1 if ub2 is more recent
+ *     Return Value = -1 if ub1 is more recent
+ * The most recent uberblock is determined using its transaction number and
+ * timestamp.  The uberblock with the highest transaction number is
+ * considered "newer".  If the transaction numbers of the two blocks match, the
+ * timestamps are compared to determine the "newer" of the two.
+ */
+static int
+vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2)
+{
+       if (ub1->ub_txg < ub2->ub_txg)
+               return (-1);
+       if (ub1->ub_txg > ub2->ub_txg)
+               return (1);
+
+       if (ub1->ub_timestamp < ub2->ub_timestamp)
+               return (-1);
+       if (ub1->ub_timestamp > ub2->ub_timestamp)
+               return (1);
+
+       return (0);
+}
+
+/*
+ * Three pieces of information are needed to verify an uberblock: the magic
+ * number, the version number, and the checksum.
+ *
+ * Currently Implemented: version number, magic number
+ * Need to Implement: checksum
+ *
+ * Return:
+ *     0 - Success
+ *    -1 - Failure
+ */
+static int
+uberblock_verify(uberblock_phys_t *ub, int offset)
+{
+
+       uberblock_t *uber = &ub->ubp_uberblock;
+       blkptr_t bp;
+
+       BP_ZERO(&bp);
+       BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
+       BP_SET_BYTEORDER(&bp, ZFS_HOST_BYTEORDER);
+       ZIO_SET_CHECKSUM(&bp.blk_cksum, offset, 0, 0, 0);
+
+       if (zio_checksum_verify(&bp, (char *)ub, UBERBLOCK_SIZE) != 0)
+               return (-1);
+
+       if (uber->ub_magic == UBERBLOCK_MAGIC &&
+           uber->ub_version >= SPA_VERSION_1 &&
+           uber->ub_version <= SPA_VERSION)
+               return (0);
+
+       return (-1);
+}
+
+/*
+ * Find the best uberblock.
+ * Return:
+ *    Success - Pointer to the best uberblock.
+ *    Failure - NULL
+ */
+static uberblock_phys_t *
+find_bestub(fsi_file_t *ffi, uberblock_phys_t *ub_array, int label)
+{
+       uberblock_phys_t *ubbest = NULL;
+       int i, offset;
+
+       for (i = 0; i < (VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT); i++) {
+               offset = vdev_label_offset(ffi, 0, label,
+                   VDEV_UBERBLOCK_OFFSET(i));
+               if (errnum == ERR_BAD_ARGUMENT)
+                       return (NULL);
+               if (uberblock_verify(&ub_array[i], offset) == 0) {
+                       if (ubbest == NULL) {
+                               ubbest = &ub_array[i];
+                       } else if (vdev_uberblock_compare(
+                           &(ub_array[i].ubp_uberblock),
+                           &(ubbest->ubp_uberblock)) > 0) {
+                               ubbest = &ub_array[i];
+                       }
+               }
+       }
+
+       return (ubbest);
+}
+
+/*
+ * Read in a block and put its uncompressed data in buf.
+ *
+ * Return:
+ *     0 - success
+ *     errnum - failure
+ */
+static int
+zio_read(fsi_file_t *ffi, blkptr_t *bp, void *buf, char *stack)
+{
+       uint64_t offset, sector;
+       int psize, lsize;
+       int i, comp, cksum;
+
+       psize = BP_GET_PSIZE(bp);
+       lsize = BP_GET_LSIZE(bp);
+       comp = BP_GET_COMPRESS(bp);
+       cksum = BP_GET_CHECKSUM(bp);
+
+       if ((unsigned int)comp >= ZIO_COMPRESS_FUNCTIONS ||
+           (comp != ZIO_COMPRESS_OFF &&
+           decomp_table[comp].decomp_func == NULL))
+               return (ERR_FSYS_CORRUPT);
+
+       /* pick a good dva from the block pointer */
+       for (i = 0; i < SPA_DVAS_PER_BP; i++) {
+
+               if (bp->blk_dva[i].dva_word[0] == 0 &&
+                   bp->blk_dva[i].dva_word[1] == 0)
+                       continue;
+
+               /* read in a block */
+               offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
+               sector =  DVA_OFFSET_TO_PHYS_SECTOR(offset);
+
+               if (comp != ZIO_COMPRESS_OFF) {
+
+                       if (devread(ffi, sector, 0, psize, stack) == 0)
+                               continue;
+                       if (zio_checksum_verify(bp, stack, psize) != 0)
+                               continue;
+                       decomp_table[comp].decomp_func(stack, buf, psize,
+                           lsize);
+               } else {
+                       if (devread(ffi, sector, 0, psize, buf) == 0)
+                               continue;
+                       if (zio_checksum_verify(bp, buf, psize) != 0)
+                               continue;
+               }
+               return (0);
+       }
+
+       return (ERR_FSYS_CORRUPT);
+}
+
+/*
+ * Get the block from a block id.
+ * push the block onto the stack.
+ *
+ * Return:
+ *     0 - success
+ *     errnum - failure
+ */
+static int
+dmu_read(fsi_file_t *ffi, dnode_phys_t *dn, uint64_t blkid, void *buf,
+    char *stack)
+{
+       int idx, level;
+       blkptr_t *bp_array = dn->dn_blkptr;
+       int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+       blkptr_t *bp, *tmpbuf;
+
+       bp = (blkptr_t *)stack;
+       stack += sizeof (blkptr_t);
+
+       tmpbuf = (blkptr_t *)stack;
+       stack += 1<<dn->dn_indblkshift;
+
+       for (level = dn->dn_nlevels - 1; level >= 0; level--) {
+               idx = (blkid >> (epbs * level)) & ((1<<epbs)-1);
+               *bp = bp_array[idx];
+               if (level == 0)
+                       tmpbuf = buf;
+               if (BP_IS_HOLE(bp)) {
+                       grub_memset(buf, 0,
+                           dn->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+                       break;
+               } else if ((errnum = zio_read(ffi, bp, tmpbuf, stack))) {
+                       return (errnum);
+               }
+               bp_array = tmpbuf;
+       }
+
+       return (0);
+}
+
+/*
+ * mzap_lookup: Looks up property described by "name" and returns the value
+ * in "value".
+ *
+ * Return:
+ *     0 - success
+ *     errnum - failure
+ */
+static int
+mzap_lookup(mzap_phys_t *zapobj, int objsize, char *name,
+       uint64_t *value)
+{
+       int i, chunks;
+       mzap_ent_phys_t *mzap_ent = zapobj->mz_chunk;
+
+       chunks = objsize/MZAP_ENT_LEN - 1;
+       for (i = 0; i < chunks; i++) {
+               if (strcmp(mzap_ent[i].mze_name, name) == 0) {
+                       *value = mzap_ent[i].mze_value;
+                       return (0);
+               }
+       }
+
+       return (ERR_FSYS_CORRUPT);
+}
+
+static uint64_t
+zap_hash(fsi_file_t *ffi, uint64_t salt, const char *name)
+{
+       static uint64_t table[256];
+       const uint8_t *cp;
+       uint8_t c;
+       uint64_t crc = salt;
+
+       if (table[128] == 0) {
+               uint64_t *ct;
+               int i, j;
+               for (i = 0; i < 256; i++) {
+                       for (ct = table + i, *ct = i, j = 8; j > 0; j--)
+                               *ct = (*ct >> 1) ^ (-(*ct & 1) &
+                                   ZFS_CRC64_POLY);
+               }
+       }
+
+       if (crc == 0 || table[128] != ZFS_CRC64_POLY) {
+               errnum = ERR_FSYS_CORRUPT;
+               return (0);
+       }
+
+       for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
+               crc = (crc >> 8) ^ table[(crc ^ c) & 0xFF];
+
+       /*
+        * Only use 28 bits, since we need 4 bits in the cookie for the
+        * collision differentiator.  We MUST use the high bits, since
+        * those are the onces that we first pay attention to when
+        * chosing the bucket.
+        */
+       crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
+
+       return (crc);
+}
+
+/*
+ * Only to be used on 8-bit arrays.
+ * array_len is actual len in bytes (not encoded le_value_length).
+ * buf is null-terminated.
+ */
+static int
+zap_leaf_array_equal(zap_leaf_phys_t *l, int blksft, int chunk,
+    int array_len, const char *buf)
+{
+       int bseen = 0;
+
+       while (bseen < array_len) {
+               struct zap_leaf_array *la =
+                   &ZAP_LEAF_CHUNK(l, blksft, chunk).l_array;
+               int toread = MIN(array_len - bseen, ZAP_LEAF_ARRAY_BYTES);
+
+               if (chunk >= ZAP_LEAF_NUMCHUNKS(blksft))
+                       return (0);
+
+               if (zfs_bcmp(la->la_array, buf + bseen, toread) != 0)
+                       break;
+               chunk = la->la_next;
+               bseen += toread;
+       }
+       return (bseen == array_len);
+}
+
+/*
+ * Given a zap_leaf_phys_t, walk thru the zap leaf chunks to get the
+ * value for the property "name".
+ *
+ * Return:
+ *     0 - success
+ *     errnum - failure
+ */
+static int
+zap_leaf_lookup(zap_leaf_phys_t *l, int blksft, uint64_t h,
+    const char *name, uint64_t *value)
+{
+       uint16_t chunk;
+       struct zap_leaf_entry *le;
+
+       /* Verify if this is a valid leaf block */
+       if (l->l_hdr.lh_block_type != ZBT_LEAF)
+               return (ERR_FSYS_CORRUPT);
+       if (l->l_hdr.lh_magic != ZAP_LEAF_MAGIC)
+               return (ERR_FSYS_CORRUPT);
+
+       for (chunk = l->l_hash[LEAF_HASH(blksft, h)];
+           chunk != CHAIN_END; chunk = le->le_next) {
+
+               if (chunk >= ZAP_LEAF_NUMCHUNKS(blksft))
+                       return (ERR_FSYS_CORRUPT);
+
+               le = ZAP_LEAF_ENTRY(l, blksft, chunk);
+
+               /* Verify the chunk entry */
+               if (le->le_type != ZAP_CHUNK_ENTRY)
+                       return (ERR_FSYS_CORRUPT);
+
+               if (le->le_hash != h)
+                       continue;
+
+               if (zap_leaf_array_equal(l, blksft, le->le_name_chunk,
+                   le->le_name_length, name)) {
+
+                       struct zap_leaf_array *la;
+                       uint8_t *ip;
+
+                       if (le->le_int_size != 8 || le->le_value_length != 1)
+                               return (ERR_FSYS_CORRUPT);
+
+                       /* get the uint64_t property value */
+                       la = &ZAP_LEAF_CHUNK(l, blksft,
+                           le->le_value_chunk).l_array;
+                       ip = la->la_array;
+
+                       *value = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 |
+                           (uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 |
+                           (uint64_t)ip[4] << 24 | (uint64_t)ip[5] << 16 |
+                           (uint64_t)ip[6] << 8 | (uint64_t)ip[7];
+
+                       return (0);
+               }
+       }
+
+       return (ERR_FSYS_CORRUPT);
+}
+
+/*
+ * Fat ZAP lookup
+ *
+ * Return:
+ *     0 - success
+ *     errnum - failure
+ */
+static int
+fzap_lookup(fsi_file_t *ffi, dnode_phys_t *zap_dnode, zap_phys_t *zap,
+    char *name, uint64_t *value, char *stack)
+{
+       zap_leaf_phys_t *l;
+       uint64_t hash, idx, blkid;
+       int blksft = zfs_log2(zap_dnode->dn_datablkszsec << DNODE_SHIFT);
+
+       /* Verify if this is a fat zap header block */
+       if (zap->zap_magic != (uint64_t)ZAP_MAGIC)
+               return (ERR_FSYS_CORRUPT);
+
+       hash = zap_hash(ffi, zap->zap_salt, name);
+       if (errnum)
+               return (errnum);
+
+       /* get block id from index */
+       if (zap->zap_ptrtbl.zt_numblks != 0) {
+               /* external pointer tables not supported */
+               return (ERR_FSYS_CORRUPT);
+       }
+       idx = ZAP_HASH_IDX(hash, zap->zap_ptrtbl.zt_shift);
+       blkid = ((uint64_t *)zap)[idx + (1<<(blksft-3-1))];
+
+       /* Get the leaf block */
+       l = (zap_leaf_phys_t *)stack;
+       stack += 1<<blksft;
+       if ((errnum = dmu_read(ffi, zap_dnode, blkid, l, stack)))
+               return (errnum);
+
+       return (zap_leaf_lookup(l, blksft, hash, name, value));
+}
+
+/*
+ * Read in the data of a zap object and find the value for a matching
+ * property name.
+ *
+ * Return:
+ *     0 - success
+ *     errnum - failure
+ */
+static int
+zap_lookup(fsi_file_t *ffi, dnode_phys_t *zap_dnode, char *name,
+    uint64_t *val, char *stack)
+{
+       uint64_t block_type;
+       int size;
+       void *zapbuf;
+
+       /* Read in the first block of the zap object data. */
+       zapbuf = stack;
+       size = zap_dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
+       stack += size;
+       if ((errnum = dmu_read(ffi, zap_dnode, 0, zapbuf, stack)))
+               return (errnum);
+
+       block_type = *((uint64_t *)zapbuf);
+
+       if (block_type == ZBT_MICRO) {
+               return (mzap_lookup(zapbuf, size, name, val));
+       } else if (block_type == ZBT_HEADER) {
+               /* this is a fat zap */
+               return (fzap_lookup(ffi, zap_dnode, zapbuf, name,
+                   val, stack));
+       }
+
+       return (ERR_FSYS_CORRUPT);
+}
+
+/*
+ * Get the dnode of an object number from the metadnode of an object set.
+ *
+ * Input
+ *     mdn - metadnode to get the object dnode
+ *     objnum - object number for the object dnode
+ *     buf - data buffer that holds the returning dnode
+ *     stack - scratch area
+ *
+ * Return:
+ *     0 - success
+ *     errnum - failure
+ */
+static int
+dnode_get(fsi_file_t *ffi, dnode_phys_t *mdn, uint64_t objnum,
+    uint8_t type, dnode_phys_t *buf, char *stack)
+{
+       uint64_t blkid, blksz; /* the block id this object dnode is in */
+       int epbs; /* shift of number of dnodes in a block */
+       int idx; /* index within a block */
+       dnode_phys_t *dnbuf;
+       zfs_bootarea_t *zfs_ba = (zfs_bootarea_t *)ffi->ff_fsi->f_data;
+
+       blksz = mdn->dn_datablkszsec << SPA_MINBLOCKSHIFT;
+       epbs = zfs_log2(blksz) - DNODE_SHIFT;
+       blkid = objnum >> epbs;
+       idx = objnum & ((1<<epbs)-1);
+
+       if (dnode_buf != NULL && dnode_mdn == mdn &&
+           objnum >= dnode_start && objnum < dnode_end) {
+               grub_memmove(buf, &dnode_buf[idx], DNODE_SIZE);
+               VERIFY_DN_TYPE(buf, type);
+               return (0);
+       }
+
+       if (dnode_buf && blksz == 1<<DNODE_BLOCK_SHIFT) {
+               dnbuf = dnode_buf;
+               dnode_mdn = mdn;
+               dnode_start = blkid << epbs;
+               dnode_end = (blkid + 1) << epbs;
+       } else {
+               dnbuf = (dnode_phys_t *)stack;
+               stack += blksz;
+       }
+
+       if ((errnum = dmu_read(ffi, mdn, blkid, (char *)dnbuf, stack)))
+               return (errnum);
+
+       grub_memmove(buf, &dnbuf[idx], DNODE_SIZE);
+       VERIFY_DN_TYPE(buf, type);
+
+       return (0);
+}
+
+/*
+ * Check if this is a special file that resides at the top
+ * dataset of the pool. Currently this is the GRUB menu,
+ * boot signature and boot signature backup.
+ * str starts with '/'.
+ */
+static int
+is_top_dataset_file(char *str)
+{
+       char *tptr;
+
+       if (((tptr = strstr(str, "menu.lst"))) &&
+           (tptr[8] == '\0' || tptr[8] == ' ') &&
+           *(tptr-1) == '/')
+               return (1);
+
+       if (strncmp(str, BOOTSIGN_DIR"/",
+           strlen(BOOTSIGN_DIR) + 1) == 0)
+               return (1);
+
+       if (strcmp(str, BOOTSIGN_BACKUP) == 0)
+               return (1);
+
+       return (0);
+}
+
+/*
+ * Get the file dnode for a given file name where mdn is the meta dnode
+ * for this ZFS object set. When found, place the file dnode in dn.
+ * The 'path' argument will be mangled.
+ *
+ * Return:
+ *     0 - success
+ *     errnum - failure
+ */
+static int
+dnode_get_path(fsi_file_t *ffi, dnode_phys_t *mdn, char *path,
+    dnode_phys_t *dn, char *stack)
+{
+       uint64_t objnum, version;
+       char *cname, ch;
+
+       if ((errnum = dnode_get(ffi, mdn, MASTER_NODE_OBJ, DMU_OT_MASTER_NODE,
+           dn, stack)))
+               return (errnum);
+
+       if ((errnum = zap_lookup(ffi, dn, ZPL_VERSION_STR, &version, stack)))
+               return (errnum);
+       if (version > ZPL_VERSION)
+               return (-1);
+
+       if ((errnum = zap_lookup(ffi, dn, ZFS_ROOT_OBJ, &objnum, stack)))
+               return (errnum);
+
+       if ((errnum = dnode_get(ffi, mdn, objnum, DMU_OT_DIRECTORY_CONTENTS,
+           dn, stack)))
+               return (errnum);
+
+       /* skip leading slashes */
+       while (*path == '/')
+               path++;
+
+       while (*path && !isspace(*path)) {
+
+               /* get the next component name */
+               cname = path;
+               while (*path && !isspace(*path) && *path != '/')
+                       path++;
+               ch = *path;
+               *path = 0;   /* ensure null termination */
+
+               if ((errnum = zap_lookup(ffi, dn, cname, &objnum, stack)))
+                       return (errnum);
+
+               objnum = ZFS_DIRENT_OBJ(objnum);
+               if ((errnum = dnode_get(ffi, mdn, objnum, 0, dn, stack)))
+                       return (errnum);
+
+               *path = ch;
+               while (*path == '/')
+                       path++;
+       }
+
+       /* We found the dnode for this file. Verify if it is a plain file. */
+       VERIFY_DN_TYPE(dn, DMU_OT_PLAIN_FILE_CONTENTS);
+
+       return (0);
+}
+
+/*
+ * Get the default 'bootfs' property value from the rootpool.
+ *
+ * Return:
+ *     0 - success
+ *     errnum -failure
+ */
+static int
+get_default_bootfsobj(fsi_file_t *ffi, dnode_phys_t *mosmdn,
+    uint64_t *obj, char *stack)
+{
+       uint64_t objnum = 0;
+       dnode_phys_t *dn = (dnode_phys_t *)stack;
+       stack += DNODE_SIZE;
+
+       if ((errnum = dnode_get(ffi, mosmdn, DMU_POOL_DIRECTORY_OBJECT,
+           DMU_OT_OBJECT_DIRECTORY, dn, stack)))
+               return (errnum);
+
+       /*
+        * find the object number for 'pool_props', and get the dnode
+        * of the 'pool_props'.
+        */
+       if (zap_lookup(ffi, dn, DMU_POOL_PROPS, &objnum, stack))
+               return (ERR_FILESYSTEM_NOT_FOUND);
+
+       if ((errnum = dnode_get(ffi, mosmdn, objnum, DMU_OT_POOL_PROPS, dn,
+           stack)))
+               return (errnum);
+
+       if (zap_lookup(ffi, dn, ZPOOL_PROP_BOOTFS, &objnum, stack))
+               return (ERR_FILESYSTEM_NOT_FOUND);
+
+       if (!objnum)
+               return (ERR_FILESYSTEM_NOT_FOUND);
+
+
+       *obj = objnum;
+       return (0);
+}
+
+/*
+ * Given a MOS metadnode, get the metadnode of a given filesystem name 
(fsname),
+ * e.g. pool/rootfs, or a given object number (obj), e.g. the object number
+ * of pool/rootfs.
+ *
+ * If no fsname and no obj are given, return the DSL_DIR metadnode.
+ * If fsname is given, return its metadnode and its matching object number.
+ * If only obj is given, return the metadnode for this object number.
+ *
+ * Return:
+ *     0 - success
+ *     errnum - failure
+ */
+static int
+get_objset_mdn(fsi_file_t *ffi, dnode_phys_t *mosmdn, char *fsname,
+    uint64_t *obj, dnode_phys_t *mdn, char *stack)
+{
+       uint64_t objnum, headobj;
+       char *cname, ch;
+       blkptr_t *bp;
+       objset_phys_t *osp;
+
+       if (fsname == NULL && obj) {
+               headobj = *obj;
+               goto skip;
+       }
+
+       if ((errnum = dnode_get(ffi, mosmdn, DMU_POOL_DIRECTORY_OBJECT,
+           DMU_OT_OBJECT_DIRECTORY, mdn, stack)))
+               return (errnum);
+
+       if ((errnum = zap_lookup(ffi, mdn, DMU_POOL_ROOT_DATASET, &objnum,
+           stack)))
+               return (errnum);
+
+       if ((errnum = dnode_get(ffi, mosmdn, objnum, DMU_OT_DSL_DIR, mdn,
+           stack)))
+               return (errnum);
+
+       if (fsname == NULL) {
+               headobj =
+                   ((dsl_dir_phys_t *)DN_BONUS(mdn))->dd_head_dataset_obj;
+               goto skip;
+       }
+
+       /* take out the pool name */
+       while (*fsname && !isspace(*fsname) && *fsname != '/')
+               fsname++;
+
+       while (*fsname && !isspace(*fsname)) {
+               uint64_t childobj;
+
+               while (*fsname == '/')
+                       fsname++;
+
+               cname = fsname;
+               while (*fsname && !isspace(*fsname) && *fsname != '/')
+                       fsname++;
+               ch = *fsname;
+               *fsname = 0;
+
+               childobj =
+                   ((dsl_dir_phys_t *)DN_BONUS(mdn))->dd_child_dir_zapobj;
+               if ((errnum = dnode_get(ffi, mosmdn, childobj,
+                   DMU_OT_DSL_DIR_CHILD_MAP, mdn, stack)))
+                       return (errnum);
+
+               if (zap_lookup(ffi, mdn, cname, &objnum, stack))
+                       return (ERR_FILESYSTEM_NOT_FOUND);
+
+               if ((errnum = dnode_get(ffi, mosmdn, objnum, DMU_OT_DSL_DIR,
+                   mdn, stack)))
+                       return (errnum);
+
+               *fsname = ch;
+       }
+       headobj = ((dsl_dir_phys_t *)DN_BONUS(mdn))->dd_head_dataset_obj;
+       if (obj)
+               *obj = headobj;
+
+skip:
+       if ((errnum = dnode_get(ffi, mosmdn, headobj, DMU_OT_DSL_DATASET, mdn,
+           stack)))
+               return (errnum);
+
+       /* TODO: Add snapshot support here - for fsname=snapshot-name */
+
+       bp = &((dsl_dataset_phys_t *)DN_BONUS(mdn))->ds_bp;
+       osp = (objset_phys_t *)stack;
+       stack += sizeof (objset_phys_t);
+       if ((errnum = zio_read(ffi, bp, osp, stack)))
+               return (errnum);
+
+       grub_memmove((char *)mdn, (char *)&osp->os_meta_dnode, DNODE_SIZE);
+
+       return (0);
+}
+
+/*
+ * For a given XDR packed nvlist, verify the first 4 bytes and move on.
+ *
+ * An XDR packed nvlist is encoded as (comments from nvs_xdr_create) :
+ *
+ *      encoding method/host endian     (4 bytes)
+ *      nvl_version                     (4 bytes)
+ *      nvl_nvflag                      (4 bytes)
+ *     encoded nvpairs:
+ *             encoded size of the nvpair      (4 bytes)
+ *             decoded size of the nvpair      (4 bytes)
+ *             name string size                (4 bytes)
+ *             name string data                (sizeof(NV_ALIGN4(string))
+ *             data type                       (4 bytes)
+ *             # of elements in the nvpair     (4 bytes)
+ *             data
+ *      2 zero's for the last nvpair
+ *             (end of the entire list)        (8 bytes)
+ *
+ * Return:
+ *     0 - success
+ *     1 - failure
+ */
+static int
+nvlist_unpack(char *nvlist, char **out)
+{
+       /* Verify if the 1st and 2nd byte in the nvlist are valid. */
+       if (nvlist[0] != NV_ENCODE_XDR || nvlist[1] != HOST_ENDIAN)
+               return (1);
+
+       nvlist += 4;
+       *out = nvlist;
+       return (0);
+}
+
+static char *
+nvlist_array(char *nvlist, int index)
+{
+       int i, encode_size;
+
+       for (i = 0; i < index; i++) {
+               /* skip the header, nvl_version, and nvl_nvflag */
+               nvlist = nvlist + 4 * 2;
+
+               while ((encode_size = BSWAP_32(*(uint32_t *)nvlist)))
+                       nvlist += encode_size; /* goto the next nvpair */
+
+               nvlist = nvlist + 4 * 2; /* skip the ending 2 zeros - 8 bytes */
+       }
+
+       return (nvlist);
+}
+
+static int
+nvlist_lookup_value(char *nvlist, char *name, void *val, int valtype,
+    int *nelmp)
+{
+       int name_len, type, slen, encode_size;
+       char *nvpair, *nvp_name, *strval = val;
+       uint64_t *intval = val;
+
+       /* skip the header, nvl_version, and nvl_nvflag */
+       nvlist = nvlist + 4 * 2;
+
+       /*
+        * Loop thru the nvpair list
+        * The XDR representation of an integer is in big-endian byte order.
+        */
+       while ((encode_size = BSWAP_32(*(uint32_t *)nvlist)))  {
+
+               nvpair = nvlist + 4 * 2; /* skip the encode/decode size */
+
+               name_len = BSWAP_32(*(uint32_t *)nvpair);
+               nvpair += 4;
+
+               nvp_name = nvpair;
+               nvpair = nvpair + ((name_len + 3) & ~3); /* align */
+
+               type = BSWAP_32(*(uint32_t *)nvpair);
+               nvpair += 4;
+
+               if (((strncmp(nvp_name, name, name_len) == 0) &&
+                   type == valtype)) {
+                       int nelm;
+
+                       if (((nelm = BSWAP_32(*(uint32_t *)nvpair)) < 1))
+                               return (1);
+                       nvpair += 4;
+
+                       switch (valtype) {
+                       case DATA_TYPE_STRING:
+                               slen = BSWAP_32(*(uint32_t *)nvpair);
+                               nvpair += 4;
+                               grub_memmove(strval, nvpair, slen);
+                               strval[slen] = '\0';
+                               return (0);
+
+                       case DATA_TYPE_UINT64:
+                               *intval = BSWAP_64(*(uint64_t *)nvpair);
+                               return (0);
+
+                       case DATA_TYPE_NVLIST:
+                               *(void **)val = (void *)nvpair;
+                               return (0);
+
+                       case DATA_TYPE_NVLIST_ARRAY:
+                               *(void **)val = (void *)nvpair;
+                               if (nelmp)
+                                       *nelmp = nelm;
+                               return (0);
+                       }
+               }
+
+               nvlist += encode_size; /* goto the next nvpair */
+       }
+
+       return (1);
+}
+
+/*
+ * Check if this vdev is online and is in a good state.
+ */
+static int
+vdev_validate(char *nv)
+{
+       uint64_t ival;
+
+       if (nvlist_lookup_value(nv, ZPOOL_CONFIG_OFFLINE, &ival,
+           DATA_TYPE_UINT64, NULL) == 0 ||
+           nvlist_lookup_value(nv, ZPOOL_CONFIG_FAULTED, &ival,
+           DATA_TYPE_UINT64, NULL) == 0 ||
+           nvlist_lookup_value(nv, ZPOOL_CONFIG_DEGRADED, &ival,
+           DATA_TYPE_UINT64, NULL) == 0 ||
+           nvlist_lookup_value(nv, ZPOOL_CONFIG_REMOVED, &ival,
+           DATA_TYPE_UINT64, NULL) == 0)
+               return (ERR_DEV_VALUES);
+
+       return (0);
+}
+
+/*
+ * Get a list of valid vdev pathname from the boot device.
+ * The caller should already allocate MAXNAMELEN memory for bootpath.
+ */
+static int
+vdev_get_bootpath(char *nv, char *bootpath)
+{
+       char type[16];
+
+       bootpath[0] = '\0';
+       if (nvlist_lookup_value(nv, ZPOOL_CONFIG_TYPE, &type, DATA_TYPE_STRING,
+           NULL))
+               return (ERR_FSYS_CORRUPT);
+
+       if (strcmp(type, VDEV_TYPE_DISK) == 0) {
+               if (vdev_validate(nv) != 0 ||
+                   nvlist_lookup_value(nv, ZPOOL_CONFIG_PHYS_PATH, bootpath,
+                   DATA_TYPE_STRING, NULL) != 0)
+                       return (ERR_NO_BOOTPATH);
+
+       } else if (strcmp(type, VDEV_TYPE_MIRROR) == 0) {
+               int nelm, i;
+               char *child;
+
+               if (nvlist_lookup_value(nv, ZPOOL_CONFIG_CHILDREN, &child,
+                   DATA_TYPE_NVLIST_ARRAY, &nelm))
+                       return (ERR_FSYS_CORRUPT);
+
+               for (i = 0; i < nelm; i++) {
+                       char tmp_path[MAXNAMELEN];
+                       char *child_i;
+
+                       child_i = nvlist_array(child, i);
+                       if (vdev_validate(child_i) != 0)
+                               continue;
+
+                       if (nvlist_lookup_value(child_i, ZPOOL_CONFIG_PHYS_PATH,
+                           tmp_path, DATA_TYPE_STRING, NULL) != 0)
+                               return (ERR_NO_BOOTPATH);
+
+                       if ((strlen(bootpath) + strlen(tmp_path)) > MAXNAMELEN)
+                               return (ERR_WONT_FIT);
+
+                       if (strlen(bootpath) == 0)
+                               sprintf(bootpath, "%s", tmp_path);
+                       else
+                               sprintf(bootpath, "%s %s", bootpath, tmp_path);
+               }
+       }
+
+       return (strlen(bootpath) > 0 ? 0 : ERR_NO_BOOTPATH);
+}
+
+/*
+ * Check the disk label information and retrieve needed vdev name-value pairs.
+ *
+ * Return:
+ *     0 - success
+ *     ERR_* - failure
+ */
+static int
+check_pool_label(fsi_file_t *ffi, int label, char *stack)
+{
+       vdev_phys_t *vdev;
+       uint64_t sector, pool_state, txg = 0;
+       char *nvlist, *nv;
+       zfs_bootarea_t *zfs_ba = (zfs_bootarea_t *)ffi->ff_fsi->f_data;
+
+       sector = (label * sizeof (vdev_label_t) + VDEV_SKIP_SIZE +
+           VDEV_BOOT_HEADER_SIZE) >> SPA_MINBLOCKSHIFT;
+
+       /* Read in the vdev name-value pair list (112K). */
+       if (devread(ffi, sector, 0, VDEV_PHYS_SIZE, stack) == 0)
+               return (ERR_READ);
+
+       vdev = (vdev_phys_t *)stack;
+
+       if (nvlist_unpack(vdev->vp_nvlist, &nvlist))
+               return (ERR_FSYS_CORRUPT);
+
+       if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_POOL_STATE, &pool_state,
+           DATA_TYPE_UINT64, NULL))
+               return (ERR_FSYS_CORRUPT);
+
+       if (pool_state == POOL_STATE_DESTROYED)
+               return (ERR_FILESYSTEM_NOT_FOUND);
+
+       if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_POOL_NAME,
+           current_rootpool, DATA_TYPE_STRING, NULL))
+               return (ERR_FSYS_CORRUPT);
+
+       if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_POOL_TXG, &txg,
+           DATA_TYPE_UINT64, NULL))
+               return (ERR_FSYS_CORRUPT);
+
+       /* not an active device */
+       if (txg == 0)
+               return (ERR_NO_BOOTPATH);
+
+       if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_VDEV_TREE, &nv,
+           DATA_TYPE_NVLIST, NULL))
+               return (ERR_FSYS_CORRUPT);
+
+       if (vdev_get_bootpath(nv, current_bootpath))
+               return (ERR_NO_BOOTPATH);
+
+       return (0);
+}
+
+/*
+ * zfs_mount() locates a valid uberblock of the root pool and read in its MOS
+ * to the memory address MOS.
+ *
+ * Return:
+ *     1 - success
+ *     0 - failure
+ */
+int
+zfs_mount(fsi_file_t *ffi, const char *options)
+{
+       char *stack;
+       int label = 0;
+       uberblock_phys_t *ub_array, *ubbest = NULL;
+       objset_phys_t *osp;
+       zfs_bootarea_t *zfs_ba;
+
+       /* if zfs is already mounted, don't do it again */
+       if (is_zfs_mount == 1)
+               return (1);
+
+       /* get much bigger data block for zfs */
+       if (((zfs_ba = malloc(sizeof (zfs_bootarea_t))) == NULL)) {
+               return (1);
+       }
+       bzero(zfs_ba, sizeof (zfs_bootarea_t));
+
+       /* replace small data area in fsi with big one */
+       free(ffi->ff_fsi->f_data);
+       ffi->ff_fsi->f_data = (void *)zfs_ba;
+
+       /* If an boot filesystem is passed in, set it to current_bootfs */
+       if (options != NULL) {
+               if (strlen(options) < MAXNAMELEN) {
+                       strcpy(current_bootfs, options);
+               }
+       }
+
+       stackbase = ZFS_SCRATCH;
+       stack = stackbase;
+       ub_array = (uberblock_phys_t *)stack;
+       stack += VDEV_UBERBLOCK_RING;
+
+       osp = (objset_phys_t *)stack;
+       stack += sizeof (objset_phys_t);
+
+       /* XXX add back labels support? */
+       for (label = 0; ubbest == NULL && label < (VDEV_LABELS/2); label++) {
+               uint64_t sector = (label * sizeof (vdev_label_t) +
+                   VDEV_SKIP_SIZE + VDEV_BOOT_HEADER_SIZE +
+                   VDEV_PHYS_SIZE) >> SPA_MINBLOCKSHIFT;
+
+
+               /* Read in the uberblock ring (128K). */
+               if (devread(ffi, sector, 0, VDEV_UBERBLOCK_RING,
+                   (char *)ub_array) == 0)
+                       continue;
+
+               if ((ubbest = find_bestub(ffi, ub_array, label)) != NULL &&
+                   zio_read(ffi, &ubbest->ubp_uberblock.ub_rootbp, osp, stack)
+                   == 0) {
+
+                       VERIFY_OS_TYPE(osp, DMU_OST_META);
+
+                       /* Got the MOS. Save it at the memory addr MOS. */
+                       grub_memmove(MOS, &osp->os_meta_dnode, DNODE_SIZE);
+
+                       if (check_pool_label(ffi, label, stack))
+                               return (0);
+
+                       /*
+                        * Copy fsi->f_data to ffi->ff_data since
+                        * fsig_mount copies from ff_data to f_data
+                        * overwriting fsi->f_data.
+                        */
+                       bcopy(zfs_ba, fsig_file_buf(ffi), FSYS_BUFLEN);
+
+                       is_zfs_mount = 1;
+                       return (1);
+               }
+       }
+
+       return (0);
+}
+
+/*
+ * zfs_open() locates a file in the rootpool by following the
+ * MOS and places the dnode of the file in the memory address DNODE.
+ *
+ * Return:
+ *     1 - success
+ *     0 - failure
+ */
+int
+zfs_open(fsi_file_t *ffi, char *filename)
+{
+       char *stack;
+       dnode_phys_t *mdn;
+       char *bootstring;
+       zfs_bootarea_t *zfs_ba = (zfs_bootarea_t *)ffi->ff_fsi->f_data;
+
+       file_buf = NULL;
+       stackbase = ZFS_SCRATCH;
+       stack = stackbase;
+
+       mdn = (dnode_phys_t *)stack;
+       stack += sizeof (dnode_phys_t);
+
+       dnode_mdn = NULL;
+       dnode_buf = (dnode_phys_t *)stack;
+       stack += 1<<DNODE_BLOCK_SHIFT;
+
+       /*
+        * menu.lst is placed at the root pool filesystem level,
+        * do not goto 'current_bootfs'.
+        */
+       if (is_top_dataset_file(filename)) {
+               if ((errnum = get_objset_mdn(ffi, MOS, NULL, NULL, mdn, stack)))
+                       return (0);
+
+               current_bootfs_obj = 0;
+       } else {
+               if (current_bootfs[0] == '\0') {
+                       /* Get the default root filesystem object number */
+                       if ((errnum = get_default_bootfsobj(ffi, MOS,
+                           &current_bootfs_obj, stack)))
+                               return (0);
+                       if ((errnum = get_objset_mdn(ffi, MOS, NULL,
+                           &current_bootfs_obj, mdn, stack)))
+                               return (0);
+               } else {
+                       if ((errnum = get_objset_mdn(ffi, MOS,
+                           current_bootfs, &current_bootfs_obj, mdn, stack)))
+                               return (0);
+               }
+
+               /*
+                * Put zfs rootpool and boot obj number into bootstring.
+                */
+               if (is_zfs_open == 0) {
+                       char temp[25];          /* needs to hold long long */
+                       int alloc_size;
+                       char zfs_bootstr[] = "zfs-bootfs=";
+                       char zfs_bootpath[] = ",bootpath='";
+
+                       sprintf(temp, "%llu", (unsigned long long)
+                           current_bootfs_obj);
+                       alloc_size = strlen(zfs_bootstr) +
+                           strlen(current_rootpool) +
+                           strlen(temp) + strlen(zfs_bootpath) +
+                           strlen(current_bootpath) + 3;
+                       bootstring = fsi_bootstring_alloc(ffi->ff_fsi,
+                           alloc_size);
+                       if (bootstring != NULL) {
+                               strcpy(bootstring, zfs_bootstr);
+                               strcat(bootstring, current_rootpool);
+                               strcat(bootstring, "/");
+                               strcat(bootstring, temp);
+                               strcat(bootstring, zfs_bootpath);
+                               strcat(bootstring, current_bootpath);
+                               strcat(bootstring, "'");
+                               is_zfs_open = 1;
+                       }
+               }
+       }
+
+       if (dnode_get_path(ffi, mdn, filename, DNODE, stack)) {
+               errnum = ERR_FILE_NOT_FOUND;
+               return (0);
+       }
+
+       /* get the file size and set the file position to 0 */
+       filemax = ((znode_phys_t *)DN_BONUS(DNODE))->zp_size;
+       filepos = 0;
+
+       dnode_buf = NULL;
+       return (1);
+}
+
+/*
+ * zfs_read reads in the data blocks pointed by the DNODE.
+ *
+ * Return:
+ *     len - the length successfully read in to the buffer
+ *     0   - failure
+ */
+int
+zfs_read(fsi_file_t *ffi, char *buf, int len)
+{
+       char *stack;
+       int blksz, length, movesize;
+       zfs_bootarea_t *zfs_ba = (zfs_bootarea_t *)ffi->ff_fsi->f_data;
+
+       if (file_buf == NULL) {
+               file_buf = stackbase;
+               stackbase += SPA_MAXBLOCKSIZE;
+               file_start = file_end = 0;
+       }
+       stack = stackbase;
+
+       /*
+        * If offset is in memory, move it into the buffer provided and return.
+        */
+       if (filepos >= file_start && filepos+len <= file_end) {
+               grub_memmove(buf, file_buf + filepos - file_start, len);
+               filepos += len;
+               return (len);
+       }
+
+       blksz = DNODE->dn_datablkszsec << SPA_MINBLOCKSHIFT;
+
+       /*
+        * Entire Dnode is too big to fit into the space available.  We
+        * will need to read it in chunks.  This could be optimized to
+        * read in as large a chunk as there is space available, but for
+        * now, this only reads in one data block at a time.
+        */
+       length = len;
+       while (length) {
+               /*
+                * Find requested blkid and the offset within that block.
+                */
+               uint64_t blkid = filepos / blksz;
+
+               if ((errnum = dmu_read(ffi, DNODE, blkid, file_buf, stack)))
+                       return (0);
+
+               file_start = blkid * blksz;
+               file_end = file_start + blksz;
+
+               movesize = MIN(length, file_end - filepos);
+
+               grub_memmove(buf, file_buf + filepos - file_start,
+                   movesize);
+               buf += movesize;
+               length -= movesize;
+               filepos += movesize;
+       }
+
+       return (len);
+}
+
+/*
+ * No-Op
+ */
+int
+zfs_embed(int *start_sector, int needed_sectors)
+{
+       return (1);
+}
+
+fsi_plugin_ops_t *
+fsi_init_plugin(int version, fsi_plugin_t *fp, const char **name)
+{
+       static fsig_plugin_ops_t ops = {
+               FSIMAGE_PLUGIN_VERSION,
+               .fpo_mount = zfs_mount,
+               .fpo_dir = zfs_open,
+               .fpo_read = zfs_read
+       };
+
+       *name = "zfs";
+       return (fsig_init(fp, &ops));
+}
diff -r f2457c7aff8d -r 611787b6ca35 tools/libfsimage/zfs/fsys_zfs.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libfsimage/zfs/fsys_zfs.h   Thu May 08 18:40:07 2008 +0900
@@ -0,0 +1,203 @@
+/*
+ *  GRUB  --  GRand Unified Bootloader
+ *  Copyright (C) 1999,2000,2001,2002,2003,2004  Free Software Foundation, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+#ifndef _FSYS_ZFS_H
+#define        _FSYS_ZFS_H
+
+#include <fsimage_grub.h>
+#include <fsimage_priv.h>
+
+#include "zfs-include/zfs.h"
+#include "zfs-include/dmu.h"
+#include "zfs-include/spa.h"
+#include "zfs-include/zio.h"
+#include "zfs-include/zio_checksum.h"
+#include "zfs-include/vdev_impl.h"
+#include "zfs-include/zap_impl.h"
+#include "zfs-include/zap_leaf.h"
+#include "zfs-include/uberblock_impl.h"
+#include "zfs-include/dnode.h"
+#include "zfs-include/dsl_dir.h"
+#include "zfs-include/zfs_acl.h"
+#include "zfs-include/zfs_znode.h"
+#include "zfs-include/dsl_dataset.h"
+#include "zfs-include/zil.h"
+#include "zfs-include/dmu_objset.h"
+
+/*
+ * Global Memory addresses to store MOS and DNODE data
+ */
+#define        MOS             ((dnode_phys_t *)(((zfs_bootarea_t *) \
+                           (ffi->ff_fsi->f_data))->zfs_data))
+#define        DNODE           (MOS+1) /* move sizeof(dnode_phys_t) bytes */
+#define        ZFS_SCRATCH     ((char *)(DNODE+1))
+
+#define        MAXNAMELEN      256
+
+typedef struct zfs_bootarea {
+       char zfs_current_bootpath[MAXNAMELEN];
+       char zfs_current_rootpool[MAXNAMELEN];
+       char zfs_current_bootfs[MAXNAMELEN];
+       uint64_t zfs_current_bootfs_obj;
+       int zfs_open;
+
+       /* cache for a file block of the currently zfs_open()-ed file */
+       void *zfs_file_buf;
+       uint64_t zfs_file_start;
+       uint64_t zfs_file_end;
+
+       /* cache for a dnode block */
+       dnode_phys_t *zfs_dnode_buf;
+       dnode_phys_t *zfs_dnode_mdn;
+       uint64_t zfs_dnode_start;
+       uint64_t zfs_dnode_end;
+
+       char *zfs_stackbase;
+       char zfs_data[0x400000];
+} zfs_bootarea_t;
+
+/*
+ * Verify dnode type.
+ * Can only be used in functions returning non-0 for failure.
+ */
+#define        VERIFY_DN_TYPE(dnp, type) \
+       if (type && (dnp)->dn_type != type) { \
+               return (ERR_FSYS_CORRUPT); \
+       }
+
+/*
+ * Verify object set type.
+ * Can only be used in functions returning 0 for failure.
+ */
+#define        VERIFY_OS_TYPE(osp, type) \
+       if (type && (osp)->os_type != type) { \
+               errnum = ERR_FSYS_CORRUPT; \
+               return (0); \
+       }
+
+#define        ZPOOL_PROP_BOOTFS               "bootfs"
+
+/* General macros */
+#define        BSWAP_8(x)      ((x) & 0xff)
+#define        BSWAP_16(x)     ((BSWAP_8(x) << 8) | BSWAP_8((x) >> 8))
+#define        BSWAP_32(x)     ((BSWAP_16(x) << 16) | BSWAP_16((x) >> 16))
+#define        BSWAP_64(x)     ((BSWAP_32(x) << 32) | BSWAP_32((x) >> 32))
+#define        P2ROUNDUP(x, align)     (-(-(x) & -(align)))
+
+/*
+ * XXX Match these macro up with real zfs once we have nvlist support so that 
we
+ * can support large sector disks.
+ */
+#define        UBERBLOCK_SIZE          (1ULL << UBERBLOCK_SHIFT)
+#undef offsetof
+#define        offsetof(t, m)   (size_t)(&(((t *)0)->m))
+#define        VDEV_UBERBLOCK_SHIFT    UBERBLOCK_SHIFT
+#define        VDEV_UBERBLOCK_OFFSET(n) \
+offsetof(vdev_label_t, vl_uberblock[(n) << VDEV_UBERBLOCK_SHIFT])
+
+typedef struct uberblock uberblock_t;
+
+/* XXX Uberblock_phys_t is no longer in the kernel zfs */
+typedef struct uberblock_phys {
+       uberblock_t     ubp_uberblock;
+       char            ubp_pad[UBERBLOCK_SIZE - sizeof (uberblock_t) -
+                               sizeof (zio_block_tail_t)];
+       zio_block_tail_t ubp_zbt;
+} uberblock_phys_t;
+
+/*
+ * Macros to get fields in a bp or DVA.
+ */
+#define        P2PHASE(x, align)               ((x) & ((align) - 1))
+#define        DVA_OFFSET_TO_PHYS_SECTOR(offset) \
+       ((offset + VDEV_LABEL_START_SIZE) >> SPA_MINBLOCKSHIFT)
+
+/*
+ * For nvlist manipulation. (from nvpair.h)
+ */
+#define        NV_ENCODE_NATIVE        0
+#define        NV_ENCODE_XDR           1
+#define        HOST_ENDIAN             1       /* for x86 machine */
+#define        DATA_TYPE_UINT64        8
+#define        DATA_TYPE_STRING        9
+#define        DATA_TYPE_NVLIST        19
+#define        DATA_TYPE_NVLIST_ARRAY  20
+
+/*
+ * Decompression Entry - lzjb
+ */
+#ifndef        NBBY
+#define        NBBY    8
+#endif
+
+typedef int zfs_decomp_func_t(void *s_start, void *d_start, size_t s_len,
+                       size_t d_len);
+typedef struct decomp_entry {
+       char *name;
+       zfs_decomp_func_t *decomp_func;
+} decomp_entry_t;
+
+/*
+ * FAT ZAP data structures
+ */
+#define        ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected 
form */
+#define        ZAP_HASH_IDX(hash, n)   (((n) == 0) ? 0 : ((hash) >> (64 - 
(n))))
+#define        CHAIN_END       0xffff  /* end of the chunk chain */
+
+/*
+ * The amount of space within the chunk available for the array is:
+ * chunk size - space for type (1) - space for next pointer (2)
+ */
+#define        ZAP_LEAF_ARRAY_BYTES (ZAP_LEAF_CHUNKSIZE - 3)
+
+#define        ZAP_LEAF_HASH_SHIFT(bs) (bs - 5)
+#define        ZAP_LEAF_HASH_NUMENTRIES(bs) (1 << ZAP_LEAF_HASH_SHIFT(bs))
+#define        LEAF_HASH(bs, h) \
+       ((ZAP_LEAF_HASH_NUMENTRIES(bs)-1) & \
+       ((h) >> (64 - ZAP_LEAF_HASH_SHIFT(bs)-l->l_hdr.lh_prefix_len)))
+
+/*
+ * The amount of space available for chunks is:
+ * block size shift - hash entry size (2) * number of hash
+ * entries - header space (2*chunksize)
+ */
+#define        ZAP_LEAF_NUMCHUNKS(bs) \
+       (((1<<bs) - 2*ZAP_LEAF_HASH_NUMENTRIES(bs)) / \
+       ZAP_LEAF_CHUNKSIZE - 2)
+
+/*
+ * The chunks start immediately after the hash table.  The end of the
+ * hash table is at l_hash + HASH_NUMENTRIES, which we simply cast to a
+ * chunk_t.
+ */
+#define        ZAP_LEAF_CHUNK(l, bs, idx) \
+       ((zap_leaf_chunk_t *)(l->l_hash + ZAP_LEAF_HASH_NUMENTRIES(bs)))[idx]
+#define        ZAP_LEAF_ENTRY(l, bs, idx) (&ZAP_LEAF_CHUNK(l, bs, idx).l_entry)
+
+extern void fletcher_2_native(const void *, uint64_t, zio_cksum_t *);
+extern void fletcher_2_byteswap(const void *, uint64_t, zio_cksum_t *);
+extern void fletcher_4_native(const void *, uint64_t, zio_cksum_t *);
+extern void fletcher_4_byteswap(const void *, uint64_t, zio_cksum_t *);
+extern void zio_checksum_SHA256(const void *, uint64_t, zio_cksum_t *);
+extern int lzjb_decompress(void *, void *, size_t, size_t);
+
+#endif /* !_FSYS_ZFS_H */
diff -r f2457c7aff8d -r 611787b6ca35 tools/libfsimage/zfs/mb_info.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libfsimage/zfs/mb_info.h    Thu May 08 18:40:07 2008 +0900
@@ -0,0 +1,217 @@
+/*
+ *  GRUB  --  GRand Unified Bootloader
+ *  Copyright (C) 2000,2003  Free Software Foundation, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/*
+ *  The structure type "mod_list" is used by the "multiboot_info" structure.
+ */
+
+struct mod_list
+{
+  /* the memory used goes from bytes 'mod_start' to 'mod_end-1' inclusive */
+  unsigned long mod_start;
+  unsigned long mod_end;
+  
+  /* Module command line */
+  unsigned long cmdline;
+  
+  /* padding to take it to 16 bytes (must be zero) */
+  unsigned long pad;
+};
+
+
+/*
+ *  INT-15, AX=E820 style "AddressRangeDescriptor"
+ *  ...with a "size" parameter on the front which is the structure size - 4,
+ *  pointing to the next one, up until the full buffer length of the memory
+ *  map has been reached.
+ */
+
+struct AddrRangeDesc
+{
+  unsigned long size;
+  unsigned long long BaseAddr;
+  unsigned long long Length;
+  unsigned long Type;
+  
+  /* unspecified optional padding... */
+} __attribute__ ((packed));
+
+/* usable memory "Type", all others are reserved.  */
+#define MB_ARD_MEMORY          1
+
+
+/* Drive Info structure.  */
+struct drive_info
+{
+  /* The size of this structure.  */
+  unsigned long size;
+
+  /* The BIOS drive number.  */
+  unsigned char drive_number;
+
+  /* The access mode (see below).  */
+  unsigned char drive_mode;
+
+  /* The BIOS geometry.  */
+  unsigned short drive_cylinders;
+  unsigned char drive_heads;
+  unsigned char drive_sectors;
+
+  /* The array of I/O ports used for the drive.  */
+  unsigned short drive_ports[0];
+};
+
+/* Drive Mode.  */
+#define MB_DI_CHS_MODE         0
+#define MB_DI_LBA_MODE         1
+
+
+/* APM BIOS info.  */
+struct apm_info
+{
+  unsigned short version;
+  unsigned short cseg;
+  unsigned long offset;
+  unsigned short cseg_16;
+  unsigned short dseg_16;
+  unsigned short cseg_len;
+  unsigned short cseg_16_len;
+  unsigned short dseg_16_len;
+};
+
+
+/*
+ *  MultiBoot Info description
+ *
+ *  This is the struct passed to the boot image.  This is done by placing
+ *  its address in the EAX register.
+ */
+
+struct multiboot_info
+{
+  /* MultiBoot info version number */
+  unsigned long flags;
+  
+  /* Available memory from BIOS */
+  unsigned long mem_lower;
+  unsigned long mem_upper;
+  
+  /* "root" partition */
+  unsigned long boot_device;
+  
+  /* Kernel command line */
+  unsigned long cmdline;
+  
+  /* Boot-Module list */
+  unsigned long mods_count;
+  unsigned long mods_addr;
+  
+  union
+  {
+    struct
+    {
+      /* (a.out) Kernel symbol table info */
+      unsigned long tabsize;
+      unsigned long strsize;
+      unsigned long addr;
+      unsigned long pad;
+    }
+    a;
+    
+    struct
+    {
+      /* (ELF) Kernel section header table */
+      unsigned long num;
+      unsigned long size;
+      unsigned long addr;
+      unsigned long shndx;
+    }
+    e;
+  }
+  syms;
+  
+  /* Memory Mapping buffer */
+  unsigned long mmap_length;
+  unsigned long mmap_addr;
+  
+  /* Drive Info buffer */
+  unsigned long drives_length;
+  unsigned long drives_addr;
+  
+  /* ROM configuration table */
+  unsigned long config_table;
+  
+  /* Boot Loader Name */
+  unsigned long boot_loader_name;
+
+  /* APM table */
+  unsigned long apm_table;
+
+  /* Video */
+  unsigned long vbe_control_info;
+  unsigned long vbe_mode_info;
+  unsigned short vbe_mode;
+  unsigned short vbe_interface_seg;
+  unsigned short vbe_interface_off;
+  unsigned short vbe_interface_len;
+};
+
+/*
+ *  Flags to be set in the 'flags' parameter above
+ */
+
+/* is there basic lower/upper memory information? */
+#define MB_INFO_MEMORY                 0x00000001
+/* is there a boot device set? */
+#define MB_INFO_BOOTDEV                        0x00000002
+/* is the command-line defined? */
+#define MB_INFO_CMDLINE                        0x00000004
+/* are there modules to do something with? */
+#define MB_INFO_MODS                   0x00000008
+
+/* These next two are mutually exclusive */
+
+/* is there a symbol table loaded? */
+#define MB_INFO_AOUT_SYMS              0x00000010
+/* is there an ELF section header table? */
+#define MB_INFO_ELF_SHDR               0x00000020
+
+/* is there a full memory map? */
+#define MB_INFO_MEM_MAP                        0x00000040
+
+/* Is there drive info?  */
+#define MB_INFO_DRIVE_INFO             0x00000080
+
+/* Is there a config table?  */
+#define MB_INFO_CONFIG_TABLE           0x00000100
+
+/* Is there a boot loader name?  */
+#define MB_INFO_BOOT_LOADER_NAME       0x00000200
+
+/* Is there a APM table?  */
+#define MB_INFO_APM_TABLE              0x00000400
+
+/* Is there video information?  */
+#define MB_INFO_VIDEO_INFO             0x00000800
+
+/*
+ *  The following value must be present in the EAX register.
+ */
+
+#define MULTIBOOT_VALID                        0x2BADB002
diff -r f2457c7aff8d -r 611787b6ca35 tools/libfsimage/zfs/zfs-include/dmu.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libfsimage/zfs/zfs-include/dmu.h    Thu May 08 18:40:07 2008 +0900
@@ -0,0 +1,105 @@
+/*
+ *  GRUB  --  GRand Unified Bootloader
+ *  Copyright (C) 1999,2000,2001,2002,2003,2004  Free Software Foundation, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef        _SYS_DMU_H
+#define        _SYS_DMU_H
+
+/*
+ * This file describes the interface that the DMU provides for its
+ * consumers.
+ *
+ * The DMU also interacts with the SPA.  That interface is described in
+ * dmu_spa.h.
+ */
+typedef enum dmu_object_type {
+       DMU_OT_NONE,
+       /* general: */
+       DMU_OT_OBJECT_DIRECTORY,        /* ZAP */
+       DMU_OT_OBJECT_ARRAY,            /* UINT64 */
+       DMU_OT_PACKED_NVLIST,           /* UINT8 (XDR by nvlist_pack/unpack) */
+       DMU_OT_PACKED_NVLIST_SIZE,      /* UINT64 */
+       DMU_OT_BPLIST,                  /* UINT64 */
+       DMU_OT_BPLIST_HDR,              /* UINT64 */
+       /* spa: */
+       DMU_OT_SPACE_MAP_HEADER,        /* UINT64 */
+       DMU_OT_SPACE_MAP,               /* UINT64 */
+       /* zil: */
+       DMU_OT_INTENT_LOG,              /* UINT64 */
+       /* dmu: */
+       DMU_OT_DNODE,                   /* DNODE */
+       DMU_OT_OBJSET,                  /* OBJSET */
+       /* dsl: */
+       DMU_OT_DSL_DIR,                 /* UINT64 */
+       DMU_OT_DSL_DIR_CHILD_MAP,       /* ZAP */
+       DMU_OT_DSL_DS_SNAP_MAP,         /* ZAP */
+       DMU_OT_DSL_PROPS,               /* ZAP */
+       DMU_OT_DSL_DATASET,             /* UINT64 */
+       /* zpl: */
+       DMU_OT_ZNODE,                   /* ZNODE */
+       DMU_OT_ACL,                     /* ACL */
+       DMU_OT_PLAIN_FILE_CONTENTS,     /* UINT8 */
+       DMU_OT_DIRECTORY_CONTENTS,      /* ZAP */
+       DMU_OT_MASTER_NODE,             /* ZAP */
+       DMU_OT_UNLINKED_SET,            /* ZAP */
+       /* zvol: */
+       DMU_OT_ZVOL,                    /* UINT8 */
+       DMU_OT_ZVOL_PROP,               /* ZAP */
+       /* other; for testing only! */
+       DMU_OT_PLAIN_OTHER,             /* UINT8 */
+       DMU_OT_UINT64_OTHER,            /* UINT64 */
+       DMU_OT_ZAP_OTHER,               /* ZAP */
+       /* new object types: */
+       DMU_OT_ERROR_LOG,               /* ZAP */
+       DMU_OT_SPA_HISTORY,             /* UINT8 */
+       DMU_OT_SPA_HISTORY_OFFSETS,     /* spa_his_phys_t */
+       DMU_OT_POOL_PROPS,              /* ZAP */
+
+       DMU_OT_NUMTYPES
+} dmu_object_type_t;
+
+typedef enum dmu_objset_type {
+       DMU_OST_NONE,
+       DMU_OST_META,
+       DMU_OST_ZFS,
+       DMU_OST_ZVOL,
+       DMU_OST_OTHER,                  /* For testing only! */
+       DMU_OST_ANY,                    /* Be careful! */
+       DMU_OST_NUMTYPES
+} dmu_objset_type_t;
+
+/*
+ * The names of zap entries in the DIRECTORY_OBJECT of the MOS.
+ */
+#define        DMU_POOL_DIRECTORY_OBJECT       1
+#define        DMU_POOL_CONFIG                 "config"
+#define        DMU_POOL_ROOT_DATASET           "root_dataset"
+#define        DMU_POOL_SYNC_BPLIST            "sync_bplist"
+#define        DMU_POOL_ERRLOG_SCRUB           "errlog_scrub"
+#define        DMU_POOL_ERRLOG_LAST            "errlog_last"
+#define        DMU_POOL_SPARES                 "spares"
+#define        DMU_POOL_DEFLATE                "deflate"
+#define        DMU_POOL_HISTORY                "history"
+#define        DMU_POOL_PROPS                  "pool_props"
+#define        DMU_POOL_L2CACHE                "l2cache"
+
+#endif /* _SYS_DMU_H */
diff -r f2457c7aff8d -r 611787b6ca35 
tools/libfsimage/zfs/zfs-include/dmu_objset.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libfsimage/zfs/zfs-include/dmu_objset.h     Thu May 08 18:40:07 
2008 +0900
@@ -0,0 +1,35 @@
+/*
+ *  GRUB  --  GRand Unified Bootloader
+ *  Copyright (C) 1999,2000,2001,2002,2003,2004  Free Software Foundation, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef        _SYS_DMU_OBJSET_H
+#define        _SYS_DMU_OBJSET_H
+
+typedef struct objset_phys {
+       dnode_phys_t os_meta_dnode;
+       zil_header_t os_zil_header;
+       uint64_t os_type;
+       char os_pad[1024 - sizeof (dnode_phys_t) - sizeof (zil_header_t) -
+           sizeof (uint64_t)];
+} objset_phys_t;
+
+#endif /* _SYS_DMU_OBJSET_H */
diff -r f2457c7aff8d -r 611787b6ca35 tools/libfsimage/zfs/zfs-include/dnode.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libfsimage/zfs/zfs-include/dnode.h  Thu May 08 18:40:07 2008 +0900
@@ -0,0 +1,76 @@
+/*
+ *  GRUB  --  GRand Unified Bootloader
+ *  Copyright (C) 1999,2000,2001,2002,2003,2004  Free Software Foundation, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef        _SYS_DNODE_H
+#define        _SYS_DNODE_H
+
+/*
+ * Fixed constants.
+ */
+#define        DNODE_SHIFT             9       /* 512 bytes */
+#define        DN_MIN_INDBLKSHIFT      10      /* 1k */
+#define        DN_MAX_INDBLKSHIFT      14      /* 16k */
+#define        DNODE_BLOCK_SHIFT       14      /* 16k */
+#define        DNODE_CORE_SIZE         64      /* 64 bytes for dnode sans 
blkptrs */
+#define        DN_MAX_OBJECT_SHIFT     48      /* 256 trillion (zfs_fid_t 
limit) */
+#define        DN_MAX_OFFSET_SHIFT     64      /* 2^64 bytes in a dnode */
+
+/*
+ * Derived constants.
+ */
+#define        DNODE_SIZE      (1 << DNODE_SHIFT)
+#define        DN_MAX_NBLKPTR  ((DNODE_SIZE - DNODE_CORE_SIZE) >> 
SPA_BLKPTRSHIFT)
+#define        DN_MAX_BONUSLEN (DNODE_SIZE - DNODE_CORE_SIZE - (1 << 
SPA_BLKPTRSHIFT))
+#define        DN_MAX_OBJECT   (1ULL << DN_MAX_OBJECT_SHIFT)
+
+#define        DNODES_PER_BLOCK_SHIFT  (DNODE_BLOCK_SHIFT - DNODE_SHIFT)
+#define        DNODES_PER_BLOCK        (1ULL << DNODES_PER_BLOCK_SHIFT)
+#define        DNODES_PER_LEVEL_SHIFT  (DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT)
+
+#define        DN_BONUS(dnp)   ((void*)((dnp)->dn_bonus + \
+       (((dnp)->dn_nblkptr - 1) * sizeof (blkptr_t))))
+
+typedef struct dnode_phys {
+       uint8_t dn_type;                /* dmu_object_type_t */
+       uint8_t dn_indblkshift;         /* ln2(indirect block size) */
+       uint8_t dn_nlevels;             /* 1=dn_blkptr->data blocks */
+       uint8_t dn_nblkptr;             /* length of dn_blkptr */
+       uint8_t dn_bonustype;           /* type of data in bonus buffer */
+       uint8_t dn_checksum;            /* ZIO_CHECKSUM type */
+       uint8_t dn_compress;            /* ZIO_COMPRESS type */
+       uint8_t dn_flags;               /* DNODE_FLAG_* */
+       uint16_t dn_datablkszsec;       /* data block size in 512b sectors */
+       uint16_t dn_bonuslen;           /* length of dn_bonus */
+       uint8_t dn_pad2[4];
+
+       /* accounting is protected by dn_dirty_mtx */
+       uint64_t dn_maxblkid;           /* largest allocated block ID */
+       uint64_t dn_used;               /* bytes (or sectors) of disk space */
+
+       uint64_t dn_pad3[4];
+
+       blkptr_t dn_blkptr[1];
+       uint8_t dn_bonus[DN_MAX_BONUSLEN];
+} dnode_phys_t;
+
+#endif /* _SYS_DNODE_H */
diff -r f2457c7aff8d -r 611787b6ca35 
tools/libfsimage/zfs/zfs-include/dsl_dataset.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libfsimage/zfs/zfs-include/dsl_dataset.h    Thu May 08 18:40:07 
2008 +0900
@@ -0,0 +1,53 @@
+/*
+ *  GRUB  --  GRand Unified Bootloader
+ *  Copyright (C) 1999,2000,2001,2002,2003,2004  Free Software Foundation, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef        _SYS_DSL_DATASET_H
+#define        _SYS_DSL_DATASET_H
+
+typedef struct dsl_dataset_phys {
+       uint64_t ds_dir_obj;
+       uint64_t ds_prev_snap_obj;
+       uint64_t ds_prev_snap_txg;
+       uint64_t ds_next_snap_obj;
+       uint64_t ds_snapnames_zapobj;   /* zap obj of snaps; ==0 for snaps */
+       uint64_t ds_num_children;       /* clone/snap children; ==0 for head */
+       uint64_t ds_creation_time;      /* seconds since 1970 */
+       uint64_t ds_creation_txg;
+       uint64_t ds_deadlist_obj;
+       uint64_t ds_used_bytes;
+       uint64_t ds_compressed_bytes;
+       uint64_t ds_uncompressed_bytes;
+       uint64_t ds_unique_bytes;       /* only relevant to snapshots */
+       /*
+        * The ds_fsid_guid is a 56-bit ID that can change to avoid
+        * collisions.  The ds_guid is a 64-bit ID that will never
+        * change, so there is a small probability that it will collide.
+        */
+       uint64_t ds_fsid_guid;
+       uint64_t ds_guid;
+       uint64_t ds_flags;
+       blkptr_t ds_bp;
+       uint64_t ds_pad[8]; /* pad out to 320 bytes for good measure */
+} dsl_dataset_phys_t;
+
+#endif /* _SYS_DSL_DATASET_H */
diff -r f2457c7aff8d -r 611787b6ca35 tools/libfsimage/zfs/zfs-include/dsl_dir.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libfsimage/zfs/zfs-include/dsl_dir.h        Thu May 08 18:40:07 
2008 +0900
@@ -0,0 +1,49 @@
+/*
+ *  GRUB  --  GRand Unified Bootloader
+ *  Copyright (C) 1999,2000,2001,2002,2003,2004  Free Software Foundation, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef        _SYS_DSL_DIR_H
+#define        _SYS_DSL_DIR_H
+
+typedef struct dsl_dir_phys {
+       uint64_t dd_creation_time; /* not actually used */
+       uint64_t dd_head_dataset_obj;
+       uint64_t dd_parent_obj;
+       uint64_t dd_clone_parent_obj;
+       uint64_t dd_child_dir_zapobj;
+       /*
+        * how much space our children are accounting for; for leaf
+        * datasets, == physical space used by fs + snaps
+        */
+       uint64_t dd_used_bytes;
+       uint64_t dd_compressed_bytes;
+       uint64_t dd_uncompressed_bytes;
+       /* Administrative quota setting */
+       uint64_t dd_quota;
+       /* Administrative reservation setting */
+       uint64_t dd_reserved;
+       uint64_t dd_props_zapobj;
+       uint64_t dd_deleg_zapobj;       /* dataset permissions */
+       uint64_t dd_pad[20]; /* pad out to 256 bytes for good measure */
+} dsl_dir_phys_t;
+
+#endif /* _SYS_DSL_DIR_H */
diff -r f2457c7aff8d -r 611787b6ca35 tools/libfsimage/zfs/zfs-include/spa.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libfsimage/zfs/zfs-include/spa.h    Thu May 08 18:40:07 2008 +0900
@@ -0,0 +1,283 @@
+/*
+ *  GRUB  --  GRand Unified Bootloader
+ *  Copyright (C) 1999,2000,2001,2002,2003,2004  Free Software Foundation, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_SPA_H
+#define        _SYS_SPA_H
+
+/*
+ * General-purpose 32-bit and 64-bit bitfield encodings.
+ */
+#define        BF32_DECODE(x, low, len)        P2PHASE((x) >> (low), 1U << 
(len))
+#define        BF64_DECODE(x, low, len)        P2PHASE((x) >> (low), 1ULL << 
(len))
+#define        BF32_ENCODE(x, low, len)        (P2PHASE((x), 1U << (len)) << 
(low))
+#define        BF64_ENCODE(x, low, len)        (P2PHASE((x), 1ULL << (len)) << 
(low))
+
+#define        BF32_GET(x, low, len)           BF32_DECODE(x, low, len)
+#define        BF64_GET(x, low, len)           BF64_DECODE(x, low, len)
+
+#define        BF32_SET(x, low, len, val)      \
+       ((x) ^= BF32_ENCODE((x >> low) ^ (val), low, len))
+#define        BF64_SET(x, low, len, val)      \
+       ((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len))
+
+#define        BF32_GET_SB(x, low, len, shift, bias)   \
+       ((BF32_GET(x, low, len) + (bias)) << (shift))
+#define        BF64_GET_SB(x, low, len, shift, bias)   \
+       ((BF64_GET(x, low, len) + (bias)) << (shift))
+
+#define        BF32_SET_SB(x, low, len, shift, bias, val)      \
+       BF32_SET(x, low, len, ((val) >> (shift)) - (bias))
+#define        BF64_SET_SB(x, low, len, shift, bias, val)      \
+       BF64_SET(x, low, len, ((val) >> (shift)) - (bias))
+
+/*
+ * We currently support nine block sizes, from 512 bytes to 128K.
+ * We could go higher, but the benefits are near-zero and the cost
+ * of COWing a giant block to modify one byte would become excessive.
+ */
+#define        SPA_MINBLOCKSHIFT       9
+#define        SPA_MAXBLOCKSHIFT       17
+#define        SPA_MINBLOCKSIZE        (1ULL << SPA_MINBLOCKSHIFT)
+#define        SPA_MAXBLOCKSIZE        (1ULL << SPA_MAXBLOCKSHIFT)
+
+#define        SPA_BLOCKSIZES          (SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT 
+ 1)
+
+/*
+ * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB.
+ * The ASIZE encoding should be at least 64 times larger (6 more bits)
+ * to support up to 4-way RAID-Z mirror mode with worst-case gang block
+ * overhead, three DVAs per bp, plus one more bit in case we do anything
+ * else that expands the ASIZE.
+ */
+#define        SPA_LSIZEBITS           16      /* LSIZE up to 32M (2^16 * 512) 
*/
+#define        SPA_PSIZEBITS           16      /* PSIZE up to 32M (2^16 * 512) 
*/
+#define        SPA_ASIZEBITS           24      /* ASIZE up to 64 times larger  
*/
+
+/*
+ * All SPA data is represented by 128-bit data virtual addresses (DVAs).
+ * The members of the dva_t should be considered opaque outside the SPA.
+ */
+typedef struct dva {
+       uint64_t        dva_word[2];
+} dva_t;
+
+/*
+ * Each block has a 256-bit checksum -- strong enough for cryptographic hashes.
+ */
+typedef struct zio_cksum {
+       uint64_t        zc_word[4];
+} zio_cksum_t;
+
+/*
+ * Each block is described by its DVAs, time of birth, checksum, etc.
+ * The word-by-word, bit-by-bit layout of the blkptr is as follows:
+ *
+ *     64      56      48      40      32      24      16      8       0
+ *     +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 0   |               vdev1           | GRID  |         ASIZE         |
+ *     +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 1   |G|                      offset1                                |
+ *     +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 2   |               vdev2           | GRID  |         ASIZE         |
+ *     +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 3   |G|                      offset2                                |
+ *     +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 4   |               vdev3           | GRID  |         ASIZE         |
+ *     +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 5   |G|                      offset3                                |
+ *     +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 6   |E| lvl | type  | cksum | comp  |     PSIZE     |     LSIZE     |
+ *     +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 7   |                       padding                                 |
+ *     +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 8   |                       padding                                 |
+ *     +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 9   |                       padding                                 |
+ *     +-------+-------+-------+-------+-------+-------+-------+-------+
+ * a   |                       birth txg                               |
+ *     +-------+-------+-------+-------+-------+-------+-------+-------+
+ * b   |                       fill count                              |
+ *     +-------+-------+-------+-------+-------+-------+-------+-------+
+ * c   |                       checksum[0]                             |
+ *     +-------+-------+-------+-------+-------+-------+-------+-------+
+ * d   |                       checksum[1]                             |
+ *     +-------+-------+-------+-------+-------+-------+-------+-------+
+ * e   |                       checksum[2]                             |
+ *     +-------+-------+-------+-------+-------+-------+-------+-------+
+ * f   |                       checksum[3]                             |
+ *     +-------+-------+-------+-------+-------+-------+-------+-------+
+ *
+ * Legend:
+ *
+ * vdev                virtual device ID
+ * offset      offset into virtual device
+ * LSIZE       logical size
+ * PSIZE       physical size (after compression)
+ * ASIZE       allocated size (including RAID-Z parity and gang block headers)
+ * GRID                RAID-Z layout information (reserved for future use)
+ * cksum       checksum function
+ * comp                compression function
+ * G           gang block indicator
+ * E           endianness
+ * type                DMU object type
+ * lvl         level of indirection
+ * birth txg   transaction group in which the block was born
+ * fill count  number of non-zero blocks under this bp
+ * checksum[4] 256-bit checksum of the data this bp describes
+ */
+typedef struct blkptr {
+       dva_t           blk_dva[3];     /* 128-bit Data Virtual Address */
+       uint64_t        blk_prop;       /* size, compression, type, etc */
+       uint64_t        blk_pad[3];     /* Extra space for the future   */
+       uint64_t        blk_birth;      /* transaction group at birth   */
+       uint64_t        blk_fill;       /* fill count                   */
+       zio_cksum_t     blk_cksum;      /* 256-bit checksum             */
+} blkptr_t;
+
+#define        SPA_BLKPTRSHIFT 7               /* blkptr_t is 128 bytes        
*/
+#define        SPA_DVAS_PER_BP 3               /* Number of DVAs in a bp       
*/
+
+/*
+ * Macros to get and set fields in a bp or DVA.
+ */
+#define        DVA_GET_ASIZE(dva)      \
+       BF64_GET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0)
+#define        DVA_SET_ASIZE(dva, x)   \
+       BF64_SET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0, x)
+
+#define        DVA_GET_GRID(dva)       BF64_GET((dva)->dva_word[0], 24, 8)
+#define        DVA_SET_GRID(dva, x)    BF64_SET((dva)->dva_word[0], 24, 8, x)
+
+#define        DVA_GET_VDEV(dva)       BF64_GET((dva)->dva_word[0], 32, 32)
+#define        DVA_SET_VDEV(dva, x)    BF64_SET((dva)->dva_word[0], 32, 32, x)
+
+#define        DVA_GET_OFFSET(dva)     \
+       BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0)
+#define        DVA_SET_OFFSET(dva, x)  \
+       BF64_SET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0, x)
+
+#define        DVA_GET_GANG(dva)       BF64_GET((dva)->dva_word[1], 63, 1)
+#define        DVA_SET_GANG(dva, x)    BF64_SET((dva)->dva_word[1], 63, 1, x)
+
+#define        BP_GET_LSIZE(bp)        \
+       (BP_IS_HOLE(bp) ? 0 : \
+       BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1))
+#define        BP_SET_LSIZE(bp, x)     \
+       BF64_SET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
+
+#define        BP_GET_PSIZE(bp)        \
+       BF64_GET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1)
+#define        BP_SET_PSIZE(bp, x)     \
+       BF64_SET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
+
+#define        BP_GET_COMPRESS(bp)     BF64_GET((bp)->blk_prop, 32, 8)
+#define        BP_SET_COMPRESS(bp, x)  BF64_SET((bp)->blk_prop, 32, 8, x)
+
+#define        BP_GET_CHECKSUM(bp)     BF64_GET((bp)->blk_prop, 40, 8)
+#define        BP_SET_CHECKSUM(bp, x)  BF64_SET((bp)->blk_prop, 40, 8, x)
+
+#define        BP_GET_TYPE(bp)         BF64_GET((bp)->blk_prop, 48, 8)
+#define        BP_SET_TYPE(bp, x)      BF64_SET((bp)->blk_prop, 48, 8, x)
+
+#define        BP_GET_LEVEL(bp)        BF64_GET((bp)->blk_prop, 56, 5)
+#define        BP_SET_LEVEL(bp, x)     BF64_SET((bp)->blk_prop, 56, 5, x)
+
+#define        BP_GET_BYTEORDER(bp)    (0 - BF64_GET((bp)->blk_prop, 63, 1))
+#define        BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x)
+
+#define        BP_GET_ASIZE(bp)        \
+       (DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
+               DVA_GET_ASIZE(&(bp)->blk_dva[2]))
+
+#define        BP_GET_UCSIZE(bp) \
+       ((BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata) ? \
+       BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp));
+
+#define        BP_GET_NDVAS(bp)        \
+       (!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
+       !!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
+       !!DVA_GET_ASIZE(&(bp)->blk_dva[2]))
+
+#define        BP_COUNT_GANG(bp)       \
+       (DVA_GET_GANG(&(bp)->blk_dva[0]) + \
+       DVA_GET_GANG(&(bp)->blk_dva[1]) + \
+       DVA_GET_GANG(&(bp)->blk_dva[2]))
+
+#define        DVA_EQUAL(dva1, dva2)   \
+       ((dva1)->dva_word[1] == (dva2)->dva_word[1] && \
+       (dva1)->dva_word[0] == (dva2)->dva_word[0])
+
+#define        ZIO_CHECKSUM_EQUAL(zc1, zc2) \
+       (0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) | \
+       ((zc1).zc_word[1] - (zc2).zc_word[1]) | \
+       ((zc1).zc_word[2] - (zc2).zc_word[2]) | \
+       ((zc1).zc_word[3] - (zc2).zc_word[3])))
+
+
+#define        DVA_IS_VALID(dva)       (DVA_GET_ASIZE(dva) != 0)
+
+#define        ZIO_SET_CHECKSUM(zcp, w0, w1, w2, w3)   \
+{                                              \
+       (zcp)->zc_word[0] = w0;                 \
+       (zcp)->zc_word[1] = w1;                 \
+       (zcp)->zc_word[2] = w2;                 \
+       (zcp)->zc_word[3] = w3;                 \
+}
+
+#define        BP_IDENTITY(bp)         (&(bp)->blk_dva[0])
+#define        BP_IS_GANG(bp)          DVA_GET_GANG(BP_IDENTITY(bp))
+#define        BP_IS_HOLE(bp)          ((bp)->blk_birth == 0)
+#define        BP_IS_OLDER(bp, txg)    (!BP_IS_HOLE(bp) && (bp)->blk_birth < 
(txg))
+
+#define        BP_ZERO(bp)                             \
+{                                              \
+       (bp)->blk_dva[0].dva_word[0] = 0;       \
+       (bp)->blk_dva[0].dva_word[1] = 0;       \
+       (bp)->blk_dva[1].dva_word[0] = 0;       \
+       (bp)->blk_dva[1].dva_word[1] = 0;       \
+       (bp)->blk_dva[2].dva_word[0] = 0;       \
+       (bp)->blk_dva[2].dva_word[1] = 0;       \
+       (bp)->blk_prop = 0;                     \
+       (bp)->blk_pad[0] = 0;                   \
+       (bp)->blk_pad[1] = 0;                   \
+       (bp)->blk_pad[2] = 0;                   \
+       (bp)->blk_birth = 0;                    \
+       (bp)->blk_fill = 0;                     \
+       ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \
+}
+
+/*
+ * Note: the byteorder is either 0 or -1, both of which are palindromes.
+ * This simplifies the endianness handling a bit.
+ */
+#ifdef _BIG_ENDIAN
+#define        ZFS_HOST_BYTEORDER      (0ULL)
+#else
+#define        ZFS_HOST_BYTEORDER      (-1ULL)
+#endif
+
+#define        BP_SHOULD_BYTESWAP(bp)  (BP_GET_BYTEORDER(bp) != 
ZFS_HOST_BYTEORDER)
+
+#define        BP_SPRINTF_LEN  320
+
+#endif /* _SYS_SPA_H */
diff -r f2457c7aff8d -r 611787b6ca35 
tools/libfsimage/zfs/zfs-include/uberblock_impl.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libfsimage/zfs/zfs-include/uberblock_impl.h Thu May 08 18:40:07 
2008 +0900
@@ -0,0 +1,49 @@
+/*
+ *  GRUB  --  GRand Unified Bootloader
+ *  Copyright (C) 1999,2000,2001,2002,2003,2004  Free Software Foundation, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_UBERBLOCK_IMPL_H
+#define        _SYS_UBERBLOCK_IMPL_H
+
+/*
+ * The uberblock version is incremented whenever an incompatible on-disk
+ * format change is made to the SPA, DMU, or ZAP.
+ *
+ * Note: the first two fields should never be moved.  When a storage pool
+ * is opened, the uberblock must be read off the disk before the version
+ * can be checked.  If the ub_version field is moved, we may not detect
+ * version mismatch.  If the ub_magic field is moved, applications that
+ * expect the magic number in the first word won't work.
+ */
+#define        UBERBLOCK_MAGIC         0x00bab10c              /* oo-ba-bloc!  
*/
+#define        UBERBLOCK_SHIFT         10                      /* up to 1K     
*/
+
+struct uberblock {
+       uint64_t        ub_magic;       /* UBERBLOCK_MAGIC              */
+       uint64_t        ub_version;     /* ZFS_VERSION                  */
+       uint64_t        ub_txg;         /* txg of last sync             */
+       uint64_t        ub_guid_sum;    /* sum of all vdev guids        */
+       uint64_t        ub_timestamp;   /* UTC time of last sync        */
+       blkptr_t        ub_rootbp;      /* MOS objset_phys_t            */
+};
+
+#endif /* _SYS_UBERBLOCK_IMPL_H */
diff -r f2457c7aff8d -r 611787b6ca35 
tools/libfsimage/zfs/zfs-include/vdev_impl.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libfsimage/zfs/zfs-include/vdev_impl.h      Thu May 08 18:40:07 
2008 +0900
@@ -0,0 +1,70 @@
+/*
+ *  GRUB  --  GRand Unified Bootloader
+ *  Copyright (C) 1999,2000,2001,2002,2003,2004  Free Software Foundation, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_VDEV_IMPL_H
+#define        _SYS_VDEV_IMPL_H
+
+#define        VDEV_SKIP_SIZE          (8 << 10)
+#define        VDEV_BOOT_HEADER_SIZE   (8 << 10)
+#define        VDEV_PHYS_SIZE          (112 << 10)
+#define        VDEV_UBERBLOCK_RING     (128 << 10)
+
+/* ZFS boot block */
+#define        VDEV_BOOT_MAGIC         0x2f5b007b10cULL
+#define        VDEV_BOOT_VERSION       1               /* version number       
*/
+
+typedef struct vdev_boot_header {
+       uint64_t        vb_magic;               /* VDEV_BOOT_MAGIC      */
+       uint64_t        vb_version;             /* VDEV_BOOT_VERSION    */
+       uint64_t        vb_offset;              /* start offset (bytes) */
+       uint64_t        vb_size;                /* size (bytes)         */
+       char            vb_pad[VDEV_BOOT_HEADER_SIZE - 4 * sizeof (uint64_t)];
+} vdev_boot_header_t;
+
+typedef struct vdev_phys {
+       char            vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_block_tail_t)];
+       zio_block_tail_t vp_zbt;
+} vdev_phys_t;
+
+typedef struct vdev_label {
+       char            vl_pad[VDEV_SKIP_SIZE];                 /*   8K */
+       vdev_boot_header_t vl_boot_header;                      /*   8K */
+       vdev_phys_t     vl_vdev_phys;                           /* 112K */
+       char            vl_uberblock[VDEV_UBERBLOCK_RING];      /* 128K */
+} vdev_label_t;                                                        /* 256K 
total */
+
+/*
+ * Size and offset of embedded boot loader region on each label.
+ * The total size of the first two labels plus the boot area is 4MB.
+ */
+#define        VDEV_BOOT_OFFSET        (2 * sizeof (vdev_label_t))
+#define        VDEV_BOOT_SIZE          (7ULL << 19)                    /* 3.5M 
*/
+
+/*
+ * Size of label regions at the start and end of each leaf device.
+ */
+#define        VDEV_LABEL_START_SIZE   (2 * sizeof (vdev_label_t) + 
VDEV_BOOT_SIZE)
+#define        VDEV_LABEL_END_SIZE     (2 * sizeof (vdev_label_t))
+#define        VDEV_LABELS             4
+
+#endif /* _SYS_VDEV_IMPL_H */
diff -r f2457c7aff8d -r 611787b6ca35 tools/libfsimage/zfs/zfs-include/zap_impl.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libfsimage/zfs/zfs-include/zap_impl.h       Thu May 08 18:40:07 
2008 +0900
@@ -0,0 +1,110 @@
+/*
+ *  GRUB  --  GRand Unified Bootloader
+ *  Copyright (C) 1999,2000,2001,2002,2003,2004  Free Software Foundation, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef        _SYS_ZAP_IMPL_H
+#define        _SYS_ZAP_IMPL_H
+
+#define        ZAP_MAGIC 0x2F52AB2ABULL
+
+#define        ZAP_HASHBITS            28
+#define        MZAP_ENT_LEN            64
+#define        MZAP_NAME_LEN           (MZAP_ENT_LEN - 8 - 4 - 2)
+#define        MZAP_MAX_BLKSHIFT       SPA_MAXBLOCKSHIFT
+#define        MZAP_MAX_BLKSZ          (1 << MZAP_MAX_BLKSHIFT)
+
+typedef struct mzap_ent_phys {
+       uint64_t mze_value;
+       uint32_t mze_cd;
+       uint16_t mze_pad;       /* in case we want to chain them someday */
+       char mze_name[MZAP_NAME_LEN];
+} mzap_ent_phys_t;
+
+typedef struct mzap_phys {
+       uint64_t mz_block_type; /* ZBT_MICRO */
+       uint64_t mz_salt;
+       uint64_t mz_pad[6];
+       mzap_ent_phys_t mz_chunk[1];
+       /* actually variable size depending on block size */
+} mzap_phys_t;
+
+/*
+ * The (fat) zap is stored in one object. It is an array of
+ * 1<<FZAP_BLOCK_SHIFT byte blocks. The layout looks like one of:
+ *
+ * ptrtbl fits in first block:
+ *     [zap_phys_t zap_ptrtbl_shift < 6] [zap_leaf_t] ...
+ *
+ * ptrtbl too big for first block:
+ *     [zap_phys_t zap_ptrtbl_shift >= 6] [zap_leaf_t] [ptrtbl] ...
+ *
+ */
+
+#define        ZBT_LEAF                ((1ULL << 63) + 0)
+#define        ZBT_HEADER              ((1ULL << 63) + 1)
+#define        ZBT_MICRO               ((1ULL << 63) + 3)
+/* any other values are ptrtbl blocks */
+
+/*
+ * the embedded pointer table takes up half a block:
+ * block size / entry size (2^3) / 2
+ */
+#define        ZAP_EMBEDDED_PTRTBL_SHIFT(zap) (FZAP_BLOCK_SHIFT(zap) - 3 - 1)
+
+/*
+ * The embedded pointer table starts half-way through the block.  Since
+ * the pointer table itself is half the block, it starts at (64-bit)
+ * word number (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)).
+ */
+#define        ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) \
+       ((uint64_t *)(zap)->zap_f.zap_phys) \
+       [(idx) + (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap))]
+
+/*
+ * TAKE NOTE:
+ * If zap_phys_t is modified, zap_byteswap() must be modified.
+ */
+typedef struct zap_phys {
+       uint64_t zap_block_type;        /* ZBT_HEADER */
+       uint64_t zap_magic;             /* ZAP_MAGIC */
+
+       struct zap_table_phys {
+               uint64_t zt_blk;        /* starting block number */
+               uint64_t zt_numblks;    /* number of blocks */
+               uint64_t zt_shift;      /* bits to index it */
+               uint64_t zt_nextblk;    /* next (larger) copy start block */
+               uint64_t zt_blks_copied; /* number source blocks copied */
+       } zap_ptrtbl;
+
+       uint64_t zap_freeblk;           /* the next free block */
+       uint64_t zap_num_leafs;         /* number of leafs */
+       uint64_t zap_num_entries;       /* number of entries */
+       uint64_t zap_salt;              /* salt to stir into hash function */
+       /*
+        * This structure is followed by padding, and then the embedded
+        * pointer table.  The embedded pointer table takes up second
+        * half of the block.  It is accessed using the
+        * ZAP_EMBEDDED_PTRTBL_ENT() macro.
+        */
+} zap_phys_t;
+
+#endif /* _SYS_ZAP_IMPL_H */
diff -r f2457c7aff8d -r 611787b6ca35 tools/libfsimage/zfs/zfs-include/zap_leaf.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libfsimage/zfs/zfs-include/zap_leaf.h       Thu May 08 18:40:07 
2008 +0900
@@ -0,0 +1,100 @@
+/*
+ *  GRUB  --  GRand Unified Bootloader
+ *  Copyright (C) 1999,2000,2001,2002,2003,2004  Free Software Foundation, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef        _SYS_ZAP_LEAF_H
+#define        _SYS_ZAP_LEAF_H
+
+#define        ZAP_LEAF_MAGIC 0x2AB1EAF
+
+/* chunk size = 24 bytes */
+#define        ZAP_LEAF_CHUNKSIZE 24
+
+/*
+ * The amount of space within the chunk available for the array is:
+ * chunk size - space for type (1) - space for next pointer (2)
+ */
+#define        ZAP_LEAF_ARRAY_BYTES (ZAP_LEAF_CHUNKSIZE - 3)
+
+typedef enum zap_chunk_type {
+       ZAP_CHUNK_FREE = 253,
+       ZAP_CHUNK_ENTRY = 252,
+       ZAP_CHUNK_ARRAY = 251,
+       ZAP_CHUNK_TYPE_MAX = 250
+} zap_chunk_type_t;
+
+/*
+ * TAKE NOTE:
+ * If zap_leaf_phys_t is modified, zap_leaf_byteswap() must be modified.
+ */
+typedef struct zap_leaf_phys {
+       struct zap_leaf_header {
+               uint64_t lh_block_type;         /* ZBT_LEAF */
+               uint64_t lh_pad1;
+               uint64_t lh_prefix;             /* hash prefix of this leaf */
+               uint32_t lh_magic;              /* ZAP_LEAF_MAGIC */
+               uint16_t lh_nfree;              /* number free chunks */
+               uint16_t lh_nentries;           /* number of entries */
+               uint16_t lh_prefix_len;         /* num bits used to id this */
+
+/* above is accessable to zap, below is zap_leaf private */
+
+               uint16_t lh_freelist;           /* chunk head of free list */
+               uint8_t lh_pad2[12];
+       } l_hdr; /* 2 24-byte chunks */
+
+       /*
+        * The header is followed by a hash table with
+        * ZAP_LEAF_HASH_NUMENTRIES(zap) entries.  The hash table is
+        * followed by an array of ZAP_LEAF_NUMCHUNKS(zap)
+        * zap_leaf_chunk structures.  These structures are accessed
+        * with the ZAP_LEAF_CHUNK() macro.
+        */
+
+       uint16_t l_hash[1];
+} zap_leaf_phys_t;
+
+typedef union zap_leaf_chunk {
+       struct zap_leaf_entry {
+               uint8_t le_type;                /* always ZAP_CHUNK_ENTRY */
+               uint8_t le_int_size;            /* size of ints */
+               uint16_t le_next;               /* next entry in hash chain */
+               uint16_t le_name_chunk;         /* first chunk of the name */
+               uint16_t le_name_length;        /* bytes in name, incl null */
+               uint16_t le_value_chunk;        /* first chunk of the value */
+               uint16_t le_value_length;       /* value length in ints */
+               uint32_t le_cd;                 /* collision differentiator */
+               uint64_t le_hash;               /* hash value of the name */
+       } l_entry;
+       struct zap_leaf_array {
+               uint8_t la_type;                /* always ZAP_CHUNK_ARRAY */
+               uint8_t la_array[ZAP_LEAF_ARRAY_BYTES];
+               uint16_t la_next;               /* next blk or CHAIN_END */
+       } l_array;
+       struct zap_leaf_free {
+               uint8_t lf_type;                /* always ZAP_CHUNK_FREE */
+               uint8_t lf_pad[ZAP_LEAF_ARRAY_BYTES];
+               uint16_t lf_next;       /* next in free list, or CHAIN_END */
+       } l_free;
+} zap_leaf_chunk_t;
+
+#endif /* _SYS_ZAP_LEAF_H */
diff -r f2457c7aff8d -r 611787b6ca35 tools/libfsimage/zfs/zfs-include/zfs.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libfsimage/zfs/zfs-include/zfs.h    Thu May 08 18:40:07 2008 +0900
@@ -0,0 +1,112 @@
+/*
+ *  GRUB  --  GRand Unified Bootloader
+ *  Copyright (C) 1999,2000,2001,2002,2003,2004  Free Software Foundation, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef        _SYS_FS_ZFS_H
+#define        _SYS_FS_ZFS_H
+
+
+/*
+ * On-disk version number.
+ */
+#define        SPA_VERSION_1                   1ULL
+#define        SPA_VERSION_2                   2ULL
+#define        SPA_VERSION_3                   3ULL
+#define        SPA_VERSION_4                   4ULL
+#define        SPA_VERSION_5                   5ULL
+#define        SPA_VERSION_6                   6ULL
+#define        SPA_VERSION_7                   7ULL
+#define        SPA_VERSION_8                   8ULL
+#define        SPA_VERSION_9                   9ULL
+#define        SPA_VERSION_10                  10ULL
+#define        SPA_VERSION                     SPA_VERSION_10
+
+/*
+ * The following are configuration names used in the nvlist describing a pool's
+ * configuration.
+ */
+#define        ZPOOL_CONFIG_VERSION            "version"
+#define        ZPOOL_CONFIG_POOL_NAME          "name"
+#define        ZPOOL_CONFIG_POOL_STATE         "state"
+#define        ZPOOL_CONFIG_POOL_TXG           "txg"
+#define        ZPOOL_CONFIG_POOL_GUID          "pool_guid"
+#define        ZPOOL_CONFIG_CREATE_TXG         "create_txg"
+#define        ZPOOL_CONFIG_TOP_GUID           "top_guid"
+#define        ZPOOL_CONFIG_VDEV_TREE          "vdev_tree"
+#define        ZPOOL_CONFIG_TYPE               "type"
+#define        ZPOOL_CONFIG_CHILDREN           "children"
+#define        ZPOOL_CONFIG_ID                 "id"
+#define        ZPOOL_CONFIG_GUID               "guid"
+#define        ZPOOL_CONFIG_PATH               "path"
+#define        ZPOOL_CONFIG_DEVID              "devid"
+#define        ZPOOL_CONFIG_METASLAB_ARRAY     "metaslab_array"
+#define        ZPOOL_CONFIG_METASLAB_SHIFT     "metaslab_shift"
+#define        ZPOOL_CONFIG_ASHIFT             "ashift"
+#define        ZPOOL_CONFIG_ASIZE              "asize"
+#define        ZPOOL_CONFIG_DTL                "DTL"
+#define        ZPOOL_CONFIG_STATS              "stats"
+#define        ZPOOL_CONFIG_WHOLE_DISK         "whole_disk"
+#define        ZPOOL_CONFIG_ERRCOUNT           "error_count"
+#define        ZPOOL_CONFIG_NOT_PRESENT        "not_present"
+#define        ZPOOL_CONFIG_SPARES             "spares"
+#define        ZPOOL_CONFIG_IS_SPARE           "is_spare"
+#define        ZPOOL_CONFIG_NPARITY            "nparity"
+#define        ZPOOL_CONFIG_PHYS_PATH          "phys_path"
+#define        ZPOOL_CONFIG_L2CACHE            "l2cache"
+/*
+ * The persistent vdev state is stored as separate values rather than a single
+ * 'vdev_state' entry.  This is because a device can be in multiple states, 
such
+ * as offline and degraded.
+ */
+#define        ZPOOL_CONFIG_OFFLINE            "offline"
+#define        ZPOOL_CONFIG_FAULTED            "faulted"
+#define        ZPOOL_CONFIG_DEGRADED           "degraded"
+#define        ZPOOL_CONFIG_REMOVED            "removed"
+
+#define        VDEV_TYPE_ROOT                  "root"
+#define        VDEV_TYPE_MIRROR                "mirror"
+#define        VDEV_TYPE_REPLACING             "replacing"
+#define        VDEV_TYPE_RAIDZ                 "raidz"
+#define        VDEV_TYPE_DISK                  "disk"
+#define        VDEV_TYPE_FILE                  "file"
+#define        VDEV_TYPE_MISSING               "missing"
+#define        VDEV_TYPE_SPARE                 "spare"
+#define        VDEV_TYPE_L2CACHE               "l2cache"
+
+/*
+ * pool state.  The following states are written to disk as part of the normal
+ * SPA lifecycle: ACTIVE, EXPORTED, DESTROYED, SPARE, L2CACHE.  The remaining
+ * states are software abstractions used at various levels to communicate pool
+ * state.
+ */
+typedef enum pool_state {
+       POOL_STATE_ACTIVE = 0,          /* In active use                */
+       POOL_STATE_EXPORTED,            /* Explicitly exported          */
+       POOL_STATE_DESTROYED,           /* Explicitly destroyed         */
+       POOL_STATE_SPARE,               /* Reserved for hot spare use   */
+       POOL_STATE_L2CACHE,             /* Level 2 ARC device           */
+       POOL_STATE_UNINITIALIZED,       /* Internal spa_t state         */
+       POOL_STATE_UNAVAIL,             /* Internal libzfs state        */
+       POOL_STATE_POTENTIALLY_ACTIVE   /* Internal libzfs state        */
+} pool_state_t;
+
+#endif /* _SYS_FS_ZFS_H */
diff -r f2457c7aff8d -r 611787b6ca35 tools/libfsimage/zfs/zfs-include/zfs_acl.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libfsimage/zfs/zfs-include/zfs_acl.h        Thu May 08 18:40:07 
2008 +0900
@@ -0,0 +1,55 @@
+/*
+ *  GRUB  --  GRand Unified Bootloader
+ *  Copyright (C) 1999,2000,2001,2002,2003,2004  Free Software Foundation, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef        _SYS_FS_ZFS_ACL_H
+#define        _SYS_FS_ZFS_ACL_H
+
+typedef struct zfs_oldace {
+       uint32_t        z_fuid;         /* "who" */
+       uint32_t        z_access_mask;  /* access mask */
+       uint16_t        z_flags;        /* flags, i.e inheritance */
+       uint16_t        z_type;         /* type of entry allow/deny */
+} zfs_oldace_t;
+
+#define        ACE_SLOT_CNT    6
+
+typedef struct zfs_znode_acl_v0 {
+       uint64_t        z_acl_extern_obj;         /* ext acl pieces */
+       uint32_t        z_acl_count;              /* Number of ACEs */
+       uint16_t        z_acl_version;            /* acl version */
+       uint16_t        z_acl_pad;                /* pad */
+       zfs_oldace_t    z_ace_data[ACE_SLOT_CNT]; /* 6 standard ACEs */
+} zfs_znode_acl_v0_t;
+
+#define        ZFS_ACE_SPACE   (sizeof (zfs_oldace_t) * ACE_SLOT_CNT)
+
+typedef struct zfs_znode_acl {
+       uint64_t        z_acl_extern_obj;         /* ext acl pieces */
+       uint32_t        z_acl_size;               /* Number of bytes in ACL */
+       uint16_t        z_acl_version;            /* acl version */
+       uint16_t        z_acl_count;              /* ace count */
+       uint8_t         z_ace_data[ZFS_ACE_SPACE]; /* space for embedded ACEs */
+} zfs_znode_acl_t;
+
+
+#endif /* _SYS_FS_ZFS_ACL_H */
diff -r f2457c7aff8d -r 611787b6ca35 
tools/libfsimage/zfs/zfs-include/zfs_znode.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libfsimage/zfs/zfs-include/zfs_znode.h      Thu May 08 18:40:07 
2008 +0900
@@ -0,0 +1,68 @@
+/*
+ *  GRUB  --  GRand Unified Bootloader
+ *  Copyright (C) 1999,2000,2001,2002,2003,2004  Free Software Foundation, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef        _SYS_FS_ZFS_ZNODE_H
+#define        _SYS_FS_ZFS_ZNODE_H
+
+#define        MASTER_NODE_OBJ 1
+#define        ZFS_ROOT_OBJ            "ROOT"
+#define        ZPL_VERSION_STR         "VERSION"
+
+#define        ZPL_VERSION             3ULL
+
+#define        ZFS_DIRENT_OBJ(de) BF64_GET(de, 0, 48)
+
+/*
+ * This is the persistent portion of the znode.  It is stored
+ * in the "bonus buffer" of the file.  Short symbolic links
+ * are also stored in the bonus buffer.
+ */
+typedef struct znode_phys {
+       uint64_t zp_atime[2];           /*  0 - last file access time */
+       uint64_t zp_mtime[2];           /* 16 - last file modification time */
+       uint64_t zp_ctime[2];           /* 32 - last file change time */
+       uint64_t zp_crtime[2];          /* 48 - creation time */
+       uint64_t zp_gen;                /* 64 - generation (txg of creation) */
+       uint64_t zp_mode;               /* 72 - file mode bits */
+       uint64_t zp_size;               /* 80 - size of file */
+       uint64_t zp_parent;             /* 88 - directory parent (`..') */
+       uint64_t zp_links;              /* 96 - number of links to file */
+       uint64_t zp_xattr;              /* 104 - DMU object for xattrs */
+       uint64_t zp_rdev;               /* 112 - dev_t for VBLK & VCHR files */
+       uint64_t zp_flags;              /* 120 - persistent flags */
+       uint64_t zp_uid;                /* 128 - file owner */
+       uint64_t zp_gid;                /* 136 - owning group */
+       uint64_t zp_pad[4];             /* 144 - future */
+       zfs_znode_acl_t zp_acl;         /* 176 - 263 ACL */
+       /*
+        * Data may pad out any remaining bytes in the znode buffer, eg:
+        *
+        * |<---------------------- dnode_phys (512) ------------------------>|
+        * |<-- dnode (192) --->|<----------- "bonus" buffer (320) ---------->|
+        *                      |<---- znode (264) ---->|<---- data (56) ---->|
+        *
+        * At present, we only use this space to store symbolic links.
+        */
+} znode_phys_t;
+
+#endif /* _SYS_FS_ZFS_ZNODE_H */
diff -r f2457c7aff8d -r 611787b6ca35 tools/libfsimage/zfs/zfs-include/zil.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libfsimage/zfs/zfs-include/zil.h    Thu May 08 18:40:07 2008 +0900
@@ -0,0 +1,51 @@
+/*
+ *  GRUB  --  GRand Unified Bootloader
+ *  Copyright (C) 1999,2000,2001,2002,2003,2004  Free Software Foundation, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef        _SYS_ZIL_H
+#define        _SYS_ZIL_H
+
+/*
+ * Intent log format:
+ *
+ * Each objset has its own intent log.  The log header (zil_header_t)
+ * for objset N's intent log is kept in the Nth object of the SPA's
+ * intent_log objset.  The log header points to a chain of log blocks,
+ * each of which contains log records (i.e., transactions) followed by
+ * a log block trailer (zil_trailer_t).  The format of a log record
+ * depends on the record (or transaction) type, but all records begin
+ * with a common structure that defines the type, length, and txg.
+ */
+
+/*
+ * Intent log header - this on disk structure holds fields to manage
+ * the log.  All fields are 64 bit to easily handle cross architectures.
+ */
+typedef struct zil_header {
+       uint64_t zh_claim_txg;  /* txg in which log blocks were claimed */
+       uint64_t zh_replay_seq; /* highest replayed sequence number */
+       blkptr_t zh_log;        /* log chain */
+       uint64_t zh_claim_seq;  /* highest claimed sequence number */
+       uint64_t zh_pad[5];
+} zil_header_t;
+
+#endif /* _SYS_ZIL_H */
diff -r f2457c7aff8d -r 611787b6ca35 tools/libfsimage/zfs/zfs-include/zio.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libfsimage/zfs/zfs-include/zio.h    Thu May 08 18:40:07 2008 +0900
@@ -0,0 +1,81 @@
+/*
+ *  GRUB  --  GRand Unified Bootloader
+ *  Copyright (C) 1999,2000,2001,2002,2003,2004  Free Software Foundation, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _ZIO_H
+#define        _ZIO_H
+
+#define        ZBT_MAGIC       0x210da7ab10c7a11ULL    /* zio data bloc tail */
+
+typedef struct zio_block_tail {
+       uint64_t        zbt_magic;      /* for validation, endianness   */
+       zio_cksum_t     zbt_cksum;      /* 256-bit checksum             */
+} zio_block_tail_t;
+
+/*
+ * Gang block headers are self-checksumming and contain an array
+ * of block pointers.
+ */
+#define        SPA_GANGBLOCKSIZE       SPA_MINBLOCKSIZE
+#define        SPA_GBH_NBLKPTRS        ((SPA_GANGBLOCKSIZE - \
+       sizeof (zio_block_tail_t)) / sizeof (blkptr_t))
+#define        SPA_GBH_FILLER          ((SPA_GANGBLOCKSIZE - \
+       sizeof (zio_block_tail_t) - \
+       (SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\
+       sizeof (uint64_t))
+
+#define        ZIO_GET_IOSIZE(zio)     \
+       (BP_IS_GANG((zio)->io_bp) ? \
+       SPA_GANGBLOCKSIZE : BP_GET_PSIZE((zio)->io_bp))
+
+typedef struct zio_gbh {
+       blkptr_t                zg_blkptr[SPA_GBH_NBLKPTRS];
+       uint64_t                zg_filler[SPA_GBH_FILLER];
+       zio_block_tail_t        zg_tail;
+} zio_gbh_phys_t;
+
+enum zio_checksum {
+       ZIO_CHECKSUM_INHERIT = 0,
+       ZIO_CHECKSUM_ON,
+       ZIO_CHECKSUM_OFF,
+       ZIO_CHECKSUM_LABEL,
+       ZIO_CHECKSUM_GANG_HEADER,
+       ZIO_CHECKSUM_ZILOG,
+       ZIO_CHECKSUM_FLETCHER_2,
+       ZIO_CHECKSUM_FLETCHER_4,
+       ZIO_CHECKSUM_SHA256,
+       ZIO_CHECKSUM_FUNCTIONS
+};
+
+#define        ZIO_CHECKSUM_ON_VALUE   ZIO_CHECKSUM_FLETCHER_2
+#define        ZIO_CHECKSUM_DEFAULT    ZIO_CHECKSUM_ON
+
+enum zio_compress {
+       ZIO_COMPRESS_INHERIT = 0,
+       ZIO_COMPRESS_ON,
+       ZIO_COMPRESS_OFF,
+       ZIO_COMPRESS_LZJB,
+       ZIO_COMPRESS_EMPTY,
+       ZIO_COMPRESS_FUNCTIONS
+};
+
+#endif /* _ZIO_H */
diff -r f2457c7aff8d -r 611787b6ca35 
tools/libfsimage/zfs/zfs-include/zio_checksum.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libfsimage/zfs/zfs-include/zio_checksum.h   Thu May 08 18:40:07 
2008 +0900
@@ -0,0 +1,42 @@
+/*
+ *  GRUB  --  GRand Unified Bootloader
+ *  Copyright (C) 1999,2000,2001,2002,2003,2004  Free Software Foundation, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ZIO_CHECKSUM_H
+#define        _SYS_ZIO_CHECKSUM_H
+
+/*
+ * Signature for checksum functions.
+ */
+typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp);
+
+/*
+ * Information about each checksum function.
+ */
+typedef struct zio_checksum_info {
+       zio_checksum_t  *ci_func[2]; /* checksum function for each byteorder */
+       int             ci_correctable; /* number of correctable bits   */
+       int             ci_zbt;         /* uses zio block tail? */
+       char            *ci_name;       /* descriptive name */
+} zio_checksum_info_t;
+
+#endif /* _SYS_ZIO_CHECKSUM_H */
diff -r f2457c7aff8d -r 611787b6ca35 tools/libfsimage/zfs/zfs_fletcher.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libfsimage/zfs/zfs_fletcher.c       Thu May 08 18:40:07 2008 +0900
@@ -0,0 +1,93 @@
+/*
+ *  GRUB  --  GRand Unified Bootloader
+ *  Copyright (C) 1999,2000,2001,2002,2003,2004  Free Software Foundation, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include "fsys_zfs.h"
+
+
+void
+fletcher_2_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+       const uint64_t *ip = buf;
+       const uint64_t *ipend = ip + (size / sizeof (uint64_t));
+       uint64_t a0, b0, a1, b1;
+
+       for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
+               a0 += ip[0];
+               a1 += ip[1];
+               b0 += a0;
+               b1 += a1;
+       }
+
+       ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
+}
+
+void
+fletcher_2_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+       const uint64_t *ip = buf;
+       const uint64_t *ipend = ip + (size / sizeof (uint64_t));
+       uint64_t a0, b0, a1, b1;
+
+       for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
+               a0 += BSWAP_64(ip[0]);
+               a1 += BSWAP_64(ip[1]);
+               b0 += a0;
+               b1 += a1;
+       }
+
+       ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
+}
+
+void
+fletcher_4_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+       const uint32_t *ip = buf;
+       const uint32_t *ipend = ip + (size / sizeof (uint32_t));
+       uint64_t a, b, c, d;
+
+       for (a = b = c = d = 0; ip < ipend; ip++) {
+               a += ip[0];
+               b += a;
+               c += b;
+               d += c;
+       }
+
+       ZIO_SET_CHECKSUM(zcp, a, b, c, d);
+}
+
+void
+fletcher_4_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+       const uint32_t *ip = buf;
+       const uint32_t *ipend = ip + (size / sizeof (uint32_t));
+       uint64_t a, b, c, d;
+
+       for (a = b = c = d = 0; ip < ipend; ip++) {
+               a += BSWAP_32(ip[0]);
+               b += a;
+               c += b;
+               d += c;
+       }
+
+       ZIO_SET_CHECKSUM(zcp, a, b, c, d);
+}
diff -r f2457c7aff8d -r 611787b6ca35 tools/libfsimage/zfs/zfs_lzjb.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libfsimage/zfs/zfs_lzjb.c   Thu May 08 18:40:07 2008 +0900
@@ -0,0 +1,60 @@
+/*
+ *  GRUB  --  GRand Unified Bootloader
+ *  Copyright (C) 1999,2000,2001,2002,2003,2004  Free Software Foundation, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include "fsys_zfs.h"
+
+#define        MATCH_BITS      6
+#define        MATCH_MIN       3
+#define        OFFSET_MASK     ((1 << (16 - MATCH_BITS)) - 1)
+
+
+/*ARGSUSED*/
+int
+lzjb_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len)
+{
+       unsigned char *src = s_start;
+       unsigned char *dst = d_start;
+       unsigned char *d_end = (unsigned char *)d_start + d_len;
+       unsigned char *cpy;
+       unsigned char copymap = '\0';
+       int copymask = 1 << (NBBY - 1);
+
+       while (dst < d_end) {
+               if ((copymask <<= 1) == (1 << NBBY)) {
+                       copymask = 1;
+                       copymap = *src++;
+               }
+               if (copymap & (unsigned char)copymask) {
+                       int mlen = (src[0] >> (NBBY - MATCH_BITS)) + MATCH_MIN;
+                       int offset = ((src[0] << NBBY) | src[1]) & OFFSET_MASK;
+                       src += 2;
+                       if ((cpy = dst - offset) < (unsigned char *)d_start)
+                               return (-1);
+                       while (--mlen >= 0 && dst < d_end)
+                               *dst++ = *cpy++;
+               } else {
+                       *dst++ = *src++;
+               }
+       }
+       return (0);
+}
diff -r f2457c7aff8d -r 611787b6ca35 tools/libfsimage/zfs/zfs_sha256.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libfsimage/zfs/zfs_sha256.c Thu May 08 18:40:07 2008 +0900
@@ -0,0 +1,124 @@
+/*
+ *  GRUB  --  GRand Unified Bootloader
+ *  Copyright (C) 1999,2000,2001,2002,2003,2004  Free Software Foundation, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include "fsys_zfs.h"
+
+/*
+ * SHA-256 checksum, as specified in FIPS 180-2, available at:
+ * http://csrc.nist.gov/cryptval
+ *
+ * This is a very compact implementation of SHA-256.
+ * It is designed to be simple and portable, not to be fast.
+ */
+
+/*
+ * The literal definitions according to FIPS180-2 would be:
+ *
+ *     Ch(x, y, z)     (((x) & (y)) ^ ((~(x)) & (z)))
+ *     Maj(x, y, z)    (((x) & (y)) | ((x) & (z)) | ((y) & (z)))
+ *
+ * We use logical equivalents which require one less op.
+ */
+#define        Ch(x, y, z)     ((z) ^ ((x) & ((y) ^ (z))))
+#define        Maj(x, y, z)    (((x) & (y)) ^ ((z) & ((x) ^ (y))))
+#define        Rot32(x, s)     (((x) >> s) | ((x) << (32 - s)))
+#define        SIGMA0(x)       (Rot32(x, 2) ^ Rot32(x, 13) ^ Rot32(x, 22))
+#define        SIGMA1(x)       (Rot32(x, 6) ^ Rot32(x, 11) ^ Rot32(x, 25))
+#define        sigma0(x)       (Rot32(x, 7) ^ Rot32(x, 18) ^ ((x) >> 3))
+#define        sigma1(x)       (Rot32(x, 17) ^ Rot32(x, 19) ^ ((x) >> 10))
+
+static const uint32_t SHA256_K[64] = {
+       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
+static void
+SHA256Transform(uint32_t *H, const uint8_t *cp)
+{
+       uint32_t a, b, c, d, e, f, g, h, t, T1, T2, W[64];
+
+       for (t = 0; t < 16; t++, cp += 4)
+               W[t] = (cp[0] << 24) | (cp[1] << 16) | (cp[2] << 8) | cp[3];
+
+       for (t = 16; t < 64; t++)
+               W[t] = sigma1(W[t - 2]) + W[t - 7] +
+                   sigma0(W[t - 15]) + W[t - 16];
+
+       a = H[0]; b = H[1]; c = H[2]; d = H[3];
+       e = H[4]; f = H[5]; g = H[6]; h = H[7];
+
+       for (t = 0; t < 64; t++) {
+               T1 = h + SIGMA1(e) + Ch(e, f, g) + SHA256_K[t] + W[t];
+               T2 = SIGMA0(a) + Maj(a, b, c);
+               h = g; g = f; f = e; e = d + T1;
+               d = c; c = b; b = a; a = T1 + T2;
+       }
+
+       H[0] += a; H[1] += b; H[2] += c; H[3] += d;
+       H[4] += e; H[5] += f; H[6] += g; H[7] += h;
+}
+
+void
+zio_checksum_SHA256(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+       uint32_t H[8] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
+           0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 };
+       uint8_t pad[128];
+       int padsize = size & 63;
+       int i;
+
+       for (i = 0; i < size - padsize; i += 64)
+               SHA256Transform(H, (uint8_t *)buf + i);
+
+       for (i = 0; i < padsize; i++)
+               pad[i] = ((uint8_t *)buf)[i];
+
+       for (pad[padsize++] = 0x80; (padsize & 63) != 56; padsize++)
+               pad[padsize] = 0;
+
+       for (i = 0; i < 8; i++)
+               pad[padsize++] = (size << 3) >> (56 - 8 * i);
+
+       for (i = 0; i < padsize; i += 64)
+               SHA256Transform(H, pad + i);
+
+       ZIO_SET_CHECKSUM(zcp,
+           (uint64_t)H[0] << 32 | H[1],
+           (uint64_t)H[2] << 32 | H[3],
+           (uint64_t)H[4] << 32 | H[5],
+           (uint64_t)H[6] << 32 | H[7]);
+}
diff -r f2457c7aff8d -r 611787b6ca35 tools/libxc/Makefile
--- a/tools/libxc/Makefile      Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/libxc/Makefile      Thu May 08 18:40:07 2008 +0900
@@ -53,6 +53,7 @@ GUEST_SRCS-y                 += xc_dom_c
 GUEST_SRCS-y                 += xc_dom_compat_linux.c
 
 GUEST_SRCS-$(CONFIG_X86)     += xc_dom_x86.c
+GUEST_SRCS-$(CONFIG_X86)     += xc_cpuid_x86.c
 GUEST_SRCS-$(CONFIG_IA64)    += xc_dom_ia64.c
 GUEST_SRCS-$(CONFIG_POWERPC) += xc_dom_powerpc.c
 endif
diff -r f2457c7aff8d -r 611787b6ca35 tools/libxc/xc_cpufeature.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libxc/xc_cpufeature.h       Thu May 08 18:40:07 2008 +0900
@@ -0,0 +1,115 @@
+#ifndef __LIBXC_CPUFEATURE_H
+#define __LIBXC_CPUFEATURE_H
+
+/* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */
+#define X86_FEATURE_FPU                (0*32+ 0) /* Onboard FPU */
+#define X86_FEATURE_VME                (0*32+ 1) /* Virtual Mode Extensions */
+#define X86_FEATURE_DE         (0*32+ 2) /* Debugging Extensions */
+#define X86_FEATURE_PSE        (0*32+ 3) /* Page Size Extensions */
+#define X86_FEATURE_TSC                (0*32+ 4) /* Time Stamp Counter */
+#define X86_FEATURE_MSR                (0*32+ 5) /* Model-Specific Registers, 
RDMSR, WRMSR */
+#define X86_FEATURE_PAE                (0*32+ 6) /* Physical Address 
Extensions */
+#define X86_FEATURE_MCE                (0*32+ 7) /* Machine Check Architecture 
*/
+#define X86_FEATURE_CX8                (0*32+ 8) /* CMPXCHG8 instruction */
+#define X86_FEATURE_APIC       (0*32+ 9) /* Onboard APIC */
+#define X86_FEATURE_SEP                (0*32+11) /* SYSENTER/SYSEXIT */
+#define X86_FEATURE_MTRR       (0*32+12) /* Memory Type Range Registers */
+#define X86_FEATURE_PGE                (0*32+13) /* Page Global Enable */
+#define X86_FEATURE_MCA                (0*32+14) /* Machine Check Architecture 
*/
+#define X86_FEATURE_CMOV       (0*32+15) /* CMOV instruction (FCMOVCC and 
FCOMI too if FPU present) */
+#define X86_FEATURE_PAT                (0*32+16) /* Page Attribute Table */
+#define X86_FEATURE_PSE36      (0*32+17) /* 36-bit PSEs */
+#define X86_FEATURE_PN         (0*32+18) /* Processor serial number */
+#define X86_FEATURE_CLFLSH     (0*32+19) /* Supports the CLFLUSH instruction */
+#define X86_FEATURE_DS         (0*32+21) /* Debug Store */
+#define X86_FEATURE_ACPI       (0*32+22) /* ACPI via MSR */
+#define X86_FEATURE_MMX                (0*32+23) /* Multimedia Extensions */
+#define X86_FEATURE_FXSR       (0*32+24) /* FXSAVE and FXRSTOR instructions 
(fast save and restore */
+                                         /* of FPU context), and CR4.OSFXSR 
available */
+#define X86_FEATURE_XMM                (0*32+25) /* Streaming SIMD Extensions 
*/
+#define X86_FEATURE_XMM2       (0*32+26) /* Streaming SIMD Extensions-2 */
+#define X86_FEATURE_SELFSNOOP  (0*32+27) /* CPU self snoop */
+#define X86_FEATURE_HT         (0*32+28) /* Hyper-Threading */
+#define X86_FEATURE_ACC                (0*32+29) /* Automatic clock control */
+#define X86_FEATURE_IA64       (0*32+30) /* IA-64 processor */
+#define X86_FEATURE_PBE                (0*32+31) /* Pending Break Enable */
+
+/* AMD-defined CPU features, CPUID level 0x80000001, word 1 */
+/* Don't duplicate feature flags which are redundant with Intel! */
+#define X86_FEATURE_SYSCALL    (1*32+11) /* SYSCALL/SYSRET */
+#define X86_FEATURE_MP         (1*32+19) /* MP Capable. */
+#define X86_FEATURE_NX         (1*32+20) /* Execute Disable */
+#define X86_FEATURE_MMXEXT     (1*32+22) /* AMD MMX extensions */
+#define X86_FEATURE_FFXSR       (1*32+25) /* FFXSR instruction optimizations */
+#define X86_FEATURE_PAGE1GB    (1*32+26) /* 1Gb large page support */
+#define X86_FEATURE_RDTSCP     (1*32+27) /* RDTSCP */
+#define X86_FEATURE_LM         (1*32+29) /* Long Mode (x86-64) */
+#define X86_FEATURE_3DNOWEXT   (1*32+30) /* AMD 3DNow! extensions */
+#define X86_FEATURE_3DNOW      (1*32+31) /* 3DNow! */
+
+/* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */
+#define X86_FEATURE_RECOVERY   (2*32+ 0) /* CPU in recovery mode */
+#define X86_FEATURE_LONGRUN    (2*32+ 1) /* Longrun power control */
+#define X86_FEATURE_LRTI       (2*32+ 3) /* LongRun table interface */
+
+/* Other features, Linux-defined mapping, word 3 */
+/* This range is used for feature bits which conflict or are synthesized */
+#define X86_FEATURE_CXMMX      (3*32+ 0) /* Cyrix MMX extensions */
+#define X86_FEATURE_K6_MTRR    (3*32+ 1) /* AMD K6 nonstandard MTRRs */
+#define X86_FEATURE_CYRIX_ARR  (3*32+ 2) /* Cyrix ARRs (= MTRRs) */
+#define X86_FEATURE_CENTAUR_MCR        (3*32+ 3) /* Centaur MCRs (= MTRRs) */
+/* cpu types for specific tunings: */
+#define X86_FEATURE_K8         (3*32+ 4) /* Opteron, Athlon64 */
+#define X86_FEATURE_K7         (3*32+ 5) /* Athlon */
+#define X86_FEATURE_P3         (3*32+ 6) /* P3 */
+#define X86_FEATURE_P4         (3*32+ 7) /* P4 */
+#define X86_FEATURE_CONSTANT_TSC (3*32+ 8) /* TSC ticks at a constant rate */
+
+/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
+#define X86_FEATURE_XMM3       (4*32+ 0) /* Streaming SIMD Extensions-3 */
+#define X86_FEATURE_DTES64     (4*32+ 2) /* 64-bit Debug Store */
+#define X86_FEATURE_MWAIT      (4*32+ 3) /* Monitor/Mwait support */
+#define X86_FEATURE_DSCPL      (4*32+ 4) /* CPL Qualified Debug Store */
+#define X86_FEATURE_VMXE       (4*32+ 5) /* Virtual Machine Extensions */
+#define X86_FEATURE_SMXE       (4*32+ 6) /* Safer Mode Extensions */
+#define X86_FEATURE_EST                (4*32+ 7) /* Enhanced SpeedStep */
+#define X86_FEATURE_TM2                (4*32+ 8) /* Thermal Monitor 2 */
+#define X86_FEATURE_SSSE3      (4*32+ 9) /* Supplemental Streaming SIMD 
Extensions-3 */
+#define X86_FEATURE_CID                (4*32+10) /* Context ID */
+#define X86_FEATURE_CX16        (4*32+13) /* CMPXCHG16B */
+#define X86_FEATURE_XTPR       (4*32+14) /* Send Task Priority Messages */
+#define X86_FEATURE_PDCM       (4*32+15) /* Perf/Debug Capability MSR */
+#define X86_FEATURE_DCA                (4*32+18) /* Direct Cache Access */
+#define X86_FEATURE_SSE4_1     (4*32+19) /* Streaming SIMD Extensions 4.1 */
+#define X86_FEATURE_SSE4_2     (4*32+20) /* Streaming SIMD Extensions 4.2 */
+#define X86_FEATURE_POPCNT     (4*32+23) /* POPCNT instruction */
+
+/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
+#define X86_FEATURE_XSTORE     (5*32+ 2) /* on-CPU RNG present (xstore insn) */
+#define X86_FEATURE_XSTORE_EN  (5*32+ 3) /* on-CPU RNG enabled */
+#define X86_FEATURE_XCRYPT     (5*32+ 6) /* on-CPU crypto (xcrypt insn) */
+#define X86_FEATURE_XCRYPT_EN  (5*32+ 7) /* on-CPU crypto enabled */
+#define X86_FEATURE_ACE2       (5*32+ 8) /* Advanced Cryptography Engine v2 */
+#define X86_FEATURE_ACE2_EN    (5*32+ 9) /* ACE v2 enabled */
+#define X86_FEATURE_PHE                (5*32+ 10) /* PadLock Hash Engine */
+#define X86_FEATURE_PHE_EN     (5*32+ 11) /* PHE enabled */
+#define X86_FEATURE_PMM                (5*32+ 12) /* PadLock Montgomery 
Multiplier */
+#define X86_FEATURE_PMM_EN     (5*32+ 13) /* PMM enabled */
+
+/* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */
+#define X86_FEATURE_LAHF_LM    (6*32+ 0) /* LAHF/SAHF in long mode */
+#define X86_FEATURE_CMP_LEGACY (6*32+ 1) /* If yes HyperThreading not valid */
+#define X86_FEATURE_SVME        (6*32+ 2) /* Secure Virtual Machine */
+#define X86_FEATURE_EXTAPICSPACE (6*32+ 3) /* Extended APIC space */
+#define X86_FEATURE_ALTMOVCR   (6*32+ 4) /* LOCK MOV CR accesses CR+8 */
+#define X86_FEATURE_ABM                (6*32+ 5) /* Advanced Bit Manipulation 
*/
+#define X86_FEATURE_SSE4A      (6*32+ 6) /* AMD Streaming SIMD Extensions-4a */
+#define X86_FEATURE_MISALIGNSSE        (6*32+ 7) /* Misaligned SSE Access */
+#define X86_FEATURE_3DNOWPF    (6*32+ 8) /* 3DNow! Prefetch */
+#define X86_FEATURE_OSVW       (6*32+ 9) /* OS Visible Workaround */
+#define X86_FEATURE_IBS                (6*32+ 10) /* Instruction Based 
Sampling */
+#define X86_FEATURE_SSE5       (6*32+ 11) /* AMD Streaming SIMD Extensions-5 */
+#define X86_FEATURE_SKINIT     (6*32+ 12) /* SKINIT, STGI/CLGI, DEV */
+#define X86_FEATURE_WDT                (6*32+ 13) /* Watchdog Timer */
+
+#endif /* __LIBXC_CPUFEATURE_H */
diff -r f2457c7aff8d -r 611787b6ca35 tools/libxc/xc_cpuid_x86.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libxc/xc_cpuid_x86.c        Thu May 08 18:40:07 2008 +0900
@@ -0,0 +1,433 @@
+/******************************************************************************
+ * xc_cpuid_x86.c 
+ *
+ * Compute cpuid of a domain.
+ *
+ * Copyright (c) 2008, Citrix Systems, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ */
+
+#include <stdlib.h>
+#include "xc_private.h"
+#include "xc_cpufeature.h"
+#include <xen/hvm/params.h>
+
+#define bitmaskof(idx)      (1u << ((idx) & 31))
+#define clear_bit(idx, dst) ((dst) &= ~(1u << (idx)))
+#define set_bit(idx, dst)   ((dst) |= (1u << (idx)))
+
+#define DEF_MAX_BASE 0x00000004u
+#define DEF_MAX_EXT  0x80000008u
+
+static void amd_xc_cpuid_policy(
+    int xc, domid_t domid, const unsigned int *input, unsigned int *regs)
+{
+    unsigned long pae = 0;
+
+    xc_get_hvm_param(xc, domid, HVM_PARAM_PAE_ENABLED, &pae);
+
+    switch ( input[0] )
+    {
+    case 0x00000001:
+        /* Mask Intel-only features. */
+        regs[2] &= ~(bitmaskof(X86_FEATURE_SSSE3) |
+                     bitmaskof(X86_FEATURE_SSE4_1) |
+                     bitmaskof(X86_FEATURE_SSE4_2));
+        break;
+
+    case 0x00000002:
+    case 0x00000004:
+        regs[0] = regs[1] = regs[2] = 0;
+        break;
+
+    case 0x80000001:
+        if ( !pae )
+            clear_bit(X86_FEATURE_PAE & 31, regs[3]);
+        clear_bit(X86_FEATURE_PSE36 & 31, regs[3]);
+
+        /* Filter all other features according to a whitelist. */
+        regs[2] &= (bitmaskof(X86_FEATURE_LAHF_LM) |
+                    bitmaskof(X86_FEATURE_ALTMOVCR) |
+                    bitmaskof(X86_FEATURE_ABM) |
+                    bitmaskof(X86_FEATURE_SSE4A) |
+                    bitmaskof(X86_FEATURE_MISALIGNSSE) |
+                    bitmaskof(X86_FEATURE_3DNOWPF));
+        regs[3] &= (0x0183f3ff | /* features shared with 0x00000001:EDX */
+                    bitmaskof(X86_FEATURE_NX) |
+                    bitmaskof(X86_FEATURE_LM) |
+                    bitmaskof(X86_FEATURE_SYSCALL) |
+                    bitmaskof(X86_FEATURE_MP) |
+                    bitmaskof(X86_FEATURE_MMXEXT) |
+                    bitmaskof(X86_FEATURE_FFXSR) |
+                    bitmaskof(X86_FEATURE_3DNOW) |
+                    bitmaskof(X86_FEATURE_3DNOWEXT));
+        break;
+    }
+}
+
+static void intel_xc_cpuid_policy(
+    int xc, domid_t domid, const unsigned int *input, unsigned int *regs)
+{
+    switch ( input[0] )
+    {
+    case 0x00000001:
+        /* Mask AMD-only features. */
+        regs[2] &= ~(bitmaskof(X86_FEATURE_POPCNT));
+        break;
+
+    case 0x00000004:
+        regs[0] &= 0x3FF;
+        regs[3] &= 0x3FF;
+        break;
+
+    case 0x80000001:
+        /* Only a few features are advertised in Intel's 0x80000001. */
+        regs[2] &= (bitmaskof(X86_FEATURE_LAHF_LM));
+        regs[3] &= (bitmaskof(X86_FEATURE_NX) |
+                    bitmaskof(X86_FEATURE_LM) |
+                    bitmaskof(X86_FEATURE_SYSCALL));
+        break;
+    }
+}
+
+static void cpuid(const unsigned int *input, unsigned int *regs)
+{
+    unsigned int count = (input[1] == XEN_CPUID_INPUT_UNUSED) ? 0 : input[1];
+    unsigned int bx_temp;
+    asm ( "mov %%ebx,%4; cpuid; mov %%ebx,%1; mov %4,%%ebx"
+          : "=a" (regs[0]), "=r" (regs[1]),
+          "=c" (regs[2]), "=d" (regs[3]), "=m" (bx_temp)
+          : "0" (input[0]), "2" (count) );
+}
+
+/* Get the manufacturer brand name of the host processor. */
+static void xc_cpuid_brand_get(char *str)
+{
+    unsigned int input[2] = { 0, 0 };
+    unsigned int regs[4];
+
+    cpuid(input, regs);
+
+    *(uint32_t *)(str + 0) = regs[1];
+    *(uint32_t *)(str + 4) = regs[3];
+    *(uint32_t *)(str + 8) = regs[2];
+    str[12] = '\0';
+}
+
+static void xc_cpuid_policy(
+    int xc, domid_t domid, const unsigned int *input, unsigned int *regs)
+{
+    char brand[13];
+    unsigned long pae;
+
+    xc_get_hvm_param(xc, domid, HVM_PARAM_PAE_ENABLED, &pae);
+
+    switch( input[0] )
+    {
+    case 0x00000000:
+        if ( regs[0] > DEF_MAX_BASE )
+            regs[0] = DEF_MAX_BASE;
+        break;
+
+    case 0x00000001:
+        regs[2] &= (bitmaskof(X86_FEATURE_XMM3) |
+                    bitmaskof(X86_FEATURE_SSSE3) |
+                    bitmaskof(X86_FEATURE_CX16) |
+                    bitmaskof(X86_FEATURE_SSE4_1) |
+                    bitmaskof(X86_FEATURE_SSE4_2) |
+                    bitmaskof(X86_FEATURE_POPCNT));
+
+        regs[3] &= (bitmaskof(X86_FEATURE_FPU) |
+                    bitmaskof(X86_FEATURE_VME) |
+                    bitmaskof(X86_FEATURE_DE) |
+                    bitmaskof(X86_FEATURE_PSE) |
+                    bitmaskof(X86_FEATURE_TSC) |
+                    bitmaskof(X86_FEATURE_MSR) |
+                    bitmaskof(X86_FEATURE_PAE) |
+                    bitmaskof(X86_FEATURE_MCE) |
+                    bitmaskof(X86_FEATURE_CX8) |
+                    bitmaskof(X86_FEATURE_APIC) |
+                    bitmaskof(X86_FEATURE_SEP) |
+                    bitmaskof(X86_FEATURE_MTRR) |
+                    bitmaskof(X86_FEATURE_PGE) |
+                    bitmaskof(X86_FEATURE_MCA) |
+                    bitmaskof(X86_FEATURE_CMOV) |
+                    bitmaskof(X86_FEATURE_PAT) |
+                    bitmaskof(X86_FEATURE_CLFLSH) |
+                    bitmaskof(X86_FEATURE_MMX) |
+                    bitmaskof(X86_FEATURE_FXSR) |
+                    bitmaskof(X86_FEATURE_XMM) |
+                    bitmaskof(X86_FEATURE_XMM2));
+            
+        /* We always support MTRR MSRs. */
+        regs[3] |= bitmaskof(X86_FEATURE_MTRR);
+
+        if ( !pae )
+            clear_bit(X86_FEATURE_PAE & 31, regs[3]);
+        break;
+
+    case 0x80000000:
+        if ( regs[0] > DEF_MAX_EXT )
+            regs[0] = DEF_MAX_EXT;
+        break;
+
+    case 0x80000001:
+        if ( !pae )
+            clear_bit(X86_FEATURE_NX & 31, regs[3]);
+        break;
+
+
+    case 0x80000008:
+        regs[0] &= 0x0000ffffu;
+        regs[1] = regs[2] = regs[3] = 0;
+        break;
+
+    case 0x00000002:
+    case 0x00000004:
+    case 0x80000002:
+    case 0x80000003:
+    case 0x80000004:
+    case 0x80000006:
+        break;
+
+    default:
+        regs[0] = regs[1] = regs[2] = regs[3] = 0;
+        break;
+    }
+
+    xc_cpuid_brand_get(brand);
+    if ( strstr(brand, "AMD") )
+        amd_xc_cpuid_policy(xc, domid, input, regs);
+    else
+        intel_xc_cpuid_policy(xc, domid, input, regs);
+}
+
+static int xc_cpuid_do_domctl(
+    int xc, domid_t domid,
+    const unsigned int *input, const unsigned int *regs)
+{
+    DECLARE_DOMCTL;
+
+    memset(&domctl, 0, sizeof (domctl));
+    domctl.domain = domid;
+    domctl.cmd = XEN_DOMCTL_set_cpuid;
+    domctl.u.cpuid.input[0] = input[0];
+    domctl.u.cpuid.input[1] = input[1];
+    domctl.u.cpuid.eax = regs[0];
+    domctl.u.cpuid.ebx = regs[1];
+    domctl.u.cpuid.ecx = regs[2];
+    domctl.u.cpuid.edx = regs[3];
+
+    return do_domctl(xc, &domctl);
+}
+
+static char *alloc_str(void)
+{
+    char *s = malloc(33);
+    memset(s, 0, 33);
+    return s;
+}
+
+void xc_cpuid_to_str(const unsigned int *regs, char **strs)
+{
+    int i, j;
+
+    for ( i = 0; i < 4; i++ )
+    {
+        strs[i] = alloc_str();
+        for ( j = 0; j < 32; j++ )
+            strs[i][j] = !!((regs[i] & (1U << (31 - j)))) ? '1' : '0';
+    }
+}
+
+int xc_cpuid_apply_policy(int xc, domid_t domid)
+{
+    unsigned int input[2] = { 0, 0 }, regs[4];
+    unsigned int base_max, ext_max;
+    int rc;
+
+    cpuid(input, regs);
+    base_max = (regs[0] <= DEF_MAX_BASE) ? regs[0] : DEF_MAX_BASE;
+    input[0] = 0x80000000;
+    cpuid(input, regs);
+    ext_max = (regs[0] <= DEF_MAX_EXT) ? regs[0] : DEF_MAX_EXT;
+
+    input[0] = 0;
+    input[1] = XEN_CPUID_INPUT_UNUSED;
+    for ( ; ; )
+    {
+        cpuid(input, regs);
+        xc_cpuid_policy(xc, domid, input, regs);
+
+        if ( regs[0] || regs[1] || regs[2] || regs[3] )
+        {
+            rc = xc_cpuid_do_domctl(xc, domid, input, regs);
+            if ( rc )
+                return rc;
+
+            /* Intel cache descriptor leaves. */
+            if ( input[0] == 4 )
+            {
+                input[1]++;
+                /* More to do? Then loop keeping %%eax==0x00000004. */
+                if ( (regs[0] & 0x1f) != 0 )
+                    continue;
+            }
+        }
+
+        input[0]++;
+        input[1] = (input[0] == 4) ? 0 : XEN_CPUID_INPUT_UNUSED;
+        if ( !(input[0] & 0x80000000u) && (input[0] > base_max ) )
+            input[0] = 0x80000000u;
+
+        if ( (input[0] & 0x80000000u) && (input[0] > ext_max) )
+            break;
+    }
+
+    return 0;
+}
+
+/*
+ * Check whether a VM is allowed to launch on this host's processor type.
+ *
+ * @config format is similar to that of xc_cpuid_set():
+ *  '1' -> the bit must be set to 1
+ *  '0' -> must be 0
+ *  'x' -> we don't care
+ *  's' -> (same) must be the same
+ */
+int xc_cpuid_check(
+    int xc, const unsigned int *input,
+    const char **config,
+    char **config_transformed)
+{
+    int i, j;
+    unsigned int regs[4];
+
+    memset(config_transformed, 0, 4 * sizeof(*config_transformed));
+
+    cpuid(input, regs);
+
+    for ( i = 0; i < 4; i++ )
+    {
+        if ( config[i] == NULL )
+            continue;
+        config_transformed[i] = alloc_str();
+        for ( j = 0; j < 32; j++ )
+        {
+            unsigned char val = !!((regs[i] & (1U << (31 - j))));
+            if ( !strchr("10xs", config[i][j]) ||
+                 ((config[i][j] == '1') && !val) ||
+                 ((config[i][j] == '0') && val) )
+                goto fail;
+            config_transformed[i][j] = config[i][j];
+            if ( config[i][j] == 's' )
+                config_transformed[i][j] = '0' + val;
+        }
+    }
+
+    return 0;
+
+ fail:
+    for ( i = 0; i < 4; i++ )
+    {
+        free(config_transformed[i]);
+        config_transformed[i] = NULL;
+    }
+    return -EPERM;
+}
+
+/*
+ * Configure a single input with the informatiom from config.
+ *
+ * Config is an array of strings:
+ *   config[0] = eax
+ *   config[1] = ebx
+ *   config[2] = ecx
+ *   config[3] = edx
+ *
+ * The format of the string is the following:
+ *   '1' -> force to 1
+ *   '0' -> force to 0
+ *   'x' -> we don't care (use default)
+ *   'k' -> pass through host value
+ *   's' -> pass through the first time and then keep the same value
+ *          across save/restore and migration.
+ * 
+ * For 's' and 'x' the configuration is overwritten with the value applied.
+ */
+int xc_cpuid_set(
+    int xc, domid_t domid, const unsigned int *input,
+    const char **config, char **config_transformed)
+{
+    int rc;
+    unsigned int i, j, regs[4], polregs[4];
+
+    memset(config_transformed, 0, 4 * sizeof(*config_transformed));
+
+    cpuid(input, regs);
+
+    memcpy(polregs, regs, sizeof(regs));
+    xc_cpuid_policy(xc, domid, input, polregs);
+
+    for ( i = 0; i < 4; i++ )
+    {
+        if ( config[i] == NULL )
+        {
+            regs[i] = polregs[i];
+            continue;
+        }
+        
+        config_transformed[i] = alloc_str();
+
+        for ( j = 0; j < 32; j++ )
+        {
+            unsigned char val = !!((regs[i] & (1U << (31 - j))));
+            unsigned char polval = !!((polregs[i] & (1U << (31 - j))));
+
+            rc = -EINVAL;
+            if ( !strchr("10xks", config[i][j]) )
+                goto fail;
+
+            if ( config[i][j] == '1' )
+                val = 1;
+            else if ( config[i][j] == '0' )
+                val = 0;
+            else if ( config[i][j] == 'x' )
+                val = polval;
+
+            if ( val )
+                set_bit(31 - j, regs[i]);
+            else
+                clear_bit(31 - j, regs[i]);
+
+            config_transformed[i][j] = config[i][j];
+            if ( config[i][j] == 's' )
+                config_transformed[i][j] = '0' + val;
+        }
+    }
+
+    rc = xc_cpuid_do_domctl(xc, domid, input, regs);
+    if ( rc == 0 )
+        return 0;
+
+ fail:
+    for ( i = 0; i < 4; i++ )
+    {
+        free(config_transformed[i]);
+        config_transformed[i] = NULL;
+    }
+    return rc;
+}
diff -r f2457c7aff8d -r 611787b6ca35 tools/libxc/xc_domain.c
--- a/tools/libxc/xc_domain.c   Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/libxc/xc_domain.c   Thu May 08 18:40:07 2008 +0900
@@ -795,6 +795,32 @@ int xc_deassign_device(
     return do_domctl(xc_handle, &domctl);
 }
 
+int xc_domain_update_msi_irq(
+    int xc_handle,
+    uint32_t domid,
+    uint32_t gvec,
+    uint32_t pirq,
+    uint32_t gflags)
+{
+    int rc;
+    xen_domctl_bind_pt_irq_t *bind;
+
+    DECLARE_DOMCTL;
+
+    domctl.cmd = XEN_DOMCTL_bind_pt_irq;
+    domctl.domain = (domid_t)domid;
+
+    bind = &(domctl.u.bind_pt_irq);
+    bind->hvm_domid = domid;
+    bind->irq_type = PT_IRQ_TYPE_MSI;
+    bind->machine_irq = pirq;
+    bind->u.msi.gvec = gvec;
+    bind->u.msi.gflags = gflags;
+
+    rc = do_domctl(xc_handle, &domctl);
+    return rc;
+}
+
 /* Pass-through: binds machine irq to guests irq */
 int xc_domain_bind_pt_irq(
     int xc_handle,
diff -r f2457c7aff8d -r 611787b6ca35 tools/libxc/xc_minios.c
--- a/tools/libxc/xc_minios.c   Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/libxc/xc_minios.c   Thu May 08 18:40:07 2008 +0900
@@ -178,7 +178,7 @@ static void evtchn_handler(evtchn_port_t
        printk("Unknown port for handle %d\n", xce_handle);
        return;
     }
-    files[xce_handle].evtchn.ports[i].pending++;
+    files[xce_handle].evtchn.ports[i].pending = 1;
     files[xce_handle].read = 1;
     wake_up(&event_queue);
 }
@@ -278,7 +278,7 @@ evtchn_port_or_error_t xc_evtchn_pending
     for (i = 0; i < MAX_EVTCHN_PORTS; i++) {
        evtchn_port_t port = files[xce_handle].evtchn.ports[i].port;
        if (port != -1 && files[xce_handle].evtchn.ports[i].pending) {
-           files[xce_handle].evtchn.ports[i].pending--;
+           files[xce_handle].evtchn.ports[i].pending = 0;
            local_irq_restore(flags);
            return port;
        }
diff -r f2457c7aff8d -r 611787b6ca35 tools/libxc/xc_misc.c
--- a/tools/libxc/xc_misc.c     Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/libxc/xc_misc.c     Thu May 08 18:40:07 2008 +0900
@@ -222,6 +222,37 @@ int xc_hvm_set_pci_link_route(
     arg.domid   = dom;
     arg.link    = link;
     arg.isa_irq = isa_irq;
+
+    if ( (rc = lock_pages(&arg, sizeof(arg))) != 0 )
+    {
+        PERROR("Could not lock memory");
+        return rc;
+    }
+
+    rc = do_xen_hypercall(xc_handle, &hypercall);
+
+    unlock_pages(&arg, sizeof(arg));
+
+    return rc;
+}
+
+int xc_hvm_track_dirty_vram(
+    int xc_handle, domid_t dom,
+    uint64_t first_pfn, uint64_t nr,
+    unsigned long *dirty_bitmap)
+{
+    DECLARE_HYPERCALL;
+    struct xen_hvm_track_dirty_vram arg;
+    int rc;
+
+    hypercall.op     = __HYPERVISOR_hvm_op;
+    hypercall.arg[0] = HVMOP_track_dirty_vram;
+    hypercall.arg[1] = (unsigned long)&arg;
+
+    arg.domid     = dom;
+    arg.first_pfn = first_pfn;
+    arg.nr        = nr;
+    set_xen_guest_handle(arg.dirty_bitmap, (uint8_t *)dirty_bitmap);
 
     if ( (rc = lock_pages(&arg, sizeof(arg))) != 0 )
     {
diff -r f2457c7aff8d -r 611787b6ca35 tools/libxc/xc_pagetab.c
--- a/tools/libxc/xc_pagetab.c  Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/libxc/xc_pagetab.c  Thu May 08 18:40:07 2008 +0900
@@ -141,7 +141,7 @@ unsigned long xc_translate_foreign_addre
 
     /* Page Table */
 
-    if (pde & 0x00000008) { /* 4M page (or 2M in PAE mode) */
+    if (pde & 0x00000080) { /* 4M page (or 2M in PAE mode) */
         DPRINTF("Cannot currently cope with 2/4M pages\n");
         exit(-1);
     } else { /* 4k page */
diff -r f2457c7aff8d -r 611787b6ca35 tools/libxc/xc_physdev.c
--- a/tools/libxc/xc_physdev.c  Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/libxc/xc_physdev.c  Thu May 08 18:40:07 2008 +0900
@@ -19,3 +19,75 @@ int xc_physdev_pci_access_modify(int xc_
     errno = ENOSYS;
     return -1;
 }
+
+int xc_physdev_map_pirq(int xc_handle,
+                        int domid,
+                        int type,
+                        int index,
+                        int *pirq)
+{
+    int rc;
+    struct physdev_map_pirq map;
+
+    if ( !pirq )
+        return -EINVAL;
+
+    map.domid = domid;
+    map.type = type;
+    map.index = index;
+    map.pirq = *pirq;
+
+    rc = do_physdev_op(xc_handle, PHYSDEVOP_map_pirq, &map);
+
+    if ( !rc )
+        *pirq = map.pirq;
+
+    return rc;
+}
+
+int xc_physdev_map_pirq_msi(int xc_handle,
+                            int domid,
+                            int type,
+                            int index,
+                            int *pirq,
+                            int devfn,
+                            int bus,
+                            int msi_type)
+{
+    int rc;
+    struct physdev_map_pirq map;
+
+    if ( !pirq )
+        return -EINVAL;
+
+    map.domid = domid;
+    map.type = type;
+    map.index = index;
+    map.pirq = *pirq;
+    map.msi_info.devfn = devfn;
+    map.msi_info.bus = bus;
+    map.msi_info.msi = msi_type;
+
+    rc = do_physdev_op(xc_handle, PHYSDEVOP_map_pirq, &map);
+
+    if ( !rc )
+        *pirq = map.pirq;
+
+    return rc;
+}
+
+int xc_physdev_unmap_pirq(int xc_handle,
+                          int domid,
+                          int pirq)
+{
+    int rc;
+    struct physdev_unmap_pirq unmap;
+
+    unmap.domid = domid;
+    unmap.pirq = pirq;
+
+    rc = do_physdev_op(xc_handle, PHYSDEVOP_unmap_pirq, &unmap);
+
+    return rc;
+}
+
diff -r f2457c7aff8d -r 611787b6ca35 tools/libxc/xc_private.h
--- a/tools/libxc/xc_private.h  Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/libxc/xc_private.h  Thu May 08 18:40:07 2008 +0900
@@ -24,10 +24,12 @@
 #define DECLARE_HYPERCALL privcmd_hypercall_t hypercall = { 0 }
 #define DECLARE_DOMCTL struct xen_domctl domctl = { 0 }
 #define DECLARE_SYSCTL struct xen_sysctl sysctl = { 0 }
+#define DECLARE_PHYSDEV_OP struct physdev_op physdev_op = { 0 }
 #else
 #define DECLARE_HYPERCALL privcmd_hypercall_t hypercall
 #define DECLARE_DOMCTL struct xen_domctl domctl
 #define DECLARE_SYSCTL struct xen_sysctl sysctl
+#define DECLARE_PHYSDEV_OP struct physdev_op physdev_op
 #endif
 
 #undef PAGE_SHIFT
@@ -94,6 +96,34 @@ static inline int do_xen_version(int xc_
     hypercall.arg[1] = (unsigned long) dest;
 
     return do_xen_hypercall(xc_handle, &hypercall);
+}
+
+static inline int do_physdev_op(int xc_handle, int cmd, void *op)
+{
+    int ret = -1;
+
+    DECLARE_HYPERCALL;
+    hypercall.op = __HYPERVISOR_physdev_op;
+    hypercall.arg[0] = (unsigned long) cmd;
+    hypercall.arg[1] = (unsigned long) op;
+
+    if ( lock_pages(op, sizeof(*op)) != 0 )
+    {
+        PERROR("Could not lock memory for Xen hypercall");
+        goto out1;
+    }
+
+    if ( (ret = do_xen_hypercall(xc_handle, &hypercall)) < 0 )
+    {
+        if ( errno == EACCES )
+            DPRINTF("physdev operation failed -- need to"
+                    " rebuild the user-space tool set?\n");
+    }
+
+    unlock_pages(op, sizeof(*op));
+
+out1:
+    return ret;
 }
 
 static inline int do_domctl(int xc_handle, struct xen_domctl *domctl)
diff -r f2457c7aff8d -r 611787b6ca35 tools/libxc/xenctrl.h
--- a/tools/libxc/xenctrl.h     Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/libxc/xenctrl.h     Thu May 08 18:40:07 2008 +0900
@@ -21,6 +21,7 @@
 #include <stdint.h>
 #include <xen/xen.h>
 #include <xen/domctl.h>
+#include <xen/physdev.h>
 #include <xen/sysctl.h>
 #include <xen/version.h>
 #include <xen/event_channel.h>
@@ -849,6 +850,25 @@ int xc_gnttab_set_max_grants(int xcg_han
 int xc_gnttab_set_max_grants(int xcg_handle,
                             uint32_t count);
 
+int xc_physdev_map_pirq(int xc_handle,
+                        int domid,
+                        int type,
+                        int index,
+                        int *pirq);
+
+int xc_physdev_map_pirq_msi(int xc_handle,
+                            int domid,
+                            int type,
+                            int index,
+                            int *pirq,
+                            int devfn,
+                            int bus,
+                            int msi_type);
+
+int xc_physdev_unmap_pirq(int xc_handle,
+                          int domid,
+                          int pirq);
+
 int xc_hvm_set_pci_intx_level(
     int xc_handle, domid_t dom,
     uint8_t domain, uint8_t bus, uint8_t device, uint8_t intx,
@@ -862,6 +882,22 @@ int xc_hvm_set_pci_link_route(
     int xc_handle, domid_t dom, uint8_t link, uint8_t isa_irq);
 
 
+/*
+ * Track dirty bit changes in the VRAM area
+ *
+ * All of this is done atomically:
+ * - get the dirty bitmap since the last call
+ * - set up dirty tracking area for period up to the next call
+ * - clear the dirty tracking area.
+ *
+ * Returns -ENODATA and does not fill bitmap if the area has changed since the
+ * last call.
+ */
+int xc_hvm_track_dirty_vram(
+    int xc_handle, domid_t dom,
+    uint64_t first_pfn, uint64_t nr,
+    unsigned long *bitmap);
+
 typedef enum {
   XC_ERROR_NONE = 0,
   XC_INTERNAL_ERROR = 1,
@@ -948,6 +984,13 @@ int xc_domain_ioport_mapping(int xc_hand
                              uint32_t first_mport,
                              uint32_t nr_ports,
                              uint32_t add_mapping);
+
+int xc_domain_update_msi_irq(
+    int xc_handle,
+    uint32_t domid,
+    uint32_t gvec,
+    uint32_t pirq,
+    uint32_t gflags);
 
 int xc_domain_bind_pt_irq(int xc_handle,
                           uint32_t domid,
@@ -983,4 +1026,20 @@ int xc_domain_set_target(int xc_handle,
                          uint32_t domid,
                          uint32_t target);
 
+#if defined(__i386__) || defined(__x86_64__)
+int xc_cpuid_check(int xc,
+                   const unsigned int *input,
+                   const char **config,
+                   char **config_transformed);
+int xc_cpuid_set(int xc,
+                 domid_t domid,
+                 const unsigned int *input,
+                 const char **config,
+                 char **config_transformed);
+int xc_cpuid_apply_policy(int xc,
+                          domid_t domid);
+void xc_cpuid_to_str(const unsigned int *regs,
+                     char **strs);
+#endif
+
 #endif /* XENCTRL_H */
diff -r f2457c7aff8d -r 611787b6ca35 tools/pygrub/src/fsimage/fsimage.c
--- a/tools/pygrub/src/fsimage/fsimage.c        Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/pygrub/src/fsimage/fsimage.c        Thu May 08 18:40:07 2008 +0900
@@ -17,7 +17,7 @@
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
  *
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -281,6 +281,22 @@ fsimage_open(PyObject *o, PyObject *args
        return (PyObject *)fs;
 }
 
+static PyObject *
+fsimage_getbootstring(PyObject *o, PyObject *args)
+{
+       PyObject *fs;
+       char    *bootstring;
+       fsi_t   *fsi;
+
+       if (!PyArg_ParseTuple(args, "O", &fs))
+               return (NULL);
+
+       fsi = ((fsimage_fs_t *)fs)->fs;
+       bootstring = fsi_fs_bootstring(fsi);
+
+       return Py_BuildValue("s", bootstring);
+}
+
 PyDoc_STRVAR(fsimage_open__doc__,
     "open(name, [offset=off]) - Open the given file as a filesystem image.\n"
     "\n"
@@ -288,9 +304,15 @@ PyDoc_STRVAR(fsimage_open__doc__,
     "offset - offset of file system within file image.\n"
     "options - mount options string.\n");
 
+PyDoc_STRVAR(fsimage_getbootstring__doc__,
+    "getbootstring(fs) - Return the boot string needed for this file system "
+    "or NULL if none is needed.\n");
+
 static struct PyMethodDef fsimage_module_methods[] = {
        { "open", (PyCFunction)fsimage_open,
            METH_VARARGS|METH_KEYWORDS, fsimage_open__doc__ },
+       { "getbootstring", (PyCFunction)fsimage_getbootstring,
+           METH_VARARGS, fsimage_getbootstring__doc__ },
        { NULL, NULL, 0, NULL }
 };
 
diff -r f2457c7aff8d -r 611787b6ca35 tools/pygrub/src/pygrub
--- a/tools/pygrub/src/pygrub   Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/pygrub/src/pygrub   Thu May 08 18:40:07 2008 +0900
@@ -646,7 +646,13 @@ if __name__ == "__main__":
         print "  args: %s" % chosencfg["args"]
         sys.exit(0)
 
-    fs = fsimage.open(file, get_fs_offset(file))
+    # if boot filesystem is set then pass to fsimage.open
+    bootfsargs = '"%s"' % incfg["args"]
+    bootfsgroup = re.findall('zfs-bootfs=(.*?)[\s\,\"]', bootfsargs)
+    if bootfsgroup:
+        fs = fsimage.open(file, get_fs_offset(file), bootfsgroup[0])
+    else:
+        fs = fsimage.open(file, get_fs_offset(file))
 
     chosencfg = sniff_solaris(fs, incfg)
 
@@ -672,7 +678,15 @@ if __name__ == "__main__":
     if bootcfg["ramdisk"]:
         sxp += "(ramdisk %s)" % bootcfg["ramdisk"]
     if chosencfg["args"]:
-        sxp += "(args \"%s\")" % chosencfg["args"]
+        zfsinfo = fsimage.getbootstring(fs)
+        if zfsinfo is None:
+            sxp += "(args \"%s\")" % chosencfg["args"]
+        else:
+            e = re.compile("zfs-bootfs=[\w\-\.\:@/]+" )
+            (chosencfg["args"],count) = e.subn(zfsinfo, chosencfg["args"])
+            if count == 0:
+               chosencfg["args"] += " -B %s" % zfsinfo
+            sxp += "(args \"%s\")" % (chosencfg["args"])
 
     sys.stdout.flush()
     os.write(fd, sxp)
diff -r f2457c7aff8d -r 611787b6ca35 tools/python/xen/lowlevel/xc/xc.c
--- a/tools/python/xen/lowlevel/xc/xc.c Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/python/xen/lowlevel/xc/xc.c Thu May 08 18:40:07 2008 +0900
@@ -611,6 +611,110 @@ static PyObject *pyxc_set_os_type(XcObje
 }
 #endif /* __ia64__ */
 
+
+#if defined(__i386__) || defined(__x86_64__)
+static void pyxc_dom_extract_cpuid(PyObject *config,
+                                  char **regs)
+{
+    const char *regs_extract[4] = { "eax", "ebx", "ecx", "edx" };
+    PyObject *obj;
+    int i;
+
+    memset(regs, 0, 4*sizeof(*regs));
+
+    if ( !PyDict_Check(config) )
+        return;
+
+    for ( i = 0; i < 4; i++ )
+        if ( (obj = PyDict_GetItemString(config, regs_extract[i])) != NULL )
+            regs[i] = PyString_AS_STRING(obj);
+}
+
+static PyObject *pyxc_create_cpuid_dict(char **regs)
+{
+   const char *regs_extract[4] = { "eax", "ebx", "ecx", "edx" };
+   PyObject *dict;
+   int i;
+
+   dict = PyDict_New();
+   for ( i = 0; i < 4; i++ )
+   {
+       if ( regs[i] == NULL )
+           continue;
+       PyDict_SetItemString(dict, regs_extract[i],
+                            PyString_FromString(regs[i]));
+       free(regs[i]);
+       regs[i] = NULL;
+   }
+   return dict;
+}
+
+static PyObject *pyxc_dom_check_cpuid(XcObject *self,
+                                      PyObject *args)
+{
+    PyObject *sub_input, *config;
+    unsigned int input[2];
+    char *regs[4], *regs_transform[4];
+
+    if ( !PyArg_ParseTuple(args, "iOO", &input[0], &sub_input, &config) )
+        return NULL;
+
+    pyxc_dom_extract_cpuid(config, regs);
+
+    input[1] = XEN_CPUID_INPUT_UNUSED;
+    if ( PyLong_Check(sub_input) )
+        input[1] = PyLong_AsUnsignedLong(sub_input);
+
+    if ( xc_cpuid_check(self->xc_handle, input,
+                        (const char **)regs, regs_transform) )
+        return pyxc_error_to_exception();
+
+    return pyxc_create_cpuid_dict(regs_transform);
+}
+
+static PyObject *pyxc_dom_set_policy_cpuid(XcObject *self,
+                                           PyObject *args)
+{
+    domid_t domid;
+
+    if ( !PyArg_ParseTuple(args, "i", &domid) )
+        return NULL;
+
+    if ( xc_cpuid_apply_policy(self->xc_handle, domid) )
+        return pyxc_error_to_exception();
+
+    Py_INCREF(zero);
+    return zero;
+}
+
+
+static PyObject *pyxc_dom_set_cpuid(XcObject *self,
+                                    PyObject *args)
+{
+    domid_t domid;
+    PyObject *sub_input, *config;
+    unsigned int input[2];
+    char *regs[4], *regs_transform[4];
+
+    if ( !PyArg_ParseTuple(args, "IIOO", &domid,
+                           &input[0], &sub_input, &config) )
+        return NULL;
+
+    pyxc_dom_extract_cpuid(config, regs);
+
+    input[1] = XEN_CPUID_INPUT_UNUSED;
+    if ( PyLong_Check(sub_input) )
+        input[1] = PyLong_AsUnsignedLong(sub_input);
+
+    if ( xc_cpuid_set(self->xc_handle, domid, input, (const char **)regs,
+                      regs_transform) )
+        return pyxc_error_to_exception();
+
+    return pyxc_create_cpuid_dict(regs_transform);
+}
+
+#endif /* __i386__ || __x86_64__ */
+
 static PyObject *pyxc_hvm_build(XcObject *self,
                                 PyObject *args,
                                 PyObject *kwds)
@@ -693,6 +797,26 @@ static PyObject *pyxc_evtchn_reset(XcObj
 
     Py_INCREF(zero);
     return zero;
+}
+
+static PyObject *pyxc_physdev_map_pirq(PyObject *self,
+                                       PyObject *args,
+                                       PyObject *kwds)
+{
+    XcObject *xc = (XcObject *)self;
+    uint32_t dom;
+    int index, pirq, ret;
+
+    static char *kwd_list[] = {"domid", "index", "pirq", NULL};
+
+    if ( !PyArg_ParseTupleAndKeywords(args, kwds, "iii", kwd_list,
+                                      &dom, &index, &pirq) )
+        return NULL;
+    ret = xc_physdev_map_pirq(xc->xc_handle, dom, MAP_PIRQ_TYPE_GSI,
+                             index, &pirq);
+    if ( ret != 0 )
+          return pyxc_error_to_exception();
+    return PyLong_FromUnsignedLong(pirq);
 }
 
 static PyObject *pyxc_physdev_pci_access_modify(XcObject *self,
@@ -1485,6 +1609,15 @@ static PyMethodDef pyxc_methods[] = {
       "Reset all connections.\n"
       " dom [int]: Domain to reset.\n" },
 
+    { "physdev_map_pirq",
+      (PyCFunction)pyxc_physdev_map_pirq,
+      METH_VARARGS | METH_KEYWORDS, "\n"
+      "map physical irq to guest pirq.\n"
+      " dom     [int]:      Identifier of domain to map for.\n"
+      " index   [int]:      physical irq.\n"
+      " pirq    [int]:      guest pirq.\n"
+      "Returns: [long] value of the param.\n" },
+
     { "physdev_pci_access_modify",
       (PyCFunction)pyxc_physdev_pci_access_modify,
       METH_VARARGS | METH_KEYWORDS, "\n"
@@ -1635,6 +1768,37 @@ static PyMethodDef pyxc_methods[] = {
       " log [int]: Specifies the area's size.\n"
       "Returns: [int] 0 on success; -1 on error.\n" },
 #endif /* __powerpc */
+  
+#if defined(__i386__) || defined(__x86_64__)
+    { "domain_check_cpuid", 
+      (PyCFunction)pyxc_dom_check_cpuid, 
+      METH_VARARGS, "\n"
+      "Apply checks to host CPUID.\n"
+      " input [long]: Input for cpuid instruction (eax)\n"
+      " sub_input [long]: Second input (optional, may be None) for cpuid "
+      "                     instruction (ecx)\n"
+      " config [dict]: Dictionary of register\n"
+      " config [dict]: Dictionary of register, use for checking\n\n"
+      "Returns: [int] 0 on success; exception on error.\n" },
+    
+    { "domain_set_cpuid", 
+      (PyCFunction)pyxc_dom_set_cpuid, 
+      METH_VARARGS, "\n"
+      "Set cpuid response for an input and a domain.\n"
+      " dom [int]: Identifier of domain.\n"
+      " input [long]: Input for cpuid instruction (eax)\n"
+      " sub_input [long]: Second input (optional, may be None) for cpuid "
+      "                     instruction (ecx)\n"
+      " config [dict]: Dictionary of register\n\n"
+      "Returns: [int] 0 on success; exception on error.\n" },
+
+    { "domain_set_policy_cpuid", 
+      (PyCFunction)pyxc_dom_set_policy_cpuid, 
+      METH_VARARGS, "\n"
+      "Set the default cpuid policy for a domain.\n"
+      " dom [int]: Identifier of domain.\n\n"
+      "Returns: [int] 0 on success; exception on error.\n" },
+#endif
 
     { NULL, NULL, 0, NULL }
 };
diff -r f2457c7aff8d -r 611787b6ca35 tools/python/xen/util/acmpolicy.py
--- a/tools/python/xen/util/acmpolicy.py        Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/python/xen/util/acmpolicy.py        Thu May 08 18:40:07 2008 +0900
@@ -48,8 +48,6 @@ ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY = 2
 ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY = 2
 ACM_POLICY_UNDEFINED = 15
 
-
-ACM_SCHEMA_FILE = ACM_POLICIES_DIR + "security_policy.xsd"
 
 ACM_LABEL_UNLABELED = "__UNLABELED__"
 ACM_LABEL_UNLABELED_DISPLAY = "unlabeled"
@@ -118,6 +116,153 @@ DEFAULT_policy = \
 "  </SecurityLabelTemplate>\n" +\
 "</SecurityPolicyDefinition>\n"
 
+ACM_SCHEMA="""<?xml version="1.0" encoding="UTF-8"?>
+<!-- Author: Ray Valdez, Reiner Sailer {rvaldez,sailer}@us.ibm.com -->
+<!--         This file defines the schema, which is used to define -->
+<!--         the security policy and the security labels in Xen.    -->
+
+<xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema"; 
targetNamespace="http://www.ibm.com"; xmlns="http://www.ibm.com"; 
elementFormDefault="qualified">
+       <xsd:element name="SecurityPolicyDefinition">
+               <xsd:complexType>
+                       <xsd:sequence>
+                               <xsd:element ref="PolicyHeader" minOccurs="1" 
maxOccurs="1"></xsd:element>
+                               <xsd:element ref="SimpleTypeEnforcement" 
minOccurs="0" maxOccurs="1"></xsd:element>
+                               <xsd:element ref="ChineseWall" minOccurs="0" 
maxOccurs="1"></xsd:element>
+                               <xsd:element ref="SecurityLabelTemplate" 
minOccurs="1" maxOccurs="1"></xsd:element>
+                       </xsd:sequence>
+               </xsd:complexType>
+       </xsd:element>
+       <xsd:element name="PolicyHeader">
+               <xsd:complexType>
+                       <xsd:sequence>
+                               <xsd:element name="PolicyName" minOccurs="1" 
maxOccurs="1" type="xsd:string"></xsd:element>
+                               <xsd:element name="PolicyUrl" minOccurs="0" 
maxOccurs="1" type="xsd:string"></xsd:element>
+                               <xsd:element name="Reference" type="xsd:string" 
minOccurs="0" maxOccurs="1" />
+                               <xsd:element name="Date" minOccurs="0" 
maxOccurs="1" type="xsd:string"></xsd:element>
+                               <xsd:element name="NameSpaceUrl" minOccurs="0" 
maxOccurs="1" type="xsd:string"></xsd:element>
+                               <xsd:element name="Version" minOccurs="1" 
maxOccurs="1" type="VersionFormat"/>
+                               <xsd:element ref="FromPolicy" minOccurs="0" 
maxOccurs="1"/>
+                       </xsd:sequence>
+               </xsd:complexType>
+       </xsd:element>
+       <xsd:element name="ChineseWall">
+               <xsd:complexType>
+                       <xsd:sequence>
+                               <xsd:element ref="ChineseWallTypes" 
minOccurs="1" maxOccurs="1" />
+                               <xsd:element ref="ConflictSets" minOccurs="0" 
maxOccurs="1" />
+                       </xsd:sequence>
+                       <xsd:attribute name="priority" type="PolicyOrder" 
use="optional"></xsd:attribute>
+               </xsd:complexType>
+       </xsd:element>
+       <xsd:element name="SimpleTypeEnforcement">
+               <xsd:complexType>
+                       <xsd:sequence>
+                               <xsd:element ref="SimpleTypeEnforcementTypes" />
+                       </xsd:sequence>
+                       <xsd:attribute name="priority" type="PolicyOrder" 
use="optional"></xsd:attribute>
+               </xsd:complexType>
+       </xsd:element>
+       <xsd:element name="SecurityLabelTemplate">
+               <xsd:complexType>
+                       <xsd:sequence>
+                               <xsd:element name="SubjectLabels" minOccurs="0" 
maxOccurs="1">
+                                       <xsd:complexType>
+                                               <xsd:sequence>
+                                                       <xsd:element 
ref="VirtualMachineLabel" minOccurs="1" maxOccurs="unbounded"></xsd:element>
+                                               </xsd:sequence>
+                                               <xsd:attribute name="bootstrap" 
type="xsd:string" use="required"></xsd:attribute>
+                                       </xsd:complexType>
+                               </xsd:element>
+                               <xsd:element name="ObjectLabels" minOccurs="0" 
maxOccurs="1">
+                                       <xsd:complexType>
+                                               <xsd:sequence>
+                                                       <xsd:element 
ref="ResourceLabel" minOccurs="1" maxOccurs="unbounded"></xsd:element>
+                                               </xsd:sequence>
+                                       </xsd:complexType>
+                               </xsd:element>
+                       </xsd:sequence>
+               </xsd:complexType>
+       </xsd:element>
+       <xsd:element name="ChineseWallTypes">
+               <xsd:complexType>
+                       <xsd:sequence>
+                               <xsd:element maxOccurs="unbounded" 
minOccurs="1" ref="Type" />
+                       </xsd:sequence>
+               </xsd:complexType>
+       </xsd:element>
+       <xsd:element name="ConflictSets">
+               <xsd:complexType>
+                       <xsd:sequence>
+                               <xsd:element maxOccurs="unbounded" 
minOccurs="1" ref="Conflict" />
+                       </xsd:sequence>
+               </xsd:complexType>
+       </xsd:element>
+       <xsd:element name="SimpleTypeEnforcementTypes">
+               <xsd:complexType>
+                       <xsd:sequence>
+                               <xsd:element maxOccurs="unbounded" 
minOccurs="1" ref="Type" />
+                       </xsd:sequence>
+               </xsd:complexType>
+       </xsd:element>
+       <xsd:element name="Conflict">
+               <xsd:complexType>
+                       <xsd:sequence>
+                               <xsd:element maxOccurs="unbounded" 
minOccurs="1" ref="Type" />
+                       </xsd:sequence>
+                       <xsd:attribute name="name" type="xsd:string" 
use="required"></xsd:attribute>
+               </xsd:complexType>
+       </xsd:element>
+       <xsd:element name="VirtualMachineLabel">
+               <xsd:complexType>
+                       <xsd:sequence>
+                               <xsd:element name="Name" 
type="NameWithFrom"></xsd:element>
+                               <xsd:element ref="SimpleTypeEnforcementTypes" 
minOccurs="0" maxOccurs="unbounded" />
+                               <xsd:element ref="ChineseWallTypes" 
minOccurs="0" maxOccurs="unbounded" />
+                       </xsd:sequence>
+               </xsd:complexType>
+       </xsd:element>
+       <xsd:element name="ResourceLabel">
+               <xsd:complexType>
+                       <xsd:sequence>
+                               <xsd:element name="Name" 
type="NameWithFrom"></xsd:element>
+                               <xsd:element name="SimpleTypeEnforcementTypes" 
type="SingleSimpleTypeEnforcementType" />
+                       </xsd:sequence>
+               </xsd:complexType>
+       </xsd:element>
+       <xsd:element name="Name" type="xsd:string" />
+       <xsd:element name="Type" type="xsd:string" />
+       <xsd:simpleType name="PolicyOrder">
+               <xsd:restriction base="xsd:string">
+                       <xsd:enumeration 
value="PrimaryPolicyComponent"></xsd:enumeration>
+               </xsd:restriction>
+       </xsd:simpleType>
+       <xsd:element name="FromPolicy">
+               <xsd:complexType>
+                       <xsd:sequence>
+                               <xsd:element name="PolicyName" minOccurs="1" 
maxOccurs="1" type="xsd:string"/>
+                               <xsd:element name="Version" minOccurs="1" 
maxOccurs="1" type="VersionFormat"/>
+                       </xsd:sequence>
+               </xsd:complexType>
+       </xsd:element>
+       <xsd:simpleType name="VersionFormat">
+               <xsd:restriction base="xsd:string">
+                       <xsd:pattern 
value="[0-9]{1,8}.[0-9]{1,8}"></xsd:pattern>
+               </xsd:restriction>
+       </xsd:simpleType>
+       <xsd:complexType name="NameWithFrom">
+               <xsd:simpleContent>
+                       <xsd:extension base="xsd:string">
+                               <xsd:attribute name="from" type="xsd:string" 
use="optional"></xsd:attribute>
+                       </xsd:extension>
+               </xsd:simpleContent>
+       </xsd:complexType>
+       <xsd:complexType name="SingleSimpleTypeEnforcementType">
+               <xsd:sequence>
+                       <xsd:element maxOccurs="1" minOccurs="1" ref="Type" />
+               </xsd:sequence>
+       </xsd:complexType>
+</xsd:schema>"""
+
 
 def get_DEFAULT_policy(dom0label=""):
     fromnode = ""
@@ -133,18 +278,7 @@ def initialize():
 
     instdir = security.install_policy_dir_prefix
     DEF_policy_file = "DEFAULT-security_policy.xml"
-    xsd_file = "security_policy.xsd"
-
-    files = [ xsd_file ]
-
-    for file in files:
-        if not os.path.isfile(policiesdir + "/" + file ):
-            try:
-                shutil.copyfile(instdir + "/" + file,
-                                policiesdir + "/" + file)
-            except Exception, e:
-                log.info("could not copy '%s': %s" %
-                         (file, str(e)))
+
     #Install default policy.
     f = open(policiesdir + "/" + DEF_policy_file, 'w')
     if f:
@@ -219,7 +353,8 @@ class ACMPolicy(XSPolicy):
             log.warn("Libxml2 python-wrapper is not installed on the system.")
             return xsconstants.XSERR_SUCCESS
         try:
-            parserctxt = libxml2.schemaNewParserCtxt(ACM_SCHEMA_FILE)
+            parserctxt = libxml2.schemaNewMemParserCtxt(ACM_SCHEMA,
+                                                        len(ACM_SCHEMA))
             schemaparser = parserctxt.schemaParse()
             valid = schemaparser.schemaNewValidCtxt()
             doc = libxml2.parseDoc(self.toxml())
diff -r f2457c7aff8d -r 611787b6ca35 tools/python/xen/util/blkif.py
--- a/tools/python/xen/util/blkif.py    Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/python/xen/util/blkif.py    Thu May 08 18:40:07 2008 +0900
@@ -42,10 +42,12 @@ def blkdev_name_to_number(name):
     if re.match( '/dev/xvd[a-p]([1-9]|1[0-5])?', n):
         return 202 * 256 + 16 * (ord(n[8:9]) - ord('a')) + int(n[9:] or 0)
 
-    # see if this is a hex device number
-    if re.match( '^(0x)?[0-9a-fA-F]+$', name ):
+    if re.match( '^(0x)[0-9a-fA-F]+$', name ):
         return string.atoi(name,16)
-        
+
+    if re.match('^[0-9]+$', name):
+        return string.atoi(name, 10)
+
     return None
 
 def blkdev_segment(name):
diff -r f2457c7aff8d -r 611787b6ca35 tools/python/xen/util/bootloader.py
--- a/tools/python/xen/util/bootloader.py       Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/python/xen/util/bootloader.py       Thu May 08 18:40:07 2008 +0900
@@ -25,8 +25,6 @@ from xen.xend.XendLogging import log
 from xen.xend.XendLogging import log
 from xen.util import mkdir
 import xen.util.xsm.xsm as security
-
-__bootloader = None
 
 #
 # Functions for modifying entries in the bootloader, i.e. adding
@@ -513,8 +511,11 @@ class LatePolicyLoader(Bootloader):
         Bootloader.__init__(self)
 
     def probe(self):
-        _dir=os.path.dirname(self.FILENAME)
-        mkdir.parents(_dir, stat.S_IRWXU)
+        try:
+            _dir=os.path.dirname(self.FILENAME)
+            mkdir.parents(_dir, stat.S_IRWXU)
+        except:
+            return False
         return True
 
     def get_default_title(self):
@@ -614,10 +615,12 @@ class LatePolicyLoader(Bootloader):
 
 __bootloader = Bootloader()
 
-grub = Grub()
-if grub.probe() == True:
-    __bootloader = grub
-else:
-    late = LatePolicyLoader()
-    if late.probe() == True:
-        __bootloader = late
+def init():
+    global __bootloader
+    grub = Grub()
+    if grub.probe() == True:
+        __bootloader = grub
+    else:
+        late = LatePolicyLoader()
+        if late.probe() == True:
+            __bootloader = late
diff -r f2457c7aff8d -r 611787b6ca35 tools/python/xen/util/pci.py
--- a/tools/python/xen/util/pci.py      Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/python/xen/util/pci.py      Thu May 08 18:40:07 2008 +0900
@@ -7,6 +7,7 @@
 
 import sys
 import os, os.path
+import resource
 
 PROC_MNT_PATH = '/proc/mounts'
 PROC_PCI_PATH = '/proc/bus/pci/devices'
@@ -14,6 +15,7 @@ PROC_PCI_NUM_RESOURCES = 7
 
 SYSFS_PCI_DEVS_PATH = '/bus/pci/devices'
 SYSFS_PCI_DEV_RESOURCE_PATH = '/resource'
+SYSFS_PCI_DEV_CONFIG_PATH = '/config'
 SYSFS_PCI_DEV_IRQ_PATH = '/irq'
 SYSFS_PCI_DEV_DRIVER_DIR_PATH = '/driver'
 SYSFS_PCI_DEV_VENDOR_PATH = '/vendor'
@@ -24,7 +26,21 @@ PCI_BAR_IO = 0x01
 PCI_BAR_IO = 0x01
 PCI_BAR_IO_MASK = ~0x03
 PCI_BAR_MEM_MASK = ~0x0f
-
+PCI_STATUS_CAP_MASK = 0x10
+PCI_STATUS_OFFSET = 0x6
+PCI_CAP_OFFSET = 0x34
+MSIX_BIR_MASK = 0x7
+MSIX_SIZE_MASK = 0x7ff
+
+#Calculate PAGE_SHIFT: number of bits to shift an address to get the page 
number
+PAGE_SIZE = resource.getpagesize()
+PAGE_SHIFT = 0
+t = PAGE_SIZE
+while not (t&1):
+    t>>=1
+    PAGE_SHIFT+=1
+
+PAGE_MASK=~(PAGE_SIZE - 1)
 # Definitions from Linux: include/linux/pci.h
 def PCI_DEVFN(slot, func):
     return ((((slot) & 0x1f) << 3) | ((func) & 0x07))
@@ -74,10 +90,73 @@ class PciDevice:
         self.device = None
         self.subvendor = None
         self.subdevice = None
-
+        self.msix = 0
+        self.msix_iomem = []
         self.get_info_from_sysfs()
 
+    def find_capability(self, type):
+        try:
+            sysfs_mnt = find_sysfs_mnt()
+        except IOError, (errno, strerr):
+            raise PciDeviceParseError(('Failed to locate sysfs mount: %s (%d)' 
%
+                (PROC_PCI_PATH, strerr, errno)))
+
+        if sysfs_mnt == None:
+            return False
+        path = sysfs_mnt+SYSFS_PCI_DEVS_PATH+'/'+ \
+               self.name+SYSFS_PCI_DEV_CONFIG_PATH
+        try:
+            conf_file = open(path, 'rb')
+            conf_file.seek(PCI_STATUS_OFFSET)
+            status = ord(conf_file.read(1))
+            if status&PCI_STATUS_CAP_MASK:
+                conf_file.seek(PCI_CAP_OFFSET)
+                capa_pointer = ord(conf_file.read(1))
+                while capa_pointer:
+                    conf_file.seek(capa_pointer)
+                    capa_id = ord(conf_file.read(1))
+                    capa_pointer = ord(conf_file.read(1))
+                    if capa_id == type:
+                        # get the type
+                        message_cont_lo = ord(conf_file.read(1))
+                        message_cont_hi = ord(conf_file.read(1))
+                        self.msix=1
+                        self.msix_entries = (message_cont_lo + \
+                                             (message_cont_hi << 8)) \
+                                             & MSIX_SIZE_MASK
+                        t_off=conf_file.read(4)
+                        p_off=conf_file.read(4)
+                        self.table_offset=ord(t_off[0]) | (ord(t_off[1])<<8) | 
\
+                                          (ord(t_off[2])<<16)|  \
+                                          (ord(t_off[3])<<24)
+                        self.pba_offset=ord(p_off[0]) | (ord(p_off[1]) << 8)| \
+                                        (ord(p_off[2])<<16) | \
+                                        (ord(p_off[3])<<24)
+                        self.table_index = self.table_offset & MSIX_BIR_MASK
+                        self.table_offset = self.table_offset & ~MSIX_BIR_MASK
+                        self.pba_index = self.pba_offset & MSIX_BIR_MASK
+                        self.pba_offset = self.pba_offset & ~MSIX_BIR_MASK
+                        break
+        except IOError, (errno, strerr):
+            raise PciDeviceParseError(('Failed to locate sysfs mount: %s (%d)' 
%
+                (PROC_PCI_PATH, strerr, errno)))
+
+    def remove_msix_iomem(self, index, start, size):
+        if (index == self.table_index):
+            table_start = start+self.table_offset
+            table_end = table_start + self.msix_entries * 16
+            table_start = table_start & PAGE_MASK
+            table_end = (table_end + PAGE_SIZE) & PAGE_MASK
+            self.msix_iomem.append((table_start, table_end-table_start))
+        if (index==self.pba_index):
+            pba_start = start + self.pba_offset
+            pba_end = pba_start + self.msix_entries/8
+            pba_start = pba_start & PAGE_MASK
+            pba_end = (pba_end + PAGE_SIZE) & PAGE_MASK
+            self.msix_iomem.append((pba_start, pba_end-pba_start))
+
     def get_info_from_sysfs(self):
+        self.find_capability(0x11)
         try:
             sysfs_mnt = find_sysfs_mnt()
         except IOError, (errno, strerr):
@@ -108,6 +187,10 @@ class PciDevice:
                         self.ioports.append( (start,size) )
                     else:
                         self.iomem.append( (start,size) )
+                    if (self.msix):
+                        self.remove_msix_iomem(i, start, size)
+
+
 
         except IOError, (errno, strerr):
             raise PciDeviceParseError(('Failed to open & read %s: %s (%d)' %
diff -r f2457c7aff8d -r 611787b6ca35 tools/python/xen/util/xsm/acm/acm.py
--- a/tools/python/xen/util/xsm/acm/acm.py      Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/python/xen/util/xsm/acm/acm.py      Thu May 08 18:40:07 2008 +0900
@@ -156,7 +156,9 @@ def on():
     returns none if security policy is off (not compiled),
     any string otherwise, use it: if not security.on() ...
     """
-    return (get_active_policy_name() not in ['INACTIVE', 'NULL'])
+    if get_active_policy_name() not in ['INACTIVE', 'NULL', '']:
+        return xsconstants.XS_POLICY_ACM
+    return 0
 
 
 def calc_dom_ssidref_from_info(info):
diff -r f2457c7aff8d -r 611787b6ca35 tools/python/xen/util/xsm/flask/flask.py
--- a/tools/python/xen/util/xsm/flask/flask.py  Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/python/xen/util/xsm/flask/flask.py  Thu May 08 18:40:07 2008 +0900
@@ -12,7 +12,7 @@ def err(msg):
     raise XSMError(msg)
 
 def on():
-    return 1
+    return 0 #xsconstants.XS_POLICY_FLASK
 
 def ssidref2label(ssidref):
     try:
diff -r f2457c7aff8d -r 611787b6ca35 tools/python/xen/web/tcp.py
--- a/tools/python/xen/web/tcp.py       Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/python/xen/web/tcp.py       Thu May 08 18:40:07 2008 +0900
@@ -64,3 +64,43 @@ class TCPListener(connection.SocketListe
                 sock.close()
             except:
                 pass
+
+class SSLTCPListener(TCPListener):
+
+    def __init__(self, protocol_class, port, interface, hosts_allow,
+                 ssl_key_file = None, ssl_cert_file = None):
+        if not ssl_key_file or not ssl_cert_file:
+            raise ValueError("SSLXMLRPCServer requires ssl_key_file "
+                             "and ssl_cert_file to be set.")
+
+        self.ssl_key_file = ssl_key_file
+        self.ssl_cert_file = ssl_cert_file
+
+        TCPListener.__init__(self, protocol_class, port, interface, 
hosts_allow)
+
+
+    def createSocket(self):
+        from OpenSSL import SSL
+        # make a SSL socket
+        ctx = SSL.Context(SSL.SSLv23_METHOD)
+        ctx.set_options(SSL.OP_NO_SSLv2)
+        ctx.use_privatekey_file (self.ssl_key_file)
+        ctx.use_certificate_file(self.ssl_cert_file)
+        sock = SSL.Connection(ctx,
+                              socket.socket(socket.AF_INET, 
socket.SOCK_STREAM))
+        sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+
+        # SO_REUSEADDR does not always ensure that we do not get an address
+        # in use error when restarted quickly
+        # we implement a timeout to try and avoid failing unnecessarily
+        timeout = time.time() + 30
+        while True:
+            try:
+                sock.bind((self.interface, self.port))
+                return sock
+            except socket.error, (_errno, strerrno):
+                if _errno == errno.EADDRINUSE and time.time() < timeout:
+                    time.sleep(0.5)
+                else:
+                    raise
+
diff -r f2457c7aff8d -r 611787b6ca35 tools/python/xen/xend/XendCheckpoint.py
--- a/tools/python/xen/xend/XendCheckpoint.py   Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/python/xen/xend/XendCheckpoint.py   Thu May 08 18:40:07 2008 +0900
@@ -309,6 +309,7 @@ def restore(xd, fd, dominfo = None, paus
                 else:
                     break
             os.close(qemu_fd)
+            restore_image.setCpuid()
 
 
         os.read(fd, 1)           # Wait for source to close connection
diff -r f2457c7aff8d -r 611787b6ca35 tools/python/xen/xend/XendConfig.py
--- a/tools/python/xen/xend/XendConfig.py       Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/python/xen/xend/XendConfig.py       Thu May 08 18:40:07 2008 +0900
@@ -203,6 +203,8 @@ XENAPI_CFG_TYPES = {
     'target': int,
     'security_label': str,
     'pci': str,
+    'cpuid' : dict,
+    'cpuid_check' : dict,
 }
 
 # List of legacy configuration keys that have no equivalent in the
@@ -497,6 +499,32 @@ class XendConfig(dict):
         if 'handle' in dominfo:
             self['uuid'] = uuid.toString(dominfo['handle'])
             
+    def parse_cpuid(self, cfg, field):
+       def int2bin(n, count=32):
+           return "".join([str((n >> y) & 1) for y in range(count-1, -1, -1)])
+
+       for input, regs in cfg[field].iteritems():
+           if not regs is dict:
+               cfg[field][input] = dict(regs)
+
+       cpuid = {}
+       for input in cfg[field]:
+           inputs = input.split(',')
+           if inputs[0][0:2] == '0x':
+               inputs[0] = str(int(inputs[0], 16))
+           if len(inputs) == 2:
+               if inputs[1][0:2] == '0x':
+                   inputs[1] = str(int(inputs[1], 16))
+           new_input = ','.join(inputs)
+           cpuid[new_input] = {} # new input
+           for reg in cfg[field][input]:
+               val = cfg[field][input][reg]
+               if val[0:2] == '0x':
+                   cpuid[new_input][reg] = int2bin(int(val, 16))
+               else:
+                   cpuid[new_input][reg] = val
+       cfg[field] = cpuid
+
     def _parse_sxp(self, sxp_cfg):
         """ Populate this XendConfig using the parsed SXP.
 
@@ -653,8 +681,14 @@ class XendConfig(dict):
                 except ValueError, e:
                     raise XendConfigError('cpus = %s: %s' % (cfg['cpus'], e))
 
+        # Parse cpuid
+        if 'cpuid' in cfg:
+            self.parse_cpuid(cfg, 'cpuid')
+        if 'cpuid_check' in cfg:
+            self.parse_cpuid(cfg, 'cpuid_check')
+
         import xen.util.xsm.xsm as security
-        if security.on():
+        if security.on() == xsconstants.XS_POLICY_ACM:
             from xen.util.acmpolicy import ACM_LABEL_UNLABELED
             if not 'security' in cfg and sxp.child_value(sxp_cfg, 'security'):
                 cfg['security'] = sxp.child_value(sxp_cfg, 'security')
@@ -900,6 +934,16 @@ class XendConfig(dict):
         self['vcpus_params']['weight'] = \
             int(self['vcpus_params'].get('weight', 256))
         self['vcpus_params']['cap'] = int(self['vcpus_params'].get('cap', 0))
+
+    def cpuid_to_sxp(self, sxpr, field):
+        regs_list = []
+        for input, regs in self[field].iteritems():
+            reg_list = []
+            for reg, val in regs.iteritems():
+                reg_list.append([reg, val])
+            regs_list.append([input, reg_list])
+        sxpr.append([field, regs_list])
+
 
     def to_sxp(self, domain = None, ignore_devices = False, ignore = [],
                legacy_only = True):
@@ -1011,6 +1055,13 @@ class XendConfig(dict):
             except:
                 txn.abort()
                 raise
+
+        if 'cpuid' in self:
+            self.cpuid_to_sxp(sxpr, 'cpuid')
+        if 'cpuid_check' in self:
+            self.cpuid_to_sxp(sxpr, 'cpuid_check')
+
+        log.debug(sxpr)
 
         return sxpr    
     
diff -r f2457c7aff8d -r 611787b6ca35 tools/python/xen/xend/XendDomain.py
--- a/tools/python/xen/xend/XendDomain.py       Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/python/xen/xend/XendDomain.py       Thu May 08 18:40:07 2008 +0900
@@ -1258,7 +1258,7 @@ class XendDomain:
 
         return val       
 
-    def domain_migrate(self, domid, dst, live=False, resource=0, port=0, 
node=-1):
+    def domain_migrate(self, domid, dst, live=False, port=0, node=-1):
         """Start domain migration.
         
         @param domid: Domain ID or Name
@@ -1269,7 +1269,6 @@ class XendDomain:
         @type port: int        
         @keyword live: Live migration
         @type live: bool
-        @keyword resource: not used??
         @rtype: None
         @keyword node: use node number for target
         @rtype: int 
@@ -1293,8 +1292,16 @@ class XendDomain:
 
         if port == 0:
             port = xoptions.get_xend_relocation_port()
-        try:
-            sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+
+        try:
+            tls = xoptions.get_xend_relocation_tls()
+            if tls:
+                from OpenSSL import SSL
+                ctx = SSL.Context(SSL.SSLv23_METHOD)
+                sock = SSL.Connection(ctx, socket.socket(socket.AF_INET, 
socket.SOCK_STREAM))
+                sock.set_connect_state()
+            else:
+                sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
             sock.connect((dst, port))
         except socket.error, err:
             raise XendError("can't connect: %s" % err[1])
diff -r f2457c7aff8d -r 611787b6ca35 tools/python/xen/xend/XendDomainInfo.py
--- a/tools/python/xen/xend/XendDomainInfo.py   Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/python/xen/xend/XendDomainInfo.py   Thu May 08 18:40:07 2008 +0900
@@ -37,6 +37,7 @@ from xen.util import asserts
 from xen.util import asserts
 from xen.util.blkif import blkdev_uname_to_file, blkdev_uname_to_taptype
 import xen.util.xsm.xsm as security
+from xen.util import xsconstants
 
 from xen.xend import balloon, sxp, uuid, image, arch, osdep
 from xen.xend import XendOptions, XendNode, XendConfig
@@ -1973,7 +1974,7 @@ class XendDomainInfo:
         balloon.free(2*1024) # 2MB should be plenty
 
         ssidref = 0
-        if security.on():
+        if security.on() == xsconstants.XS_POLICY_ACM:
             ssidref = security.calc_dom_ssidref_from_info(self.info)
             if security.has_authorization(ssidref) == False:
                 raise VmError("VM is not authorized to run.")
@@ -1987,7 +1988,7 @@ class XendDomainInfo:
                 target = self.info.target())
         except Exception, e:
             # may get here if due to ACM the operation is not permitted
-            if security.on():
+            if security.on() == xsconstants.XS_POLICY_ACM:
                 raise VmError('Domain in conflict set with running domain?')
 
         if self.domid < 0:
@@ -2135,8 +2136,13 @@ class XendDomainInfo:
             # set memory limit
             xc.domain_setmaxmem(self.domid, maxmem)
 
+            # Reserve 1 page per MiB of RAM for separate VT-d page table.
+            vtd_mem = 4 * (self.info['memory_static_max'] / 1024 / 1024)
+            # Round vtd_mem up to a multiple of a MiB.
+            vtd_mem = ((vtd_mem + 1023) / 1024) * 1024
+
             # Make sure there's enough RAM available for the domain
-            balloon.free(memory + shadow)
+            balloon.free(memory + shadow + vtd_mem)
 
             # Set up the shadow memory
             shadow_cur = xc.shadow_mem_control(self.domid, shadow / 1024)
@@ -2848,7 +2854,6 @@ class XendDomainInfo:
         is_policy_update = (xspol_old != None)
 
         from xen.xend.XendXSPolicyAdmin import XSPolicyAdminInstance
-        from xen.util import xsconstants
 
         state = self._stateGet()
         # Relabel only HALTED or RUNNING or PAUSED domains
diff -r f2457c7aff8d -r 611787b6ca35 tools/python/xen/xend/XendOptions.py
--- a/tools/python/xen/xend/XendOptions.py      Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/python/xen/xend/XendOptions.py      Thu May 08 18:40:07 2008 +0900
@@ -192,6 +192,12 @@ class XendOptions:
         return self.get_config_bool("xend-relocation-server",
                                     self.xend_relocation_server_default)
 
+    def get_xend_relocation_server_ssl_key_file(self):
+        return self.get_config_string("xend-relocation-server-ssl-key-file")
+
+    def get_xend_relocation_server_ssl_cert_file(self):
+        return self.get_config_string("xend-relocation-server-ssl-cert-file")
+
     def get_xend_port(self):
         """Get the port xend listens at for its HTTP interface.
         """
@@ -202,6 +208,11 @@ class XendOptions:
         """
         return self.get_config_int('xend-relocation-port',
                                    self.xend_relocation_port_default)
+
+    def get_xend_relocation_tls(self):
+        """Whether to use tls when relocating.
+        """
+        return self.get_config_bool('xend-relocation-tls', 'no')
 
     def get_xend_relocation_hosts_allow(self):
         return self.get_config_string("xend-relocation-hosts-allow",
diff -r f2457c7aff8d -r 611787b6ca35 tools/python/xen/xend/XendXSPolicyAdmin.py
--- a/tools/python/xen/xend/XendXSPolicyAdmin.py        Fri Apr 25 20:13:52 
2008 +0900
+++ b/tools/python/xen/xend/XendXSPolicyAdmin.py        Thu May 08 18:40:07 
2008 +0900
@@ -46,7 +46,12 @@ class XSPolicyAdmin:
         self.maxpolicies = maxpolicies
         self.policies = {}
         self.xsobjs = {}
-
+        bootloader.init()
+
+        if security.on() == xsconstants.XS_POLICY_ACM:
+            self.__acm_init()
+
+    def __acm_init(self):
         act_pol_name = self.get_hv_loaded_policy_name()
         initialize()
 
@@ -73,7 +78,7 @@ class XSPolicyAdmin:
             This currently only checks for ACM-enablement.
         """
         rc = 0
-        if security.on():
+        if security.on() == xsconstants.XS_POLICY_ACM:
             rc |= xsconstants.XS_POLICY_ACM
         return rc
 
@@ -103,6 +108,8 @@ class XSPolicyAdmin:
 
     def __add_acmpolicy_to_system(self, xmltext, flags, overwrite):
         errors = ""
+        if security.on() != xsconstants.XS_POLICY_ACM:
+            raise SecurityError(-xsconstants.XSERR_POLICY_TYPE_UNSUPPORTED)
         loadedpol = self.get_loaded_policy()
         if loadedpol:
             # This is meant as an update to a currently loaded policy
diff -r f2457c7aff8d -r 611787b6ca35 tools/python/xen/xend/image.py
--- a/tools/python/xen/xend/image.py    Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/python/xen/xend/image.py    Thu May 08 18:40:07 2008 +0900
@@ -551,6 +551,38 @@ class HVMImageHandler(ImageHandler):
         self.acpi = int(vmConfig['platform'].get('acpi', 0))
         self.guest_os_type = vmConfig['platform'].get('guest_os_type')
 
+        self.vmConfig = vmConfig
+           
+    def setCpuid(self):
+        xc.domain_set_policy_cpuid(self.vm.getDomid())
+
+        if 'cpuid' in self.vmConfig:
+            cpuid = self.vmConfig['cpuid']
+            transformed = {}
+            for sinput, regs in cpuid.iteritems():
+                inputs = sinput.split(',')
+                input = long(inputs[0])
+                sub_input = None
+                if len(inputs) == 2:
+                    sub_input = long(inputs[1])
+                t = xc.domain_set_cpuid(self.vm.getDomid(),
+                                        input, sub_input, regs)
+                transformed[sinput] = t
+            self.vmConfig['cpuid'] = transformed
+
+        if 'cpuid_check' in self.vmConfig:
+            cpuid_check = self.vmConfig['cpuid_check']
+            transformed = {}
+            for sinput, regs_check in cpuid_check.iteritems():
+                inputs = sinput.split(',')
+                input = long(inputs[0])
+                sub_input = None
+                if len(inputs) == 2:
+                    sub_input = long(inputs[1])
+                t = xc.domain_check_cpuid(input, sub_input, regs_check)
+                transformed[sinput] = t
+            self.vmConfig['cpuid_check'] = transformed
+
     # Return a list of cmd line args to the device models based on the
     # xm config file
     def parseDeviceModelArgs(self, vmConfig):
@@ -718,6 +750,7 @@ class X86_HVM_ImageHandler(HVMImageHandl
 
     def buildDomain(self):
         xc.hvm_set_param(self.vm.getDomid(), HVM_PARAM_PAE_ENABLED, self.pae)
+        self.setCpuid()
         return HVMImageHandler.buildDomain(self)
 
     def getRequiredAvailableMemory(self, mem_kb):
diff -r f2457c7aff8d -r 611787b6ca35 tools/python/xen/xend/server/SrvDomain.py
--- a/tools/python/xen/xend/server/SrvDomain.py Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/python/xen/xend/server/SrvDomain.py Thu May 08 18:40:07 2008 +0900
@@ -115,7 +115,6 @@ class SrvDomain(SrvDir):
                     [['dom',         'int'],
                      ['destination', 'str'],
                      ['live',        'int'],
-                     ['resource',    'int'],
                      ['port',        'int']])
         return fn(req.args, {'dom': self.dom.domid})
 
diff -r f2457c7aff8d -r 611787b6ca35 tools/python/xen/xend/server/blkif.py
--- a/tools/python/xen/xend/server/blkif.py     Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/python/xen/xend/server/blkif.py     Thu May 08 18:40:07 2008 +0900
@@ -23,6 +23,7 @@ import xen.util.xsm.xsm as security
 import xen.util.xsm.xsm as security
 from xen.xend.XendError import VmError
 from xen.xend.server.DevController import DevController
+from xen.util import xsconstants
 
 class BlkifController(DevController):
     """Block device interface controller. Handles all block devices
@@ -72,7 +73,7 @@ class BlkifController(DevController):
         if uuid:
             back['uuid'] = uuid
 
-        if security.on():
+        if security.on() == xsconstants.XS_POLICY_ACM:
             self.do_access_control(config, uname)
 
         devid = blkif.blkdev_name_to_number(dev)
diff -r f2457c7aff8d -r 611787b6ca35 tools/python/xen/xend/server/irqif.py
--- a/tools/python/xen/xend/server/irqif.py     Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/python/xen/xend/server/irqif.py     Thu May 08 18:40:07 2008 +0900
@@ -69,5 +69,10 @@ class IRQController(DevController):
             #todo non-fatal
             raise VmError(
                 'irq: Failed to configure irq: %d' % (pirq))
-
+        rc = xc.physdev_map_pirq(domid = self.getDomid(),
+                                index = pirq,
+                                pirq  = pirq)
+        if rc < 0:
+            raise VmError(
+                'irq: Failed to map irq %x' % (pirq))
         return (None, {}, {})
diff -r f2457c7aff8d -r 611787b6ca35 tools/python/xen/xend/server/netif.py
--- a/tools/python/xen/xend/server/netif.py     Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/python/xen/xend/server/netif.py     Thu May 08 18:40:07 2008 +0900
@@ -29,6 +29,7 @@ from xen.xend.XendError import VmError
 from xen.xend.XendError import VmError
 from xen.xend.XendXSPolicyAdmin import XSPolicyAdminInstance
 import xen.util.xsm.xsm as security
+from xen.util import xsconstants
 
 from xen.xend.XendLogging import log
 
@@ -155,7 +156,7 @@ class NetifController(DevController):
             front = { 'handle' : "%i" % devid,
                       'mac'    : mac }
 
-        if security.on():
+        if security.on() == xsconstants.XS_POLICY_ACM:
             self.do_access_control(config)
 
         return (devid, back, front)
diff -r f2457c7aff8d -r 611787b6ca35 tools/python/xen/xend/server/pciif.py
--- a/tools/python/xen/xend/server/pciif.py     Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/python/xen/xend/server/pciif.py     Thu May 08 18:40:07 2008 +0900
@@ -271,6 +271,25 @@ class PciController(DevController):
             if rc<0:
                 raise VmError(('pci: failed to configure I/O memory on device 
'+
                             '%s - errno=%d')%(dev.name,rc))
+            rc = xc.physdev_map_pirq(domid = fe_domid,
+                                   index = dev.irq,
+                                   pirq  = dev.irq)
+            if rc < 0:
+                raise VmError(('pci: failed to map irq on device '+
+                            '%s - errno=%d')%(dev.name,rc))
+
+        if dev.msix:
+            for (start, size) in dev.msix_iomem:
+                start_pfn = start>>PAGE_SHIFT
+                nr_pfns = (size+(PAGE_SIZE-1))>>PAGE_SHIFT
+                log.debug('pci-msix: remove permission for 0x%x/0x%x 
0x%x/0x%x' % \
+                         (start,size, start_pfn, nr_pfns))
+                rc = xc.domain_iomem_permission(domid = fe_domid,
+                                                first_pfn = start_pfn,
+                                                nr_pfns = nr_pfns,
+                                                allow_access = False)
+                if rc<0:
+                    raise VmError(('pci: failed to remove msi-x iomem'))
 
         if dev.irq>0:
             log.debug('pci: enabling irq %d'%dev.irq)
diff -r f2457c7aff8d -r 611787b6ca35 tools/python/xen/xend/server/relocate.py
--- a/tools/python/xen/xend/server/relocate.py  Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/python/xen/xend/server/relocate.py  Thu May 08 18:40:07 2008 +0900
@@ -132,5 +132,14 @@ def listenRelocation():
         else:
             hosts_allow = map(re.compile, hosts_allow.split(" "))
 
-        tcp.TCPListener(RelocationProtocol, port, interface = interface,
-                        hosts_allow = hosts_allow)
+        ssl_key_file = xoptions.get_xend_relocation_server_ssl_key_file()
+        ssl_cert_file = xoptions.get_xend_relocation_server_ssl_cert_file()
+
+        if ssl_key_file and ssl_cert_file:
+            tcp.SSLTCPListener(RelocationProtocol, port, interface = interface,
+                               hosts_allow = hosts_allow,
+                               ssl_key_file = ssl_key_file,
+                               ssl_cert_file = ssl_cert_file)
+        else:
+            tcp.TCPListener(RelocationProtocol, port, interface = interface,
+                            hosts_allow = hosts_allow)
diff -r f2457c7aff8d -r 611787b6ca35 tools/python/xen/xm/addlabel.py
--- a/tools/python/xen/xm/addlabel.py   Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/python/xen/xm/addlabel.py   Thu May 08 18:40:07 2008 +0900
@@ -205,17 +205,17 @@ def main(argv):
     policy_type = ""
     if len(argv) not in (4, 5):
         raise OptionError('Needs either 2 or 3 arguments')
-    
+
     label = argv[1]
-    
+
     if len(argv) == 5:
         policyref = argv[4]
-    elif security.on():
+    elif security.on() == xsconstants.XS_POLICY_ACM:
         policyref = security.active_policy
         policy_type = xsconstants.ACM_POLICY_ID
     else:
-        raise OptionError("No active policy. Must specify policy on the "
-                          "command line.")
+        raise OptionError("ACM security is not enabled. You must specify "\
+                          "the policy on the command line.")
 
     if argv[2].lower() == "dom":
         configfile = argv[3]
diff -r f2457c7aff8d -r 611787b6ca35 tools/python/xen/xm/create.py
--- a/tools/python/xen/xm/create.py     Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/python/xen/xm/create.py     Thu May 08 18:40:07 2008 +0900
@@ -548,6 +548,14 @@ gopts.var('hap', val='HAP',
           fn=set_int, default=1,
           use="""Hap status (0=hap is disabled;
           1=hap is enabled.""")
+
+gopts.var('cpuid', val="IN[,SIN]:eax=EAX,ebx=EBX,exc=ECX,edx=EDX",
+          fn=append_value, default=[],
+          use="""Cpuid description.""")
+
+gopts.var('cpuid_check', val="IN[,SIN]:eax=EAX,ebx=EBX,exc=ECX,edx=EDX",
+          fn=append_value, default=[],
+          use="""Cpuid check description.""")
 
 def err(msg):
     """Print an error to stderr and exit.
@@ -755,7 +763,7 @@ def configure_hvm(config_image, vals):
              'vnc', 'vncdisplay', 'vncunused', 'vncconsole', 'vnclisten',
              'sdl', 'display', 'xauthority', 'rtc_timeoffset', 'monitor',
              'acpi', 'apic', 'usb', 'usbdevice', 'keymap', 'pci', 'hpet',
-             'guest_os_type', 'hap', 'opengl']
+             'guest_os_type', 'hap', 'opengl', 'cpuid', 'cpuid_check']
 
     for a in args:
         if a in vals.__dict__ and vals.__dict__[a] is not None:
@@ -779,7 +787,8 @@ def make_config(vals):
     map(add_conf, ['name', 'memory', 'maxmem', 'shadow_memory',
                    'restart', 'on_poweroff',
                    'on_reboot', 'on_crash', 'vcpus', 'vcpu_avail', 'features',
-                   'on_xend_start', 'on_xend_stop', 'target'])
+                   'on_xend_start', 'on_xend_stop', 'target', 'cpuid',
+                   'cpuid_check'])
 
     if vals.uuid is not None:
         config.append(['uuid', vals.uuid])
@@ -842,6 +851,26 @@ def preprocess_disk(vals):
             err('Invalid disk specifier: ' + v)
         disk.append(d)
     vals.disk = disk
+
+def preprocess_cpuid(vals, attr_name):
+    if not vals.cpuid: return
+    cpuid = {} 
+    for cpuid_input in getattr(vals, attr_name):
+        input_re = "(0x)?[0-9A-Fa-f]+(,(0x)?[0-9A-Fa-f]+)?"
+        cpuid_match = re.match(r'(?P<input>%s):(?P<regs>.*)' % \
+                               input_re, cpuid_input)
+        if cpuid_match != None:
+            res_cpuid = cpuid_match.groupdict()
+            input = res_cpuid['input']
+            regs = res_cpuid['regs'].split(',')
+            cpuid[input]= {} # New input
+            for reg in regs:
+                reg_match = re.match(r"(?P<reg>eax|ebx|ecx|edx)=(?P<val>.*)", 
reg)
+                if reg_match == None:
+                    err("cpuid's syntax is (eax|ebx|ecx|edx)=value")
+                res = reg_match.groupdict()
+                cpuid[input][res['reg']] = res['val'] # new register
+    setattr(vals, attr_name, cpuid)
 
 def preprocess_pci(vals):
     if not vals.pci: return
@@ -1047,6 +1076,8 @@ def preprocess(vals):
     preprocess_vnc(vals)
     preprocess_vtpm(vals)
     preprocess_access_control(vals)
+    preprocess_cpuid(vals, 'cpuid')
+    preprocess_cpuid(vals, 'cpuid_check')
 
 
 def comma_sep_kv_to_dict(c):
diff -r f2457c7aff8d -r 611787b6ca35 tools/python/xen/xm/dry-run.py
--- a/tools/python/xen/xm/dry-run.py    Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/python/xen/xm/dry-run.py    Thu May 08 18:40:07 2008 +0900
@@ -22,6 +22,7 @@ import xen.util.xsm.xsm as security
 import xen.util.xsm.xsm as security
 from xen.xm import create
 from xen.xend import sxp
+from xen.util import xsconstants
 from xen.xm.opts import OptionError
 
 def help():
@@ -40,7 +41,7 @@ def check_domain_label(config, verbose):
     answer = 0
     default_label = None
     secon = 0
-    if security.on():
+    if security.on() == xsconstants.XS_POLICY_ACM:
         default_label = security.ssidref2label(security.NULL_SSIDREF)
         secon = 1
 
@@ -90,7 +91,7 @@ def config_security_check(config, verbos
             domain_policy = sxp.child_value(sxp.name(sxp.child0(x)), 'policy')
 
     # if no domain label, use default
-    if not domain_label and security.on():
+    if not domain_label and security.on() == xsconstants.XS_POLICY_ACM:
         try:
             domain_label = security.ssidref2label(security.NULL_SSIDREF)
         except:
diff -r f2457c7aff8d -r 611787b6ca35 tools/python/xen/xm/main.py
--- a/tools/python/xen/xm/main.py       Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/python/xen/xm/main.py       Thu May 08 18:40:07 2008 +0900
@@ -133,7 +133,7 @@ SUBCOMMAND_HELP = {
                      'Read and/or clear Xend\'s message buffer.'),
     'domid'       : ('<DomainName>', 'Convert a domain name to domain id.'),
     'domname'     : ('<DomId>', 'Convert a domain id to domain name.'),
-    'dump-core'   : ('[-L|--live] [-C|--crash] <Domain> [Filename]',
+    'dump-core'   : ('[-L|--live] [-C|--crash] [-R|--reset] <Domain> 
[Filename]',
                      'Dump core for a specific domain.'),
     'info'        : ('[-c|--config]', 'Get information about Xen host.'),
     'log'         : ('', 'Print Xend log'),
@@ -243,6 +243,7 @@ SUBCOMMAND_OPTIONS = {
     'dump-core': (
        ('-L', '--live', 'Dump core without pausing the domain'),
        ('-C', '--crash', 'Crash domain after dumping core'),
+       ('-R', '--reset', 'Reset domain after dumping core'),
     ),
     'start': (
        ('-p', '--paused', 'Do not unpause domain after starting it'),
@@ -417,10 +418,11 @@ def cmdHelp(cmd):
 def cmdHelp(cmd):
     """Print help for a specific subcommand."""
     
-    for fc in SUBCOMMAND_HELP.keys():
-        if fc[:len(cmd)] == cmd:
-            cmd = fc
-            break
+    if not SUBCOMMAND_HELP.has_key(cmd):
+        for fc in SUBCOMMAND_HELP.keys():
+            if fc[:len(cmd)] == cmd:
+                cmd = fc
+                break
     
     try:
         args, desc = SUBCOMMAND_HELP[cmd]
@@ -1279,14 +1281,19 @@ def xm_dump_core(args):
 def xm_dump_core(args):
     live = False
     crash = False
+    reset = False
     try:
-        (options, params) = getopt.gnu_getopt(args, 'LC', ['live','crash'])
+        (options, params) = getopt.gnu_getopt(args, 'LCR', ['live', 'crash', 
'reset'])
         for (k, v) in options:
             if k in ('-L', '--live'):
                 live = True
-            if k in ('-C', '--crash'):
+            elif k in ('-C', '--crash'):
                 crash = True
-
+            elif k in ('-R', '--reset'):
+                reset = True
+
+        if crash and reset:
+            raise OptionError("You may not specify more than one '-CR' option")
         if len(params) not in (1, 2):
             raise OptionError("Expects 1 or 2 argument(s)")
     except getopt.GetoptError, e:
@@ -1308,8 +1315,11 @@ def xm_dump_core(args):
         if crash:
             print "Destroying domain: %s ..." % str(dom)
             server.xend.domain.destroy(dom)
+        elif reset:
+            print "Resetting domain: %s ..." % str(dom)
+            server.xend.domain.reset(dom)
     finally:
-        if not live and not crash and ds == DOM_STATE_RUNNING:
+        if not live and not crash and not reset and ds == DOM_STATE_RUNNING:
             server.xend.domain.unpause(dom)
 
 def xm_rename(args):
diff -r f2457c7aff8d -r 611787b6ca35 tools/python/xen/xm/migrate.py
--- a/tools/python/xen/xm/migrate.py    Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/python/xen/xm/migrate.py    Thu May 08 18:40:07 2008 +0900
@@ -47,10 +47,6 @@ gopts.opt('node', short='n', val='nodenu
           fn=set_int, default=-1,
           use="Use specified NUMA node on target.")
 
-gopts.opt('resource', short='r', val='MBIT',
-          fn=set_int, default=0,
-          use="Set level of resource usage for migration.")
-
 def help():
     return str(gopts)
     
@@ -69,13 +65,11 @@ def main(argv):
         vm_ref = get_single_vm(dom)
         other_config = {
             "port":     opts.vals.port,
-            "resource": opts.vals.resource,
             "node":     opts.vals.node
             }
         server.xenapi.VM.migrate(vm_ref, dst, bool(opts.vals.live),
                                  other_config)
     else:
         server.xend.domain.migrate(dom, dst, opts.vals.live,
-                                   opts.vals.resource,
                                    opts.vals.port,
                                    opts.vals.node)
diff -r f2457c7aff8d -r 611787b6ca35 tools/python/xen/xm/xenapi_create.py
--- a/tools/python/xen/xm/xenapi_create.py      Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/python/xen/xm/xenapi_create.py      Thu May 08 18:40:07 2008 +0900
@@ -485,9 +485,9 @@ class xenapi_create:
                 vm_ref,
             "protocol":
                 console.attributes["protocol"].value,
-            "other_params":
+            "other_config":
                 get_child_nodes_as_dict(console,
-                  "other_param", "key", "value")
+                  "other_config", "key", "value")
             }
 
         return server.xenapi.console.create(console_record)
diff -r f2457c7aff8d -r 611787b6ca35 tools/xenstore/xenstore_client.c
--- a/tools/xenstore/xenstore_client.c  Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/xenstore/xenstore_client.c  Thu May 08 18:40:07 2008 +0900
@@ -450,8 +450,9 @@ static enum mode lookup_mode(const char 
        return MODE_write;
     else if (strcmp(m, "read") == 0)
        return MODE_read;
-    else
-       errx(1, "unknown mode %s\n", m);
+
+    errx(1, "unknown mode %s\n", m);
+    return 0;
 }
 
 int
diff -r f2457c7aff8d -r 611787b6ca35 tools/xenstore/xenstored_core.c
--- a/tools/xenstore/xenstored_core.c   Fri Apr 25 20:13:52 2008 +0900
+++ b/tools/xenstore/xenstored_core.c   Thu May 08 18:40:07 2008 +0900
@@ -1929,7 +1929,7 @@ int main(int argc, char *argv[])
 
        /* Main loop. */
        for (;;) {
-               struct connection *conn, *old_conn;
+               struct connection *conn, *next;
 
                if (select(max+1, &inset, &outset, NULL, timeout) < 0) {
                        if (errno == EINTR)
@@ -1953,27 +1953,39 @@ int main(int argc, char *argv[])
                if (evtchn_fd != -1 && FD_ISSET(evtchn_fd, &inset))
                        handle_event();
 
-               conn = list_entry(connections.next, typeof(*conn), list);
-               while (&conn->list != &connections) {
-                       talloc_increase_ref_count(conn);
+               next = list_entry(connections.next, typeof(*conn), list);
+               while (&next->list != &connections) {
+                       conn = next;
+
+                       next = list_entry(conn->list.next,
+                                         typeof(*conn), list);
 
                        if (conn->domain) {
+                               talloc_increase_ref_count(conn);
                                if (domain_can_read(conn))
                                        handle_input(conn);
+                               if (talloc_free(conn) == 0)
+                                       continue;
+
+                               talloc_increase_ref_count(conn);
                                if (domain_can_write(conn) &&
                                    !list_empty(&conn->out_list))
                                        handle_output(conn);
+                               if (talloc_free(conn) == 0)
+                                       continue;
                        } else {
+                               talloc_increase_ref_count(conn);
                                if (FD_ISSET(conn->fd, &inset))
                                        handle_input(conn);
+                               if (talloc_free(conn) == 0)
+                                       continue;
+
+                               talloc_increase_ref_count(conn);
                                if (FD_ISSET(conn->fd, &outset))
                                        handle_output(conn);
+                               if (talloc_free(conn) == 0)
+                                       continue;
                        }
-
-                       old_conn = conn;
-                       conn = list_entry(old_conn->list.next,
-                                         typeof(*conn), list);
-                       talloc_free(old_conn);
                }
 
                max = initialize_set(&inset, &outset, *sock, *ro_sock,
diff -r f2457c7aff8d -r 611787b6ca35 xen/Makefile
--- a/xen/Makefile      Fri Apr 25 20:13:52 2008 +0900
+++ b/xen/Makefile      Thu May 08 18:40:07 2008 +0900
@@ -5,6 +5,9 @@ export XEN_EXTRAVERSION ?= -unstable$(XE
 export XEN_EXTRAVERSION ?= -unstable$(XEN_VENDORVERSION)
 export XEN_FULLVERSION   = $(XEN_VERSION).$(XEN_SUBVERSION)$(XEN_EXTRAVERSION)
 -include xen-version
+
+export XEN_WHOAMI      ?= $(USER)
+export XEN_DOMAIN      ?= $(shell ([ -x /bin/dnsdomainname ] && 
/bin/dnsdomainname) || ([ -x /bin/domainname ] && /bin/domainname || echo 
[unknown]))
 
 export BASEDIR := $(CURDIR)
 
@@ -81,8 +84,8 @@ include/xen/compile.h: include/xen/compi
 include/xen/compile.h: include/xen/compile.h.in .banner
        @sed -e 's/@@date@@/$(shell LC_ALL=C date)/g' \
            -e 's/@@time@@/$(shell LC_ALL=C date +%T)/g' \
-           -e 's/@@whoami@@/$(USER)/g' \
-           -e 's/@@domain@@/$(shell ([ -x /bin/dnsdomainname ] && 
/bin/dnsdomainname) || ([ -x /bin/domainname ] && /bin/domainname || echo 
[unknown]))/g' \
+           -e 's/@@whoami@@/$(XEN_WHOAMI)/g' \
+           -e 's/@@domain@@/$(XEN_DOMAIN)/g' \
            -e 's/@@hostname@@/$(shell hostname)/g' \
            -e 's!@@compiler@@!$(shell $(CC) $(CFLAGS) -v 2>&1 | grep -i 
"gcc.*version")!g' \
            -e 's/@@version@@/$(XEN_VERSION)/g' \
diff -r f2457c7aff8d -r 611787b6ca35 xen/arch/ia64/vmx/vmx_hypercall.c
--- a/xen/arch/ia64/vmx/vmx_hypercall.c Fri Apr 25 20:13:52 2008 +0900
+++ b/xen/arch/ia64/vmx/vmx_hypercall.c Thu May 08 18:40:07 2008 +0900
@@ -200,6 +200,10 @@ do_hvm_op(unsigned long op, XEN_GUEST_HA
         rc = 0;
         break;
 
+    case HVMOP_track_dirty_vram:
+        rc = -ENOSYS;
+        break;
+
     default:
         gdprintk(XENLOG_INFO, "Bad HVM op %ld.\n", op);
         rc = -ENOSYS;
diff -r f2457c7aff8d -r 611787b6ca35 xen/arch/x86/Makefile
--- a/xen/arch/x86/Makefile     Fri Apr 25 20:13:52 2008 +0900
+++ b/xen/arch/x86/Makefile     Thu May 08 18:40:07 2008 +0900
@@ -24,6 +24,7 @@ obj-y += i387.o
 obj-y += i387.o
 obj-y += i8259.o
 obj-y += io_apic.o
+obj-y += msi.o
 obj-y += ioport_emulate.o
 obj-y += irq.o
 obj-y += microcode.o
diff -r f2457c7aff8d -r 611787b6ca35 xen/arch/x86/acpi/Makefile
--- a/xen/arch/x86/acpi/Makefile        Fri Apr 25 20:13:52 2008 +0900
+++ b/xen/arch/x86/acpi/Makefile        Thu May 08 18:40:07 2008 +0900
@@ -1,2 +1,2 @@ obj-y += boot.o
 obj-y += boot.o
-obj-y += power.o suspend.o wakeup_prot.o
+obj-y += power.o suspend.o wakeup_prot.o cpu_idle.o
diff -r f2457c7aff8d -r 611787b6ca35 xen/arch/x86/acpi/boot.c
--- a/xen/arch/x86/acpi/boot.c  Fri Apr 25 20:13:52 2008 +0900
+++ b/xen/arch/x86/acpi/boot.c  Thu May 08 18:40:07 2008 +0900
@@ -462,6 +462,28 @@ bad:
 }
 #endif
 
+static void __init
+acpi_fadt_parse_reg(struct acpi_table_fadt *fadt)
+{
+       unsigned int len;
+
+       len = min_t(unsigned int, fadt->header.length, sizeof(*fadt));
+       memcpy(&acpi_gbl_FADT, fadt, len);
+
+       if (len > offsetof(struct acpi_table_fadt, xpm1b_event_block)) {
+               memcpy(&acpi_gbl_xpm1a_enable, &fadt->xpm1a_event_block,
+                      sizeof(acpi_gbl_xpm1a_enable));
+               memcpy(&acpi_gbl_xpm1b_enable, &fadt->xpm1b_event_block,
+                      sizeof(acpi_gbl_xpm1b_enable));
+
+               acpi_gbl_xpm1a_enable.address +=
+                       acpi_gbl_FADT.pm1_event_length / 2;
+               if ( acpi_gbl_xpm1b_enable.address )
+                       acpi_gbl_xpm1b_enable.address +=
+                               acpi_gbl_FADT.pm1_event_length / 2;
+       }
+}
+
 static int __init acpi_parse_fadt(unsigned long phys, unsigned long size)
 {
        struct acpi_table_fadt *fadt = NULL;
@@ -508,6 +530,8 @@ static int __init acpi_parse_fadt(unsign
        acpi_smi_cmd       = fadt->smi_command;
        acpi_enable_value  = fadt->acpi_enable;
        acpi_disable_value = fadt->acpi_disable;
+
+       acpi_fadt_parse_reg(fadt);
 
 #ifdef CONFIG_ACPI_SLEEP
        acpi_fadt_parse_sleep_info(fadt);
diff -r f2457c7aff8d -r 611787b6ca35 xen/arch/x86/acpi/cpu_idle.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/acpi/cpu_idle.c      Thu May 08 18:40:07 2008 +0900
@@ -0,0 +1,957 @@
+/*
+ * cpu_idle - xen idle state module derived from Linux 
+ *            drivers/acpi/processor_idle.c & 
+ *            arch/x86/kernel/acpi/cstate.c
+ *
+ *  Copyright (C) 2001, 2002 Andy Grover <andrew.grover@xxxxxxxxx>
+ *  Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@xxxxxxxxx>
+ *  Copyright (C) 2004, 2005 Dominik Brodowski <linux@xxxxxxxx>
+ *  Copyright (C) 2004  Anil S Keshavamurthy <anil.s.keshavamurthy@xxxxxxxxx>
+ *                      - Added processor hotplug support
+ *  Copyright (C) 2005  Venkatesh Pallipadi <venkatesh.pallipadi@xxxxxxxxx>
+ *                      - Added support for C3 on SMP
+ *  Copyright (C) 2007, 2008 Intel Corporation
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or (at
+ *  your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+
+#include <xen/config.h>
+#include <xen/errno.h>
+#include <xen/lib.h>
+#include <xen/types.h>
+#include <xen/acpi.h>
+#include <xen/smp.h>
+#include <asm/cache.h>
+#include <asm/io.h>
+#include <xen/guest_access.h>
+#include <public/platform.h>
+#include <asm/processor.h>
+#include <xen/keyhandler.h>
+
+#define DEBUG_PM_CX
+
+#define US_TO_PM_TIMER_TICKS(t)     ((t * (PM_TIMER_FREQUENCY/1000)) / 1000)
+#define C2_OVERHEAD         4   /* 1us (3.579 ticks per us) */
+#define C3_OVERHEAD         4   /* 1us (3.579 ticks per us) */
+
+#define ACPI_PROCESSOR_MAX_POWER        8
+#define ACPI_PROCESSOR_MAX_C2_LATENCY   100
+#define ACPI_PROCESSOR_MAX_C3_LATENCY   1000
+
+extern u32 pmtmr_ioport;
+extern void (*pm_idle) (void);
+
+static void (*pm_idle_save) (void) __read_mostly;
+unsigned int max_cstate __read_mostly = 2;
+integer_param("max_cstate", max_cstate);
+/*
+ * bm_history -- bit-mask with a bit per jiffy of bus-master activity
+ * 1000 HZ: 0xFFFFFFFF: 32 jiffies = 32ms
+ * 800 HZ: 0xFFFFFFFF: 32 jiffies = 40ms
+ * 100 HZ: 0x0000000F: 4 jiffies = 40ms
+ * reduce history for more aggressive entry into C3
+ */
+unsigned int bm_history __read_mostly =
+    (HZ >= 800 ? 0xFFFFFFFF : ((1U << (HZ / 25)) - 1));
+integer_param("bm_history", bm_history);
+
+struct acpi_processor_cx;
+
+struct acpi_processor_cx_policy
+{
+    u32 count;
+    struct acpi_processor_cx *state;
+    struct
+    {
+        u32 time;
+        u32 ticks;
+        u32 count;
+        u32 bm;
+    } threshold;
+};
+
+struct acpi_processor_cx
+{
+    u8 valid;
+    u8 type;
+    u32 address;
+    u8 space_id;
+    u32 latency;
+    u32 latency_ticks;
+    u32 power;
+    u32 usage;
+    u64 time;
+    struct acpi_processor_cx_policy promotion;
+    struct acpi_processor_cx_policy demotion;
+};
+
+struct acpi_processor_flags
+{
+    u8 bm_control:1;
+    u8 bm_check:1;
+    u8 has_cst:1;
+    u8 power_setup_done:1;
+    u8 bm_rld_set:1;
+};
+
+struct acpi_processor_power
+{
+    struct acpi_processor_flags flags;
+    struct acpi_processor_cx *state;
+    s_time_t bm_check_timestamp;
+    u32 default_state;
+    u32 bm_activity;
+    u32 count;
+    struct acpi_processor_cx states[ACPI_PROCESSOR_MAX_POWER];
+};
+
+static struct acpi_processor_power processor_powers[NR_CPUS];
+
+static void print_acpi_power(uint32_t cpu, struct acpi_processor_power *power)
+{
+    uint32_t i;
+
+    printk("saved cpu%d cx acpi info:\n", cpu);
+    printk("\tcurrent state is C%d\n", (power->state)?power->state->type:-1);
+    printk("\tbm_check_timestamp = %"PRId64"\n", power->bm_check_timestamp);
+    printk("\tdefault_state = %d\n", power->default_state);
+    printk("\tbm_activity = 0x%08x\n", power->bm_activity);
+    printk("\tcount = %d\n", power->count);
+    
+    for ( i = 0; i < power->count; i++ )
+    {
+        printk("\tstates[%d]:\n", i);
+        printk("\t\tvalid   = %d\n", power->states[i].valid);
+        printk("\t\ttype    = %d\n", power->states[i].type);
+        printk("\t\taddress = 0x%x\n", power->states[i].address);
+        printk("\t\tspace_id = 0x%x\n", power->states[i].space_id);
+        printk("\t\tlatency = %d\n", power->states[i].latency);
+        printk("\t\tpower   = %d\n", power->states[i].power);
+        printk("\t\tlatency_ticks = %d\n", power->states[i].latency_ticks);
+        printk("\t\tusage   = %d\n", power->states[i].usage);
+        printk("\t\ttime    = %"PRId64"\n", power->states[i].time);
+
+        printk("\t\tpromotion policy:\n");
+        printk("\t\t\tcount    = %d\n", power->states[i].promotion.count);
+        printk("\t\t\tstate    = C%d\n",
+               (power->states[i].promotion.state) ? 
+               power->states[i].promotion.state->type : -1);
+        printk("\t\t\tthreshold.time = %d\n", 
power->states[i].promotion.threshold.time);
+        printk("\t\t\tthreshold.ticks = %d\n", 
power->states[i].promotion.threshold.ticks);
+        printk("\t\t\tthreshold.count = %d\n", 
power->states[i].promotion.threshold.count);
+        printk("\t\t\tthreshold.bm = %d\n", 
power->states[i].promotion.threshold.bm);
+
+        printk("\t\tdemotion policy:\n");
+        printk("\t\t\tcount    = %d\n", power->states[i].demotion.count);
+        printk("\t\t\tstate    = C%d\n",
+               (power->states[i].demotion.state) ? 
+               power->states[i].demotion.state->type : -1);
+        printk("\t\t\tthreshold.time = %d\n", 
power->states[i].demotion.threshold.time);
+        printk("\t\t\tthreshold.ticks = %d\n", 
power->states[i].demotion.threshold.ticks);
+        printk("\t\t\tthreshold.count = %d\n", 
power->states[i].demotion.threshold.count);
+        printk("\t\t\tthreshold.bm = %d\n", 
power->states[i].demotion.threshold.bm);
+    }
+}
+
+static void dump_cx(unsigned char key)
+{
+    for( int i = 0; i < num_online_cpus(); i++ )
+        print_acpi_power(i, &processor_powers[i]);
+}
+
+static int __init cpu_idle_key_init(void)
+{
+    register_keyhandler(
+        'c', dump_cx,        "dump cx structures");
+    return 0;
+}
+__initcall(cpu_idle_key_init);
+
+static inline u32 ticks_elapsed(u32 t1, u32 t2)
+{
+    if ( t2 >= t1 )
+        return (t2 - t1);
+    else
+        return ((0xFFFFFFFF - t1) + t2);
+}
+
+static void acpi_processor_power_activate(struct acpi_processor_power *power,
+                                          struct acpi_processor_cx *new)
+{
+    struct acpi_processor_cx *old;
+
+    if ( !power || !new )
+        return;
+
+    old = power->state;
+
+    if ( old )
+        old->promotion.count = 0;
+    new->demotion.count = 0;
+
+    /* Cleanup from old state. */
+    if ( old )
+    {
+        switch ( old->type )
+        {
+        case ACPI_STATE_C3:
+            /* Disable bus master reload */
+            if ( new->type != ACPI_STATE_C3 && power->flags.bm_check )
+                acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
+            break;
+        }
+    }
+
+    /* Prepare to use new state. */
+    switch ( new->type )
+    {
+    case ACPI_STATE_C3:
+        /* Enable bus master reload */
+        if ( old->type != ACPI_STATE_C3 && power->flags.bm_check )
+            acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 1);
+        break;
+    }
+
+    power->state = new;
+
+    return;
+}
+
+static void acpi_safe_halt(void)
+{
+    smp_mb__after_clear_bit();
+    safe_halt();
+}
+
+#define MWAIT_ECX_INTERRUPT_BREAK   (0x1)
+
+static void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
+{
+    __monitor((void *)current, 0, 0);
+    smp_mb();
+    __mwait(eax, ecx);
+}
+
+static void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
+{
+    mwait_idle_with_hints(cx->address, MWAIT_ECX_INTERRUPT_BREAK);
+}
+
+static void acpi_idle_do_entry(struct acpi_processor_cx *cx)
+{
+    if ( cx->space_id == ACPI_ADR_SPACE_FIXED_HARDWARE )
+    {
+        /* Call into architectural FFH based C-state */
+        acpi_processor_ffh_cstate_enter(cx);
+    }
+    else
+    {
+        int unused;
+        /* IO port based C-state */
+        inb(cx->address);
+        /* Dummy wait op - must do something useless after P_LVL2 read
+           because chipsets cannot guarantee that STPCLK# signal
+           gets asserted in time to freeze execution properly. */
+        unused = inl(pmtmr_ioport);
+    }
+}
+
+static atomic_t c3_cpu_count;
+
+static void acpi_processor_idle(void)
+{
+    struct acpi_processor_power *power = NULL;
+    struct acpi_processor_cx *cx = NULL;
+    struct acpi_processor_cx *next_state = NULL;
+    int sleep_ticks = 0;
+    u32 t1, t2 = 0;
+
+    power = &processor_powers[smp_processor_id()];
+
+    /*
+     * Interrupts must be disabled during bus mastering calculations and
+     * for C2/C3 transitions.
+     */
+    local_irq_disable();
+    cx = power->state;
+    if ( !cx )
+    {
+        if ( pm_idle_save )
+        {
+            printk(XENLOG_DEBUG "call pm_idle_save()\n");
+            pm_idle_save();
+        }
+        else
+        {
+            printk(XENLOG_DEBUG "call acpi_safe_halt()\n");
+            acpi_safe_halt();
+        }
+        return;
+    }
+
+    /*
+     * Check BM Activity
+     * -----------------
+     * Check for bus mastering activity (if required), record, and check
+     * for demotion.
+     */
+    if ( power->flags.bm_check )
+    {
+        u32 bm_status = 0;
+        unsigned long diff = (NOW() - power->bm_check_timestamp) >> 23;
+
+        if ( diff > 31 )
+            diff = 31;
+
+        power->bm_activity <<= diff;
+
+        acpi_get_register(ACPI_BITREG_BUS_MASTER_STATUS, &bm_status);
+        if ( bm_status )
+        {
+            power->bm_activity |= 0x1;
+            acpi_set_register(ACPI_BITREG_BUS_MASTER_STATUS, 1);
+        }
+        /*
+         * PIIX4 Erratum #18: Note that BM_STS doesn't always reflect
+         * the true state of bus mastering activity; forcing us to
+         * manually check the BMIDEA bit of each IDE channel.
+         */
+        /*else if ( errata.piix4.bmisx )
+        {
+            if ( (inb_p(errata.piix4.bmisx + 0x02) & 0x01)
+                || (inb_p(errata.piix4.bmisx + 0x0A) & 0x01) )
+                pr->power.bm_activity |= 0x1;
+        }*/
+
+        power->bm_check_timestamp = NOW();
+
+        /*
+         * If bus mastering is or was active this jiffy, demote
+         * to avoid a faulty transition.  Note that the processor
+         * won't enter a low-power state during this call (to this
+         * function) but should upon the next.
+         *
+         * TBD: A better policy might be to fallback to the demotion
+         *      state (use it for this quantum only) istead of
+         *      demoting -- and rely on duration as our sole demotion
+         *      qualification.  This may, however, introduce DMA
+         *      issues (e.g. floppy DMA transfer overrun/underrun).
+         */
+        if ( (power->bm_activity & 0x1) && cx->demotion.threshold.bm )
+        {
+            local_irq_enable();
+            next_state = cx->demotion.state;
+            goto end;
+        }
+    }
+
+    /*
+     * Sleep:
+     * ------
+     * Invoke the current Cx state to put the processor to sleep.
+     */
+    if ( cx->type == ACPI_STATE_C2 || cx->type == ACPI_STATE_C3 )
+        smp_mb__after_clear_bit();
+
+    switch ( cx->type )
+    {
+    case ACPI_STATE_C1:
+        /*
+         * Invoke C1.
+         * Use the appropriate idle routine, the one that would
+         * be used without acpi C-states.
+         */
+        if ( pm_idle_save )
+            pm_idle_save();
+        else 
+            acpi_safe_halt();
+
+        /*
+         * TBD: Can't get time duration while in C1, as resumes
+         *      go to an ISR rather than here.  Need to instrument
+         *      base interrupt handler.
+         */
+        sleep_ticks = 0xFFFFFFFF;
+        break;
+
+    case ACPI_STATE_C2:
+        /* Get start time (ticks) */
+        t1 = inl(pmtmr_ioport);
+        /* Invoke C2 */
+        acpi_idle_do_entry(cx);
+        /* Get end time (ticks) */
+        t2 = inl(pmtmr_ioport);
+
+        /* Re-enable interrupts */
+        local_irq_enable();
+        /* Compute time (ticks) that we were actually asleep */
+        sleep_ticks =
+            ticks_elapsed(t1, t2) - cx->latency_ticks - C2_OVERHEAD;
+        break;
+
+    case ACPI_STATE_C3:
+        /*
+         * disable bus master
+         * bm_check implies we need ARB_DIS
+         * !bm_check implies we need cache flush
+         * bm_control implies whether we can do ARB_DIS
+         *
+         * That leaves a case where bm_check is set and bm_control is
+         * not set. In that case we cannot do much, we enter C3
+         * without doing anything.
+         */
+        if ( power->flags.bm_check && power->flags.bm_control )
+        {
+            atomic_inc(&c3_cpu_count);
+            if ( atomic_read(&c3_cpu_count) == num_online_cpus() )
+            {
+                /*
+                 * All CPUs are trying to go to C3
+                 * Disable bus master arbitration
+                 */
+                acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1);
+            }
+        }
+        else if ( !power->flags.bm_check )
+        {
+            /* SMP with no shared cache... Invalidate cache  */
+            ACPI_FLUSH_CPU_CACHE();
+        }
+
+        /* Get start time (ticks) */
+        t1 = inl(pmtmr_ioport);
+
+        /*
+         * FIXME: Before invoking C3, be aware that TSC/APIC timer may be 
+         * stopped by H/W. Without carefully handling of TSC/APIC stop issues,
+         * deep C state can't work correctly.
+         */
+        /* preparing TSC stop */
+        cstate_save_tsc();
+        /* placeholder for preparing APIC stop */
+
+        /* Invoke C3 */
+        acpi_idle_do_entry(cx);
+
+        /* placeholder for recovering APIC */
+
+        /* recovering TSC */
+        cstate_restore_tsc();
+
+        /* Get end time (ticks) */
+        t2 = inl(pmtmr_ioport);
+        if ( power->flags.bm_check && power->flags.bm_control )
+        {
+            /* Enable bus master arbitration */
+            atomic_dec(&c3_cpu_count);
+            acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0);
+        }
+
+        /* Compute time (ticks) that we were actually asleep */
+        sleep_ticks = ticks_elapsed(t1, t2);
+        /* Re-enable interrupts */
+        local_irq_enable();
+        /* Do not account our idle-switching overhead: */
+        sleep_ticks -= cx->latency_ticks + C3_OVERHEAD;
+
+        break;
+
+    default:
+        local_irq_enable();
+        return;
+    }
+
+    cx->usage++;
+    if ( (cx->type != ACPI_STATE_C1) && (sleep_ticks > 0) )
+        cx->time += sleep_ticks;
+
+    next_state = power->state;
+
+    /*
+     * Promotion?
+     * ----------
+     * Track the number of longs (time asleep is greater than threshold)
+     * and promote when the count threshold is reached.  Note that bus
+     * mastering activity may prevent promotions.
+     * Do not promote above max_cstate.
+     */
+    if ( cx->promotion.state &&
+         ((cx->promotion.state - power->states) <= max_cstate) )
+    {
+        if ( sleep_ticks > cx->promotion.threshold.ticks )
+        {
+            cx->promotion.count++;
+            cx->demotion.count = 0;
+            if ( cx->promotion.count >= cx->promotion.threshold.count )
+            {
+                if ( power->flags.bm_check )
+                {
+                    if ( !(power->bm_activity & cx->promotion.threshold.bm) )
+                    {
+                        next_state = cx->promotion.state;
+                        goto end;
+                    }
+                }
+                else
+                {
+                    next_state = cx->promotion.state;
+                    goto end;
+                }
+            }
+        }
+    }
+
+    /*
+     * Demotion?
+     * ---------
+     * Track the number of shorts (time asleep is less than time threshold)
+     * and demote when the usage threshold is reached.
+     */
+    if ( cx->demotion.state )
+    {
+        if ( sleep_ticks < cx->demotion.threshold.ticks )
+        {
+            cx->demotion.count++;
+            cx->promotion.count = 0;
+            if ( cx->demotion.count >= cx->demotion.threshold.count )
+            {
+                next_state = cx->demotion.state;
+                goto end;
+            }
+        }
+    }
+
+end:
+    /*
+     * Demote if current state exceeds max_cstate
+     */
+    if ( (power->state - power->states) > max_cstate )
+    {
+        if ( cx->demotion.state )
+            next_state = cx->demotion.state;
+    }
+
+    /*
+     * New Cx State?
+     * -------------
+     * If we're going to start using a new Cx state we must clean up
+     * from the previous and prepare to use the new.
+     */
+    if ( next_state != power->state )
+        acpi_processor_power_activate(power, next_state);
+}
+
+static int acpi_processor_set_power_policy(struct acpi_processor_power *power)
+{
+    unsigned int i;
+    unsigned int state_is_set = 0;
+    struct acpi_processor_cx *lower = NULL;
+    struct acpi_processor_cx *higher = NULL;
+    struct acpi_processor_cx *cx;
+
+    if ( !power )
+        return -EINVAL;
+
+    /*
+     * This function sets the default Cx state policy (OS idle handler).
+     * Our scheme is to promote quickly to C2 but more conservatively
+     * to C3.  We're favoring C2  for its characteristics of low latency
+     * (quick response), good power savings, and ability to allow bus
+     * mastering activity.  Note that the Cx state policy is completely
+     * customizable and can be altered dynamically.
+     */
+
+    /* startup state */
+    for ( i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++ )
+    {
+        cx = &power->states[i];
+        if ( !cx->valid )
+            continue;
+
+        if ( !state_is_set )
+            power->state = cx;
+        state_is_set++;
+        break;
+    }
+
+    if ( !state_is_set )
+        return -ENODEV;
+
+    /* demotion */
+    for ( i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++ )
+    {
+        cx = &power->states[i];
+        if ( !cx->valid )
+            continue;
+
+        if ( lower )
+        {
+            cx->demotion.state = lower;
+            cx->demotion.threshold.ticks = cx->latency_ticks;
+            cx->demotion.threshold.count = 1;
+            if ( cx->type == ACPI_STATE_C3 )
+                cx->demotion.threshold.bm = bm_history;
+        }
+
+        lower = cx;
+    }
+
+    /* promotion */
+    for ( i = (ACPI_PROCESSOR_MAX_POWER - 1); i > 0; i-- )
+    {
+        cx = &power->states[i];
+        if ( !cx->valid )
+            continue;
+
+        if ( higher )
+        {
+            cx->promotion.state = higher;
+            cx->promotion.threshold.ticks = cx->latency_ticks;
+            if ( cx->type >= ACPI_STATE_C2 )
+                cx->promotion.threshold.count = 4;
+            else
+                cx->promotion.threshold.count = 10;
+            if ( higher->type == ACPI_STATE_C3 )
+                cx->promotion.threshold.bm = bm_history;
+        }
+
+        higher = cx;
+    }
+
+    return 0;
+}
+
+static int init_cx_pminfo(struct acpi_processor_power *acpi_power)
+{
+    memset(acpi_power, 0, sizeof(*acpi_power));
+
+    acpi_power->states[ACPI_STATE_C1].type = ACPI_STATE_C1;
+
+    acpi_power->states[ACPI_STATE_C0].valid = 1;
+    acpi_power->states[ACPI_STATE_C1].valid = 1;
+
+    acpi_power->count = 2;
+
+    return 0;
+}
+
+#define CPUID_MWAIT_LEAF (5)
+#define CPUID5_ECX_EXTENSIONS_SUPPORTED (0x1)
+#define CPUID5_ECX_INTERRUPT_BREAK      (0x2)
+
+#define MWAIT_ECX_INTERRUPT_BREAK       (0x1)
+
+#define MWAIT_SUBSTATE_MASK (0xf)
+#define MWAIT_SUBSTATE_SIZE (4)
+
+static int acpi_processor_ffh_cstate_probe(xen_processor_cx_t *cx)
+{
+    struct cpuinfo_x86 *c = &current_cpu_data;
+    unsigned int eax, ebx, ecx, edx;
+    unsigned int edx_part;
+    unsigned int cstate_type; /* C-state type and not ACPI C-state type */
+    unsigned int num_cstate_subtype;
+
+    if ( c->cpuid_level < CPUID_MWAIT_LEAF )
+    {
+        printk(XENLOG_INFO "MWAIT leaf not supported by cpuid\n");
+        return -EFAULT;
+    }
+
+    cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
+    printk(XENLOG_DEBUG "cpuid.MWAIT[.eax=%x, .ebx=%x, .ecx=%x, .edx=%x]\n",
+           eax, ebx, ecx, edx);
+
+    /* Check whether this particular cx_type (in CST) is supported or not */
+    cstate_type = (cx->reg.address >> MWAIT_SUBSTATE_SIZE) + 1;
+    edx_part = edx >> (cstate_type * MWAIT_SUBSTATE_SIZE);
+    num_cstate_subtype = edx_part & MWAIT_SUBSTATE_MASK;
+
+    if ( num_cstate_subtype < (cx->reg.address & MWAIT_SUBSTATE_MASK) )
+        return -EFAULT;
+
+    /* mwait ecx extensions INTERRUPT_BREAK should be supported for C2/C3 */
+    if ( !(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
+         !(ecx & CPUID5_ECX_INTERRUPT_BREAK) )
+        return -EFAULT;
+
+    printk(XENLOG_INFO "Monitor-Mwait will be used to enter C-%d state\n", 
cx->type);
+    return 0;
+}
+
+/*
+ * Initialize bm_flags based on the CPU cache properties
+ * On SMP it depends on cache configuration
+ * - When cache is not shared among all CPUs, we flush cache
+ *   before entering C3.
+ * - When cache is shared among all CPUs, we use bm_check
+ *   mechanism as in UP case
+ *
+ * This routine is called only after all the CPUs are online
+ */
+static void acpi_processor_power_init_bm_check(struct acpi_processor_flags 
*flags)
+{
+    struct cpuinfo_x86 *c = &current_cpu_data;
+
+    flags->bm_check = 0;
+    if ( num_online_cpus() == 1 )
+        flags->bm_check = 1;
+    else if ( c->x86_vendor == X86_VENDOR_INTEL )
+    {
+        /*
+         * Today all CPUs that support C3 share cache.
+         * TBD: This needs to look at cache shared map, once
+         * multi-core detection patch makes to the base.
+         */
+        flags->bm_check = 1;
+    }
+}
+
+#define VENDOR_INTEL                   (1)
+#define NATIVE_CSTATE_BEYOND_HALT      (2)
+
+static int check_cx(struct acpi_processor_power *power, xen_processor_cx_t *cx)
+{
+    static int bm_check_flag;
+    if ( cx == NULL )
+        return -EINVAL;
+
+    switch ( cx->reg.space_id )
+    {
+    case ACPI_ADR_SPACE_SYSTEM_IO:
+        if ( cx->reg.address == 0 )
+            return -EINVAL;
+        break;
+
+    case ACPI_ADR_SPACE_FIXED_HARDWARE:
+        if ( cx->type > ACPI_STATE_C1 )
+        {
+            if ( cx->reg.bit_width != VENDOR_INTEL || 
+                 cx->reg.bit_offset != NATIVE_CSTATE_BEYOND_HALT )
+                return -EINVAL;
+
+            /* assume all logical cpu has the same support for mwait */
+            if ( acpi_processor_ffh_cstate_probe(cx) )
+                return -EFAULT;
+        }
+        break;
+
+    default:
+        return -ENODEV;
+    }
+
+    if ( cx->type == ACPI_STATE_C3 )
+    {
+        /* All the logic here assumes flags.bm_check is same across all CPUs */
+        if ( !bm_check_flag )
+        {
+            /* Determine whether bm_check is needed based on CPU  */
+            acpi_processor_power_init_bm_check(&(power->flags));
+            bm_check_flag = power->flags.bm_check;
+        }
+        else
+        {
+            power->flags.bm_check = bm_check_flag;
+        }
+
+        if ( power->flags.bm_check )
+        {
+            if ( !power->flags.bm_control )
+            {
+                if ( power->flags.has_cst != 1 )
+                {
+                    /* bus mastering control is necessary */
+                    ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+                        "C3 support requires BM control\n"));
+                    return -1;
+                }
+                else
+                {
+                    /* Here we enter C3 without bus mastering */
+                    ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+                        "C3 support without BM control\n"));
+                }
+            }
+        }
+        else
+        {
+            /*
+             * WBINVD should be set in fadt, for C3 state to be
+             * supported on when bm_check is not required.
+             */
+            if ( !(acpi_gbl_FADT.flags & ACPI_FADT_WBINVD) )
+            {
+                ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+                          "Cache invalidation should work properly"
+                          " for C3 to be enabled on SMP systems\n"));
+                return -1;
+            }
+            acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
+        }
+    }
+
+    return 0;
+}
+
+static int set_cx(struct acpi_processor_power *acpi_power,
+                  xen_processor_cx_t *xen_cx)
+{
+    struct acpi_processor_cx *cx;
+
+    /* skip unsupported acpi cstate */
+    if ( check_cx(acpi_power, xen_cx) )
+        return -EFAULT;
+
+    cx = &acpi_power->states[xen_cx->type];
+    if ( !cx->valid )
+        acpi_power->count++;
+
+    cx->valid    = 1;
+    cx->type     = xen_cx->type;
+    cx->address  = xen_cx->reg.address;
+    cx->space_id = xen_cx->reg.space_id;
+    cx->latency  = xen_cx->latency;
+    cx->power    = xen_cx->power;
+    
+    cx->latency_ticks = US_TO_PM_TIMER_TICKS(cx->latency);
+
+    return 0;   
+}
+
+static int get_cpu_id(u8 acpi_id)
+{
+    int i;
+    u8 apic_id;
+
+    apic_id = x86_acpiid_to_apicid[acpi_id];
+    if ( apic_id == 0xff )
+        return -1;
+
+    for ( i = 0; i < NR_CPUS; i++ )
+    {
+        if ( apic_id == x86_cpu_to_apicid[i] )
+            return i;
+    }
+
+    return -1;
+}
+
+#ifdef DEBUG_PM_CX
+static void print_cx_pminfo(uint32_t cpu, struct xen_processor_power *power)
+{
+    XEN_GUEST_HANDLE(xen_processor_cx_t) states;
+    xen_processor_cx_t  state;
+    XEN_GUEST_HANDLE(xen_processor_csd_t) csd;
+    xen_processor_csd_t dp;
+    uint32_t i;
+
+    printk("cpu%d cx acpi info:\n", cpu);
+    printk("\tcount = %d\n", power->count);
+    printk("\tflags: bm_cntl[%d], bm_chk[%d], has_cst[%d],\n"
+           "\t       pwr_setup_done[%d], bm_rld_set[%d]\n",
+           power->flags.bm_control, power->flags.bm_check, 
power->flags.has_cst,
+           power->flags.power_setup_done, power->flags.bm_rld_set);
+    
+    states = power->states;
+    
+    for ( i = 0; i < power->count; i++ )
+    {
+        if ( unlikely(copy_from_guest_offset(&state, states, i, 1)) )
+            return;
+        
+        printk("\tstates[%d]:\n", i);
+        printk("\t\treg.space_id = 0x%x\n", state.reg.space_id);
+        printk("\t\treg.bit_width = 0x%x\n", state.reg.bit_width);
+        printk("\t\treg.bit_offset = 0x%x\n", state.reg.bit_offset);
+        printk("\t\treg.access_size = 0x%x\n", state.reg.access_size);
+        printk("\t\treg.address = 0x%"PRIx64"\n", state.reg.address);
+        printk("\t\ttype    = %d\n", state.type);
+        printk("\t\tlatency = %d\n", state.latency);
+        printk("\t\tpower   = %d\n", state.power);
+
+        csd = state.dp;
+        printk("\t\tdp(@0x%p)\n", csd.p);
+        
+        if ( csd.p != NULL )
+        {
+            if ( unlikely(copy_from_guest(&dp, csd, 1)) )
+                return;
+            printk("\t\t\tdomain = %d\n", dp.domain);
+            printk("\t\t\tcoord_type   = %d\n", dp.coord_type);
+            printk("\t\t\tnum = %d\n", dp.num);
+        }
+    }
+}
+#else
+#define print_cx_pminfo(c, p)
+#endif
+
+long set_cx_pminfo(uint32_t cpu, struct xen_processor_power *power)
+{
+    XEN_GUEST_HANDLE(xen_processor_cx_t) states;
+    xen_processor_cx_t xen_cx;
+    struct acpi_processor_power *acpi_power;
+    int cpu_id, i;
+
+    if ( unlikely(!guest_handle_okay(power->states, power->count)) )
+        return -EFAULT;
+
+    print_cx_pminfo(cpu, power);
+
+    /* map from acpi_id to cpu_id */
+    cpu_id = get_cpu_id((u8)cpu);
+    if ( cpu_id == -1 )
+    {
+        printk(XENLOG_ERR "no cpu_id for acpi_id %d\n", cpu);
+        return -EFAULT;
+    }
+
+    acpi_power = &processor_powers[cpu_id];
+
+    init_cx_pminfo(acpi_power);
+
+    acpi_power->flags.bm_check = power->flags.bm_check;
+    acpi_power->flags.bm_control = power->flags.bm_control;
+    acpi_power->flags.has_cst = power->flags.has_cst;
+
+    states = power->states;
+
+    for ( i = 0; i < power->count; i++ )
+    {
+        if ( unlikely(copy_from_guest_offset(&xen_cx, states, i, 1)) )
+            return -EFAULT;
+
+        set_cx(acpi_power, &xen_cx);
+    }
+
+    /* FIXME: C-state dependency is not supported by far */
+    
+    /* initialize default policy */
+    acpi_processor_set_power_policy(acpi_power);
+
+    print_acpi_power(cpu_id, acpi_power);
+
+    if ( cpu_id == 0 && pm_idle_save == NULL )
+    {
+        pm_idle_save = pm_idle;
+        pm_idle = acpi_processor_idle;
+    }
+        
+    return 0;
+}
diff -r f2457c7aff8d -r 611787b6ca35 xen/arch/x86/apic.c
--- a/xen/arch/x86/apic.c       Fri Apr 25 20:13:52 2008 +0900
+++ b/xen/arch/x86/apic.c       Thu May 08 18:40:07 2008 +0900
@@ -47,6 +47,8 @@ int enable_local_apic __initdata = 0; /*
  */
 int apic_verbosity;
 
+int x2apic_enabled __read_mostly = 0;
+
 
 static void apic_pm_activate(void);
 
@@ -306,7 +308,10 @@ int __init verify_local_APIC(void)
      */
     reg0 = apic_read(APIC_LVR);
     apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0);
-    apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK);
+
+    /* We don't try writing LVR in x2APIC mode since that incurs #GP. */
+    if ( !x2apic_enabled )
+        apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK);
     reg1 = apic_read(APIC_LVR);
     apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1);
 
@@ -610,7 +615,8 @@ int lapic_suspend(void)
     apic_pm_state.apic_id = apic_read(APIC_ID);
     apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
     apic_pm_state.apic_ldr = apic_read(APIC_LDR);
-    apic_pm_state.apic_dfr = apic_read(APIC_DFR);
+    if ( !x2apic_enabled )
+        apic_pm_state.apic_dfr = apic_read(APIC_DFR);
     apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
     apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
     apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
@@ -643,14 +649,20 @@ int lapic_resume(void)
      * FIXME! This will be wrong if we ever support suspend on
      * SMP! We'll need to do this as part of the CPU restore!
      */
-    rdmsr(MSR_IA32_APICBASE, l, h);
-    l &= ~MSR_IA32_APICBASE_BASE;
-    l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
-    wrmsr(MSR_IA32_APICBASE, l, h);
+    if ( !x2apic_enabled )
+    {
+        rdmsr(MSR_IA32_APICBASE, l, h);
+        l &= ~MSR_IA32_APICBASE_BASE;
+        l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
+        wrmsr(MSR_IA32_APICBASE, l, h);
+    }
+    else
+        enable_x2apic();
 
     apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
     apic_write(APIC_ID, apic_pm_state.apic_id);
-    apic_write(APIC_DFR, apic_pm_state.apic_dfr);
+    if ( !x2apic_enabled )
+        apic_write(APIC_DFR, apic_pm_state.apic_dfr);
     apic_write(APIC_LDR, apic_pm_state.apic_ldr);
     apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri);
     apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
@@ -809,10 +821,29 @@ no_apic:
     return -1;
 }
 
+void enable_x2apic(void)
+{
+    u32 lo, hi;
+
+    rdmsr(MSR_IA32_APICBASE, lo, hi);
+    if ( !(lo & MSR_IA32_APICBASE_EXTD) )
+    {
+        lo |= MSR_IA32_APICBASE_ENABLE | MSR_IA32_APICBASE_EXTD;
+        wrmsr(MSR_IA32_APICBASE, lo, 0);
+        printk("x2APIC mode enabled.\n");
+    }
+    else
+        printk("x2APIC mode enabled by BIOS.\n");
+
+    x2apic_enabled = 1;
+}
+
 void __init init_apic_mappings(void)
 {
     unsigned long apic_phys;
 
+    if ( x2apic_enabled )
+        goto __next;
     /*
      * If no local APIC can be found then set up a fake all
      * zeroes page to simulate the local APIC and another
@@ -828,12 +859,13 @@ void __init init_apic_mappings(void)
     apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", APIC_BASE,
                 apic_phys);
 
+__next:
     /*
      * Fetch the APIC ID of the BSP in case we have a
      * default configuration (or the MP table is broken).
      */
     if (boot_cpu_physical_apicid == -1U)
-        boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
+        boot_cpu_physical_apicid = get_apic_id();
 
 #ifdef CONFIG_X86_IO_APIC
     {
@@ -1271,7 +1303,7 @@ int __init APIC_init_uniprocessor (void)
      * might be zero if read from MP tables. Get it from LAPIC.
      */
 #ifdef CONFIG_CRASH_DUMP
-    boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
+    boot_cpu_physical_apicid = get_apic_id();
 #endif
     phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid);
 
diff -r f2457c7aff8d -r 611787b6ca35 xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c     Fri Apr 25 20:13:52 2008 +0900
+++ b/xen/arch/x86/domain.c     Thu May 08 18:40:07 2008 +0900
@@ -56,6 +56,9 @@ DEFINE_PER_CPU(u64, efer);
 DEFINE_PER_CPU(u64, efer);
 DEFINE_PER_CPU(unsigned long, cr4);
 
+static void default_idle(void);
+void (*pm_idle) (void) = default_idle;
+
 static void unmap_vcpu_info(struct vcpu *v);
 
 static void paravirt_ctxt_switch_from(struct vcpu *v);
@@ -105,7 +108,7 @@ void idle_loop(void)
         if ( cpu_is_offline(smp_processor_id()) )
             play_dead();
         page_scrub_schedule_work();
-        default_idle();
+        (*pm_idle)();
         do_softirq();
     }
 }
@@ -440,10 +443,9 @@ int arch_domain_create(struct domain *d,
 {
 #ifdef __x86_64__
     struct page_info *pg;
-    int i;
 #endif
     l1_pgentry_t gdt_l1e;
-    int vcpuid, pdpt_order, paging_initialised = 0;
+    int i, vcpuid, pdpt_order, paging_initialised = 0;
     int rc = -ENOMEM;
 
     d->arch.hvm_domain.hap_enabled =
@@ -526,6 +528,8 @@ int arch_domain_create(struct domain *d,
             goto fail;
     }
 
+    spin_lock_init(&d->arch.irq_lock);
+
     if ( is_hvm_domain(d) )
     {
         if ( (rc = hvm_domain_initialise(d)) != 0 )
@@ -539,6 +543,13 @@ int arch_domain_create(struct domain *d,
         /* 32-bit PV guest by default only if Xen is not 64-bit. */
         d->arch.is_32bit_pv = d->arch.has_32bit_shinfo =
             (CONFIG_PAGING_LEVELS != 4);
+    }
+
+    memset(d->arch.cpuids, 0, sizeof(d->arch.cpuids));
+    for ( i = 0; i < MAX_CPUID_INPUT; i++ )
+    {
+        d->arch.cpuids[i].input[0] = XEN_CPUID_INPUT_UNUSED;
+        d->arch.cpuids[i].input[1] = XEN_CPUID_INPUT_UNUSED;
     }
 
     return 0;
@@ -1910,6 +1921,37 @@ void arch_dump_vcpu_info(struct vcpu *v)
     paging_dump_vcpu_info(v);
 }
 
+void domain_cpuid(
+    struct domain *d,
+    unsigned int  input,
+    unsigned int  sub_input,
+    unsigned int  *eax,
+    unsigned int  *ebx,
+    unsigned int  *ecx,
+    unsigned int  *edx)
+{
+    cpuid_input_t *cpuid;
+    int i;
+
+    for ( i = 0; i < MAX_CPUID_INPUT; i++ )
+    {
+        cpuid = &d->arch.cpuids[i];
+
+        if ( (cpuid->input[0] == input) &&
+             ((cpuid->input[1] == XEN_CPUID_INPUT_UNUSED) ||
+              (cpuid->input[1] == sub_input)) )
+        {
+            *eax = cpuid->eax;
+            *ebx = cpuid->ebx;
+            *ecx = cpuid->ecx;
+            *edx = cpuid->edx;
+            return;
+        }
+    }
+
+    *eax = *ebx = *ecx = *edx = 0;
+}
+
 /*
  * Local variables:
  * mode: C
diff -r f2457c7aff8d -r 611787b6ca35 xen/arch/x86/domctl.c
--- a/xen/arch/x86/domctl.c     Fri Apr 25 20:13:52 2008 +0900
+++ b/xen/arch/x86/domctl.c     Thu May 08 18:40:07 2008 +0900
@@ -10,6 +10,7 @@
 #include <xen/mm.h>
 #include <xen/guest_access.h>
 #include <xen/compat.h>
+#include <xen/pci.h>
 #include <public/domctl.h>
 #include <xen/sched.h>
 #include <xen/domain.h>
@@ -539,7 +540,7 @@ long arch_do_domctl(
         if ( device_assigned(bus, devfn) )
         {
             gdprintk(XENLOG_ERR, "XEN_DOMCTL_test_assign_device: "
-                     "%x:%x:%x already assigned\n",
+                     "%x:%x:%x already assigned, or non-existent\n",
                      bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
             break;
         }
@@ -568,7 +569,7 @@ long arch_do_domctl(
         if ( device_assigned(bus, devfn) )
         {
             gdprintk(XENLOG_ERR, "XEN_DOMCTL_assign_device: "
-                     "%x:%x:%x already assigned\n",
+                     "%x:%x:%x already assigned, or non-existent\n",
                      bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
             break;
         }
@@ -842,6 +843,45 @@ long arch_do_domctl(
     }
     break;
 
+    case XEN_DOMCTL_set_cpuid:
+    {
+        struct domain *d;
+        xen_domctl_cpuid_t *ctl = &domctl->u.cpuid;
+        cpuid_input_t *cpuid = NULL; 
+        int i;
+
+        ret = -ESRCH;
+        d = rcu_lock_domain_by_id(domctl->domain);
+        if ( d == NULL )
+            break;
+
+        for ( i = 0; i < MAX_CPUID_INPUT; i++ )
+        {
+            cpuid = &d->arch.cpuids[i];
+
+            if ( cpuid->input[0] == XEN_CPUID_INPUT_UNUSED )
+                break;
+
+            if ( (cpuid->input[0] == ctl->input[0]) &&
+                 ((cpuid->input[1] == XEN_CPUID_INPUT_UNUSED) ||
+                  (cpuid->input[1] == ctl->input[1])) )
+                break;
+        }
+        
+        if ( i == MAX_CPUID_INPUT )
+        {
+            ret = -ENOENT;
+        }
+        else
+        {
+            memcpy(cpuid, ctl, sizeof(cpuid_input_t));
+            ret = 0;
+        }
+
+        rcu_unlock_domain(d);
+    }
+    break;
+
     default:
         ret = -ENOSYS;
         break;
diff -r f2457c7aff8d -r 611787b6ca35 xen/arch/x86/genapic/Makefile
--- a/xen/arch/x86/genapic/Makefile     Fri Apr 25 20:13:52 2008 +0900
+++ b/xen/arch/x86/genapic/Makefile     Thu May 08 18:40:07 2008 +0900
@@ -1,4 +1,5 @@ obj-y += bigsmp.o
 obj-y += bigsmp.o
+obj-y += x2apic.o
 obj-y += default.o
 obj-y += delivery.o
 obj-y += probe.o
diff -r f2457c7aff8d -r 611787b6ca35 xen/arch/x86/genapic/delivery.c
--- a/xen/arch/x86/genapic/delivery.c   Fri Apr 25 20:13:52 2008 +0900
+++ b/xen/arch/x86/genapic/delivery.c   Thu May 08 18:40:07 2008 +0900
@@ -17,7 +17,7 @@ void init_apic_ldr_flat(void)
 
        apic_write_around(APIC_DFR, APIC_DFR_FLAT);
        val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
-       val |= SET_APIC_LOGICAL_ID(1UL << smp_processor_id());
+       val |= SET_xAPIC_LOGICAL_ID(1UL << smp_processor_id());
        apic_write_around(APIC_LDR, val);
 }
 
diff -r f2457c7aff8d -r 611787b6ca35 xen/arch/x86/genapic/probe.c
--- a/xen/arch/x86/genapic/probe.c      Fri Apr 25 20:13:52 2008 +0900
+++ b/xen/arch/x86/genapic/probe.c      Thu May 08 18:40:07 2008 +0900
@@ -14,6 +14,7 @@
 #include <asm/apicdef.h>
 #include <asm/genapic.h>
 
+extern struct genapic apic_x2apic;
 extern struct genapic apic_summit;
 extern struct genapic apic_bigsmp;
 extern struct genapic apic_default;
@@ -21,6 +22,7 @@ struct genapic *genapic;
 struct genapic *genapic;
 
 struct genapic *apic_probe[] __initdata = { 
+       &apic_x2apic, 
        &apic_summit,
        &apic_bigsmp, 
        &apic_default,  /* must be last */
diff -r f2457c7aff8d -r 611787b6ca35 xen/arch/x86/genapic/x2apic.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/genapic/x2apic.c     Thu May 08 18:40:07 2008 +0900
@@ -0,0 +1,79 @@
+/*
+ * x2APIC driver.
+ *
+ * Copyright (c) 2008, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ */
+
+#include <xen/cpumask.h>
+#include <asm/apicdef.h>
+#include <asm/genapic.h>
+#include <xen/smp.h>
+#include <asm/mach-default/mach_mpparse.h>
+
+__init int probe_x2apic(void)
+{
+    return x2apic_is_available();
+}
+
+struct genapic apic_x2apic= {
+    APIC_INIT("x2apic", probe_x2apic),
+    GENAPIC_X2APIC
+};
+
+void init_apic_ldr_x2apic(void)
+{
+    /* We only use physical delivery mode. */
+    return;
+}
+
+void clustered_apic_check_x2apic(void)
+{
+    /* We only use physical delivery mode. */
+    return;
+}
+
+cpumask_t target_cpus_x2apic(void)
+{
+    /* Deliver interrupts only to CPU0 for now */
+    return cpumask_of_cpu(0);
+}
+
+unsigned int cpu_mask_to_apicid_x2apic(cpumask_t cpumask)
+{
+    return cpu_physical_id(first_cpu(cpumask));
+}
+
+void send_IPI_mask_x2apic(cpumask_t cpumask, int vector)
+{
+    unsigned int query_cpu;
+    u32 cfg, dest;
+    unsigned long flags;
+
+    ASSERT(cpus_subset(cpumask, cpu_online_map));
+    ASSERT(!cpus_empty(cpumask));
+
+    local_irq_save(flags);
+
+    cfg = APIC_DM_FIXED | 0 /* no shorthand */ | APIC_DEST_PHYSICAL | vector;
+    for_each_cpu_mask(query_cpu, cpumask)
+    {
+        dest =  cpu_physical_id(query_cpu);
+        apic_icr_write(cfg, dest);
+    }
+
+    local_irq_restore(flags);
+}
+
diff -r f2457c7aff8d -r 611787b6ca35 xen/arch/x86/hvm/Makefile
--- a/xen/arch/x86/hvm/Makefile Fri Apr 25 20:13:52 2008 +0900
+++ b/xen/arch/x86/hvm/Makefile Thu May 08 18:40:07 2008 +0900
@@ -16,4 +16,5 @@ obj-y += vlapic.o
 obj-y += vlapic.o
 obj-y += vpic.o
 obj-y += save.o
+obj-y += vmsi.o
 obj-y += stdvga.o
diff -r f2457c7aff8d -r 611787b6ca35 xen/arch/x86/hvm/hvm.c
--- a/xen/arch/x86/hvm/hvm.c    Fri Apr 25 20:13:52 2008 +0900
+++ b/xen/arch/x86/hvm/hvm.c    Thu May 08 18:40:07 2008 +0900
@@ -46,6 +46,7 @@
 #include <asm/hvm/vpt.h>
 #include <asm/hvm/support.h>
 #include <asm/hvm/cacheattr.h>
+#include <asm/hvm/trace.h>
 #include <public/sched.h>
 #include <public/hvm/ioreq.h>
 #include <public/version.h>
@@ -739,15 +740,22 @@ void hvm_send_assist_req(struct vcpu *v)
 
 void hvm_hlt(unsigned long rflags)
 {
+    struct vcpu *curr = current;
+
+    if ( hvm_event_pending(curr) )
+        return;
+
     /*
      * If we halt with interrupts disabled, that's a pretty sure sign that we
      * want to shut down. In a real processor, NMIs are the only way to break
      * out of this.
      */
     if ( unlikely(!(rflags & X86_EFLAGS_IF)) )
-        return hvm_vcpu_down(current);
+        return hvm_vcpu_down(curr);
 
     do_sched_op_compat(SCHEDOP_block, 0);
+
+    HVMTRACE_1D(HLT, curr, /* pending = */ vcpu_runnable(curr));
 }
 
 void hvm_triple_fault(void)
@@ -1594,66 +1602,15 @@ void hvm_cpuid(unsigned int input, unsig
     if ( cpuid_hypervisor_leaves(input, eax, ebx, ecx, edx) )
         return;
 
-    cpuid(input, eax, ebx, ecx, edx);
-
-    switch ( input )
-    {
-    case 0x00000001:
-        /* Clear #threads count and poke initial VLAPIC ID. */
-        *ebx &= 0x0000FFFFu;
-        *ebx |= (current->vcpu_id * 2) << 24;
-
-        /* We always support MTRR MSRs. */
-        *edx |= bitmaskof(X86_FEATURE_MTRR);
-
-        *ecx &= (bitmaskof(X86_FEATURE_XMM3) |
-                 bitmaskof(X86_FEATURE_SSSE3) |
-                 bitmaskof(X86_FEATURE_CX16) |
-                 bitmaskof(X86_FEATURE_SSE4_1) |
-                 bitmaskof(X86_FEATURE_SSE4_2) |
-                 bitmaskof(X86_FEATURE_POPCNT));
-
-        *edx &= (bitmaskof(X86_FEATURE_FPU) |
-                 bitmaskof(X86_FEATURE_VME) |
-                 bitmaskof(X86_FEATURE_DE) |
-                 bitmaskof(X86_FEATURE_PSE) |
-                 bitmaskof(X86_FEATURE_TSC) |
-                 bitmaskof(X86_FEATURE_MSR) |
-                 bitmaskof(X86_FEATURE_PAE) |
-                 bitmaskof(X86_FEATURE_MCE) |
-                 bitmaskof(X86_FEATURE_CX8) |
-                 bitmaskof(X86_FEATURE_APIC) |
-                 bitmaskof(X86_FEATURE_SEP) |
-                 bitmaskof(X86_FEATURE_MTRR) |
-                 bitmaskof(X86_FEATURE_PGE) |
-                 bitmaskof(X86_FEATURE_MCA) |
-                 bitmaskof(X86_FEATURE_CMOV) |
-                 bitmaskof(X86_FEATURE_PAT) |
-                 bitmaskof(X86_FEATURE_CLFLSH) |
-                 bitmaskof(X86_FEATURE_MMX) |
-                 bitmaskof(X86_FEATURE_FXSR) |
-                 bitmaskof(X86_FEATURE_XMM) |
-                 bitmaskof(X86_FEATURE_XMM2));
+    domain_cpuid(v->domain, input, *ecx, eax, ebx, ecx, edx);
+
+    if ( input == 0x00000001 )
+    {
+        /* Fix up VLAPIC details. */
+        *ebx &= 0x00FFFFFFu;
+        *ebx |= (v->vcpu_id * 2) << 24;
         if ( vlapic_hw_disabled(vcpu_vlapic(v)) )
-            __clear_bit(X86_FEATURE_APIC & 31, edx);
-#if CONFIG_PAGING_LEVELS >= 3
-        if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
-#endif
-            __clear_bit(X86_FEATURE_PAE & 31, edx);
-        break;
-
-    case 0x80000001:
-#if CONFIG_PAGING_LEVELS >= 3
-        if ( !v->domain->arch.hvm_domain.params[HVM_PARAM_PAE_ENABLED] )
-#endif
-            __clear_bit(X86_FEATURE_NX & 31, edx);
-#ifdef __i386__
-        /* Mask feature for Intel ia32e or AMD long mode. */
-        __clear_bit(X86_FEATURE_LAHF_LM & 31, ecx);
-        __clear_bit(X86_FEATURE_LM & 31, edx);
-        __clear_bit(X86_FEATURE_SYSCALL & 31, edx);
-#endif
-        break;
+            __clear_bit(X86_FEATURE_APIC & 31, ebx);
     }
 }
 
@@ -1663,10 +1620,14 @@ int hvm_msr_read_intercept(struct cpu_us
     uint64_t msr_content = 0;
     struct vcpu *v = current;
     uint64_t *var_range_base, *fixed_range_base;
-    int index;
+    int index, mtrr;
+    uint32_t cpuid[4];
 
     var_range_base = (uint64_t *)v->arch.hvm_vcpu.mtrr.var_ranges;
     fixed_range_base = (uint64_t *)v->arch.hvm_vcpu.mtrr.fixed_ranges;
+
+    hvm_cpuid(1, &cpuid[0], &cpuid[1], &cpuid[2], &cpuid[3]);
+    mtrr = !!(cpuid[3] & bitmaskof(X86_FEATURE_MTRR));
 
     switch ( ecx )
     {
@@ -1695,25 +1656,37 @@ int hvm_msr_read_intercept(struct cpu_us
         break;
 
     case MSR_MTRRcap:
+        if ( !mtrr )
+            goto gp_fault;
         msr_content = v->arch.hvm_vcpu.mtrr.mtrr_cap;
         break;
     case MSR_MTRRdefType:
+        if ( !mtrr )
+            goto gp_fault;
         msr_content = v->arch.hvm_vcpu.mtrr.def_type
                         | (v->arch.hvm_vcpu.mtrr.enabled << 10);
         break;
     case MSR_MTRRfix64K_00000:
+        if ( !mtrr )
+            goto gp_fault;
         msr_content = fixed_range_base[0];
         break;
     case MSR_MTRRfix16K_80000:
     case MSR_MTRRfix16K_A0000:
+        if ( !mtrr )
+            goto gp_fault;
         index = regs->ecx - MSR_MTRRfix16K_80000;
         msr_content = fixed_range_base[index + 1];
         break;
     case MSR_MTRRfix4K_C0000...MSR_MTRRfix4K_F8000:
+        if ( !mtrr )
+            goto gp_fault;
         index = regs->ecx - MSR_MTRRfix4K_C0000;
         msr_content = fixed_range_base[index + 3];
         break;
     case MSR_IA32_MTRR_PHYSBASE0...MSR_IA32_MTRR_PHYSMASK7:
+        if ( !mtrr )
+            goto gp_fault;
         index = regs->ecx - MSR_IA32_MTRR_PHYSBASE0;
         msr_content = var_range_base[index];
         break;
@@ -1725,6 +1698,10 @@ int hvm_msr_read_intercept(struct cpu_us
     regs->eax = (uint32_t)msr_content;
     regs->edx = (uint32_t)(msr_content >> 32);
     return X86EMUL_OKAY;
+
+gp_fault:
+    hvm_inject_exception(TRAP_gp_fault, 0, 0);
+    return X86EMUL_EXCEPTION;
 }
 
 int hvm_msr_write_intercept(struct cpu_user_regs *regs)
@@ -1739,7 +1716,11 @@ int hvm_msr_write_intercept(struct cpu_u
     uint32_t ecx = regs->ecx;
     uint64_t msr_content = (uint32_t)regs->eax | ((uint64_t)regs->edx << 32);
     struct vcpu *v = current;
-    int index;
+    int index, mtrr;
+    uint32_t cpuid[4];
+
+    hvm_cpuid(1, &cpuid[0], &cpuid[1], &cpuid[2], &cpuid[3]);
+    mtrr = !!(cpuid[3] & bitmaskof(X86_FEATURE_MTRR));
 
     switch ( ecx )
     {
@@ -1758,29 +1739,41 @@ int hvm_msr_write_intercept(struct cpu_u
         break;
 
     case MSR_MTRRcap:
+        if ( !mtrr )
+            goto gp_fault;
         goto gp_fault;
     case MSR_MTRRdefType:
+        if ( !mtrr )
+            goto gp_fault;
         if ( !mtrr_def_type_msr_set(&v->arch.hvm_vcpu.mtrr, msr_content) )
            goto gp_fault;
         break;
     case MSR_MTRRfix64K_00000:
+        if ( !mtrr )
+            goto gp_fault;
         if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr, 0, msr_content) )
             goto gp_fault;
         break;
     case MSR_MTRRfix16K_80000:
     case MSR_MTRRfix16K_A0000:
+        if ( !mtrr )
+            goto gp_fault;
         index = regs->ecx - MSR_MTRRfix16K_80000 + 1;
         if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr,
                                      index, msr_content) )
             goto gp_fault;
         break;
     case MSR_MTRRfix4K_C0000...MSR_MTRRfix4K_F8000:
+        if ( !mtrr )
+            goto gp_fault;
         index = regs->ecx - MSR_MTRRfix4K_C0000 + 3;
         if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr,
                                      index, msr_content) )
             goto gp_fault;
         break;
     case MSR_IA32_MTRR_PHYSBASE0...MSR_IA32_MTRR_PHYSMASK7:
+        if ( !mtrr )
+            goto gp_fault;
         if ( !mtrr_var_range_msr_set(&v->arch.hvm_vcpu.mtrr,
                                      regs->ecx, msr_content) )
             goto gp_fault;
@@ -2360,6 +2353,54 @@ long do_hvm_op(unsigned long op, XEN_GUE
         rc = guest_handle_is_null(arg) ? hvmop_flush_tlb_all() : -ENOSYS;
         break;
 
+    case HVMOP_track_dirty_vram:
+    {
+        struct xen_hvm_track_dirty_vram a;
+        struct domain *d;
+
+        if ( copy_from_guest(&a, arg, 1) )
+            return -EFAULT;
+
+        if ( a.domid == DOMID_SELF )
+        {
+            d = rcu_lock_current_domain();
+        }
+        else
+        {
+            if ( (d = rcu_lock_domain_by_id(a.domid)) == NULL )
+                return -ESRCH;
+            if ( !IS_PRIV_FOR(current->domain, d) )
+            {
+                rc = -EPERM;
+                goto param_fail2;
+            }
+        }
+
+        rc = -EINVAL;
+        if ( !is_hvm_domain(d) )
+            goto param_fail2;
+
+        rc = xsm_hvm_param(d, op);
+        if ( rc )
+            goto param_fail2;
+
+        rc = -ESRCH;
+        if ( d->is_dying )
+            goto param_fail2;
+
+        rc = -EINVAL;
+        if ( !shadow_mode_enabled(d))
+            goto param_fail2;
+        if ( d->vcpu[0] == NULL )
+            goto param_fail2;
+
+        rc = shadow_track_dirty_vram(d, a.first_pfn, a.nr, a.dirty_bitmap);
+
+    param_fail2:
+        rcu_unlock_domain(d);
+        break;
+    }
+
     default:
     {
         gdprintk(XENLOG_WARNING, "Bad HVM op %ld.\n", op);
diff -r f2457c7aff8d -r 611787b6ca35 xen/arch/x86/hvm/i8254.c
--- a/xen/arch/x86/hvm/i8254.c  Fri Apr 25 20:13:52 2008 +0900
+++ b/xen/arch/x86/hvm/i8254.c  Thu May 08 18:40:07 2008 +0900
@@ -206,19 +206,21 @@ static void pit_load_count(PITState *pit
 
     switch ( s->mode )
     {
-        case 2:
-            /* Periodic timer. */
-            create_periodic_time(v, &pit->pt0, period, 0, 0, pit_time_fired, 
-                                 &pit->count_load_time[channel]);
-            break;
-        case 1:
-            /* One-shot timer. */
-            create_periodic_time(v, &pit->pt0, period, 0, 1, pit_time_fired,
-                                 &pit->count_load_time[channel]);
-            break;
-        default:
-            destroy_periodic_time(&pit->pt0);
-            break;
+    case 2:
+    case 3:
+        /* Periodic timer. */
+        create_periodic_time(v, &pit->pt0, period, 0, 0, pit_time_fired, 
+                             &pit->count_load_time[channel]);
+        break;
+    case 1:
+    case 4:
+        /* One-shot timer. */
+        create_periodic_time(v, &pit->pt0, period, 0, 1, pit_time_fired,
+                             &pit->count_load_time[channel]);
+        break;
+    default:
+        destroy_periodic_time(&pit->pt0);
+        break;
     }
 }
 
diff -r f2457c7aff8d -r 611787b6ca35 xen/arch/x86/hvm/stdvga.c
--- a/xen/arch/x86/hvm/stdvga.c Fri Apr 25 20:13:52 2008 +0900
+++ b/xen/arch/x86/hvm/stdvga.c Thu May 08 18:40:07 2008 +0900
@@ -131,14 +131,15 @@ static int stdvga_outb(uint64_t addr, ui
 
     /* When in standard vga mode, emulate here all writes to the vram buffer
      * so we can immediately satisfy reads without waiting for qemu. */
-    s->stdvga =
-        (s->sr[7] == 0x00) &&  /* standard vga mode */
-        (s->gr[6] == 0x05);    /* misc graphics register w/ MemoryMapSelect=1
-                                * 0xa0000-0xaffff (64k region), AlphaDis=1 */
+    s->stdvga = (s->sr[7] == 0x00);
 
     if ( !prev_stdvga && s->stdvga )
     {
-        s->cache = 1;       /* (re)start caching video buffer */
+        /*
+         * (Re)start caching of video buffer.
+         * XXX TODO: In case of a restart the cache could be unsynced.
+         */
+        s->cache = 1;
         gdprintk(XENLOG_INFO, "entering stdvga and caching modes\n");
     }
     else if ( prev_stdvga && !s->stdvga )
@@ -180,6 +181,40 @@ static int stdvga_intercept_pio(
     }
 
     return X86EMUL_UNHANDLEABLE; /* propagate to external ioemu */
+}
+
+static unsigned int stdvga_mem_offset(
+    struct hvm_hw_stdvga *s, unsigned int mmio_addr)
+{
+    unsigned int memory_map_mode = (s->gr[6] >> 2) & 3;
+    unsigned int offset = mmio_addr & 0x1ffff;
+
+    switch ( memory_map_mode )
+    {
+    case 0:
+        break;
+    case 1:
+        if ( offset >= 0x10000 )
+            goto fail;
+        offset += 0; /* assume bank_offset == 0; */
+        break;
+    case 2:
+        offset -= 0x10000;
+        if ( offset >= 0x8000 )
+            goto fail;
+        break;
+    default:
+    case 3:
+        offset -= 0x18000;
+        if ( offset >= 0x8000 )
+            goto fail;
+        break;
+    }
+
+    return offset;
+
+ fail:
+    return ~0u;
 }
 
 #define GET_PLANE(data, p) (((data) >> ((p) * 8)) & 0xff)
@@ -191,8 +226,8 @@ static uint8_t stdvga_mem_readb(uint64_t
     uint32_t ret, *vram_l;
     uint8_t *vram_b;
 
-    addr &= 0x1ffff;
-    if ( addr >= 0x10000 )
+    addr = stdvga_mem_offset(s, addr);
+    if ( addr == ~0u )
         return 0xff;
 
     if ( s->sr[4] & 0x08 )
@@ -273,8 +308,8 @@ static void stdvga_mem_writeb(uint64_t a
     uint32_t write_mask, bit_mask, set_mask, *vram_l;
     uint8_t *vram_b;
 
-    addr &= 0x1ffff;
-    if ( addr >= 0x10000 )
+    addr = stdvga_mem_offset(s, addr);
+    if ( addr == ~0u )
         return;
 
     if ( s->sr[4] & 0x08 )
@@ -531,7 +566,7 @@ void stdvga_init(struct domain *d)
         register_portio_handler(d, 0x3ce, 2, stdvga_intercept_pio);
         /* MMIO. */
         register_buffered_io_handler(
-            d, 0xa0000, 0x10000, stdvga_intercept_mmio);
+            d, 0xa0000, 0x20000, stdvga_intercept_mmio);
     }
 }
 
diff -r f2457c7aff8d -r 611787b6ca35 xen/arch/x86/hvm/svm/emulate.c
--- a/xen/arch/x86/hvm/svm/emulate.c    Fri Apr 25 20:13:52 2008 +0900
+++ b/xen/arch/x86/hvm/svm/emulate.c    Thu May 08 18:40:07 2008 +0900
@@ -28,18 +28,6 @@
 #include <asm/hvm/svm/emulate.h>
 
 #define MAX_INST_LEN 15
-
-static int inst_copy_from_guest(
-    unsigned char *buf, unsigned long guest_eip, int inst_len)
-{
-    struct vmcb_struct *vmcb = current->arch.hvm_svm.vmcb;
-    uint32_t pfec = (vmcb->cpl == 3) ? PFEC_user_mode : 0;
-    if ( (inst_len > MAX_INST_LEN) || (inst_len <= 0) )
-        return 0;
-    if ( hvm_fetch_from_guest_virt_nofault(buf, guest_eip, inst_len, pfec) )
-        return 0;
-    return inst_len;
-}
 

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.