[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen-unstable] merge with xen-unstable.hg



# HG changeset patch
# User Isaku Yamahata <yamahata@xxxxxxxxxxxxx>
# Date 1221198460 -32400
# Node ID ec8eaab557d867dca3e8cbb3e0384d797929102a
# Parent  4ddd63b4be9be2440d213da60b10c20327e5c515
# Parent  346c073ed6a4f0debca36588039d649e2efd93c3
merge with xen-unstable.hg
---
 .hgignore                                             |    1 
 Config.mk                                             |    4 
 docs/misc/vtd.txt                                     |   27 
 docs/src/user.tex                                     |    4 
 stubdom/README                                        |    8 
 tools/examples/init.d/xendomains                      |    6 
 tools/examples/xend-config.sxp                        |    4 
 tools/examples/xmexample.hvm                          |    2 
 tools/examples/xmexample.hvm-stubdom                  |    2 
 tools/flask/policy/Makefile                           |  234 +++++
 tools/flask/policy/Rules.modular                      |  166 +++
 tools/flask/policy/Rules.monolithic                   |  196 ++++
 tools/flask/policy/policy/constraints                 |   27 
 tools/flask/policy/policy/flask/Makefile              |   41 
 tools/flask/policy/policy/flask/access_vectors        |  166 +++
 tools/flask/policy/policy/flask/initial_sids          |   17 
 tools/flask/policy/policy/flask/mkaccess_vector.sh    |  227 +++++
 tools/flask/policy/policy/flask/mkflask.sh            |   95 ++
 tools/flask/policy/policy/flask/security_classes      |   20 
 tools/flask/policy/policy/global_booleans             |    5 
 tools/flask/policy/policy/global_tunables             |    6 
 tools/flask/policy/policy/mcs                         |  324 +++++++
 tools/flask/policy/policy/mls                         |  354 ++++++++
 tools/flask/policy/policy/modules.conf                |   21 
 tools/flask/policy/policy/modules/xen/xen.if          |    1 
 tools/flask/policy/policy/modules/xen/xen.te          |  135 +++
 tools/flask/policy/policy/support/loadable_module.spt |  166 +++
 tools/flask/policy/policy/support/misc_macros.spt     |   32 
 tools/flask/policy/policy/systemuser                  |   19 
 tools/flask/policy/policy/users                       |   39 
 tools/ioemu/hw/cirrus_vga.c                           |    3 
 tools/ioemu/hw/pass-through.c                         |  146 +++
 tools/ioemu/hw/pass-through.h                         |   15 
 tools/ioemu/hw/pci.c                                  |    5 
 tools/ioemu/hw/pt-msi.c                               |    2 
 tools/ioemu/hw/vga.c                                  |    8 
 tools/ioemu/hw/xen_machine_fv.c                       |    4 
 tools/ioemu/vl.h                                      |    2 
 tools/libxc/ia64/xc_ia64_linux_save.c                 |    6 
 tools/libxc/xc_domain_save.c                          |   65 -
 tools/libxc/xc_evtchn.c                               |   15 
 tools/libxc/xc_private.c                              |   10 
 tools/libxc/xenctrl.h                                 |    6 
 tools/libxc/xenguest.h                                |    2 
 tools/python/Makefile                                 |   26 
 tools/python/xen/util/xsconstants.py                  |    6 
 tools/python/xen/util/xsm/flask/flask.py              |    8 
 tools/python/xen/util/xsm/xsm.py                      |   20 
 tools/python/xen/xend/XendConfig.py                   |    2 
 tools/python/xen/xend/XendDomainInfo.py               |    6 
 tools/python/xen/xend/XendOptions.py                  |    8 
 tools/python/xen/xend/server/blkif.py                 |    2 
 tools/python/xen/xend/server/netif.py                 |    2 
 tools/python/xen/xend/server/pciif.py                 |    2 
 tools/python/xen/xm/create.py                         |    6 
 tools/python/xen/xm/main.py                           |    2 
 tools/xcutils/lsevtchn.c                              |   48 -
 tools/xcutils/xc_save.c                               |  117 +-
 tools/xenstore/xs.c                                   |    7 
 tools/xentrace/formats                                |  149 ++-
 tools/xentrace/xentrace.c                             |  399 ++++++++-
 xen/arch/x86/acpi/Makefile                            |    2 
 xen/arch/x86/acpi/cpu_idle.c                          |  434 ++-------
 xen/arch/x86/acpi/cpufreq/cpufreq.c                   |   26 
 xen/arch/x86/acpi/cpufreq/powernow.c                  |    4 
 xen/arch/x86/acpi/cpuidle_menu.c                      |  132 +++
 xen/arch/x86/domain.c                                 |   24 
 xen/arch/x86/domain_build.c                           |    1 
 xen/arch/x86/domctl.c                                 |   47 -
 xen/arch/x86/hpet.c                                   |   30 
 xen/arch/x86/hvm/hvm.c                                |    5 
 xen/arch/x86/hvm/svm/intr.c                           |    4 
 xen/arch/x86/hvm/svm/svm.c                            |   36 
 xen/arch/x86/hvm/vmx/intr.c                           |    2 
 xen/arch/x86/hvm/vmx/vmx.c                            |   49 -
 xen/arch/x86/io_apic.c                                |   13 
 xen/arch/x86/irq.c                                    |   23 
 xen/arch/x86/mm.c                                     |  783 +++++++++++-------
 xen/arch/x86/mm/hap/hap.c                             |    1 
 xen/arch/x86/mm/shadow/common.c                       |   71 +
 xen/arch/x86/mm/shadow/multi.c                        |  210 ++++
 xen/arch/x86/mm/shadow/private.h                      |   43 
 xen/arch/x86/physdev.c                                |   80 -
 xen/arch/x86/platform_hypercall.c                     |   16 
 xen/arch/x86/smpboot.c                                |   40 
 xen/arch/x86/time.c                                   |    7 
 xen/arch/x86/traps.c                                  |   45 +
 xen/common/domain.c                                   |    4 
 xen/common/domctl.c                                   |   19 
 xen/common/event_channel.c                            |   21 
 xen/common/rangeset.c                                 |    9 
 xen/common/sched_credit.c                             |    5 
 xen/common/schedule.c                                 |  123 ++
 xen/common/sysctl.c                                   |   12 
 xen/common/trace.c                                    |   45 -
 xen/drivers/acpi/hwregs.c                             |    2 
 xen/drivers/passthrough/iommu.c                       |    4 
 xen/drivers/passthrough/vtd/iommu.c                   |   22 
 xen/include/asm-ia64/shadow.h                         |    2 
 xen/include/asm-x86/bitops.h                          |    4 
 xen/include/asm-x86/guest_access.h                    |    6 
 xen/include/asm-x86/hvm/trace.h                       |   49 -
 xen/include/asm-x86/io_apic.h                         |    2 
 xen/include/asm-x86/mm.h                              |   38 
 xen/include/asm-x86/msr-index.h                       |   12 
 xen/include/asm-x86/shadow.h                          |    2 
 xen/include/public/trace.h                            |   51 -
 xen/include/xen/cpuidle.h                             |   82 +
 xen/include/xen/iommu.h                               |    1 
 xen/include/xen/sched.h                               |   22 
 xen/include/xen/trace.h                               |    2 
 xen/include/xsm/xsm.h                                 |  148 ++-
 xen/xsm/dummy.c                                       |  130 ++
 xen/xsm/flask/hooks.c                                 |  318 ++++++-
 xen/xsm/flask/include/av_perm_to_string.h             |   21 
 xen/xsm/flask/include/av_permissions.h                |   63 -
 xen/xsm/flask/include/flask.h                         |   11 
 xen/xsm/flask/include/initial_sid_to_string.h         |    3 
 xen/xsm/flask/include/security.h                      |    6 
 xen/xsm/flask/ss/policydb.h                           |   13 
 xen/xsm/flask/ss/services.c                           |   40 
 121 files changed, 5439 insertions(+), 1429 deletions(-)

diff -r 4ddd63b4be9b -r ec8eaab557d8 .hgignore
--- a/.hgignore Fri Sep 12 14:32:45 2008 +0900
+++ b/.hgignore Fri Sep 12 14:47:40 2008 +0900
@@ -185,7 +185,6 @@
 ^tools/misc/xenperf$
 ^tools/pygrub/build/.*$
 ^tools/python/build/.*$
-^tools/python/xen/util/xsm/xsm\.py$
 ^tools/security/secpol_tool$
 ^tools/security/xen/.*$
 ^tools/security/xensec_tool$
diff -r 4ddd63b4be9b -r ec8eaab557d8 Config.mk
--- a/Config.mk Fri Sep 12 14:32:45 2008 +0900
+++ b/Config.mk Fri Sep 12 14:47:40 2008 +0900
@@ -86,11 +86,7 @@ QEMU_REMOTE=http://xenbits.xensource.com
 # Mercurial in-tree version, or a local directory, or a git URL.
 # CONFIG_QEMU   ?= ioemu
 # CONFIG_QEMU   ?= ../qemu-xen.git
-ifeq ($(XEN_TARGET_ARCH),ia64)
-CONFIG_QEMU   ?= ioemu
-else
 CONFIG_QEMU   ?= $(QEMU_REMOTE)
-endif
 
 # Optional components
 XENSTAT_XENTOP     ?= y
diff -r 4ddd63b4be9b -r ec8eaab557d8 docs/misc/vtd.txt
--- a/docs/misc/vtd.txt Fri Sep 12 14:32:45 2008 +0900
+++ b/docs/misc/vtd.txt Fri Sep 12 14:47:40 2008 +0900
@@ -1,8 +1,9 @@ Title   : How to do PCI Passthrough with
 Title   : How to do PCI Passthrough with VT-d
 Authors : Allen Kay    <allen.m.kay@xxxxxxxxx>
           Weidong Han  <weidong.han@xxxxxxxxx>
+          Yuji Shimada <shimada-yxb@xxxxxxxxxxxxxxx>
 Created : October-24-2007
-Updated : August-06-2008
+Updated : September-09-2008
 
 How to turn on VT-d in Xen
 --------------------------
@@ -106,3 +107,27 @@ http://h10010.www1.hp.com/wwpc/us/en/en/
 
 For more information, pls refer to http://wiki.xensource.com/xenwiki/VTdHowTo.
 
+
+Assigning devices to HVM domains
+--------------------------------
+
+Most device types such as NIC, HBA, EHCI and UHCI can be assigned to
+an HVM domain.
+
+But some devices have design features which make them unsuitable for
+assignment to an HVM domain. Examples include:
+
+ * Device has an internal resource, such as private memory, which is
+   mapped to memory address space with BAR (Base Address Register).
+ * Driver submits command with a pointer to a buffer within internal
+   resource. Device decodes the pointer (address), and accesses to the
+   buffer.
+
+In an HVM domain, the BAR is virtualized, and host-BAR value and
+guest-BAR value are different. The addresses of internal resource from
+device's view and driver's view are different. Similarly, the
+addresses of buffer within internal resource from device's view and
+driver's view are different. As a result, device can't access to the
+buffer specified by driver.
+
+Such devices assigned to HVM domain currently do not work.
diff -r 4ddd63b4be9b -r ec8eaab557d8 docs/src/user.tex
--- a/docs/src/user.tex Fri Sep 12 14:32:45 2008 +0900
+++ b/docs/src/user.tex Fri Sep 12 14:47:40 2008 +0900
@@ -4252,7 +4252,7 @@ directory of the Xen source distribution
 \section{Online References}
 
 The official Xen web site can be found at:
-\begin{quote} {\tt http://www.xensource.com}
+\begin{quote} {\tt http://www.xen.org}
 \end{quote}
 
 This contains links to the latest versions of all online
@@ -4282,7 +4282,7 @@ mailing lists and subscription informati
   Subscribe at: \\
   {\small {\tt http://lists.xensource.com/xen-announce}}
 \item[xen-changelog@xxxxxxxxxxxxxxxxxxx] Changelog feed
-  from the unstable and 2.0 trees - developer oriented.  Subscribe at: \\
+  from the unstable and 3.x trees - developer oriented.  Subscribe at: \\
   {\small {\tt http://lists.xensource.com/xen-changelog}}
 \end{description}
 
diff -r 4ddd63b4be9b -r ec8eaab557d8 stubdom/README
--- a/stubdom/README    Fri Sep 12 14:32:45 2008 +0900
+++ b/stubdom/README    Fri Sep 12 14:47:40 2008 +0900
@@ -27,7 +27,7 @@ device_model = '/usr/lib/xen/bin/stubdom
 - disable anything related to dom0, like pty serial assignments
 
 
-Create /etc/xen/stubdom-hvmconfig (where "hvmconfig" is the name of your HVM
+Create /etc/xen/hvmconfig-dm (where "hvmconfig" is the name of your HVM
 guest) with
 
 kernel = "/usr/lib/xen/boot/ioemu-stubdom.gz"
@@ -52,7 +52,7 @@ vnc = 0
 vnc = 0
 sdl = 0
 
-  - In stubdom-hvmconfig, set an sdl vfb:
+  - In hvmconfig-dm, set an sdl vfb:
 
 vfb = [ 'type=sdl' ]
 
@@ -65,7 +65,7 @@ vnc = 1
 vnc = 1
 vnclisten = "172.30.206.1"
 
-  - In stubdom-hvmconfig, fill the reserved vif with the same IP, for instance:
+  - In hvmconfig-dm, fill the reserved vif with the same IP, for instance:
 
 vif = [ 'ip=172.30.206.1', 'ip=10.0.1.1,mac=aa:00:00:12:23:34']
 
@@ -76,7 +76,7 @@ vnc = 0
 vnc = 0
 sdl = 0
 
-  - In stubdom-hvmconfig, set a vnc vfb:
+  - In hvmconfig-dm, set a vnc vfb:
 
 vfb = [ 'type=vnc' ]
 
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/examples/init.d/xendomains
--- a/tools/examples/init.d/xendomains  Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/examples/init.d/xendomains  Fri Sep 12 14:47:40 2008 +0900
@@ -327,15 +327,17 @@ stop()
        if test $id = 0; then continue; fi
        echo -n " $name"
        if test "$XENDOMAINS_AUTO_ONLY" = "true"; then
-           case $name in
+           eval "
+           case \"\$name\" in
                ($NAMES)
                    # nothing
                    ;;
                (*)
-                   echo -n "(skip)"
+                   echo -n '(skip)'
                    continue
                    ;;
            esac
+           "
        fi
        # XENDOMAINS_SYSRQ chould be something like just "s" 
        # or "s e i u" or even "s e s i u o"
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/examples/xend-config.sxp
--- a/tools/examples/xend-config.sxp    Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/examples/xend-config.sxp    Fri Sep 12 14:47:40 2008 +0900
@@ -14,6 +14,10 @@
 #(logfile /var/log/xen/xend.log)
 #(loglevel DEBUG)
 
+# Uncomment the line below.  Set the value to flask, acm, or dummy to 
+# select a security module.
+
+#(xsm_module_name dummy)
 
 # The Xen-API server configuration.
 #
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/examples/xmexample.hvm
--- a/tools/examples/xmexample.hvm      Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/examples/xmexample.hvm      Fri Sep 12 14:47:40 2008 +0900
@@ -220,7 +220,7 @@ serial='pty'
 #   Configure guest CPUID responses:
 #
 #cpuid=[ '1:ecx=xxxxxxxxxxx00xxxxxxxxxxxxxxxxxxx,
-#           eax=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' ]
+#           eax=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' ]
 # - Unset the SSE4 features (CPUID.1[ECX][20-19])
 # - Default behaviour for all other bits in ECX And EAX registers.
 # 
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/examples/xmexample.hvm-stubdom
--- a/tools/examples/xmexample.hvm-stubdom      Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/examples/xmexample.hvm-stubdom      Fri Sep 12 14:47:40 2008 +0900
@@ -236,7 +236,7 @@ stdvga=0
 #   Configure guest CPUID responses:
 #
 #cpuid=[ '1:ecx=xxxxxxxxxxx00xxxxxxxxxxxxxxxxxxx,
-#           eax=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' ]
+#           eax=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' ]
 # - Unset the SSE4 features (CPUID.1[ECX][20-19])
 # - Default behaviour for all other bits in ECX And EAX registers.
 # 
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/Makefile
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/flask/policy/Makefile       Fri Sep 12 14:47:40 2008 +0900
@@ -0,0 +1,234 @@
+#
+# Makefile for the security policy.
+#
+# Targets:
+# 
+# install       - compile and install the policy configuration, and context 
files.
+# load          - compile, install, and load the policy configuration.
+# reload        - compile, install, and load/reload the policy configuration.
+# policy        - compile the policy configuration locally for 
testing/development.
+#
+# The default target is 'policy'.
+#
+
+########################################
+#
+# Configurable portions of the Makefile
+#
+
+# Policy version
+# By default, checkpolicy will create the highest
+# version policy it supports.  Setting this will
+# override the version.
+OUTPUT_POLICY = 20
+
+# Policy Type
+# strict, targeted,
+# strict-mls, targeted-mls,
+# strict-mcs, targeted-mcs
+TYPE = strict
+
+# Policy Name
+# If set, this will be used as the policy
+# name.  Otherwise the policy type will be
+# used for the name.
+NAME = xenrefpolicy
+
+# Distribution
+# Some distributions have portions of policy
+# for programs or configurations specific to the
+# distribution.  Setting this will enable options
+# for the distribution.
+# redhat, gentoo, debian, and suse are current options.
+# Fedora users should enable redhat.
+#DISTRO = 
+
+# Build monolithic policy.  Putting n here
+# will build a loadable module policy.
+MONOLITHIC=y
+
+# Uncomment this to disable command echoing
+#QUIET:=@
+
+########################################
+#
+# NO OPTIONS BELOW HERE
+#
+
+# executable paths
+PREFIX := /usr
+BINDIR := $(PREFIX)/bin
+SBINDIR := $(PREFIX)/sbin
+CHECKPOLICY := $(BINDIR)/checkpolicy
+CHECKMODULE := $(BINDIR)/checkmodule
+SEMOD_PKG := $(BINDIR)/semodule_package
+LOADPOLICY := $(SBINDIR)/flask-loadpolicy
+
+CFLAGS := -Wall
+
+# policy source layout
+POLDIR := policy
+MODDIR := $(POLDIR)/modules
+FLASKDIR := $(POLDIR)/flask
+SECCLASS := $(FLASKDIR)/security_classes
+ISIDS := $(FLASKDIR)/initial_sids
+AVS := $(FLASKDIR)/access_vectors
+
+#policy building support tools
+SUPPORT := support
+FCSORT := tmp/fc_sort
+
+# config file paths
+GLOBALTUN := $(POLDIR)/global_tunables
+GLOBALBOOL := $(POLDIR)/global_booleans
+MOD_CONF := $(POLDIR)/modules.conf
+TUNABLES := $(POLDIR)/tunables.conf
+BOOLEANS := $(POLDIR)/booleans.conf
+
+# install paths
+TOPDIR = $(DESTDIR)/etc/xen/
+INSTALLDIR = $(TOPDIR)/$(NAME)
+SRCPATH = $(INSTALLDIR)/src
+USERPATH = $(INSTALLDIR)/users
+CONTEXTPATH = $(INSTALLDIR)/contexts
+
+# enable MLS if requested.
+ifneq ($(findstring -mls,$(TYPE)),)
+       override M4PARAM += -D enable_mls
+       CHECKPOLICY += -M
+       CHECKMODULE += -M
+endif
+
+# enable MLS if MCS requested.
+ifneq ($(findstring -mcs,$(TYPE)),)
+       override M4PARAM += -D enable_mcs
+       CHECKPOLICY += -M
+       CHECKMODULE += -M
+endif
+
+# compile targeted policy if requested.
+ifneq ($(findstring targeted,$(TYPE)),)
+       override M4PARAM += -D targeted_policy
+endif
+
+# enable distribution-specific policy
+ifneq ($(DISTRO),)
+       override M4PARAM += -D distro_$(DISTRO)
+endif
+
+ifneq ($(OUTPUT_POLICY),)
+       CHECKPOLICY += -c $(OUTPUT_POLICY)
+endif
+
+ifeq ($(NAME),)
+       NAME := $(TYPE)
+endif
+
+# determine the policy version and current kernel version if possible
+PV := $(shell $(CHECKPOLICY) -V |cut -f 1 -d ' ')
+KV := $(shell cat /selinux/policyvers)
+
+# dont print version warnings if we are unable to determine
+# the currently running kernel's policy version
+ifeq ($(KV),)
+       KV := $(PV)
+endif
+
+FC := file_contexts
+POLVER := policy.$(PV)
+
+M4SUPPORT = $(wildcard $(POLDIR)/support/*.spt)
+
+APPCONF := config/appconfig-$(TYPE)
+APPDIR := $(CONTEXTPATH)
+APPFILES := $(INSTALLDIR)/booleans
+CONTEXTFILES += $(wildcard $(APPCONF)/*_context*) $(APPCONF)/media
+USER_FILES := $(POLDIR)/systemuser $(POLDIR)/users
+
+ALL_LAYERS := $(filter-out $(MODDIR)/CVS,$(shell find $(wildcard $(MODDIR)/*) 
-maxdepth 0 -type d))
+
+GENERATED_TE := $(basename $(foreach dir,$(ALL_LAYERS),$(wildcard 
$(dir)/*.te.in)))
+GENERATED_IF := $(basename $(foreach dir,$(ALL_LAYERS),$(wildcard 
$(dir)/*.if.in)))
+GENERATED_FC := $(basename $(foreach dir,$(ALL_LAYERS),$(wildcard 
$(dir)/*.fc.in)))
+
+# sort here since it removes duplicates, which can happen
+# when a generated file is already generated
+DETECTED_MODS := $(sort $(foreach dir,$(ALL_LAYERS),$(wildcard $(dir)/*.te)) 
$(GENERATED_TE))
+
+# modules.conf setting for base module
+MODBASE := base
+
+# modules.conf setting for module
+MODMOD := module
+
+# extract settings from modules.conf
+BASE_MODS := $(foreach mod,$(shell awk '/^[[:blank:]]*[[:alpha:]]/{ if ($$3 == 
"$(MODBASE)") print $$1 }' $(MOD_CONF) 2> /dev/null),$(subst ./,,$(shell find 
-iname $(mod).te)))
+MOD_MODS := $(foreach mod,$(shell awk '/^[[:blank:]]*[[:alpha:]]/{ if ($$3 == 
"$(MODMOD)") print $$1 }' $(MOD_CONF) 2> /dev/null),$(subst ./,,$(shell find 
-iname $(mod).te)))
+
+HOMEDIR_TEMPLATE = tmp/homedir_template
+
+########################################
+#
+# Load appropriate rules
+#
+
+ifeq ($(MONOLITHIC),y)
+       include Rules.monolithic
+else
+       include Rules.modular
+endif
+
+########################################
+#
+# Create config files
+#
+conf: $(MOD_CONF) $(BOOLEANS) $(GENERATED_TE) $(GENERATED_IF) $(GENERATED_FC)
+
+$(MOD_CONF) $(BOOLEANS): $(POLXML)
+       @echo "Updating $(MOD_CONF) and $(BOOLEANS)"
+       $(QUIET) cd $(DOCS) && ../$(GENDOC) -t ../$(BOOLEANS) -m ../$(MOD_CONF) 
-x ../$(POLXML)
+
+########################################
+#
+# Appconfig files
+#
+install-appconfig: $(APPFILES)
+
+$(INSTALLDIR)/booleans: $(BOOLEANS)
+       @mkdir -p $(INSTALLDIR)
+       $(QUIET) egrep '^[[:blank:]]*[[:alpha:]]' $(BOOLEANS) \
+               | sed -e 's/false/0/g' -e 's/true/1/g' > tmp/booleans
+       $(QUIET) install -m 644 tmp/booleans $@
+
+########################################
+#
+# Install policy sources
+#
+install-src:
+       rm -rf $(SRCPATH)/policy.old
+       -mv $(SRCPATH)/policy $(SRCPATH)/policy.old
+       mkdir -p $(SRCPATH)/policy
+       cp -R . $(SRCPATH)/policy
+
+########################################
+#
+# Clean everything
+#
+bare: clean
+       rm -f $(POLXML)
+       rm -f $(SUPPORT)/*.pyc
+       rm -f $(FCSORT)
+       rm -f $(MOD_CONF)
+       rm -f $(BOOLEANS)
+       rm -fR $(HTMLDIR)
+ifneq ($(GENERATED_TE),)
+       rm -f $(GENERATED_TE)
+endif
+ifneq ($(GENERATED_IF),)
+       rm -f $(GENERATED_IF)
+endif
+ifneq ($(GENERATED_FC),)
+       rm -f $(GENERATED_FC)
+endif
+
+.PHONY: install-src install-appconfig conf html bare
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/Rules.modular
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/flask/policy/Rules.modular  Fri Sep 12 14:47:40 2008 +0900
@@ -0,0 +1,166 @@
+########################################
+#
+# Rules and Targets for building modular policies
+#
+
+ALL_MODULES := $(filter $(BASE_MODS) $(MOD_MODS),$(DETECTED_MODS))
+ALL_INTERFACES := $(ALL_MODULES:.te=.if)
+
+BASE_PKG := base.pp
+BASE_FC := base.fc
+
+BASE_SECTIONS := tmp/pre_te_files.conf tmp/generated_definitions.conf 
tmp/all_interfaces.conf tmp/all_attrs_types.conf $(GLOBALBOOL) $(GLOBALTUN) 
tmp/only_te_rules.conf tmp/all_post.conf
+
+BASE_PRE_TE_FILES := $(SECCLASS) $(ISIDS) $(AVS) $(M4SUPPORT) $(POLDIR)/mls 
$(POLDIR)/mcs
+BASE_TE_FILES := $(BASE_MODS)
+BASE_POST_TE_FILES := $(POLDIR)/systemuser $(POLDIR)/constraints
+BASE_FC_FILES := $(BASE_MODS:.te=.fc)
+
+MOD_MODULES := $(MOD_MODS:.te=.mod)
+MOD_PKGS := $(notdir $(MOD_MODS:.te=.pp))
+
+# search layer dirs for source files
+vpath %.te $(ALL_LAYERS)
+vpath %.if $(ALL_LAYERS)
+vpath %.fc $(ALL_LAYERS)
+
+########################################
+#
+# default action: create all module packages
+#
+default: base
+
+base: $(BASE_PKG)
+
+modules: $(MOD_PKGS)
+
+#policy: $(POLVER)
+#install: $(LOADPATH) $(FCPATH) $(APPFILES) $(USERPATH)/local.users
+#load: tmp/load
+
+########################################
+#
+# Create a base module package
+#
+$(BASE_PKG): tmp/base.mod $(BASE_FC)
+       @echo "Creating $(NAME) base module package"
+       $(QUIET) $(SEMOD_PKG) $@ $^
+
+########################################
+#
+# Compile a base module
+#
+tmp/base.mod: base.conf
+       @echo "Compiling $(NAME) base module"
+       $(QUIET) $(CHECKMODULE) $^ -o $@
+
+########################################
+#
+# Construct a base module policy.conf
+#
+base.conf: $(BASE_SECTIONS)
+       @echo "Creating $(NAME) base module policy.conf"
+# checkpolicy can use the #line directives provided by -s for error reporting:
+       $(QUIET) m4 -D self_contained_policy $(M4PARAM) -s $^ > tmp/$@.tmp
+       $(QUIET) sed -e /^portcon/d -e /^nodecon/d -e /^netifcon/d < tmp/$@.tmp 
> $@
+# the ordering of these ocontexts matters:
+       $(QUIET) grep ^portcon tmp/$@.tmp >> $@ || true
+       $(QUIET) grep ^netifcon tmp/$@.tmp >> $@ || true
+       $(QUIET) grep ^nodecon tmp/$@.tmp >> $@ || true
+
+tmp/pre_te_files.conf: $(BASE_PRE_TE_FILES)
+       @test -d tmp || mkdir -p tmp
+       $(QUIET) cat $^ > $@
+
+tmp/generated_definitions.conf: $(ALL_LAYERS) $(BASE_TE_FILES)
+       @test -d tmp || mkdir -p tmp
+# define all available object classes
+       $(QUIET) $(GENPERM) $(AVS) $(SECCLASS) > $@
+# per-userdomain templates
+       $(QUIET) echo "define(\`per_userdomain_templates',\`" >> $@
+       $(QUIET) for i in $(patsubst %.te,%,$(notdir $(ALL_MODULES))); do \
+               echo 
"ifdef(\`""$$i""_per_userdomain_template',\`""$$i""_per_userdomain_template("'$$*'")')"
 \
+                       >> $@ ;\
+       done
+       $(QUIET) echo "')" >> $@
+# define foo.te
+       $(QUIET) for i in $(notdir $(BASE_TE_FILES)); do \
+               echo "define(\`$$i')" >> $@ ;\
+       done
+       $(QUIET) $(SETTUN) $(BOOLEANS) >> $@
+
+tmp/all_interfaces.conf: $(M4SUPPORT) $(ALL_INTERFACES)
+ifeq ($(ALL_INTERFACES),)
+       $(error No enabled modules! $(notdir $(MOD_CONF)) may need to be 
generated by using "make conf")
+endif
+       @test -d tmp || mkdir -p tmp
+       $(QUIET) m4 $^ | sed -e s/dollarsstar/\$$\*/g > $@
+
+tmp/all_te_files.conf: $(BASE_TE_FILES)
+ifeq ($(BASE_TE_FILES),)
+       $(error No enabled modules! $(notdir $(MOD_CONF)) may need to be 
generated by using "make conf")
+endif
+       @test -d tmp || mkdir -p tmp
+       $(QUIET) cat $^ > $@
+
+tmp/post_te_files.conf: $(BASE_POST_TE_FILES)
+       @test -d tmp || mkdir -p tmp
+       $(QUIET) cat $^ > $@
+
+# extract attributes and put them first. extract post te stuff
+# like genfscon and put last.  portcon, nodecon, and netifcon
+# is delayed since they are generated by m4
+tmp/all_attrs_types.conf tmp/only_te_rules.conf tmp/all_post.conf: 
tmp/all_te_files.conf tmp/post_te_files.conf
+       $(QUIET) grep ^attribute tmp/all_te_files.conf > 
tmp/all_attrs_types.conf || true
+       $(QUIET) grep '^type ' tmp/all_te_files.conf >> tmp/all_attrs_types.conf
+       $(QUIET) cat tmp/post_te_files.conf > tmp/all_post.conf
+       $(QUIET) grep '^sid ' tmp/all_te_files.conf >> tmp/all_post.conf || true
+       $(QUIET) egrep '^fs_use_(xattr|task|trans)' tmp/all_te_files.conf >> 
tmp/all_post.conf || true
+       $(QUIET) grep ^genfscon tmp/all_te_files.conf >> tmp/all_post.conf || 
true
+       $(QUIET) sed -r -e /^attribute/d -e '/^type /d' -e /^genfscon/d \
+                       -e '/^sid /d' -e '/^fs_use_(xattr|task|trans)/d' \
+                       < tmp/all_te_files.conf > tmp/only_te_rules.conf
+
+########################################
+#
+# Construct base module file contexts
+#
+$(BASE_FC): $(M4SUPPORT) tmp/generated_definitions.conf $(BASE_FC_FILES) 
$(FCSORT)
+ifeq ($(BASE_FC_FILES),)
+       $(error No enabled modules! $(notdir $(MOD_CONF)) may need to be 
generated by using "make conf")
+endif
+       @echo "Creating $(NAME) base module file contexts."
+       @test -d tmp || mkdir -p tmp
+       $(QUIET) m4 $(M4PARAM) $(M4SUPPORT) tmp/generated_definitions.conf 
$(BASE_FC_FILES) > tmp/$@.tmp
+       $(QUIET) grep -e HOME -e ROLE tmp/$@.tmp > $(HOMEDIR_TEMPLATE)
+       $(QUIET) sed -i -e /HOME/d -e /ROLE/d tmp/$@.tmp
+       $(QUIET) $(FCSORT) tmp/$@.tmp $@
+
+########################################
+#
+# Build module packages
+#
+tmp/%.mod: $(M4SUPPORT) tmp/generated_definitions.conf tmp/all_interfaces.conf 
%.te
+       @if test -z "$(filter $^,$(MOD_MODS))"; then \
+               echo "The $(notdir $(basename $@)) module is not configured to 
be compiled as a lodable module." ;\
+               false ;\
+       fi
+       @echo "Compliling $(NAME) $(@F) module"
+       $(QUIET) m4 $(M4PARAM) -s $^ > $(@:.mod=.tmp)
+       $(QUIET) $(CHECKMODULE) -m $(@:.mod=.tmp) -o $@
+
+%.pp: tmp/%.mod %.fc
+       @echo "Creating $(NAME) $(@F) policy package"
+       $(QUIET) $(SEMOD_PKG) $@ $^
+
+########################################
+#
+# Clean the sources
+#
+clean:
+       rm -fR tmp
+       rm -f base.conf
+       rm -f *.pp
+       rm -f $(BASE_FC)
+
+.PHONY: default base modules clean
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/Rules.monolithic
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/flask/policy/Rules.monolithic       Fri Sep 12 14:47:40 2008 +0900
@@ -0,0 +1,196 @@
+########################################
+#
+# Rules and Targets for building monolithic policies
+#
+
+# install paths
+POLICYPATH = $(INSTALLDIR)/policy
+LOADPATH = $(POLICYPATH)/$(POLVER)
+FCPATH = $(CONTEXTPATH)/files/file_contexts
+HOMEDIRPATH = $(CONTEXTPATH)/files/homedir_template
+
+# for monolithic policy use all base and module to create policy
+ENABLEMOD := $(BASE_MODS) $(MOD_MODS)
+
+ALL_MODULES := $(filter $(ENABLEMOD),$(DETECTED_MODS))
+
+ALL_INTERFACES := $(ALL_MODULES:.te=.if)
+ALL_TE_FILES := $(ALL_MODULES)
+ALL_FC_FILES := $(ALL_MODULES:.te=.fc)
+
+PRE_TE_FILES := $(SECCLASS) $(ISIDS) $(AVS) $(M4SUPPORT) $(POLDIR)/mls 
$(POLDIR)/mcs
+POST_TE_FILES := $(POLDIR)/systemuser $(POLDIR)/users $(POLDIR)/constraints
+
+POLICY_SECTIONS := tmp/pre_te_files.conf tmp/generated_definitions.conf 
tmp/all_interfaces.conf tmp/all_attrs_types.conf $(GLOBALBOOL) $(GLOBALTUN) 
tmp/only_te_rules.conf tmp/all_post.conf
+
+########################################
+#
+# default action: build policy locally
+#
+default: policy
+
+policy: $(POLVER)
+
+install: $(LOADPATH) $(FCPATH) $(APPFILES) $(USERPATH)/local.users
+
+load: tmp/load
+
+########################################
+#
+# Build a binary policy locally
+#
+$(POLVER): policy.conf
+       @echo "Compiling $(NAME) $(POLVER)"
+ifneq ($(PV),$(KV))
+       @echo
+       @echo "WARNING: Policy version mismatch!  Is your OUTPUT_POLICY set 
correctly?"
+       @echo
+endif
+       $(QUIET) $(CHECKPOLICY) $^ -o $@
+
+########################################
+#
+# Install a binary policy
+#
+$(LOADPATH): policy.conf
+       @mkdir -p $(POLICYPATH)
+       @echo "Compiling and installing $(NAME) $(LOADPATH)"
+ifneq ($(PV),$(KV))
+       @echo
+       @echo "WARNING: Policy version mismatch!  Is your OUTPUT_POLICY set 
correctly?"
+       @echo
+endif
+       $(QUIET) $(CHECKPOLICY) $^ -o $@
+
+########################################
+#
+# Load the binary policy
+#
+reload tmp/load: $(LOADPATH) $(FCPATH)
+       @echo "Loading $(NAME) $(LOADPATH)"
+       $(QUIET) $(LOADPOLICY) -q $(LOADPATH)
+       @touch tmp/load
+
+########################################
+#
+# Construct a monolithic policy.conf
+#
+policy.conf: $(POLICY_SECTIONS)
+       @echo "Creating $(NAME) policy.conf"
+# checkpolicy can use the #line directives provided by -s for error reporting:
+       $(QUIET) m4 -D self_contained_policy $(M4PARAM) -s $^ > tmp/$@.tmp
+       $(QUIET) sed -e /^portcon/d -e /^nodecon/d -e /^netifcon/d < tmp/$@.tmp 
> $@
+
+tmp/pre_te_files.conf: $(PRE_TE_FILES)
+       @test -d tmp || mkdir -p tmp
+       $(QUIET) cat $^ > $@
+
+tmp/generated_definitions.conf: $(ALL_LAYERS) $(ALL_TE_FILES)
+# per-userdomain templates:
+       @test -d tmp || mkdir -p tmp
+       $(QUIET) echo "define(\`per_userdomain_templates',\`" > $@
+       $(QUIET) for i in $(patsubst %.te,%,$(notdir $(ALL_MODULES))); do \
+               echo 
"ifdef(\`""$$i""_per_userdomain_template',\`""$$i""_per_userdomain_template("'$$*'")')"
 \
+                       >> $@ ;\
+       done
+       $(QUIET) echo "')" >> $@
+# define foo.te
+       $(QUIET) for i in $(notdir $(ALL_MODULES)); do \
+               echo "define(\`$$i')" >> $@ ;\
+       done
+#      $(QUIET) $(SETTUN) $(BOOLEANS) >> $@
+
+tmp/all_interfaces.conf: $(M4SUPPORT) $(ALL_INTERFACES)
+ifeq ($(ALL_INTERFACES),)
+       $(error No enabled modules! $(notdir $(MOD_CONF)) may need to be 
generated by using "make conf")
+endif
+       @test -d tmp || mkdir -p tmp
+       $(QUIET) m4 $^ | sed -e s/dollarsstar/\$$\*/g > $@
+
+tmp/all_te_files.conf: $(ALL_TE_FILES)
+ifeq ($(ALL_TE_FILES),)
+       $(error No enabled modules! $(notdir $(MOD_CONF)) may need to be 
generated by using "make conf")
+endif
+       @test -d tmp || mkdir -p tmp
+       $(QUIET) cat $^ > $@
+
+tmp/post_te_files.conf: $(POST_TE_FILES)
+       @test -d tmp || mkdir -p tmp
+       $(QUIET) cat $^ > $@
+
+# extract attributes and put them first. extract post te stuff
+# like genfscon and put last.  portcon, nodecon, and netifcon
+# is delayed since they are generated by m4
+tmp/all_attrs_types.conf tmp/only_te_rules.conf tmp/all_post.conf: 
tmp/all_te_files.conf tmp/post_te_files.conf
+       $(QUIET) grep ^attribute tmp/all_te_files.conf > 
tmp/all_attrs_types.conf || true
+       $(QUIET) grep '^type ' tmp/all_te_files.conf >> tmp/all_attrs_types.conf
+       $(QUIET) cat tmp/post_te_files.conf > tmp/all_post.conf
+       $(QUIET) grep '^sid ' tmp/all_te_files.conf >> tmp/all_post.conf || true
+       $(QUIET) egrep '^fs_use_(xattr|task|trans)' tmp/all_te_files.conf >> 
tmp/all_post.conf || true
+       $(QUIET) grep ^genfscon tmp/all_te_files.conf >> tmp/all_post.conf || 
true
+       $(QUIET) sed -r -e /^attribute/d -e '/^type /d' -e /^genfscon/d \
+                       -e '/^sid /d' -e '/^fs_use_(xattr|task|trans)/d' \
+                       < tmp/all_te_files.conf > tmp/only_te_rules.conf
+
+########################################
+#
+# Remove the dontaudit rules from the policy.conf
+#
+enableaudit: policy.conf
+       @test -d tmp || mkdir -p tmp
+       @echo "Removing dontaudit rules from policy.conf"
+       $(QUIET) grep -v dontaudit policy.conf > tmp/policy.audit
+       $(QUIET) mv tmp/policy.audit policy.conf
+
+########################################
+#
+# Construct file_contexts
+#
+$(FC): $(M4SUPPORT) tmp/generated_definitions.conf $(ALL_FC_FILES)
+ifeq ($(ALL_FC_FILES),)
+       $(error No enabled modules! $(notdir $(MOD_CONF)) may need to be 
generated by using "make conf")
+endif
+       @echo "Creating $(NAME) file_contexts."
+       @test -d tmp || mkdir -p tmp
+       $(QUIET) m4 $(M4PARAM) $(M4SUPPORT) tmp/generated_definitions.conf 
$(ALL_FC_FILES) > tmp/$@.tmp
+#      $(QUIET) grep -e HOME -e ROLE tmp/$@.tmp > $(HOMEDIR_TEMPLATE)
+#      $(QUIET) sed -i -e /HOME/d -e /ROLE/d tmp/$@.tmp
+#      $(QUIET) $(FCSORT) tmp/$@.tmp $@
+       $(QUIET) touch $(HOMEDIR_TEMPLATE)
+       $(QUIET) touch $@
+
+########################################
+#
+# Install file_contexts
+#
+$(FCPATH): $(FC) $(LOADPATH) $(USERPATH)/system.users
+       @echo "Validating $(NAME) file_contexts."
+#      $(QUIET) $(SETFILES) -q -c $(LOADPATH) $(FC)
+       @echo "Installing file_contexts."
+       @mkdir -p $(CONTEXTPATH)/files
+       $(QUIET) install -m 644 $(FC) $(FCPATH)
+       $(QUIET) install -m 644 $(HOMEDIR_TEMPLATE) $(HOMEDIRPATH)
+#      $(QUIET) $(GENHOMEDIRCON) -d $(TOPDIR) -t $(NAME) $(USEPWD)
+
+########################################
+#
+# Run policy source checks
+#
+check: policy.conf $(FC)
+       $(SECHECK) -s --profile=development --policy=policy.conf --fcfile=$(FC) 
> $@.res
+
+longcheck: policy.conf $(FC)
+       $(SECHECK) -s --profile=all --policy=policy.conf --fcfile=$(FC) > $@.res
+
+########################################
+#
+# Clean the sources
+#
+clean:
+       rm -fR tmp
+       rm -f policy.conf
+       rm -f policy.$(PV)
+       rm -f $(FC)
+       rm -f *.res
+
+.PHONY: default policy install load reload enableaudit checklabels 
restorelabels relabel check longcheck clean
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/policy/constraints
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/flask/policy/policy/constraints     Fri Sep 12 14:47:40 2008 +0900
@@ -0,0 +1,27 @@
+
+#
+# Define the constraints
+#
+# constrain class_set perm_set expression ;
+#
+# expression : ( expression ) 
+#           | not expression
+#           | expression and expression
+#           | expression or expression
+#           | u1 op u2
+#           | r1 role_op r2
+#           | t1 op t2
+#           | u1 op names
+#           | u2 op names
+#           | r1 op names
+#           | r2 op names
+#           | t1 op names
+#           | t2 op names
+#
+# op : == | != 
+# role_op : == | != | eq | dom | domby | incomp
+#
+# names : name | { name_list }
+# name_list : name | name_list name            
+#
+
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/policy/flask/Makefile
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/flask/policy/policy/flask/Makefile  Fri Sep 12 14:47:40 2008 +0900
@@ -0,0 +1,41 @@
+# flask needs to know where to export the libselinux headers.
+LIBSEL ?= ../../libselinux
+
+# flask needs to know where to export the kernel headers.
+LINUXDIR ?= ../../../linux-2.6
+
+AWK = awk
+
+CONFIG_SHELL := $(shell if [ -x "$$BASH" ]; then echo $$BASH; \
+          else if [ -x /bin/bash ]; then echo /bin/bash; \
+          else echo sh; fi ; fi)
+
+FLASK_H_DEPEND = security_classes initial_sids
+AV_H_DEPEND = access_vectors
+
+FLASK_H_FILES = class_to_string.h flask.h initial_sid_to_string.h
+AV_H_FILES = av_inherit.h common_perm_to_string.h av_perm_to_string.h 
av_permissions.h
+ALL_H_FILES = $(FLASK_H_FILES) $(AV_H_FILES)
+
+all:  $(ALL_H_FILES)
+
+$(FLASK_H_FILES): $(FLASK_H_DEPEND)
+       $(CONFIG_SHELL) mkflask.sh $(AWK) $(FLASK_H_DEPEND)
+
+$(AV_H_FILES): $(AV_H_DEPEND)
+       $(CONFIG_SHELL) mkaccess_vector.sh $(AWK) $(AV_H_DEPEND)
+
+tolib: all
+       install -m 644 flask.h av_permissions.h $(LIBSEL)/include/selinux
+       install -m 644 class_to_string.h av_inherit.h common_perm_to_string.h 
av_perm_to_string.h $(LIBSEL)/src
+
+tokern: all
+       install -m 644 $(ALL_H_FILES) $(LINUXDIR)/security/selinux/include
+
+install: all
+
+relabel:
+
+clean:  
+       rm -f $(FLASK_H_FILES)
+       rm -f $(AV_H_FILES)
diff -r 4ddd63b4be9b -r ec8eaab557d8 
tools/flask/policy/policy/flask/access_vectors
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/flask/policy/policy/flask/access_vectors    Fri Sep 12 14:47:40 
2008 +0900
@@ -0,0 +1,166 @@
+#
+# Define common prefixes for access vectors
+#
+# common common_name { permission_name ... }
+
+#
+# Define a common prefix for file access vectors.
+#
+
+
+#
+# Define the access vectors.
+#
+# class class_name [ inherits common_name ] { permission_name ... }
+
+
+#
+# Define the access vector interpretation for file-related objects.
+#
+
+class xen
+{
+       scheduler
+       settime
+       tbufcontrol
+       readconsole
+       clearconsole
+       perfcontrol
+       mtrr_add
+       mtrr_del
+       mtrr_read
+       microcode
+       physinfo
+       quirk
+    writeconsole
+    readapic
+    writeapic
+    privprofile
+    nonprivprofile
+    kexec
+       firmware
+       sleep
+       frequency
+       getidle
+       debug
+       getcpuinfo
+       heap
+}
+
+class domain
+{
+       setvcpucontext
+       pause
+       unpause
+    resume
+    create
+    transition
+    max_vcpus
+    destroy
+    setvcpuaffinity
+       getvcpuaffinity
+       scheduler
+       getdomaininfo
+       getvcpuinfo
+       getvcpucontext
+       setdomainmaxmem
+       setdomainhandle
+       setdebugging
+       hypercall
+    settime
+    set_target
+    shutdown
+    setaddrsize
+    getaddrsize
+       trigger
+       getextvcpucontext
+       setextvcpucontext
+}
+
+class hvm
+{
+    sethvmc
+    gethvmc
+    setparam
+    getparam
+    pcilevel
+    irqlevel
+    pciroute
+       bind_irq
+       cacheattr
+}
+
+class event
+{
+       bind
+       send
+       status
+       notify
+       create
+    vector
+    reset
+}
+
+class grant
+{
+       map_read
+       map_write
+       unmap
+       transfer
+       setup
+    copy
+    query
+}
+
+class mmu
+{
+       map_read
+       map_write
+       pageinfo
+       pagelist
+    adjust
+    stat
+    translategp
+       updatemp
+    physmap
+    pinpage
+    mfnlist
+    memorymap
+}
+
+class shadow
+{
+       disable
+       enable
+    logdirty
+}
+
+class resource
+{
+       add
+       remove
+       use
+       add_irq
+       remove_irq
+       add_ioport
+       remove_ioport
+       add_iomem
+       remove_iomem
+       stat_device
+       add_device
+       remove_device
+}
+
+class security
+{
+       compute_av
+       compute_create
+       compute_member
+       check_context
+       load_policy
+       compute_relabel
+       compute_user
+       setenforce
+       setbool
+       setsecparam
+}
diff -r 4ddd63b4be9b -r ec8eaab557d8 
tools/flask/policy/policy/flask/initial_sids
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/flask/policy/policy/flask/initial_sids      Fri Sep 12 14:47:40 
2008 +0900
@@ -0,0 +1,17 @@
+# FLASK
+
+#
+# Define initial security identifiers 
+#
+sid xen
+sid dom0
+sid domU
+sid domio
+sid domxen
+sid unlabeled
+sid security
+sid ioport
+sid iomem
+sid pirq
+sid device
+# FLASK
diff -r 4ddd63b4be9b -r ec8eaab557d8 
tools/flask/policy/policy/flask/mkaccess_vector.sh
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/flask/policy/policy/flask/mkaccess_vector.sh        Fri Sep 12 
14:47:40 2008 +0900
@@ -0,0 +1,227 @@
+#!/bin/sh -
+#
+
+# FLASK
+
+set -e
+
+awk=$1
+shift
+
+# output files
+av_permissions="av_permissions.h"
+av_inherit="av_inherit.h"
+common_perm_to_string="common_perm_to_string.h"
+av_perm_to_string="av_perm_to_string.h"
+
+cat $* | $awk "
+BEGIN  {
+               outfile = \"$av_permissions\"
+               inheritfile = \"$av_inherit\"
+               cpermfile = \"$common_perm_to_string\"
+               avpermfile = \"$av_perm_to_string\"
+               "'
+               nextstate = "COMMON_OR_AV";
+               printf("/* This file is automatically generated.  Do not edit. 
*/\n") > outfile;
+               printf("/* This file is automatically generated.  Do not edit. 
*/\n") > inheritfile;
+               printf("/* This file is automatically generated.  Do not edit. 
*/\n") > cpermfile;
+               printf("/* This file is automatically generated.  Do not edit. 
*/\n") > avpermfile;
+;
+       }
+/^[ \t]*#/     { 
+                       next;
+               }
+$1 == "common" { 
+                       if (nextstate != "COMMON_OR_AV")
+                       {
+                               printf("Parse error:  Unexpected COMMON 
definition on line %d\n", NR);
+                               next;   
+                       }
+
+                       if ($2 in common_defined)
+                       {
+                               printf("Duplicate COMMON definition for %s on 
line %d.\n", $2, NR);
+                               next;
+                       }       
+                       common_defined[$2] = 1;
+
+                       tclass = $2;
+                       common_name = $2; 
+                       permission = 1;
+
+                       printf("TB_(common_%s_perm_to_string)\n", $2) > 
cpermfile;
+
+                       nextstate = "COMMON-OPENBRACKET";
+                       next;
+               }
+$1 == "class"  {
+                       if (nextstate != "COMMON_OR_AV" &&
+                           nextstate != "CLASS_OR_CLASS-OPENBRACKET")
+                       {
+                               printf("Parse error:  Unexpected class 
definition on line %d\n", NR);
+                               next;   
+                       }
+
+                       tclass = $2;
+
+                       if (tclass in av_defined)
+                       {
+                               printf("Duplicate access vector definition for 
%s on line %d\n", tclass, NR);
+                               next;
+                       } 
+                       av_defined[tclass] = 1;
+
+                       inherits = "";
+                       permission = 1;
+
+                       nextstate = "INHERITS_OR_CLASS-OPENBRACKET";
+                       next;
+               }
+$1 == "inherits" {                     
+                       if (nextstate != "INHERITS_OR_CLASS-OPENBRACKET")
+                       {
+                               printf("Parse error:  Unexpected INHERITS 
definition on line %d\n", NR);
+                               next;   
+                       }
+
+                       if (!($2 in common_defined))
+                       {
+                               printf("COMMON %s is not defined (line %d).\n", 
$2, NR);
+                               next;
+                       }
+
+                       inherits = $2;
+                       permission = common_base[$2];
+
+                       for (combined in common_perms)
+                       {
+                               split(combined,separate, SUBSEP);
+                               if (separate[1] == inherits)
+                               {
+                                       inherited_perms[common_perms[combined]] 
= separate[2];
+                               }
+                       }
+
+                        j = 1;
+                        for (i in inherited_perms) {
+                            ind[j] = i + 0;
+                            j++;
+                        }
+                        n = asort(ind);
+                       for (i = 1; i <= n; i++) {
+                               perm = inherited_perms[ind[i]];
+                               printf("#define %s__%s", toupper(tclass), 
toupper(perm)) > outfile; 
+                               spaces = 40 - (length(perm) + length(tclass));
+                               if (spaces < 1)
+                                     spaces = 1;
+                               for (j = 0; j < spaces; j++) 
+                                       printf(" ") > outfile; 
+                               printf("0x%08xUL\n", ind[i]) > outfile; 
+                       }
+                       printf("\n") > outfile;
+                        for (i in ind) delete ind[i];
+                        for (i in inherited_perms) delete inherited_perms[i];
+
+                       printf("   S_(SECCLASS_%s, %s, 0x%08xUL)\n", 
toupper(tclass), inherits, permission) > inheritfile; 
+
+                       nextstate = "CLASS_OR_CLASS-OPENBRACKET";
+                       next;
+               }
+$1 == "{"      { 
+                       if (nextstate != "INHERITS_OR_CLASS-OPENBRACKET" &&
+                           nextstate != "CLASS_OR_CLASS-OPENBRACKET" &&
+                           nextstate != "COMMON-OPENBRACKET")
+                       {
+                               printf("Parse error:  Unexpected { on line 
%d\n", NR);
+                               next;
+                       }
+
+                       if (nextstate == "INHERITS_OR_CLASS-OPENBRACKET")
+                               nextstate = "CLASS-CLOSEBRACKET";
+
+                       if (nextstate == "CLASS_OR_CLASS-OPENBRACKET")
+                               nextstate = "CLASS-CLOSEBRACKET";
+
+                       if (nextstate == "COMMON-OPENBRACKET")
+                               nextstate = "COMMON-CLOSEBRACKET";
+               }
+/[a-z][a-z_]*/ {
+                       if (nextstate != "COMMON-CLOSEBRACKET" &&
+                           nextstate != "CLASS-CLOSEBRACKET")
+                       {
+                               printf("Parse error:  Unexpected symbol %s on 
line %d\n", $1, NR);              
+                               next;
+                       }
+
+                       if (nextstate == "COMMON-CLOSEBRACKET")
+                       {
+                               if ((common_name,$1) in common_perms)
+                               {
+                                       printf("Duplicate permission %s for 
common %s on line %d.\n", $1, common_name, NR);
+                                       next;
+                               }
+
+                               common_perms[common_name,$1] = permission;
+
+                               printf("#define COMMON_%s__%s", 
toupper(common_name), toupper($1)) > outfile; 
+
+                               printf("    S_(\"%s\")\n", $1) > cpermfile;
+                       }
+                       else
+                       {
+                               if ((tclass,$1) in av_perms)
+                               {
+                                       printf("Duplicate permission %s for %s 
on line %d.\n", $1, tclass, NR);
+                                       next;
+                               }
+
+                               av_perms[tclass,$1] = permission;
+               
+                               if (inherits != "")
+                               {
+                                       if ((inherits,$1) in common_perms)
+                                       {
+                                               printf("Permission %s in %s on 
line %d conflicts with common permission.\n", $1, tclass, inherits, NR);
+                                               next;
+                                       }
+                               }
+
+                               printf("#define %s__%s", toupper(tclass), 
toupper($1)) > outfile; 
+
+                               printf("   S_(SECCLASS_%s, %s__%s, \"%s\")\n", 
toupper(tclass), toupper(tclass), toupper($1), $1) > avpermfile; 
+                       }
+
+                       spaces = 40 - (length($1) + length(tclass));
+                       if (spaces < 1)
+                             spaces = 1;
+
+                       for (i = 0; i < spaces; i++) 
+                               printf(" ") > outfile; 
+                       printf("0x%08xUL\n", permission) > outfile; 
+                       permission = permission * 2;
+               }
+$1 == "}"      {
+                       if (nextstate != "CLASS-CLOSEBRACKET" && 
+                           nextstate != "COMMON-CLOSEBRACKET")
+                       {
+                               printf("Parse error:  Unexpected } on line 
%d\n", NR);
+                               next;
+                       }
+
+                       if (nextstate == "COMMON-CLOSEBRACKET")
+                       {
+                               common_base[common_name] = permission;
+                               printf("TE_(common_%s_perm_to_string)\n\n", 
common_name) > cpermfile; 
+                       }
+
+                       printf("\n") > outfile;
+
+                       nextstate = "COMMON_OR_AV";
+               }
+END    {
+               if (nextstate != "COMMON_OR_AV" && nextstate != 
"CLASS_OR_CLASS-OPENBRACKET")
+                       printf("Parse error:  Unexpected end of file\n");
+
+       }'
+
+# FLASK
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/policy/flask/mkflask.sh
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/flask/policy/policy/flask/mkflask.sh        Fri Sep 12 14:47:40 
2008 +0900
@@ -0,0 +1,95 @@
+#!/bin/sh -
+#
+
+# FLASK
+
+set -e
+
+awk=$1
+shift 1
+
+# output file
+output_file="flask.h"
+debug_file="class_to_string.h"
+debug_file2="initial_sid_to_string.h"
+
+cat $* | $awk "
+BEGIN  {
+               outfile = \"$output_file\"
+               debugfile = \"$debug_file\"
+               debugfile2 = \"$debug_file2\"
+               "'
+               nextstate = "CLASS";
+
+               printf("/* This file is automatically generated.  Do not edit. 
*/\n") > outfile;
+
+               printf("#ifndef _SELINUX_FLASK_H_\n") > outfile;
+               printf("#define _SELINUX_FLASK_H_\n") > outfile;
+               printf("\n/*\n * Security object class definitions\n */\n") > 
outfile;
+               printf("/* This file is automatically generated.  Do not edit. 
*/\n") > debugfile;
+               printf("/*\n * Security object class definitions\n */\n") > 
debugfile;
+               printf("    S_(\"null\")\n") > debugfile;
+               printf("/* This file is automatically generated.  Do not edit. 
*/\n") > debugfile2;
+               printf("static char *initial_sid_to_string[] =\n{\n") > 
debugfile2;
+               printf("    \"null\",\n") > debugfile2;
+       }
+/^[ \t]*#/     { 
+                       next;
+               }
+$1 == "class"  { 
+                       if (nextstate != "CLASS")
+                       {
+                               printf("Parse error:  Unexpected class 
definition on line %d\n", NR);
+                               next;   
+                       }
+
+                       if ($2 in class_found)
+                       {
+                               printf("Duplicate class definition for %s on 
line %d.\n", $2, NR);
+                               next;
+                       }       
+                       class_found[$2] = 1;
+
+                       class_value++;
+
+                       printf("#define SECCLASS_%s", toupper($2)) > outfile;
+                       for (i = 0; i < 40 - length($2); i++) 
+                               printf(" ") > outfile; 
+                       printf("%d\n", class_value) > outfile; 
+
+                       printf("    S_(\"%s\")\n", $2) > debugfile;
+               }
+$1 == "sid"    { 
+                       if (nextstate == "CLASS")
+                       {
+                           nextstate = "SID";
+                           printf("\n/*\n * Security identifier indices for 
initial entities\n */\n") > outfile;                           
+                       }
+
+                       if ($2 in sid_found)
+                       {
+                               printf("Duplicate SID definition for %s on line 
%d.\n", $2, NR);
+                               next;
+                       }       
+                       sid_found[$2] = 1;
+                       sid_value++;
+
+                       printf("#define SECINITSID_%s", toupper($2)) > outfile;
+                       for (i = 0; i < 37 - length($2); i++) 
+                               printf(" ") > outfile; 
+                       printf("%d\n", sid_value) > outfile; 
+                       printf("    \"%s\",\n", $2) > debugfile2;
+               }
+END    {
+               if (nextstate != "SID")
+                       printf("Parse error:  Unexpected end of file\n");
+
+               printf("\n#define SECINITSID_NUM") > outfile;
+               for (i = 0; i < 34; i++) 
+                       printf(" ") > outfile; 
+               printf("%d\n", sid_value) > outfile; 
+               printf("\n#endif\n") > outfile;
+               printf("};\n\n") > debugfile2;
+       }'
+
+# FLASK
diff -r 4ddd63b4be9b -r ec8eaab557d8 
tools/flask/policy/policy/flask/security_classes
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/flask/policy/policy/flask/security_classes  Fri Sep 12 14:47:40 
2008 +0900
@@ -0,0 +1,20 @@
+# FLASK
+
+#
+# Define the security object classes 
+#
+
+# Classes marked as userspace are classes
+# for userspace object managers
+
+class xen
+class domain
+class hvm
+class mmu
+class resource
+class shadow
+class event
+class grant
+class security
+
+# FLASK
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/policy/global_booleans
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/flask/policy/policy/global_booleans Fri Sep 12 14:47:40 2008 +0900
@@ -0,0 +1,5 @@
+#
+# This file is for the declaration of global booleans.
+# To change the default value at build time, the booleans.conf
+# file should be used.
+#
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/policy/global_tunables
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/flask/policy/policy/global_tunables Fri Sep 12 14:47:40 2008 +0900
@@ -0,0 +1,6 @@
+#
+# This file is for the declaration of global tunables.
+# To change the default value at build time, the booleans.conf
+# file should be used.
+#
+
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/policy/mcs
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/flask/policy/policy/mcs     Fri Sep 12 14:47:40 2008 +0900
@@ -0,0 +1,324 @@
+ifdef(`enable_mcs',`
+#
+# Define sensitivities 
+#
+# Each sensitivity has a name and zero or more aliases.
+#
+# MCS is single-sensitivity.
+#
+sensitivity s0;
+
+#
+# Define the ordering of the sensitivity levels (least to greatest)
+#
+dominance { s0 }
+
+
+#
+# Define the categories
+#
+# Each category has a name and zero or more aliases.
+#
+category c0;
+category c1;
+category c2;
+category c3;
+category c4;
+category c5;
+category c6;
+category c7;
+category c8;
+category c9;
+category c10;
+category c11;
+category c12;
+category c13;
+category c14;
+category c15;
+category c16;
+category c17;
+category c18;
+category c19;
+category c20;
+category c21;
+category c22;
+category c23;
+category c24;
+category c25;
+category c26;
+category c27;
+category c28;
+category c29;
+category c30;
+category c31;
+category c32;
+category c33;
+category c34;
+category c35;
+category c36;
+category c37;
+category c38;
+category c39;
+category c40;
+category c41;
+category c42;
+category c43;
+category c44;
+category c45;
+category c46;
+category c47;
+category c48;
+category c49;
+category c50;
+category c51;
+category c52;
+category c53;
+category c54;
+category c55;
+category c56;
+category c57;
+category c58;
+category c59;
+category c60;
+category c61;
+category c62;
+category c63;
+category c64;
+category c65;
+category c66;
+category c67;
+category c68;
+category c69;
+category c70;
+category c71;
+category c72;
+category c73;
+category c74;
+category c75;
+category c76;
+category c77;
+category c78;
+category c79;
+category c80;
+category c81;
+category c82;
+category c83;
+category c84;
+category c85;
+category c86;
+category c87;
+category c88;
+category c89;
+category c90;
+category c91;
+category c92;
+category c93;
+category c94;
+category c95;
+category c96;
+category c97;
+category c98;
+category c99;
+category c100;
+category c101;
+category c102;
+category c103;
+category c104;
+category c105;
+category c106;
+category c107;
+category c108;
+category c109;
+category c110;
+category c111;
+category c112;
+category c113;
+category c114;
+category c115;
+category c116;
+category c117;
+category c118;
+category c119;
+category c120;
+category c121;
+category c122;
+category c123;
+category c124;
+category c125;
+category c126;
+category c127;
+category c128;
+category c129;
+category c130;
+category c131;
+category c132;
+category c133;
+category c134;
+category c135;
+category c136;
+category c137;
+category c138;
+category c139;
+category c140;
+category c141;
+category c142;
+category c143;
+category c144;
+category c145;
+category c146;
+category c147;
+category c148;
+category c149;
+category c150;
+category c151;
+category c152;
+category c153;
+category c154;
+category c155;
+category c156;
+category c157;
+category c158;
+category c159;
+category c160;
+category c161;
+category c162;
+category c163;
+category c164;
+category c165;
+category c166;
+category c167;
+category c168;
+category c169;
+category c170;
+category c171;
+category c172;
+category c173;
+category c174;
+category c175;
+category c176;
+category c177;
+category c178;
+category c179;
+category c180;
+category c181;
+category c182;
+category c183;
+category c184;
+category c185;
+category c186;
+category c187;
+category c188;
+category c189;
+category c190;
+category c191;
+category c192;
+category c193;
+category c194;
+category c195;
+category c196;
+category c197;
+category c198;
+category c199;
+category c200;
+category c201;
+category c202;
+category c203;
+category c204;
+category c205;
+category c206;
+category c207;
+category c208;
+category c209;
+category c210;
+category c211;
+category c212;
+category c213;
+category c214;
+category c215;
+category c216;
+category c217;
+category c218;
+category c219;
+category c220;
+category c221;
+category c222;
+category c223;
+category c224;
+category c225;
+category c226;
+category c227;
+category c228;
+category c229;
+category c230;
+category c231;
+category c232;
+category c233;
+category c234;
+category c235;
+category c236;
+category c237;
+category c238;
+category c239;
+category c240;
+category c241;
+category c242;
+category c243;
+category c244;
+category c245;
+category c246;
+category c247;
+category c248;
+category c249;
+category c250;
+category c251;
+category c252;
+category c253;
+category c254;
+category c255;
+
+
+#
+# Each MCS level specifies a sensitivity and zero or more categories which may
+# be associated with that sensitivity.
+#
+level s0:c0.c255;
+
+#
+# Define the MCS policy
+#
+# mlsconstrain class_set perm_set expression ;
+#
+# mlsvalidatetrans class_set expression ;
+#
+# expression : ( expression )
+#           | not expression
+#           | expression and expression
+#           | expression or expression
+#           | u1 op u2
+#           | r1 role_mls_op r2
+#           | t1 op t2
+#           | l1 role_mls_op l2
+#           | l1 role_mls_op h2
+#           | h1 role_mls_op l2
+#           | h1 role_mls_op h2
+#           | l1 role_mls_op h1
+#           | l2 role_mls_op h2
+#           | u1 op names
+#           | u2 op names
+#           | r1 op names
+#           | r2 op names
+#           | t1 op names
+#           | t2 op names
+#           | u3 op names (NOTE: this is only available for mlsvalidatetrans)
+#           | r3 op names (NOTE: this is only available for mlsvalidatetrans)
+#           | t3 op names (NOTE: this is only available for mlsvalidatetrans)
+#
+# op : == | !=
+# role_mls_op : == | != | eq | dom | domby | incomp
+#
+# names : name | { name_list }
+# name_list : name | name_list name
+#
+
+
+') dnl end enable_mcs
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/policy/mls
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/flask/policy/policy/mls     Fri Sep 12 14:47:40 2008 +0900
@@ -0,0 +1,354 @@
+
+ifdef(`enable_mls',`
+#
+# Define sensitivities 
+#
+# Each sensitivity has a name and zero or more aliases.
+#
+sensitivity s0;
+sensitivity s1;
+sensitivity s2;
+sensitivity s3;
+sensitivity s4;
+sensitivity s5;
+sensitivity s6;
+sensitivity s7;
+sensitivity s8;
+sensitivity s9;
+sensitivity s10;
+sensitivity s11;
+sensitivity s12;
+sensitivity s13;
+sensitivity s14;
+sensitivity s15;
+
+#
+# Define the ordering of the sensitivity levels (least to greatest)
+#
+dominance { s0 s1 s2 s3 s4 s5 s6 s7 s8 s9 s10 s11 s12 s13 s14 s15 }
+
+
+#
+# Define the categories
+#
+# Each category has a name and zero or more aliases.
+#
+category c0;
+category c1;
+category c2;
+category c3;
+category c4;
+category c5;
+category c6;
+category c7;
+category c8;
+category c9;
+category c10;
+category c11;
+category c12;
+category c13;
+category c14;
+category c15;
+category c16;
+category c17;
+category c18;
+category c19;
+category c20;
+category c21;
+category c22;
+category c23;
+category c24;
+category c25;
+category c26;
+category c27;
+category c28;
+category c29;
+category c30;
+category c31;
+category c32;
+category c33;
+category c34;
+category c35;
+category c36;
+category c37;
+category c38;
+category c39;
+category c40;
+category c41;
+category c42;
+category c43;
+category c44;
+category c45;
+category c46;
+category c47;
+category c48;
+category c49;
+category c50;
+category c51;
+category c52;
+category c53;
+category c54;
+category c55;
+category c56;
+category c57;
+category c58;
+category c59;
+category c60;
+category c61;
+category c62;
+category c63;
+category c64;
+category c65;
+category c66;
+category c67;
+category c68;
+category c69;
+category c70;
+category c71;
+category c72;
+category c73;
+category c74;
+category c75;
+category c76;
+category c77;
+category c78;
+category c79;
+category c80;
+category c81;
+category c82;
+category c83;
+category c84;
+category c85;
+category c86;
+category c87;
+category c88;
+category c89;
+category c90;
+category c91;
+category c92;
+category c93;
+category c94;
+category c95;
+category c96;
+category c97;
+category c98;
+category c99;
+category c100;
+category c101;
+category c102;
+category c103;
+category c104;
+category c105;
+category c106;
+category c107;
+category c108;
+category c109;
+category c110;
+category c111;
+category c112;
+category c113;
+category c114;
+category c115;
+category c116;
+category c117;
+category c118;
+category c119;
+category c120;
+category c121;
+category c122;
+category c123;
+category c124;
+category c125;
+category c126;
+category c127;
+category c128;
+category c129;
+category c130;
+category c131;
+category c132;
+category c133;
+category c134;
+category c135;
+category c136;
+category c137;
+category c138;
+category c139;
+category c140;
+category c141;
+category c142;
+category c143;
+category c144;
+category c145;
+category c146;
+category c147;
+category c148;
+category c149;
+category c150;
+category c151;
+category c152;
+category c153;
+category c154;
+category c155;
+category c156;
+category c157;
+category c158;
+category c159;
+category c160;
+category c161;
+category c162;
+category c163;
+category c164;
+category c165;
+category c166;
+category c167;
+category c168;
+category c169;
+category c170;
+category c171;
+category c172;
+category c173;
+category c174;
+category c175;
+category c176;
+category c177;
+category c178;
+category c179;
+category c180;
+category c181;
+category c182;
+category c183;
+category c184;
+category c185;
+category c186;
+category c187;
+category c188;
+category c189;
+category c190;
+category c191;
+category c192;
+category c193;
+category c194;
+category c195;
+category c196;
+category c197;
+category c198;
+category c199;
+category c200;
+category c201;
+category c202;
+category c203;
+category c204;
+category c205;
+category c206;
+category c207;
+category c208;
+category c209;
+category c210;
+category c211;
+category c212;
+category c213;
+category c214;
+category c215;
+category c216;
+category c217;
+category c218;
+category c219;
+category c220;
+category c221;
+category c222;
+category c223;
+category c224;
+category c225;
+category c226;
+category c227;
+category c228;
+category c229;
+category c230;
+category c231;
+category c232;
+category c233;
+category c234;
+category c235;
+category c236;
+category c237;
+category c238;
+category c239;
+category c240;
+category c241;
+category c242;
+category c243;
+category c244;
+category c245;
+category c246;
+category c247;
+category c248;
+category c249;
+category c250;
+category c251;
+category c252;
+category c253;
+category c254;
+category c255;
+
+
+#
+# Each MLS level specifies a sensitivity and zero or more categories which may
+# be associated with that sensitivity.
+#
+level s0:c0.c255;
+level s1:c0.c255;
+level s2:c0.c255;
+level s3:c0.c255;
+level s4:c0.c255;
+level s5:c0.c255;
+level s6:c0.c255;
+level s7:c0.c255;
+level s8:c0.c255;
+level s9:c0.c255;
+level s10:c0.c255;
+level s11:c0.c255;
+level s12:c0.c255;
+level s13:c0.c255;
+level s14:c0.c255;
+level s15:c0.c255;
+
+
+#
+# Define the MLS policy
+#
+# mlsconstrain class_set perm_set expression ;
+#
+# mlsvalidatetrans class_set expression ;
+#
+# expression : ( expression )
+#           | not expression
+#           | expression and expression
+#           | expression or expression
+#           | u1 op u2
+#           | r1 role_mls_op r2
+#           | t1 op t2
+#           | l1 role_mls_op l2
+#           | l1 role_mls_op h2
+#           | h1 role_mls_op l2
+#           | h1 role_mls_op h2
+#           | l1 role_mls_op h1
+#           | l2 role_mls_op h2
+#           | u1 op names
+#           | u2 op names
+#           | r1 op names
+#           | r2 op names
+#           | t1 op names
+#           | t2 op names
+#           | u3 op names (NOTE: this is only available for mlsvalidatetrans)
+#           | r3 op names (NOTE: this is only available for mlsvalidatetrans)
+#           | t3 op names (NOTE: this is only available for mlsvalidatetrans)
+#
+# op : == | !=
+# role_mls_op : == | != | eq | dom | domby | incomp
+#
+# names : name | { name_list }
+# name_list : name | name_list name
+#
+
+
+') dnl end enable_mls
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/policy/modules.conf
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/flask/policy/policy/modules.conf    Fri Sep 12 14:47:40 2008 +0900
@@ -0,0 +1,21 @@
+#
+# This file contains a listing of available modules.
+# To prevent a module from  being used in policy
+# creation, set the module name to "off".
+#
+# For monolithic policies, modules set to "base" and "module"
+# will be built into the policy.
+#
+# For modular policies, modules set to "base" will be
+# included in the base module.  "module" will be compiled
+# as individual loadable modules.
+#
+
+# Layer: xen
+# Module: xen
+# Required in base
+#
+# Policy for xen.
+# 
+xen = base
+
diff -r 4ddd63b4be9b -r ec8eaab557d8 
tools/flask/policy/policy/modules/xen/xen.if
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/flask/policy/policy/modules/xen/xen.if      Fri Sep 12 14:47:40 
2008 +0900
@@ -0,0 +1,1 @@
+#
diff -r 4ddd63b4be9b -r ec8eaab557d8 
tools/flask/policy/policy/modules/xen/xen.te
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/flask/policy/policy/modules/xen/xen.te      Fri Sep 12 14:47:40 
2008 +0900
@@ -0,0 +1,135 @@
+attribute xen_type;
+attribute domain_type;
+attribute resource_type;
+attribute event_type;
+
+type xen_t, xen_type, domain_type;
+
+type dom0_t, domain_type;
+
+type domio_t, domain_type;
+
+type domxen_t, domain_type;
+
+type unlabeled_t, domain_type;
+
+type security_t, domain_type;
+
+type pirq_t, resource_type;
+type ioport_t, resource_type;
+type iomem_t, resource_type;
+type device_t, resource_type;
+
+################################################################################
+#
+# create_domain(priv_dom, domain, channel)
+#
+################################################################################
+define(`create_domain', `
+       type $2, domain_type;
+       allow $1 $2:domain {create max_vcpus setdomainmaxmem 
+                               setaddrsize getdomaininfo hypercall 
+                               setvcpucontext scheduler unpause 
+                               getvcpuinfo getaddrsize getvcpuaffinity};
+       allow $1 $2:shadow {enable};
+       allow $1 $2:mmu {map_read map_write memorymap adjust pinpage};
+       allow $2 $2:mmu {map_read map_write pinpage};
+       allow $2 domio_t:mmu {map_read};
+       allow $2 $2:grant {query setup};
+       allow $1 $2:grant {map_read unmap};
+       allow $1 $3:event {create};
+')
+
+################################################################################
+#
+# manage_domain(priv_dom, domain)
+#
+################################################################################
+define(`manage_domain', `
+       allow $1 $2:domain {pause destroy};
+')
+
+################################################################################
+#
+# create_channel(caller, peer, channel)
+#
+################################################################################
+define(`create_channel', `
+       type $3, event_type;
+       type_transition $1 $2:event $3;
+       allow $1 $3:event {create};
+       allow $3 $2:event {bind};
+')
+
+################################################################################
+#
+# Boot the hypervisor and dom0
+#
+################################################################################
+allow dom0_t xen_t:xen {kexec readapic writeapic mtrr_read mtrr_add mtrr_del 
+scheduler physinfo heap quirk readconsole writeconsole settime microcode};
+
+allow dom0_t domio_t:mmu {map_read map_write};
+allow dom0_t iomem_t:mmu {map_read map_write};
+allow dom0_t pirq_t:event {vector};
+allow dom0_t xen_t:mmu {memorymap};
+
+allow dom0_t dom0_t:mmu {pinpage map_read map_write adjust};
+allow dom0_t dom0_t:grant {query setup};
+allow dom0_t dom0_t:domain {scheduler getdomaininfo getvcpuinfo 
getvcpuaffinity};
+
+allow xen_t dom0_t:domain {create};
+allow xen_t dom0_t:resource {add remove};
+allow xen_t ioport_t:resource {add_ioport remove_ioport};
+allow dom0_t ioport_t:resource {use};
+allow xen_t iomem_t:resource {add_iomem remove_iomem};
+allow dom0_t iomem_t:resource {use};
+allow xen_t pirq_t:resource {add_irq remove_irq};
+allow dom0_t pirq_t:resource {use};
+
+allow dom0_t security_t:security {compute_av compute_create compute_member 
+check_context load_policy compute_relabel compute_user setenforce setbool
+setsecparam};
+
+create_channel(dom0_t, dom0_t, evchn0-0_t)
+allow dom0_t evchn0-0_t:event {send};
+
+################################################################################
+#
+# Create and manage a domU w/ dom0 IO
+#
+################################################################################
+create_domain(dom0_t, domU_t, evchnU-0_t)
+
+create_channel(domU_t, domU_t, evchnU-U_t)
+allow domU_t evchnU-U_t:event {send};
+
+create_channel(dom0_t, domU_t, evchn0-U_t)
+allow dom0_t evchn0-U_t:event {send};
+
+create_channel(domU_t, dom0_t, evchnU-0_t)
+allow domU_t evchnU-0_t:event {send};
+
+manage_domain(dom0_t, domU_t)
+
+################################################################################
+#
+#
+#
+################################################################################
+sid xen gen_context(system_u:system_r:xen_t,s0)
+sid dom0 gen_context(system_u:system_r:dom0_t,s0)
+sid domU gen_context(system_u:system_r:domU_t,s0)
+sid domxen gen_context(system_u:system_r:domxen_t,s0)
+sid domio gen_context(system_u:system_r:domio_t,s0)
+sid unlabeled gen_context(system_u:system_r:unlabeled_t,s0)
+sid security gen_context(system_u:system_r:security_t,s0)
+sid pirq gen_context(system_u:object_r:pirq_t,s0)
+sid iomem gen_context(system_u:object_r:iomem_t,s0)
+sid ioport gen_context(system_u:object_r:ioport_t,s0)
+sid device gen_context(system_u:object_r:device_t,s0)
+
+role system_r types { xen_type domain_type };
+role user_r types { xen_type domain_type };
+role sysadm_r types { xen_type domain_type };
+role staff_r types { xen_type domain_type };
diff -r 4ddd63b4be9b -r ec8eaab557d8 
tools/flask/policy/policy/support/loadable_module.spt
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/flask/policy/policy/support/loadable_module.spt     Fri Sep 12 
14:47:40 2008 +0900
@@ -0,0 +1,166 @@
+########################################
+#
+# Macros for switching between source policy
+# and loadable policy module support
+#
+
+##############################
+#
+# For adding the module statement
+#
+define(`policy_module',`
+       ifdef(`self_contained_policy',`',`
+               module $1 $2;
+
+               require {
+                       role system_r;
+                       all_kernel_class_perms
+               }
+       ')
+')
+
+##############################
+#
+# For use in interfaces, to optionally insert a require block
+#
+define(`gen_require',`
+       ifdef(`self_contained_policy',`',`
+               define(`in_gen_require_block')
+               require {
+                       $1
+               }
+               undefine(`in_gen_require_block')
+       ')
+')
+
+##############################
+#
+# In the future interfaces should be in loadable modules
+#
+# template(name,rules)
+#
+define(`template',`
+       `define(`$1',`
+##### begin $1(dollarsstar)
+               $2
+##### end $1(dollarsstar)
+       '')
+')
+
+# helper function, since m4 wont expand macros
+# if a line is a comment (#):
+define(`policy_m4_comment',`dnl
+##### $2 depth: $1
+')dnl
+
+##############################
+#
+# In the future interfaces should be in loadable modules
+#
+# interface(name,rules)
+#
+define(`interface',`
+       `define(`$1',`
+
+       define(`policy_temp',incr(policy_call_depth))
+       pushdef(`policy_call_depth',policy_temp)
+       undefine(`policy_temp')
+
+       policy_m4_comment(policy_call_depth,begin `$1'(dollarsstar))
+
+       $2
+
+       define(`policy_temp',decr(policy_call_depth))
+       pushdef(`policy_call_depth',policy_temp)
+       undefine(`policy_temp')
+
+       policy_m4_comment(policy_call_depth,end `$1'(dollarsstar))
+
+       '')
+')
+
+define(`policy_call_depth',0)
+
+##############################
+#
+# Optional policy handling
+#
+define(`optional_policy',`
+       ifdef(`self_contained_policy',`
+               ifdef(`$1',`$2',`$3')
+       ',`
+               optional {
+                       $2
+               ifelse(`$3',`',`',`
+               } else {
+                       $3
+               ')
+               }
+       ')
+')
+
+##############################
+#
+# Determine if we should use the default
+# tunable value as specified by the policy
+# or if the override value should be used
+#
+define(`dflt_or_overr',`ifdef(`$1',$1,$2)')
+
+##############################
+#
+# Extract booleans out of an expression.
+# This needs to be reworked so expressions
+# with parentheses can work.
+
+define(`delcare_required_symbols',`
+ifelse(regexp($1, `\w'), -1, `', `dnl
+bool regexp($1, `\(\w+\)', `\1');
+delcare_required_symbols(regexp($1, `\w+\(.*\)', `\1'))dnl
+') dnl
+')
+
+##############################
+#
+# Tunable declaration
+#
+define(`gen_tunable',`
+       ifdef(`self_contained_policy',`
+               bool $1 dflt_or_overr(`$1'_conf,$2);
+       ',`
+               # loadable module tunable
+               # declaration will go here
+               # instead of bool when
+               # loadable modules support
+               # tunables
+               bool $1 dflt_or_overr(`$1'_conf,$2);
+       ')
+')
+
+##############################
+#
+# Tunable policy handling
+#
+define(`tunable_policy',`
+       ifdef(`self_contained_policy',`
+               if (`$1') {
+                       $2
+               } else {
+                       $3
+               }
+       ',`
+               # structure for tunables
+               # will go here instead of a
+               # conditional when loadable
+               # modules support tunables
+               gen_require(`
+                       delcare_required_symbols(`$1')
+               ')
+
+               if (`$1') {
+                       $2
+               } else {
+                       $3
+               }
+       ')
+')
diff -r 4ddd63b4be9b -r ec8eaab557d8 
tools/flask/policy/policy/support/misc_macros.spt
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/flask/policy/policy/support/misc_macros.spt Fri Sep 12 14:47:40 
2008 +0900
@@ -0,0 +1,32 @@
+
+########################################
+#
+# Helper macros
+#
+
+#
+# shiftn(num,list...)
+#
+# shift the list num times
+#
+define(`shiftn',`ifelse($1,0,`shift($*)',`shiftn(decr($1),shift(shift($*)))')')
+
+########################################
+#
+# gen_user(username, role_set, mls_defaultlevel, mls_range, [mcs_categories])
+#
+define(`gen_user',`user $1 roles { $2 }`'ifdef(`enable_mls', ` level $3 range 
$4')`'ifdef(`enable_mcs',` level s0 range s0`'ifelse(`$5',,,` - s0:$5')');')
+
+########################################
+#
+# gen_context(context,mls_sensitivity,[mcs_categories])
+#
+define(`gen_context',`$1`'ifdef(`enable_mls',`:$2')`'ifdef(`enable_mcs',`:s0`'ifelse(`$3',,,`:$3')')')
 dnl
+
+########################################
+#
+# gen_bool(name,default_value)
+#
+define(`gen_bool',`
+       bool $1 dflt_or_overr(`$1'_conf,$2);
+')
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/policy/systemuser
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/flask/policy/policy/systemuser      Fri Sep 12 14:47:40 2008 +0900
@@ -0,0 +1,19 @@
+##################################
+#
+# System User configuration.
+#
+
+#
+# gen_user(username, role_set, mls_defaultlevel, mls_range, [mcs_categories])
+#
+
+#
+# system_u is the user identity for system processes and objects.
+# There should be no corresponding Unix user identity for system,
+# and a user process should never be assigned the system user
+# identity.
+#
+gen_user(system_u, system_r, s0, s0 - s9:c0.c127, c0.c127)
+
+# Normal users should not be added to this file,
+# but instead added to the users file.
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/policy/users
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/flask/policy/policy/users   Fri Sep 12 14:47:40 2008 +0900
@@ -0,0 +1,39 @@
+
+##################################
+#
+# Core User configuration.
+#
+
+#
+# gen_user(username, role_set, mls_defaultlevel, mls_range, [mcs_catetories])
+#
+
+#
+# user_u is a generic user identity for Linux users who have no
+# SELinux user identity defined.  The modified daemons will use
+# this user identity in the security context if there is no matching
+# SELinux user identity for a Linux user.  If you do not want to
+# permit any access to such users, then remove this entry.
+#
+ifdef(`targeted_policy',`
+gen_user(user_u, user_r sysadm_r system_r, s0, s0 - s9:c0.c127)
+',`
+gen_user(user_u, user_r, s0, s0 - s9:c0.c127)
+')
+
+#
+# The following users correspond to Unix identities.
+# These identities are typically assigned as the user attribute
+# when login starts the user shell.  Users with access to the sysadm_r
+# role should use the staff_r role instead of the user_r role when
+# not in the sysadm_r.
+#
+ifdef(`targeted_policy',`
+       gen_user(root, user_r sysadm_r system_r, s0, s0 - s9:c0.c127, c0.c127)
+',`
+       ifdef(`direct_sysadm_daemon',`
+               gen_user(root, sysadm_r staff_r system_r, s0, s0 - s9:c0.c127, 
c0.c127)
+       ',`
+               gen_user(root, sysadm_r staff_r, s0, s0 - s9:c0.c127, c0.c127)
+       ')
+')
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/ioemu/hw/cirrus_vga.c
--- a/tools/ioemu/hw/cirrus_vga.c       Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/ioemu/hw/cirrus_vga.c       Fri Sep 12 14:47:40 2008 +0900
@@ -2554,6 +2554,9 @@ static void set_vram_mapping(CirrusVGASt
 
     fprintf(logfile,"mapping vram to %lx - %lx\n", begin, end);
 
+    if (!s->vram_mfns)
+        return;
+
     xatp.domid = domid;
     xatp.space = XENMAPSPACE_mfn;
 
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/ioemu/hw/pass-through.c
--- a/tools/ioemu/hw/pass-through.c     Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/ioemu/hw/pass-through.c     Fri Sep 12 14:47:40 2008 +0900
@@ -57,6 +57,10 @@ static uint32_t pt_irqpin_reg_init(struc
     struct pt_reg_info_tbl *reg, uint32_t real_offset);
 static uint32_t pt_bar_reg_init(struct pt_dev *ptdev,
     struct pt_reg_info_tbl *reg, uint32_t real_offset);
+static uint32_t pt_linkctrl_reg_init(struct pt_dev *ptdev,
+    struct pt_reg_info_tbl *reg, uint32_t real_offset);
+static uint32_t pt_devctrl2_reg_init(struct pt_dev *ptdev,
+    struct pt_reg_info_tbl *reg, uint32_t real_offset);
 static uint32_t pt_linkctrl2_reg_init(struct pt_dev *ptdev,
     struct pt_reg_info_tbl *reg, uint32_t real_offset);
 static uint32_t pt_msgctrl_reg_init(struct pt_dev *ptdev,
@@ -76,6 +80,8 @@ static uint8_t pt_msix_size_init(struct 
 static uint8_t pt_msix_size_init(struct pt_dev *ptdev,
     struct pt_reg_grp_info_tbl *grp_reg, uint32_t base_offset);
 static uint8_t pt_vendor_size_init(struct pt_dev *ptdev,
+    struct pt_reg_grp_info_tbl *grp_reg, uint32_t base_offset);
+static uint8_t pt_pcie_size_init(struct pt_dev *ptdev,
     struct pt_reg_grp_info_tbl *grp_reg, uint32_t base_offset);
 static int pt_byte_reg_read(struct pt_dev *ptdev,
     struct pt_reg_tbl *cfg_entry,
@@ -438,7 +444,7 @@ static struct pt_reg_info_tbl pt_emu_reg
         .init_val   = 0x0000,
         .ro_mask    = 0x0000,
         .emu_mask   = 0xFFFF,
-        .init       = pt_common_reg_init,
+        .init       = pt_linkctrl_reg_init,
         .u.w.read   = pt_word_reg_read,
         .u.w.write  = pt_linkctrl_reg_write,
     },
@@ -449,7 +455,7 @@ static struct pt_reg_info_tbl pt_emu_reg
         .init_val   = 0x0000,
         .ro_mask    = 0x0000,
         .emu_mask   = 0xFFFF,
-        .init       = pt_common_reg_init,
+        .init       = pt_devctrl2_reg_init,
         .u.w.read   = pt_word_reg_read,
         .u.w.write  = pt_devctrl2_reg_write,
     },
@@ -666,8 +672,8 @@ static const struct pt_reg_grp_info_tbl 
     {
         .grp_id     = PCI_CAP_ID_EXP,
         .grp_type   = GRP_TYPE_EMU,
-        .grp_size   = 0x3C,
-        .size_init  = pt_reg_grp_size_init,
+        .grp_size   = 0xFF,
+        .size_init  = pt_pcie_size_init,
         .emu_reg_tbl= pt_emu_reg_pcie_tbl,
     },
     /* MSI-X Capability Structure reg group */
@@ -1869,12 +1875,57 @@ static uint32_t pt_bar_reg_init(struct p
     return reg_field;
 }
 
+/* initialize Link Control register */
+static uint32_t pt_linkctrl_reg_init(struct pt_dev *ptdev,
+        struct pt_reg_info_tbl *reg, uint32_t real_offset)
+{
+    uint8_t cap_ver = 0;
+    uint8_t dev_type = 0;
+
+    cap_ver = (ptdev->dev.config[(real_offset - reg->offset) + PCI_EXP_FLAGS] &
+        (uint8_t)PCI_EXP_FLAGS_VERS);
+    dev_type = (ptdev->dev.config[(real_offset - reg->offset) + PCI_EXP_FLAGS] 
&
+        (uint8_t)PCI_EXP_FLAGS_TYPE) >> 4;
+    
+    /* no need to initialize in case of Root Complex Integrated Endpoint
+     * with cap_ver 1.x 
+     */
+    if ((dev_type == PCI_EXP_TYPE_ROOT_INT_EP) && (cap_ver == 1))
+        return PT_INVALID_REG;
+
+    return reg->init_val;
+}
+
+/* initialize Device Control 2 register */
+static uint32_t pt_devctrl2_reg_init(struct pt_dev *ptdev,
+        struct pt_reg_info_tbl *reg, uint32_t real_offset)
+{
+    uint8_t cap_ver = 0;
+
+    cap_ver = (ptdev->dev.config[(real_offset - reg->offset) + PCI_EXP_FLAGS] &
+        (uint8_t)PCI_EXP_FLAGS_VERS);
+    
+    /* no need to initialize in case of cap_ver 1.x */
+    if (cap_ver == 1)
+        return PT_INVALID_REG;
+
+    return reg->init_val;
+}
+
 /* initialize Link Control 2 register */
 static uint32_t pt_linkctrl2_reg_init(struct pt_dev *ptdev,
         struct pt_reg_info_tbl *reg, uint32_t real_offset)
 {
     int reg_field = 0;
-
+    uint8_t cap_ver = 0;
+
+    cap_ver = (ptdev->dev.config[(real_offset - reg->offset) + PCI_EXP_FLAGS] &
+        (uint8_t)PCI_EXP_FLAGS_VERS);
+    
+    /* no need to initialize in case of cap_ver 1.x */
+    if (cap_ver == 1)
+        return PT_INVALID_REG;
+    
     /* set Supported Link Speed */
     reg_field |= 
         (0x0F & 
@@ -2034,6 +2085,91 @@ static uint8_t pt_vendor_size_init(struc
         struct pt_reg_grp_info_tbl *grp_reg, uint32_t base_offset)
 {
     return ptdev->dev.config[base_offset + 0x02];
+}
+
+/* get PCI Express Capability Structure register group size */
+static uint8_t pt_pcie_size_init(struct pt_dev *ptdev,
+        struct pt_reg_grp_info_tbl *grp_reg, uint32_t base_offset)
+{
+    PCIDevice *d = &ptdev->dev;
+    uint16_t exp_flag = 0;
+    uint16_t type = 0;
+    uint16_t vers = 0;
+    uint8_t pcie_size = 0;
+
+    exp_flag = *((uint16_t*)(d->config + (base_offset + PCI_EXP_FLAGS)));
+    type = (exp_flag & PCI_EXP_FLAGS_TYPE) >> 4;
+    vers = (exp_flag & PCI_EXP_FLAGS_VERS);
+
+    /* calculate size depend on capability version and device/port type */
+    /* in case of PCI Express Base Specification Rev 1.x */
+    if (vers == 1)
+    {
+        /* The PCI Express Capabilities, Device Capabilities, and Device 
+         * Status/Control registers are required for all PCI Express devices. 
+         * The Link Capabilities and Link Status/Control are required for all 
+         * Endpoints that are not Root Complex Integrated Endpoints. Endpoints 
+         * are not required to implement registers other than those listed 
+         * above and terminate the capability structure.
+         */
+        switch (type) {
+        case PCI_EXP_TYPE_ENDPOINT:
+        case PCI_EXP_TYPE_LEG_END:
+            pcie_size = 0x14;
+            break;
+        case PCI_EXP_TYPE_ROOT_INT_EP:
+            /* has no link */
+            pcie_size = 0x0C;
+            break;
+        /* only EndPoint passthrough is supported */
+        case PCI_EXP_TYPE_ROOT_PORT:
+        case PCI_EXP_TYPE_UPSTREAM:
+        case PCI_EXP_TYPE_DOWNSTREAM:
+        case PCI_EXP_TYPE_PCI_BRIDGE:
+        case PCI_EXP_TYPE_PCIE_BRIDGE:
+        case PCI_EXP_TYPE_ROOT_EC:
+        default:
+            /* exit I/O emulator */
+            PT_LOG("Internal error: Unsupported device/port type[%d]. "
+                "I/O emulator exit.\n", type);
+            exit(1);
+        }
+    }
+    /* in case of PCI Express Base Specification Rev 2.0 */
+    else if (vers == 2)
+    {
+        switch (type) {
+        case PCI_EXP_TYPE_ENDPOINT:
+        case PCI_EXP_TYPE_LEG_END:
+        case PCI_EXP_TYPE_ROOT_INT_EP:
+            /* For Functions that do not implement the registers, 
+             * these spaces must be hardwired to 0b.
+             */
+            pcie_size = 0x3C;
+            break;
+        /* only EndPoint passthrough is supported */
+        case PCI_EXP_TYPE_ROOT_PORT:
+        case PCI_EXP_TYPE_UPSTREAM:
+        case PCI_EXP_TYPE_DOWNSTREAM:
+        case PCI_EXP_TYPE_PCI_BRIDGE:
+        case PCI_EXP_TYPE_PCIE_BRIDGE:
+        case PCI_EXP_TYPE_ROOT_EC:
+        default:
+            /* exit I/O emulator */
+            PT_LOG("Internal error: Unsupported device/port type[%d]. "
+                "I/O emulator exit.\n", type);
+            exit(1);
+        }
+    }
+    else
+    {
+        /* exit I/O emulator */
+        PT_LOG("Internal error: Unsupported capability version[%d]. "
+            "I/O emulator exit.\n", vers);
+        exit(1);
+    }
+
+    return pcie_size;
 }
 
 /* read byte size emulate register */
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/ioemu/hw/pass-through.h
--- a/tools/ioemu/hw/pass-through.h     Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/ioemu/hw/pass-through.h     Fri Sep 12 14:47:40 2008 +0900
@@ -60,6 +60,21 @@
 #ifndef PCI_MSI_FLAGS_MASK_BIT
 /* interrupt masking & reporting supported */
 #define PCI_MSI_FLAGS_MASK_BIT  0x0100
+#endif
+
+#ifndef PCI_EXP_TYPE_PCIE_BRIDGE
+/* PCI/PCI-X to PCIE Bridge */
+#define PCI_EXP_TYPE_PCIE_BRIDGE 0x8
+#endif
+
+#ifndef PCI_EXP_TYPE_ROOT_INT_EP
+/* Root Complex Integrated Endpoint */
+#define PCI_EXP_TYPE_ROOT_INT_EP 0x9
+#endif
+
+#ifndef PCI_EXP_TYPE_ROOT_EC
+/* Root Complex Event Collector */
+#define PCI_EXP_TYPE_ROOT_EC     0xa
 #endif
 
 #define PT_INVALID_REG          0xFFFFFFFF      /* invalid register value */
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/ioemu/hw/pci.c
--- a/tools/ioemu/hw/pci.c      Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/ioemu/hw/pci.c      Fri Sep 12 14:47:40 2008 +0900
@@ -45,7 +45,6 @@ static void pci_update_mappings(PCIDevic
 static void pci_update_mappings(PCIDevice *d);
 
 target_phys_addr_t pci_mem_base;
-static int pci_irq_index;
 static PCIBus *first_bus;
 
 PCIBus *pci_register_bus(pci_set_irq_fn set_irq, pci_map_irq_fn map_irq,
@@ -114,9 +113,6 @@ PCIDevice *pci_register_device(PCIBus *b
 {
     PCIDevice *pci_dev;
 
-    if (pci_irq_index >= PCI_DEVICES_MAX)
-        return NULL;
-    
     if (devfn < 0) {
         for(devfn = bus->devfn_min ; devfn < 256; devfn += 8) {
             if ( !bus->devices[devfn] &&
@@ -140,7 +136,6 @@ PCIDevice *pci_register_device(PCIBus *b
         config_write = pci_default_write_config;
     pci_dev->config_read = config_read;
     pci_dev->config_write = config_write;
-    pci_dev->irq_index = pci_irq_index++;
     bus->devices[devfn] = pci_dev;
     return pci_dev;
 }
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/ioemu/hw/pt-msi.c
--- a/tools/ioemu/hw/pt-msi.c   Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/ioemu/hw/pt-msi.c   Fri Sep 12 14:47:40 2008 +0900
@@ -313,7 +313,7 @@ int pt_msix_init(struct pt_dev *dev, int
 
     table_off = pci_read_long(pd, pos + PCI_MSIX_TABLE);
     bar_index = dev->msix->bar_index = table_off & PCI_MSIX_BIR;
-    table_off &= table_off & ~PCI_MSIX_BIR;
+    table_off = dev->msix->table_off = table_off & ~PCI_MSIX_BIR;
     dev->msix->table_base = dev->pci_dev->base_addr[bar_index];
     PT_LOG("get MSI-X table bar base %llx\n",
            (unsigned long long)dev->msix->table_base);
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/ioemu/hw/vga.c
--- a/tools/ioemu/hw/vga.c      Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/ioemu/hw/vga.c      Fri Sep 12 14:47:40 2008 +0900
@@ -2080,7 +2080,13 @@ void xen_vga_vram_map(uint64_t vram_addr
 
     if (copy)
         memcpy(vram, xen_vga_state->vram_ptr, VGA_RAM_SIZE);
-    qemu_free(xen_vga_state->vram_ptr);
+    if (xen_vga_state->vram_mfns) {
+        /* In case this function is called more than once */
+        free(xen_vga_state->vram_mfns);
+        munmap(xen_vga_state->vram_ptr, VGA_RAM_SIZE);
+    } else {
+        qemu_free(xen_vga_state->vram_ptr);
+    }
     xen_vga_state->vram_ptr = vram;
     xen_vga_state->vram_mfns = pfn_list;
 #ifdef CONFIG_STUBDOM
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/ioemu/hw/xen_machine_fv.c
--- a/tools/ioemu/hw/xen_machine_fv.c   Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/ioemu/hw/xen_machine_fv.c   Fri Sep 12 14:47:40 2008 +0900
@@ -139,8 +139,10 @@ uint8_t *qemu_map_cache(target_phys_addr
         !test_bit(address_offset>>XC_PAGE_SHIFT, entry->valid_mapping))
         qemu_remap_bucket(entry, address_index);
 
-    if (!test_bit(address_offset>>XC_PAGE_SHIFT, entry->valid_mapping))
+    if (!test_bit(address_offset>>XC_PAGE_SHIFT, entry->valid_mapping)) {
+        last_address_index = ~0UL;
         return NULL;
+    }
 
     last_address_index = address_index;
     last_address_vaddr = entry->vaddr_base;
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/ioemu/vl.h
--- a/tools/ioemu/vl.h  Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/ioemu/vl.h  Fri Sep 12 14:47:40 2008 +0900
@@ -812,8 +812,6 @@ struct PCIDevice {
     /* do not access the following fields */
     PCIConfigReadFunc *config_read;
     PCIConfigWriteFunc *config_write;
-    /* ??? This is a PC-specific hack, and should be removed.  */
-    int irq_index;
 
     /* Current IRQ levels.  Used internally by the generic PCI code.  */
     int irq_state[4];
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/libxc/ia64/xc_ia64_linux_save.c
--- a/tools/libxc/ia64/xc_ia64_linux_save.c     Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/libxc/ia64/xc_ia64_linux_save.c     Fri Sep 12 14:47:40 2008 +0900
@@ -53,12 +53,12 @@ static inline void set_bit(int nr, volat
 }
 
 static int
-suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd,
+suspend_and_state(int (*suspend)(void), int xc_handle, int io_fd,
                   int dom, xc_dominfo_t *info)
 {
     int i = 0;
 
-    if (!(*suspend)(dom)) {
+    if (!(*suspend)()) {
         ERROR("Suspend request failed");
         return -1;
     }
@@ -406,7 +406,7 @@ out:
 
 int
 xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
-               uint32_t max_factor, uint32_t flags, int (*suspend)(int),
+               uint32_t max_factor, uint32_t flags, int (*suspend)(void),
                int hvm, void *(*init_qemu_maps)(int, unsigned),
                void (*qemu_flip_buffer)(int, int))
 {
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/libxc/xc_domain_save.c
--- a/tools/libxc/xc_domain_save.c      Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/libxc/xc_domain_save.c      Fri Sep 12 14:47:40 2008 +0900
@@ -338,72 +338,23 @@ static int analysis_phase(int xc_handle,
 }
 
 
-static int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd,
+static int suspend_and_state(int (*suspend)(void), int xc_handle, int io_fd,
                              int dom, xc_dominfo_t *info)
 {
-    int i = 0;
-
-    if ( !(*suspend)(dom) )
+    if ( !(*suspend)() )
     {
         ERROR("Suspend request failed");
         return -1;
     }
 
- retry:
-
-    if ( xc_domain_getinfo(xc_handle, dom, 1, info) != 1 )
-    {
-        ERROR("Could not get domain info");
+    if ( (xc_domain_getinfo(xc_handle, dom, 1, info) != 1) ||
+         !info->shutdown || (info->shutdown_reason != SHUTDOWN_suspend) )
+    {
+        ERROR("Domain not in suspended state");
         return -1;
     }
 
-    if ( info->dying )
-    {
-        ERROR("domain is dying");
-        return -1;
-    }
-
-    if ( info->crashed )
-    {
-        ERROR("domain has crashed");
-        return -1;
-    }
-
-    if ( info->shutdown )
-    {
-        switch ( info->shutdown_reason )
-        {
-        case SHUTDOWN_poweroff:
-        case SHUTDOWN_reboot:
-            ERROR("domain has shut down");
-            return -1;
-        case SHUTDOWN_suspend:
-            return 0;
-        case SHUTDOWN_crash:
-            ERROR("domain has crashed");
-            return -1;
-        }
-    }
-
-    if ( info->paused )
-    {
-        /* Try unpausing domain, wait, and retest. */
-        xc_domain_unpause( xc_handle, dom );
-        ERROR("Domain was paused. Wait and re-test.");
-        usleep(10000); /* 10ms */
-        goto retry;
-    }
-
-    if ( ++i < 100 )
-    {
-        ERROR("Retry suspend domain");
-        usleep(10000); /* 10ms */
-        goto retry;
-    }
-
-    ERROR("Unable to suspend domain.");
-
-    return -1;
+    return 0;
 }
 
 /*
@@ -796,7 +747,7 @@ static xen_pfn_t *map_and_save_p2m_table
 
 
 int xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
-                   uint32_t max_factor, uint32_t flags, int (*suspend)(int),
+                   uint32_t max_factor, uint32_t flags, int (*suspend)(void),
                    int hvm, void *(*init_qemu_maps)(int, unsigned), 
                    void (*qemu_flip_buffer)(int, int))
 {
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/libxc/xc_evtchn.c
--- a/tools/libxc/xc_evtchn.c   Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/libxc/xc_evtchn.c   Fri Sep 12 14:47:40 2008 +0900
@@ -59,17 +59,8 @@ int xc_evtchn_reset(int xc_handle,
     return do_evtchn_op(xc_handle, EVTCHNOP_reset, &arg, sizeof(arg), 0);
 }
 
-int xc_evtchn_status(int xc_handle,
-                     uint32_t dom,
-                     uint32_t port)
+int xc_evtchn_status(int xc_handle, xc_evtchn_status_t *status)
 {
-    int rc;
-    struct evtchn_status arg = { .dom = (domid_t)dom,
-                                 .port = (evtchn_port_t)port };
-
-    rc = do_evtchn_op(xc_handle, EVTCHNOP_status, &arg, sizeof(arg), 1);
-    if ( rc == 0 )
-        rc = arg.status;
-
-    return rc;
+    return do_evtchn_op(xc_handle, EVTCHNOP_status, status,
+                        sizeof(*status), 1);
 }
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/libxc/xc_private.c
--- a/tools/libxc/xc_private.c  Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/libxc/xc_private.c  Fri Sep 12 14:47:40 2008 +0900
@@ -307,6 +307,13 @@ int xc_memory_op(int xc_handle,
             goto out1;
         }
         break;
+    case XENMEM_remove_from_physmap:
+        if ( lock_pages(arg, sizeof(struct xen_remove_from_physmap)) )
+        {
+            PERROR("Could not lock");
+            goto out1;
+        }
+        break;
     case XENMEM_current_reservation:
     case XENMEM_maximum_reservation:
     case XENMEM_maximum_gpfn:
@@ -339,6 +346,9 @@ int xc_memory_op(int xc_handle,
         break;
     case XENMEM_add_to_physmap:
         unlock_pages(arg, sizeof(struct xen_add_to_physmap));
+        break;
+    case XENMEM_remove_from_physmap:
+        unlock_pages(arg, sizeof(struct xen_remove_from_physmap));
         break;
     case XENMEM_current_reservation:
     case XENMEM_maximum_reservation:
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/libxc/xenctrl.h
--- a/tools/libxc/xenctrl.h     Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/libxc/xenctrl.h     Fri Sep 12 14:47:40 2008 +0900
@@ -502,9 +502,9 @@ xc_evtchn_alloc_unbound(int xc_handle,
 
 int xc_evtchn_reset(int xc_handle,
                     uint32_t dom);
-int xc_evtchn_status(int xc_handle,
-                     uint32_t dom,
-                     uint32_t port);
+
+typedef struct evtchn_status xc_evtchn_status_t;
+int xc_evtchn_status(int xc_handle, xc_evtchn_status_t *status);
 
 /*
  * Return a handle to the event channel driver, or -1 on failure, in which case
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/libxc/xenguest.h
--- a/tools/libxc/xenguest.h    Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/libxc/xenguest.h    Fri Sep 12 14:47:40 2008 +0900
@@ -25,7 +25,7 @@
  */
 int xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
                    uint32_t max_factor, uint32_t flags /* XCFLAGS_xxx */,
-                   int (*suspend)(int domid), int hvm,
+                   int (*suspend)(void), int hvm,
                    void *(*init_qemu_maps)(int, unsigned),  /* HVM only */
                    void (*qemu_flip_buffer)(int, int));     /* HVM only */
 
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/python/Makefile
--- a/tools/python/Makefile     Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/python/Makefile     Fri Sep 12 14:47:40 2008 +0900
@@ -1,13 +1,5 @@ XEN_ROOT = ../..
 XEN_ROOT = ../..
 include $(XEN_ROOT)/tools/Rules.mk
-
-XEN_SECURITY_MODULE = dummy
-ifeq ($(FLASK_ENABLE),y)
-XEN_SECURITY_MODULE = flask
-endif
-ifeq ($(ACM_SECURITY),y)
-XEN_SECURITY_MODULE = acm
-endif
 
 .PHONY: all
 all: build
@@ -23,8 +15,8 @@ NLSDIR = /usr/share/locale
 NLSDIR = /usr/share/locale
 
 .PHONY: build buildpy
-buildpy: xsm.py
-       CC="$(CC)" CFLAGS="$(CFLAGS)" 
XEN_SECURITY_MODULE="$(XEN_SECURITY_MODULE)" python setup.py build
+buildpy: 
+       CC="$(CC)" CFLAGS="$(CFLAGS)" python setup.py build
 
 build: buildpy refresh-pot refresh-po $(CATALOGS)
 
@@ -61,18 +53,6 @@ refresh-po: $(POTFILE)
 %.mo: %.po
        $(MSGFMT) -c -o $@ $<
 
-xsm.py:
-       @(set -e; \
-         echo "XEN_SECURITY_MODULE = \""$(XEN_SECURITY_MODULE)"\""; \
-         echo "from xsm_core import *"; \
-         echo ""; \
-         echo "import 
xen.util.xsm."$(XEN_SECURITY_MODULE)"."$(XEN_SECURITY_MODULE)" as xsm_module"; \
-         echo ""; \
-         echo "xsm_init(xsm_module)"; \
-         echo "from 
xen.util.xsm."$(XEN_SECURITY_MODULE)"."$(XEN_SECURITY_MODULE)" import *"; \
-         echo "del xsm_module"; \
-         echo "") >xen/util/xsm/$@
-
 .PHONY: install
 ifndef XEN_PYTHON_NATIVE_INSTALL
 install: LIBPATH=$(shell PYTHONPATH=xen/util python -c "import auxbin; print 
auxbin.libpath()")
@@ -104,4 +84,4 @@ test:
 
 .PHONY: clean
 clean:
-       rm -rf build *.pyc *.pyo *.o *.a *~ $(CATALOGS) xen/util/xsm/xsm.py 
xen/util/auxbin.pyc
+       rm -rf build *.pyc *.pyo *.o *.a *~ $(CATALOGS) xen/util/auxbin.pyc
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/python/xen/util/xsconstants.py
--- a/tools/python/xen/util/xsconstants.py      Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/python/xen/util/xsconstants.py      Fri Sep 12 14:47:40 2008 +0900
@@ -20,8 +20,10 @@ XS_INST_BOOT = (1 << 0)
 XS_INST_BOOT = (1 << 0)
 XS_INST_LOAD = (1 << 1)
 
-XS_POLICY_NONE  = 0
 XS_POLICY_ACM = (1 << 0)
+XS_POLICY_FLASK = (1 << 1)
+XS_POLICY_DUMMY  = (1 << 2)
+XS_POLICY_USE = 0
 
 # Some internal variables used by the Xen-API
 ACM_LABEL_VM  = (1 << 0)
@@ -107,6 +109,6 @@ ACM_POLICY_ID = 'ACM'
 
 INVALID_POLICY_PREFIX = 'INV_'
 
-INVALID_SSIDREF = 0xFFFFFFFF
+INVALID_SSIDREF = 0xFFFFFFFFL
 
 XS_INACCESSIBLE_LABEL = '__INACCESSIBLE__'
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/python/xen/util/xsm/flask/flask.py
--- a/tools/python/xen/util/xsm/flask/flask.py  Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/python/xen/util/xsm/flask/flask.py  Fri Sep 12 14:47:40 2008 +0900
@@ -1,5 +1,6 @@ import sys
 import sys
 from xen.lowlevel import flask
+from xen.util import xsconstants
 from xen.xend import sxp
 
 #Functions exported through XML-RPC
@@ -12,7 +13,7 @@ def err(msg):
     raise XSMError(msg)
 
 def on():
-    return 0 #xsconstants.XS_POLICY_FLASK
+    return xsconstants.XS_POLICY_FLASK
 
 def ssidref2label(ssidref):
     try:
@@ -37,8 +38,9 @@ def set_security_label(policy, label):
     return label
 
 def ssidref2security_label(ssidref):
-    return ssidref2label(ssidref)
+    label = ssidref2label(ssidref)
+    return label
 
 def get_security_label(self, xspol=None):
-    label = self.info.get('security_label', '')
+    label = self.info['security_label']
     return label
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/python/xen/util/xsm/xsm.py
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/python/xen/util/xsm/xsm.py  Fri Sep 12 14:47:40 2008 +0900
@@ -0,0 +1,20 @@
+import sys
+import string
+from xen.xend import XendOptions
+from xen.util import xsconstants
+from xsm_core import xsm_init
+
+xoptions = XendOptions.instance()
+xsm_module_name = xoptions.get_xsm_module_name()
+
+xsconstants.XS_POLICY_USE = eval("xsconstants.XS_POLICY_" +
+                                 string.upper(xsm_module_name))
+
+xsm_module_path = "xen.util.xsm." + xsm_module_name + "." + xsm_module_name
+xsm_module = __import__(xsm_module_path, globals(), locals(), ['*'])
+
+xsm_init(xsm_module)
+
+for op in dir(xsm_module):
+    if not hasattr(sys.modules[__name__], op):
+        setattr(sys.modules[__name__], op, getattr(xsm_module, op, None))
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/python/xen/xend/XendConfig.py
--- a/tools/python/xen/xend/XendConfig.py       Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/python/xen/xend/XendConfig.py       Fri Sep 12 14:47:40 2008 +0900
@@ -729,7 +729,7 @@ class XendConfig(dict):
             self.parse_cpuid(cfg, 'cpuid_check')
 
         import xen.util.xsm.xsm as security
-        if security.on() == xsconstants.XS_POLICY_ACM:
+        if security.on() == xsconstants.XS_POLICY_USE:
             from xen.util.acmpolicy import ACM_LABEL_UNLABELED
             if not 'security' in cfg and sxp.child_value(sxp_cfg, 'security'):
                 cfg['security'] = sxp.child_value(sxp_cfg, 'security')
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/python/xen/xend/XendDomainInfo.py
--- a/tools/python/xen/xend/XendDomainInfo.py   Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/python/xen/xend/XendDomainInfo.py   Fri Sep 12 14:47:40 2008 +0900
@@ -2069,7 +2069,7 @@ class XendDomainInfo:
         balloon.free(2*1024) # 2MB should be plenty
 
         ssidref = 0
-        if security.on() == xsconstants.XS_POLICY_ACM:
+        if security.on() == xsconstants.XS_POLICY_USE:
             ssidref = security.calc_dom_ssidref_from_info(self.info)
             if security.has_authorization(ssidref) == False:
                 raise VmError("VM is not authorized to run.")
@@ -2855,10 +2855,6 @@ class XendDomainInfo:
             info["maxmem_kb"] = XendNode.instance() \
                                 .physinfo_dict()['total_memory'] * 1024
 
-        #ssidref field not used any longer
-        if 'ssidref' in info:
-            info.pop('ssidref')
-
         # make sure state is reset for info
         # TODO: we should eventually get rid of old_dom_states
 
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/python/xen/xend/XendOptions.py
--- a/tools/python/xen/xend/XendOptions.py      Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/python/xen/xend/XendOptions.py      Fri Sep 12 14:47:40 2008 +0900
@@ -131,6 +131,9 @@ class XendOptions:
 
     """Default script to configure a backend network interface"""
     vif_script = osdep.vif_script
+
+    """Default Xen Security Module"""
+    xsm_module_default = 'dummy'
 
     """Default rotation count of qemu-dm log file."""
     qemu_dm_logrotate_count = 10
@@ -427,6 +430,11 @@ class XendOptionsFile(XendOptions):
         return self.get_config_value('xen-api-server',
                                      self.xen_api_server_default)
 
+    def get_xsm_module_name(self):
+        """Get the Xen Security Module name.
+        """
+        return self.get_config_string('xsm_module_name', 
self.xsm_module_default)
+
 if os.uname()[0] == 'SunOS':
     class XendOptionsSMF(XendOptions):
 
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/python/xen/xend/server/blkif.py
--- a/tools/python/xen/xend/server/blkif.py     Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/python/xen/xend/server/blkif.py     Fri Sep 12 14:47:40 2008 +0900
@@ -78,7 +78,7 @@ class BlkifController(DevController):
         if uuid:
             back['uuid'] = uuid
 
-        if security.on() == xsconstants.XS_POLICY_ACM:
+        if security.on() == xsconstants.XS_POLICY_USE:
             self.do_access_control(config, uname)
 
         (device_path, devid) = blkif.blkdev_name_to_number(dev)
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/python/xen/xend/server/netif.py
--- a/tools/python/xen/xend/server/netif.py     Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/python/xen/xend/server/netif.py     Fri Sep 12 14:47:40 2008 +0900
@@ -156,7 +156,7 @@ class NetifController(DevController):
             front = { 'handle' : "%i" % devid,
                       'mac'    : mac }
 
-        if security.on() == xsconstants.XS_POLICY_ACM:
+        if security.on() == xsconstants.XS_POLICY_USE:
             self.do_access_control(config)
 
         return (devid, back, front)
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/python/xen/xend/server/pciif.py
--- a/tools/python/xen/xend/server/pciif.py     Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/python/xen/xend/server/pciif.py     Fri Sep 12 14:47:40 2008 +0900
@@ -286,7 +286,7 @@ class PciController(DevController):
                     )%(dev.name))
 
         if dev.has_non_page_aligned_bar and arch.type != "ia64":
-            raise VmError("pci: %: non-page-aligned MMIO BAR found." % 
dev.name)
+            raise VmError("pci: %s: non-page-aligned MMIO BAR found." % 
dev.name)
 
         self.CheckSiblingDevices(fe_domid, dev)
 
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/python/xen/xm/create.py
--- a/tools/python/xen/xm/create.py     Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/python/xen/xm/create.py     Fri Sep 12 14:47:40 2008 +0900
@@ -566,11 +566,11 @@ gopts.var('hap', val='HAP',
           use="""Hap status (0=hap is disabled;
           1=hap is enabled.""")
 
-gopts.var('cpuid', val="IN[,SIN]:eax=EAX,ebx=EBX,exc=ECX,edx=EDX",
+gopts.var('cpuid', val="IN[,SIN]:eax=EAX,ebx=EBX,ecx=ECX,edx=EDX",
           fn=append_value, default=[],
           use="""Cpuid description.""")
 
-gopts.var('cpuid_check', val="IN[,SIN]:eax=EAX,ebx=EBX,exc=ECX,edx=EDX",
+gopts.var('cpuid_check', val="IN[,SIN]:eax=EAX,ebx=EBX,ecx=ECX,edx=EDX",
           fn=append_value, default=[],
           use="""Cpuid check description.""")
 
@@ -971,7 +971,7 @@ def preprocess_cpuid(vals, attr_name):
                         "of the register %s for input %s\n"
                         % (res['reg'], input) )
                 cpuid[input][res['reg']] = res['val'] # new register
-    setattr(vals, attr_name, cpuid)
+            setattr(vals, attr_name, cpuid)
 
 def preprocess_pci(vals):
     if not vals.pci: return
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/python/xen/xm/main.py
--- a/tools/python/xen/xm/main.py       Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/python/xen/xm/main.py       Fri Sep 12 14:47:40 2008 +0900
@@ -1812,7 +1812,7 @@ def domain_name_to_domid(domain_name):
     else:
         dom = server.xend.domain(domain_name)
         domid = int(sxp.child_value(dom, 'domid', '-1'))
-    return domid
+    return int(domid)
 
 def xm_vncviewer(args):
     autopass = False;
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/xcutils/lsevtchn.c
--- a/tools/xcutils/lsevtchn.c  Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/xcutils/lsevtchn.c  Fri Sep 12 14:47:40 2008 +0900
@@ -8,49 +8,55 @@
 #include <xenctrl.h>
 #include <xenguest.h>
 
-int
-main(int argc, char **argv)
+int main(int argc, char **argv)
 {
-    int xc_fd;
-    int domid = 0, port = 0, status;
-    const char *msg;
+    int xc_fd, domid, port, rc;
+    xc_evtchn_status_t status;
 
-    if ( argc > 1 )
-        domid = strtol(argv[1], NULL, 10);
+    domid = (argc > 1) ? strtol(argv[1], NULL, 10) : 0;
 
     xc_fd = xc_interface_open();
     if ( xc_fd < 0 )
         errx(1, "failed to open control interface");
 
-    while ( (status = xc_evtchn_status(xc_fd, domid, port)) >= 0 )
+    for ( port = 0; ; port++ )
     {
-        switch ( status )
+        status.dom = domid;
+        status.port = port;
+        rc = xc_evtchn_status(xc_fd, &status);
+        if ( rc < 0 )
+            break;
+
+        if ( status.status == EVTCHNSTAT_closed )
+            continue;
+
+        printf("%4d: VCPU %u: ", port, status.vcpu);
+
+        switch ( status.status )
         {
-        case EVTCHNSTAT_closed:
-            msg = "Channel is not in use.";
-            break;
         case EVTCHNSTAT_unbound:
-            msg = "Channel is waiting interdom connection.";
+            printf("Interdomain (Waiting connection) - Remote Domain %u",
+                   status.u.unbound.dom);
             break;
         case EVTCHNSTAT_interdomain:
-            msg = "Channel is connected to remote domain.";
+            printf("Interdomain (Connected) - Remote Domain %u, Port %u",
+                   status.u.interdomain.dom, status.u.interdomain.port);
             break;
         case EVTCHNSTAT_pirq:
-            msg = "Channel is bound to a phys IRQ line.";
+            printf("Physical IRQ %u", status.u.pirq);
             break;
         case EVTCHNSTAT_virq:
-            msg = "Channel is bound to a virtual IRQ line.";
+            printf("Virtual IRQ %u", status.u.virq);
             break;
         case EVTCHNSTAT_ipi:
-            msg = "Channel is bound to a virtual IPI line.";
+            printf("IPI");
             break;
         default:
-            msg = "Unknown.";
+            printf("Unknown");
             break;
+        }
 
-        }
-        printf("%03d: %d: %s\n", port, status, msg);
-        port++;
+        printf("\n");
     }
 
     xc_interface_close(xc_fd);
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/xcutils/xc_save.c
--- a/tools/xcutils/xc_save.c   Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/xcutils/xc_save.c   Fri Sep 12 14:47:40 2008 +0900
@@ -32,7 +32,7 @@ static struct suspendinfo {
  * Issue a suspend request through stdout, and receive the acknowledgement
  * from stdin.  This is handled by XendCheckpoint in the Python layer.
  */
-static int compat_suspend(int domid)
+static int compat_suspend(void)
 {
     char ans[30];
 
@@ -43,16 +43,35 @@ static int compat_suspend(int domid)
             !strncmp(ans, "done\n", 5));
 }
 
-static int suspend_evtchn_release(int xc, int domid)
+static int suspend_evtchn_release(void)
 {
     if (si.suspend_evtchn >= 0) {
-       xc_evtchn_unbind(si.xce, si.suspend_evtchn);
-       si.suspend_evtchn = -1;
+        xc_evtchn_unbind(si.xce, si.suspend_evtchn);
+        si.suspend_evtchn = -1;
     }
     if (si.xce >= 0) {
-       xc_evtchn_close(si.xce);
-       si.xce = -1;
-    }
+        xc_evtchn_close(si.xce);
+        si.xce = -1;
+    }
+
+    return 0;
+}
+
+static int await_suspend(void)
+{
+    int rc;
+
+    do {
+        rc = xc_evtchn_pending(si.xce);
+        if (rc < 0) {
+            warnx("error polling suspend notification channel: %d", rc);
+            return -1;
+        }
+    } while (rc != si.suspend_evtchn);
+
+    /* harmless for one-off suspend */
+    if (xc_evtchn_unmask(si.xce, si.suspend_evtchn) < 0)
+        warnx("failed to unmask suspend notification channel: %d", rc);
 
     return 0;
 }
@@ -71,16 +90,16 @@ static int suspend_evtchn_init(int xc, i
 
     xs = xs_daemon_open();
     if (!xs) {
-       errx(1, "failed to get xenstore handle");
-       return -1;
+        warnx("failed to get xenstore handle");
+        return -1;
     }
     sprintf(path, "/local/domain/%d/device/suspend/event-channel", domid);
     portstr = xs_read(xs, XBT_NULL, path, &plen);
     xs_daemon_close(xs);
 
     if (!portstr || !plen) {
-       warnx("could not read suspend event channel");
-       return -1;
+        warnx("could not read suspend event channel");
+        return -1;
     }
 
     port = atoi(portstr);
@@ -88,27 +107,29 @@ static int suspend_evtchn_init(int xc, i
 
     si.xce = xc_evtchn_open();
     if (si.xce < 0) {
-       errx(1, "failed to open event channel handle");
-       goto cleanup;
+        warnx("failed to open event channel handle");
+        goto cleanup;
     }
 
     si.suspend_evtchn = xc_evtchn_bind_interdomain(si.xce, domid, port);
     if (si.suspend_evtchn < 0) {
-       errx(1, "failed to bind suspend event channel: %d",
-            si.suspend_evtchn);
-       goto cleanup;
+        warnx("failed to bind suspend event channel: %d", si.suspend_evtchn);
+        goto cleanup;
     }
 
     rc = xc_domain_subscribe_for_suspend(xc, domid, port);
     if (rc < 0) {
-       errx(1, "failed to subscribe to domain: %d", rc);
-       goto cleanup;
-    }
+        warnx("failed to subscribe to domain: %d", rc);
+        goto cleanup;
+    }
+
+    /* event channel is pending immediately after binding */
+    await_suspend();
 
     return 0;
 
   cleanup:
-    suspend_evtchn_release(xc, domid);
+    suspend_evtchn_release();
 
     return -1;
 }
@@ -116,29 +137,20 @@ static int suspend_evtchn_init(int xc, i
 /**
  * Issue a suspend request to a dedicated event channel in the guest, and
  * receive the acknowledgement from the subscribe event channel. */
-static int evtchn_suspend(int domid)
-{
-    int xcefd;
+static int evtchn_suspend(void)
+{
     int rc;
 
     rc = xc_evtchn_notify(si.xce, si.suspend_evtchn);
     if (rc < 0) {
-       errx(1, "failed to notify suspend request channel: %d", rc);
-       return 0;
-    }
-
-    xcefd = xc_evtchn_fd(si.xce);
-    do {
-      rc = xc_evtchn_pending(si.xce);
-      if (rc < 0) {
-       errx(1, "error polling suspend notification channel: %d", rc);
-       return 0;
-      }
-    } while (rc != si.suspend_evtchn);
-
-    /* harmless for one-off suspend */
-    if (xc_evtchn_unmask(si.xce, si.suspend_evtchn) < 0)
-       errx(1, "failed to unmask suspend notification channel: %d", rc);
+        warnx("failed to notify suspend request channel: %d", rc);
+        return 0;
+    }
+
+    if (await_suspend() < 0) {
+        warnx("suspend failed");
+        return 0;
+    }
 
     /* notify xend that it can do device migration */
     printf("suspended\n");
@@ -147,12 +159,12 @@ static int evtchn_suspend(int domid)
     return 1;
 }
 
-static int suspend(int domid)
+static int suspend(void)
 {
     if (si.suspend_evtchn >= 0)
-       return evtchn_suspend(domid);
-
-    return compat_suspend(domid);
+        return evtchn_suspend();
+
+    return compat_suspend();
 }
 
 /* For HVM guests, there are two sources of dirty pages: the Xen shadow
@@ -195,11 +207,9 @@ static void qemu_flip_buffer(int domid, 
 
     /* Tell qemu that we want it to start writing log-dirty bits to the
      * other buffer */
-    if (!xs_write(xs, XBT_NULL, qemu_next_active_path, &digit, 1)) {
+    if (!xs_write(xs, XBT_NULL, qemu_next_active_path, &digit, 1))
         errx(1, "can't write next-active to store path (%s)\n", 
-              qemu_next_active_path);
-        exit(1);
-    }
+             qemu_next_active_path);
 
     /* Wait a while for qemu to signal that it has switched to the new 
      * active buffer */
@@ -208,10 +218,8 @@ static void qemu_flip_buffer(int domid, 
     tv.tv_usec = 0;
     FD_ZERO(&fdset);
     FD_SET(xs_fileno(xs), &fdset);
-    if ((select(xs_fileno(xs) + 1, &fdset, NULL, NULL, &tv)) != 1) {
+    if ((select(xs_fileno(xs) + 1, &fdset, NULL, NULL, &tv)) != 1)
         errx(1, "timed out waiting for qemu to switch buffers\n");
-        exit(1);
-    }
     watch = xs_read_watch(xs, &len);
     free(watch);
     
@@ -221,7 +229,7 @@ static void qemu_flip_buffer(int domid, 
         goto read_again;
 }
 
-static void * init_qemu_maps(int domid, unsigned int bitmap_size)
+static void *init_qemu_maps(int domid, unsigned int bitmap_size)
 {
     key_t key;
     char key_ascii[17] = {0,};
@@ -293,7 +301,7 @@ main(int argc, char **argv)
     int ret;
 
     if (argc != 6)
-       errx(1, "usage: %s iofd domid maxit maxf flags", argv[0]);
+        errx(1, "usage: %s iofd domid maxit maxf flags", argv[0]);
 
     xc_fd = xc_interface_open();
     if (xc_fd < 0)
@@ -305,13 +313,14 @@ main(int argc, char **argv)
     max_f = atoi(argv[4]);
     flags = atoi(argv[5]);
 
-    suspend_evtchn_init(xc_fd, domid);
+    if (suspend_evtchn_init(xc_fd, domid) < 0)
+        warnx("suspend event channel initialization failed, using slow path");
 
     ret = xc_domain_save(xc_fd, io_fd, domid, maxit, max_f, flags, 
                          &suspend, !!(flags & XCFLAGS_HVM),
                          &init_qemu_maps, &qemu_flip_buffer);
 
-    suspend_evtchn_release(xc_fd, domid);
+    suspend_evtchn_release();
 
     xc_interface_close(xc_fd);
 
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/xenstore/xs.c
--- a/tools/xenstore/xs.c       Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/xenstore/xs.c       Fri Sep 12 14:47:40 2008 +0900
@@ -795,8 +795,11 @@ char *xs_get_domain_path(struct xs_handl
 
 bool xs_is_domain_introduced(struct xs_handle *h, unsigned int domid)
 {
-       return strcmp("F",
-                     single_with_domid(h, XS_IS_DOMAIN_INTRODUCED, domid));
+       char *domain = single_with_domid(h, XS_IS_DOMAIN_INTRODUCED, domid);
+       int rc = strcmp("F", domain);
+
+       free(domain);
+       return rc;
 }
 
 /* Only useful for DEBUG versions */
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/xentrace/formats
--- a/tools/xentrace/formats    Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/xentrace/formats    Fri Sep 12 14:47:40 2008 +0900
@@ -4,56 +4,69 @@ 0x0001f002  CPU%(cpu)d  %(tsc)d (+%(relt
 0x0001f002  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  wrap_buffer       0x%(1)08x
 0x0001f003  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  cpu_change        0x%(1)08x
 
-0x0002f001  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  sched_add_domain  [ domid = 
0x%(1)08x, edomid = 0x%(2)08x ]
-0x0002f002  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  sched_rem_domain  [ domid = 
0x%(1)08x, edomid = 0x%(2)08x ]
-0x0002f003  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  domain_sleep      [ domid = 
0x%(1)08x, edomid = 0x%(2)08x ]
-0x0002f004  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  domain_wake       [ domid = 
0x%(1)08x, edomid = 0x%(2)08x ]
-0x0002f005  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  do_yield          [ domid = 
0x%(1)08x, edomid = 0x%(2)08x ]
-0x0002f006  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  do_block          [ domid = 
0x%(1)08x, edomid = 0x%(2)08x ]
-0x0002f007  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  domain_shutdown          [ 
domid = 0x%(1)08x, edomid = 0x%(2)08x, reason = 0x%(3)08x ]
-0x0002f008  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  sched_ctl
-0x0002f009  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  sched_adjdom      [ domid = 
0x%(1)08x ]
-0x0002f00a  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  __enter_scheduler [ 
prev<domid:edomid> = 0x%(1)08x : 0x%(2)08x, next<domid:edomid> = 0x%(3)08x : 
0x%(4)08x ]
-0x0002f00B  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  s_timer_fn
-0x0002f00c  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  t_timer_fn
-0x0002f00d  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  dom_timer_fn
-0x0002f00e  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  switch_infprev    [ old_domid 
= 0x%(1)08x, runtime = %(2)d ]
-0x0002f00f  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  switch_infnext    [ new_domid 
= 0x%(1)08x, time = %(2)d, r_time = %(3)d ]
+0x00021011  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  running_to_runnable [ dom:vcpu 
= 0x%(1)08x ]
+0x00021021  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  running_to_blocked  [ dom:vcpu 
= 0x%(1)08x ]
+0x00021031  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  running_to_offline  [ dom:vcpu 
= 0x%(1)08x ]
+0x00021101  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  runnable_to_running [ dom:vcpu 
= 0x%(1)08x ]
+0x00021121  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  runnable_to_blocked [ dom:vcpu 
= 0x%(1)08x ]
+0x00021131  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  runnable_to_offline [ dom:vcpu 
= 0x%(1)08x ]
+0x00021201  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  blocked_to_running  [ dom:vcpu 
= 0x%(1)08x ]
+0x00021211  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  blocked_to_runnable [ dom:vcpu 
= 0x%(1)08x ]
+0x00021231  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  blocked_to_offline  [ dom:vcpu 
= 0x%(1)08x ]
+0x00021301  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  offline_to_running  [ dom:vcpu 
= 0x%(1)08x ]
+0x00021311  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  offline_to_runnable [ dom:vcpu 
= 0x%(1)08x ]
+0x00021321  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  offline_to_blocked  [ dom:vcpu 
= 0x%(1)08x ]
 
-0x00081001  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  VMENTRY     [ dom:vcpu = 
0x%(1)08x ]
-0x00081002  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  VMEXIT      [ dom:vcpu = 
0x%(1)08x, exitcode = 0x%(2)08x, rIP  = 0x%(3)08x ]
-0x00081102  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  VMEXIT      [ dom:vcpu = 
0x%(1)08x, exitcode = 0x%(2)08x, rIP  = 0x%(3)016x ]
-0x00082001  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  PF_XEN      [ dom:vcpu = 
0x%(1)08x, errorcode = 0x%(2)02x, virt = 0x%(3)08x ]
-0x00082101  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  PF_XEN      [ dom:vcpu = 
0x%(1)08x, errorcode = 0x%(2)02x, virt = 0x%(3)016x ]
-0x00082002  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  PF_INJECT   [ dom:vcpu = 
0x%(1)08x, errorcode = 0x%(2)02x, virt = 0x%(3)08x ]
-0x00082102  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  PF_INJECT   [ dom:vcpu = 
0x%(1)08x,  errorcode = 0x%(2)02x, virt = 0x%(3)016x ]
-0x00082003  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  INJ_EXC     [ dom:vcpu = 
0x%(1)08x, vector = 0x%(2)02x, errorcode = 0x%(3)04x ]
-0x00082004  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  INJ_VIRQ    [ dom:vcpu = 
0x%(1)08x, vector = 0x%(2)02x, fake = %(3)d ]
-0x00082005  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  REINJ_VIRQ  [ dom:vcpu = 
0x%(1)08x, vector = 0x%(2)02x ]
-0x00082006  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  IO_READ     [ dom:vcpu = 
0x%(1)08x, port = 0x%(2)04x, size = %(3)d ]
-0x00082007  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  IO_WRITE    [ dom:vcpu = 
0x%(1)08x, port = 0x%(2)04x, size = %(3)d ]
-0x00082008  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  CR_READ     [ dom:vcpu = 
0x%(1)08x, CR# = %(2)d, value = 0x%(3)08x ]
-0x00082108  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  CR_READ     [ dom:vcpu = 
0x%(1)08x, CR# = %(2)d, value = 0x%(3)016x ]
-0x00082009  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  CR_WRITE    [ dom:vcpu = 
0x%(1)08x, CR# = %(2)d, value = 0x%(3)08x ]
-0x00082109  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  CR_WRITE    [ dom:vcpu = 
0x%(1)08x, CR# = %(2)d, value = 0x%(3)016x ]
-0x0008200A  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  DR_READ     [ dom:vcpu = 
0x%(1)08x ]
-0x0008200B  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  DR_WRITE    [ dom:vcpu = 
0x%(1)08x ]
-0x0008200C  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  MSR_READ    [ dom:vcpu = 
0x%(1)08x, MSR# = 0x%(2)08x, value = 0x%(3)016x ]
-0x0008200D  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  MSR_WRITE   [ dom:vcpu = 
0x%(1)08x, MSR# = 0x%(2)08x, value = 0x%(3)016x ]
-0x0008200E  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  CPUID       [ dom:vcpu = 
0x%(1)08x, func = 0x%(2)08x, eax = 0x%(3)08x, ebx = 0x%(4)08x, ecx=0x%(5)08x, 
edx = 0x%(6)08x ]
-0x0008200F  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  INTR        [ dom:vcpu = 
0x%(1)08x, vector = 0x%(2)02x ]
-0x00082010  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  NMI         [ dom:vcpu = 
0x%(1)08x ]
-0x00082011  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  SMI         [ dom:vcpu = 
0x%(1)08x ]
-0x00082012  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  VMMCALL     [ dom:vcpu = 
0x%(1)08x, func = 0x%(2)08x ]
-0x00082013  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  HLT         [ dom:vcpu = 
0x%(1)08x, intpending = %(2)d ]
-0x00082014  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  INVLPG      [ dom:vcpu = 
0x%(1)08x, is invlpga? = %(2)d, virt = 0x%(3)08x ]
-0x00082114  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  INVLPG      [ dom:vcpu = 
0x%(1)08x, is invlpga? = %(2)d, virt = 0x%(3)016x ]
-0x00082015  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  MCE         [ dom:vcpu = 
0x%(1)08x ]
-0x00082016  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  IO_ASSIST   [ dom:vcpu = 
0x%(1)08x, data = 0x%(2)04x ]
-0x00082017  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  MMIO_ASSIST [ dom:vcpu = 
0x%(1)08x, data = 0x%(2)04x ]
-0x00082018  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  CLTS        [ dom:vcpu = 
0x%(1)08x ]
-0x00082019  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  LMSW        [ dom:vcpu = 
0x%(1)08x, value = 0x%(2)08x ]
-0x00082119  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  LMSW        [ dom:vcpu = 
0x%(1)08x, value = 0x%(2)016x ]
+0x00028001  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  sched_add_domain  [ domid = 
0x%(1)08x, edomid = 0x%(2)08x ]
+0x00028002  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  sched_rem_domain  [ domid = 
0x%(1)08x, edomid = 0x%(2)08x ]
+0x00028003  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  domain_sleep      [ domid = 
0x%(1)08x, edomid = 0x%(2)08x ]
+0x00028004  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  domain_wake       [ domid = 
0x%(1)08x, edomid = 0x%(2)08x ]
+0x00028005  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  do_yield          [ domid = 
0x%(1)08x, edomid = 0x%(2)08x ]
+0x00028006  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  do_block          [ domid = 
0x%(1)08x, edomid = 0x%(2)08x ]
+0x00028007  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  domain_shutdown          [ 
domid = 0x%(1)08x, edomid = 0x%(2)08x, reason = 0x%(3)08x ]
+0x00028008  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  sched_ctl
+0x00028009  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  sched_adjdom      [ domid = 
0x%(1)08x ]
+0x0002800a  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  __enter_scheduler [ 
prev<domid:edomid> = 0x%(1)08x : 0x%(2)08x, next<domid:edomid> = 0x%(3)08x : 
0x%(4)08x ]
+0x0002800b  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  s_timer_fn
+0x0002800c  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  t_timer_fn
+0x0002800d  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  dom_timer_fn
+0x0002800e  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  switch_infprev    [ old_domid 
= 0x%(1)08x, runtime = %(2)d ]
+0x0002800f  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  switch_infnext    [ new_domid 
= 0x%(1)08x, time = %(2)d, r_time = %(3)d ]
+
+0x00081001  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  VMENTRY
+0x00081002  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  VMEXIT      [ exitcode = 
0x%(1)08x, rIP  = 0x%(2)08x ]
+0x00081102  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  VMEXIT      [ exitcode = 
0x%(1)08x, rIP  = 0x%(2)016x ]
+0x00082001  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  PF_XEN      [ errorcode = 
0x%(2)02x, virt = 0x%(1)08x ]
+0x00082101  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  PF_XEN      [ errorcode = 
0x%(2)02x, virt = 0x%(1)016x ]
+0x00082002  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  PF_INJECT   [ errorcode = 
0x%(1)02x, virt = 0x%(2)08x ]
+0x00082102  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  PF_INJECT   [ errorcode = 
0x%(1)02x, virt = 0x%(2)016x ]
+0x00082003  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  INJ_EXC     [ vector = 
0x%(1)02x, errorcode = 0x%(2)04x ]
+0x00082004  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  INJ_VIRQ    [ vector = 
0x%(1)02x, fake = %(2)d ]
+0x00082005  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  REINJ_VIRQ  [ vector = 
0x%(1)02x ]
+0x00082006  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  IO_READ     [ port = 
0x%(1)04x, size = %(2)d ]
+0x00082007  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  IO_WRITE    [ port = 
0x%(1)04x, size = %(2)d ]
+0x00082008  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  CR_READ     [ CR# = %(1)d, 
value = 0x%(2)08x ]
+0x00082108  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  CR_READ     [ CR# = %(1)d, 
value = 0x%(2)016x ]
+0x00082009  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  CR_WRITE    [ CR# = %(1)d, 
value = 0x%(2)08x ]
+0x00082109  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  CR_WRITE    [ CR# = %(1)d, 
value = 0x%(2)016x ]
+0x0008200A  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  DR_READ    
+0x0008200B  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  DR_WRITE
+0x0008200C  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  MSR_READ    [ MSR# = 
0x%(1)08x, value = 0x%(2)016x ]
+0x0008200D  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  MSR_WRITE   [ MSR# = 
0x%(1)08x, value = 0x%(2)016x ]
+0x0008200E  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  CPUID       [ func = 
0x%(1)08x, eax = 0x%(2)08x, ebx = 0x%(3)08x, ecx=0x%(4)08x, edx = 0x%(5)08x ]
+0x0008200F  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  INTR        [ vector = 
0x%(1)02x ]
+0x00082010  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  NMI
+0x00082011  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  SMI
+0x00082012  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  VMMCALL     [ func = 0x%(1)08x 
]
+0x00082013  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  HLT         [ intpending = 
%(1)d ]
+0x00082014  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  INVLPG      [ is invlpga? = 
%(1)d, virt = 0x%(2)08x ]
+0x00082114  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  INVLPG      [ is invlpga? = 
%(1)d, virt = 0x%(2)016x ]
+0x00082015  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  MCE
+0x00082016  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  IO_ASSIST   [ data = 0x%(1)04x 
]
+0x00082017  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  MMIO_ASSIST [ data = 0x%(1)04x 
]
+0x00082018  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  CLTS
+0x00082019  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  LMSW        [ value = 
0x%(1)08x ]
+0x00082119  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  LMSW        [ value = 
0x%(1)016x ]
 
 0x0010f001  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  page_grant_map      [ domid = 
%(1)d ]
 0x0010f002  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  page_grant_unmap    [ domid = 
%(1)d ]
@@ -65,3 +78,41 @@ 0x0020f103  CPU%(cpu)d  %(tsc)d (+%(relt
 0x0020f103  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  trap       [ rip = 0x%(1)016x, 
trapnr:error = 0x%(2)08x ]
 0x0020f004  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  page_fault [ eip = 0x%(1)08x, 
addr = 0x%(2)08x, error = 0x%(3)08x ]
 0x0020f104  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  page_fault [ rip = 0x%(1)16x, 
addr = 0x%(3)16x, error = 0x%(5)08x ]
+
+0x0020f006  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  emulate_privop      [ eip = 
0x%(1)08x ]
+0x0020f106  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  emulate_privop      [ rip = 
0x%(1)16x ]
+0x0020f007  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  emulate_4G          [ eip = 
0x%(1)08x ]
+0x0020f107  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  emulate_4G          [ rip = 
0x%(1)16x ]
+0x0020f00c  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  ptwr_emulation_pae  [ addr = 
0x%(2)08x, eip = 0x%(1)08x, npte = 0x%(1)16x ]
+0x0020f10c  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  ptwr_emulation_pae  [ addr = 
0x%(2)16x, rip = 0x%(1)16x, npte = 0x%(1)16x ]
+
+0x0040f001  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  shadow_not_shadow              
   [ gl1e = 0x%(1)16x, va = 0x%(2)08x, flags = 0x%(3)08x ]
+0x0040f101  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  shadow_not_shadow              
   [ gl1e = 0x%(1)16x, va = 0x%(2)16x, flags = 0x%(3)08x ]
+0x0040f002  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  shadow_fast_propagate          
   [ va = 0x%(1)08x ]
+0x0040f102  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  shadow_fast_propagate          
   [ va = 0x%(1)16x ]
+0x0040f003  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  shadow_fast_mmio               
   [ va = 0x%(1)08x ]
+0x0040f103  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  shadow_fast_mmio               
   [ va = 0x%(1)16x ]
+0x0040f004  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  shadow_false_fast_path         
   [ va = 0x%(1)08x ]
+0x0040f104  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  shadow_false_fast_path         
   [ va = 0x%(1)16x ]
+0x0040f005  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  shadow_mmio                    
   [ va = 0x%(1)08x ]
+0x0040f105  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  shadow_mmio                    
   [ va = 0x%(1)16x ]
+0x0040f006  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  shadow_fixup                   
   [ gl1e = 0x%(1)08x, va = 0x%(2)08x, flags = 0x%(3)08x ]
+0x0040f106  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  shadow_fixup                   
   [ gl1e = 0x%(1)16x, va = 0x%(2)16x, flags = 0x%(3)08x ]
+0x0040f007  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  shadow_domf_dying              
   [ va = 0x%(1)08x ]
+0x0040f107  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  shadow_domf_dying              
   [ va = 0x%(1)16x ]
+0x0040f008  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  shadow_emulate                 
   [ gl1e = 0x%(1)08x, write_val = 0x%(2)08x, va = 0x%(3)08x, flags = 
0x%(4)08x, emulation_count = 0x%(5)08x]
+0x0040f108  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  shadow_emulate                 
   [ gl1e = 0x%(1)16x, write_val = 0x%(2)16x, va = 0x%(3)16x, flags = 
0x%(4)08x, emulation_count = 0x%(5)08x]
+0x0040f009  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  shadow_emulate_unshadow_user   
   [ va = 0x%(1)08x, gfn = 0x%(2)08x ]
+0x0040f109  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  shadow_emulate_unshadow_user   
   [ va = 0x%(1)16x, gfn = 0x%(2)16x ]
+0x0040f00a  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  shadow_emulate_unshadow_evtinj 
   [ va = 0x%(1)08x, gfn = 0x%(2)08x ]
+0x0040f10a  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  shadow_emulate_unshadow_evtinj 
   [ va = 0x%(1)16x, gfn = 0x%(2)16x ]
+0x0040f00b  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  
shadow_emulate_unshadow_unhandled [ va = 0x%(1)08x, gfn = 0x%(2)08x ]
+0x0040f10b  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  
shadow_emulate_unshadow_unhandled [ va = 0x%(1)16x, gfn = 0x%(2)16x ]
+0x0040f00c  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  shadow_emulate_wrmap_bf        
   [ gfn = 0x%(1)08x ]
+0x0040f10c  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  shadow_emulate_wrmap_bf        
   [ gfn = 0x%(1)16x ]
+0x0040f00d  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  shadow_emulate_prealloc_unpin  
   [ gfn = 0x%(1)08x ]
+0x0040f10d  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  shadow_emulate_prealloc_unpin  
   [ gfn = 0x%(1)16x ]
+0x0040f00e  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  shadow_emulate_resync_full     
   [ gfn = 0x%(1)08x ]
+0x0040f10e  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  shadow_emulate_resync_full     
   [ gfn = 0x%(1)16x ]
+0x0040f00f  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  shadow_emulate_resync_only     
   [ gfn = 0x%(1)08x ]
+0x0040f10f  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  shadow_emulate_resync_only     
   [ gfn = 0x%(1)16x ]
diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/xentrace/xentrace.c
--- a/tools/xentrace/xentrace.c Fri Sep 12 14:32:45 2008 +0900
+++ b/tools/xentrace/xentrace.c Fri Sep 12 14:47:40 2008 +0900
@@ -56,6 +56,7 @@ typedef struct settings_st {
     unsigned long tbuf_size;
     unsigned long disk_rsvd;
     unsigned long timeout;
+    unsigned long memory_buffer;
     uint8_t discard:1,
         disable_tracing:1;
 } settings_t;
@@ -67,10 +68,243 @@ static int xc_handle = -1;
 static int xc_handle = -1;
 static int event_fd = -1;
 static int virq_port = -1;
+static int outfd = 1;
 
 static void close_handler(int signal)
 {
     interrupted = 1;
+}
+
+static struct {
+    char * buf;
+    unsigned long prod, cons, size;
+    unsigned long pending_size, pending_prod;
+} membuf = { 0 };
+
+#define MEMBUF_INDEX_RESET_THRESHOLD (1<<29)
+
+/* FIXME -- make a power of 2 so we can mask instead. */
+#define MEMBUF_POINTER(_i) (membuf.buf + ((_i) % membuf.size))
+#define MEMBUF_CONS_INCREMENT(_n)               \
+    do {                                        \
+        membuf.cons += (_n);                    \
+    } while(0)
+#define MEMBUF_PROD_SET(_x)                                             \
+    do {                                                                \
+        if ( (_x) < membuf.prod ) {                                     \
+            fprintf(stderr, "%s: INTERNAL_ERROR: prod %lu, trying to set to 
%lu!\n", \
+                    __func__, membuf.prod, (unsigned long)(_x));        \
+            exit(1);                                                    \
+        }                                                               \
+        membuf.prod = (_x);                                             \
+        if ( (_x) > MEMBUF_INDEX_RESET_THRESHOLD )                      \
+        {                                                               \
+            membuf.prod %= membuf.size;                                 \
+            membuf.cons %= membuf.size;                                 \
+            if( membuf.prod < membuf.cons )                             \
+                membuf.prod += membuf.size;                             \
+        }                                                               \
+    } while(0) 
+
+struct cpu_change_record {
+    uint32_t header;
+    struct {
+        int cpu;
+        unsigned window_size;
+    } data;
+};
+
+#define CPU_CHANGE_HEADER                                           \
+    (TRC_TRACE_CPU_CHANGE                                           \
+     | (((sizeof(struct cpu_change_record)/sizeof(uint32_t)) - 1)   \
+        << TRACE_EXTRA_SHIFT) )
+
+void membuf_alloc(unsigned long size)
+{
+    membuf.buf = malloc(size);
+
+    if(!membuf.buf)
+    {
+        fprintf(stderr, "%s: Couldn't malloc %lu bytes!\n",
+                __func__, size);
+        exit(1);
+    }
+
+    membuf.prod = membuf.cons = 0;
+    membuf.size = size;
+}
+
+/*
+ * Reserve a new window in the buffer.  Move the 'consumer' forward size
+ * bytes, re-adjusting the cpu window sizes as necessary, and insert a
+ * cpu_change record.
+ */
+void membuf_reserve_window(unsigned cpu, unsigned long window_size)
+{
+    struct cpu_change_record *rec;
+    long need_to_consume, free, freed;
+
+    if ( membuf.pending_size > 0 )
+    {
+        fprintf(stderr, "%s: INTERNAL_ERROR: pending_size %lu\n",
+                __func__, membuf.pending_size);
+        exit(1);
+    }
+
+    need_to_consume = window_size + sizeof(*rec);
+
+    if ( window_size > membuf.size )
+    {
+        fprintf(stderr, "%s: reserve size %lu larger than buffer size %lu!\n",
+                __func__, window_size, membuf.size);
+        exit(1);
+    }
+
+    /* Subtract free space already in buffer. */
+    free = membuf.size - (membuf.prod - membuf.cons);
+    if( need_to_consume < free)
+        goto start_window;
+
+    need_to_consume -= free;
+
+    /*
+     * "Free" up full windows until we have enough for this window.
+     * It's a bit wasteful to throw away partial buffers, but the only
+     * other option is to scan throught he buffer headers.  Since the
+     * common case is that it's going to be thrown away next anyway, I
+     * think minimizing the overall impact is more important.
+     */
+    do {
+        rec = (struct cpu_change_record *)MEMBUF_POINTER(membuf.cons);
+        if( rec->header != CPU_CHANGE_HEADER )
+        {
+            fprintf(stderr, "%s: INTERNAL ERROR: no cpu_change record at 
consumer!\n",
+                    __func__);
+            exit(EXIT_FAILURE);
+        }
+
+        freed = sizeof(*rec) + rec->data.window_size;
+
+        if ( need_to_consume > 0 )
+        {
+            MEMBUF_CONS_INCREMENT(freed);
+            need_to_consume -= freed;
+        }
+    } while( need_to_consume > 0 );
+
+start_window:
+    /*
+     * Start writing "pending" data.  Update prod once all this data is
+     * written.
+     */
+    membuf.pending_prod = membuf.prod;
+    membuf.pending_size = window_size;
+
+    rec = (struct cpu_change_record *)MEMBUF_POINTER(membuf.pending_prod);
+
+    rec->header = CPU_CHANGE_HEADER;
+    rec->data.cpu = cpu;
+    rec->data.window_size = window_size;
+
+    membuf.pending_prod += sizeof(*rec);
+}
+
+void membuf_write(void *start, unsigned long size) {
+    char * p;
+    unsigned long wsize;
+
+    if( (membuf.size - (membuf.prod - membuf.cons)) < size )
+    {
+        fprintf(stderr, "%s: INTERNAL ERROR: need %lu bytes, only have %lu!\n",
+                __func__, size, membuf.prod - membuf.cons);
+        exit(1);
+    }
+
+    if( size > membuf.pending_size )
+    {
+        fprintf(stderr, "%s: INTERNAL ERROR: size %lu, pending %lu!\n",
+                __func__, size, membuf.pending_size);
+        exit(1);
+    }
+
+    wsize = size;
+    p = MEMBUF_POINTER(membuf.pending_prod);
+
+    /* If the buffer overlaps the "wrap", do an extra write */
+    if ( p + size > membuf.buf + membuf.size )
+    {
+        int usize = ( membuf.buf + membuf.size ) - p;
+
+        memcpy(p, start, usize);
+
+        start += usize;
+        wsize -= usize;
+        p = membuf.buf;
+    }
+
+    memcpy(p, start, wsize);
+
+    membuf.pending_prod += size;
+    membuf.pending_size -= size;
+
+    if ( membuf.pending_size == 0 )
+    {
+        MEMBUF_PROD_SET(membuf.pending_prod);
+    }
+}
+
+void membuf_dump(void) {
+    /* Dump circular memory buffer */
+    int cons, prod, wsize, written;
+    char * wstart;
+
+    fprintf(stderr, "Dumping memory buffer.\n");
+
+    cons = membuf.cons % membuf.size; 
+    prod = membuf.prod % membuf.size;
+   
+    if(prod > cons)
+    {
+        /* Write in one go */
+        wstart = membuf.buf + cons;
+        wsize = prod - cons;
+
+        written = write(outfd, wstart, wsize);
+        if ( written != wsize )
+            goto fail;
+    }
+    else
+    {
+        /* Write in two pieces: cons->end, beginning->prod. */
+        wstart = membuf.buf + cons;
+        wsize = membuf.size - cons;
+
+        written = write(outfd, wstart, wsize);
+        if ( written != wsize )
+        {
+            fprintf(stderr, "Write failed! (size %d, returned %d)\n",
+                    wsize, written);
+            goto fail;
+        }
+
+        wstart = membuf.buf;
+        wsize = prod;
+
+        written = write(outfd, wstart, wsize);
+        if ( written != wsize )
+        {
+            fprintf(stderr, "Write failed! (size %d, returned %d)\n",
+                    wsize, written);
+            goto fail;
+        }
+    }
+
+    membuf.cons = membuf.prod = 0;
+    
+    return;
+fail:
+    exit(1);
+    return;
 }
 
 /**
@@ -85,20 +319,20 @@ static void close_handler(int signal)
  * of the buffer write.
  */
 static void write_buffer(unsigned int cpu, unsigned char *start, int size,
-               int total_size, int outfd)
+                         int total_size)
 {
     struct statvfs stat;
     size_t written = 0;
     
-    if ( opts.disk_rsvd != 0 )
+    if ( opts.memory_buffer == 0 && opts.disk_rsvd != 0 )
     {
         unsigned long long freespace;
 
         /* Check that filesystem has enough space. */
         if ( fstatvfs (outfd, &stat) )
         {
-                fprintf(stderr, "Statfs failed!\n");
-                goto fail;
+            fprintf(stderr, "Statfs failed!\n");
+            goto fail;
         }
 
         freespace = stat.f_frsize * (unsigned long long)stat.f_bfree;
@@ -112,8 +346,8 @@ static void write_buffer(unsigned int cp
 
         if ( freespace <= opts.disk_rsvd )
         {
-                fprintf(stderr, "Disk space limit reached (free space: %lluMB, 
limit: %luMB).\n", freespace, opts.disk_rsvd);
-                exit (EXIT_FAILURE);
+            fprintf(stderr, "Disk space limit reached (free space: %lluMB, 
limit: %luMB).\n", freespace, opts.disk_rsvd);
+            exit (EXIT_FAILURE);
         }
     }
 
@@ -122,40 +356,46 @@ static void write_buffer(unsigned int cp
      * first write. */
     if ( total_size != 0 )
     {
-        struct {
-            uint32_t header;
-            struct {
-                unsigned cpu;
-                unsigned byte_count;
-            } extra;
-        } rec;
-
-        rec.header = TRC_TRACE_CPU_CHANGE
-            | ((sizeof(rec.extra)/sizeof(uint32_t)) << TRACE_EXTRA_SHIFT);
-        rec.extra.cpu = cpu;
-        rec.extra.byte_count = total_size;
-
-        written = write(outfd, &rec, sizeof(rec));
-
-        if ( written != sizeof(rec) )
-        {
-            fprintf(stderr, "Cannot write cpu change (write returned %zd)\n",
-                    written);
+        if ( opts.memory_buffer )
+        {
+            membuf_reserve_window(cpu, total_size);
+        }
+        else
+        {
+            struct cpu_change_record rec;
+
+            rec.header = CPU_CHANGE_HEADER;
+            rec.data.cpu = cpu;
+            rec.data.window_size = total_size;
+
+            written = write(outfd, &rec, sizeof(rec));
+            if ( written != sizeof(rec) )
+            {
+                fprintf(stderr, "Cannot write cpu change (write returned 
%zd)\n",
+                        written);
+                goto fail;
+            }
+        }
+    }
+
+    if ( opts.memory_buffer )
+    {
+        membuf_write(start, size);
+    }
+    else
+    {
+        written = write(outfd, start, size);
+        if ( written != size )
+        {
+            fprintf(stderr, "Write failed! (size %d, returned %zd)\n",
+                    size, written);
             goto fail;
         }
     }
 
-    written = write(outfd, start, size);
-    if ( written != size )
-    {
-        fprintf(stderr, "Write failed! (size %d, returned %zd)\n",
-                size, written);
-        goto fail;
-    }
-
     return;
 
- fail:
+fail:
     PERROR("Failed to write trace data");
     exit(EXIT_FAILURE);
 }
@@ -394,7 +634,7 @@ static void wait_for_event_or_timeout(un
  * monitor_tbufs - monitor the contents of tbufs and output to a file
  * @logfile:       the FILE * representing the file to log to
  */
-static int monitor_tbufs(int outfd)
+static int monitor_tbufs(void)
 {
     int i;
 
@@ -429,9 +669,9 @@ static int monitor_tbufs(int outfd)
             meta[i]->cons = meta[i]->prod;
 
     /* now, scan buffers for events */
-    while ( !interrupted )
-    {
-        for ( i = 0; (i < num) && !interrupted; i++ )
+    while ( 1 )
+    {
+        for ( i = 0; i < num; i++ )
         {
             unsigned long start_offset, end_offset, window_size, cons, prod;
                 
@@ -463,8 +703,7 @@ static int monitor_tbufs(int outfd)
                 /* If window does not wrap, write in one big chunk */
                 write_buffer(i, data[i]+start_offset,
                              window_size,
-                             window_size,
-                             outfd);
+                             window_size);
             }
             else
             {
@@ -474,23 +713,28 @@ static int monitor_tbufs(int outfd)
                  */
                 write_buffer(i, data[i] + start_offset,
                              data_size - start_offset,
-                             window_size,
-                             outfd);
+                             window_size);
                 write_buffer(i, data[i],
                              end_offset,
-                             0,
-                             outfd);
+                             0);
             }
 
             xen_mb(); /* read buffer, then update cons. */
             meta[i]->cons = prod;
-        }
+
+        }
+
+        if ( interrupted )
+            break;
 
         wait_for_event_or_timeout(opts.poll_sleep);
     }
 
-    if(opts.disable_tracing)
+    if ( opts.disable_tracing )
         disable_tbufs();
+
+    if ( opts.memory_buffer )
+        membuf_dump();
 
     /* cleanup */
     free(meta);
@@ -538,6 +782,8 @@ static void usage(void)
 "  -T  --time-interval=s   Run xentrace for s seconds and quit.\n" \
 "  -?, --help              Show this message\n" \
 "  -V, --version           Print program version\n" \
+"  -M, --memory-buffer=b   Copy trace records to a circular memory buffer.\n" \
+"                          Dump to file on exit.\n" \
 "\n" \
 "This tool is used to capture trace buffer data from Xen. The\n" \
 "data is output in a binary format, in the following order:\n" \
@@ -551,6 +797,53 @@ static void usage(void)
     printf("\nReport bugs to %s\n", program_bug_address);
 
     exit(EXIT_FAILURE);
+}
+
+/* convert the argument string pointed to by arg to a long int representation,
+ * including suffixes such as 'M' and 'k'. */
+#define MB (1024*1024)
+#define KB (1024)
+long sargtol(const char *restrict arg, int base)
+{
+    char *endp;
+    long val;
+
+    errno = 0;
+    val = strtol(arg, &endp, base);
+    
+    if ( errno != 0 )
+    {
+        fprintf(stderr, "Invalid option argument: %s\n", arg);
+        fprintf(stderr, "Error: %s\n\n", strerror(errno));
+        usage();
+    }
+    else if (endp == arg)
+    {
+        goto invalid;
+    }
+
+    switch(*endp)
+    {
+    case '\0':
+        break;
+    case 'M':
+        val *= MB;
+        break;
+    case 'K':
+    case 'k':
+        val *= KB;
+        break;
+    default:
+        fprintf(stderr, "Unknown suffix %c\n", *endp);
+        exit(1);
+    }
+
+
+    return val;
+invalid:
+    return 0;
+    fprintf(stderr, "Invalid option argument: %s\n\n", arg);
+    usage();
 }
 
 /* convert the argument string pointed to by arg to a long int representation 
*/
@@ -606,6 +899,7 @@ static void parse_args(int argc, char **
         { "trace-buf-size", required_argument, 0, 'S' },
         { "reserve-disk-space", required_argument, 0, 'r' },
         { "time-interval",  required_argument, 0, 'T' },
+        { "memory-buffer",  required_argument, 0, 'M' },
         { "discard-buffers", no_argument,      0, 'D' },
         { "dont-disable-tracing", no_argument, 0, 'x' },
         { "help",           no_argument,       0, '?' },
@@ -613,7 +907,7 @@ static void parse_args(int argc, char **
         { 0, 0, 0, 0 }
     };
 
-    while ( (option = getopt_long(argc, argv, "c:e:s:S:t:?V",
+    while ( (option = getopt_long(argc, argv, "t:s:c:e:S:r:T:M:Dx?V",
                     long_options, NULL)) != -1) 
     {
         switch ( option )
@@ -653,6 +947,10 @@ static void parse_args(int argc, char **
 
         case 'T':
             opts.timeout = argtol(optarg, 0);
+            break;
+
+        case 'M':
+            opts.memory_buffer = sargtol(optarg, 0);
             break;
 
         default:
@@ -674,7 +972,7 @@ static void parse_args(int argc, char **
 
 int main(int argc, char **argv)
 {
-    int outfd = 1, ret;
+    int ret;
     struct sigaction act;
 
     opts.outfile = 0;
@@ -719,6 +1017,9 @@ int main(int argc, char **argv)
         fprintf(stderr, "Cannot output to a TTY, specify a log file.\n");
         exit(EXIT_FAILURE);
     }
+
+    if ( opts.memory_buffer > 0 )
+        membuf_alloc(opts.memory_buffer);
 
     /* ensure that if we get a signal, we'll do cleanup, then exit */
     act.sa_handler = close_handler;
@@ -729,7 +1030,7 @@ int main(int argc, char **argv)
     sigaction(SIGINT,  &act, NULL);
     sigaction(SIGALRM, &act, NULL);
 
-    ret = monitor_tbufs(outfd);
+    ret = monitor_tbufs();
 
     return ret;
 }
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/acpi/Makefile
--- a/xen/arch/x86/acpi/Makefile        Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/acpi/Makefile        Fri Sep 12 14:47:40 2008 +0900
@@ -1,5 +1,5 @@ subdir-y += cpufreq
 subdir-y += cpufreq
 
 obj-y += boot.o
-obj-y += power.o suspend.o wakeup_prot.o cpu_idle.o
+obj-y += power.o suspend.o wakeup_prot.o cpu_idle.o cpuidle_menu.o
 obj-y += pmstat.o
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/acpi/cpu_idle.c
--- a/xen/arch/x86/acpi/cpu_idle.c      Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/acpi/cpu_idle.c      Fri Sep 12 14:47:40 2008 +0900
@@ -39,6 +39,7 @@
 #include <xen/smp.h>
 #include <xen/guest_access.h>
 #include <xen/keyhandler.h>
+#include <xen/cpuidle.h>
 #include <asm/cache.h>
 #include <asm/io.h>
 #include <asm/hpet.h>
@@ -49,12 +50,9 @@
 #define DEBUG_PM_CX
 
 #define US_TO_PM_TIMER_TICKS(t)     ((t * (PM_TIMER_FREQUENCY/1000)) / 1000)
+#define PM_TIMER_TICKS_TO_US(t)     ((t * 1000) / (PM_TIMER_FREQUENCY / 1000))
 #define C2_OVERHEAD         4   /* 1us (3.579 ticks per us) */
 #define C3_OVERHEAD         4   /* 1us (3.579 ticks per us) */
-
-#define ACPI_PROCESSOR_MAX_POWER        8
-#define ACPI_PROCESSOR_MAX_C2_LATENCY   100
-#define ACPI_PROCESSOR_MAX_C3_LATENCY   1000
 
 static void (*lapic_timer_off)(void);
 static void (*lapic_timer_on)(void);
@@ -65,66 +63,6 @@ static void (*pm_idle_save) (void) __rea
 static void (*pm_idle_save) (void) __read_mostly;
 unsigned int max_cstate __read_mostly = 2;
 integer_param("max_cstate", max_cstate);
-/*
- * bm_history -- bit-mask with a bit per jiffy of bus-master activity
- * 1000 HZ: 0xFFFFFFFF: 32 jiffies = 32ms
- * 800 HZ: 0xFFFFFFFF: 32 jiffies = 40ms
- * 100 HZ: 0x0000000F: 4 jiffies = 40ms
- * reduce history for more aggressive entry into C3
- */
-unsigned int bm_history __read_mostly =
-    (HZ >= 800 ? 0xFFFFFFFF : ((1U << (HZ / 25)) - 1));
-integer_param("bm_history", bm_history);
-
-struct acpi_processor_cx;
-
-struct acpi_processor_cx_policy
-{
-    u32 count;
-    struct acpi_processor_cx *state;
-    struct
-    {
-        u32 time;
-        u32 ticks;
-        u32 count;
-        u32 bm;
-    } threshold;
-};
-
-struct acpi_processor_cx
-{
-    u8 valid;
-    u8 type;
-    u32 address;
-    u8 space_id;
-    u32 latency;
-    u32 latency_ticks;
-    u32 power;
-    u32 usage;
-    u64 time;
-    struct acpi_processor_cx_policy promotion;
-    struct acpi_processor_cx_policy demotion;
-};
-
-struct acpi_processor_flags
-{
-    u8 bm_control:1;
-    u8 bm_check:1;
-    u8 has_cst:1;
-    u8 power_setup_done:1;
-    u8 bm_rld_set:1;
-};
-
-struct acpi_processor_power
-{
-    struct acpi_processor_flags flags;
-    struct acpi_processor_cx *state;
-    s_time_t bm_check_timestamp;
-    u32 default_state;
-    u32 bm_activity;
-    u32 count;
-    struct acpi_processor_cx states[ACPI_PROCESSOR_MAX_POWER];
-};
 
 static struct acpi_processor_power processor_powers[NR_CPUS];
 
@@ -133,26 +71,21 @@ static void print_acpi_power(uint32_t cp
     uint32_t i;
 
     printk("==cpu%d==\n", cpu);
-    printk("active state:\t\tC%d\n", (power->state)?power->state->type:-1);
+    printk("active state:\t\tC%d\n",
+           (power->last_state) ? power->last_state->type : -1);
     printk("max_cstate:\t\tC%d\n", max_cstate);
-    printk("bus master activity:\t%08x\n", power->bm_activity);
     printk("states:\n");
     
     for ( i = 1; i < power->count; i++ )
     {
-        printk((power->states[i].type == power->state->type) ? "   *" : "    
");
+        if ( power->last_state && 
+             power->states[i].type == power->last_state->type )
+            printk("   *");
+        else
+            printk("    ");
         printk("C%d:\t\t", i);
         printk("type[C%d] ", power->states[i].type);
-        if ( power->states[i].promotion.state )
-            printk("promotion[C%d] ", power->states[i].promotion.state->type);
-        else
-            printk("promotion[--] ");
-        if ( power->states[i].demotion.state )
-            printk("demotion[C%d] ", power->states[i].demotion.state->type);
-        else
-            printk("demotion[--] ");
-        printk("latency[%03d]\n ", power->states[i].latency);
-        printk("\t\t\t");
+        printk("latency[%03d] ", power->states[i].latency);
         printk("usage[%08d] ", power->states[i].usage);
         printk("duration[%"PRId64"]\n", power->states[i].time);
     }
@@ -180,48 +113,6 @@ static inline u32 ticks_elapsed(u32 t1, 
         return (((0x00FFFFFF - t1) + t2) & 0x00FFFFFF);
     else
         return ((0xFFFFFFFF - t1) + t2);
-}
-
-static void acpi_processor_power_activate(struct acpi_processor_power *power,
-                                          struct acpi_processor_cx *new)
-{
-    struct acpi_processor_cx *old;
-
-    if ( !power || !new )
-        return;
-
-    old = power->state;
-
-    if ( old )
-        old->promotion.count = 0;
-    new->demotion.count = 0;
-
-    /* Cleanup from old state. */
-    if ( old )
-    {
-        switch ( old->type )
-        {
-        case ACPI_STATE_C3:
-            /* Disable bus master reload */
-            if ( new->type != ACPI_STATE_C3 && power->flags.bm_check )
-                acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
-            break;
-        }
-    }
-
-    /* Prepare to use new state. */
-    switch ( new->type )
-    {
-    case ACPI_STATE_C3:
-        /* Enable bus master reload */
-        if ( old->type != ACPI_STATE_C3 && power->flags.bm_check )
-            acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 1);
-        break;
-    }
-
-    power->state = new;
-
-    return;
 }
 
 static void acpi_safe_halt(void)
@@ -263,13 +154,50 @@ static void acpi_idle_do_entry(struct ac
     }
 }
 
-static atomic_t c3_cpu_count;
+static inline void acpi_idle_update_bm_rld(struct acpi_processor_power *power,
+                                           struct acpi_processor_cx *target)
+{
+    if ( !power->flags.bm_check )
+        return;
+
+    if ( power->flags.bm_rld_set && target->type != ACPI_STATE_C3 )
+    {
+        acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
+        power->flags.bm_rld_set = 0;
+    }
+
+    if ( !power->flags.bm_rld_set && target->type == ACPI_STATE_C3 )
+    {
+        acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 1);
+        power->flags.bm_rld_set = 1;
+    }
+}
+
+static int acpi_idle_bm_check(void)
+{
+    u32 bm_status = 0;
+
+    acpi_get_register(ACPI_BITREG_BUS_MASTER_STATUS, &bm_status);
+    if ( bm_status )
+        acpi_set_register(ACPI_BITREG_BUS_MASTER_STATUS, 1);
+    /*
+     * TBD: PIIX4 Erratum #18: Note that BM_STS doesn't always reflect
+     * the true state of bus mastering activity; forcing us to
+     * manually check the BMIDEA bit of each IDE channel.
+     */
+    return bm_status;
+}
+
+static struct {
+    spinlock_t lock;
+    unsigned int count;
+} c3_cpu_status = { .lock = SPIN_LOCK_UNLOCKED };
 
 static void acpi_processor_idle(void)
 {
     struct acpi_processor_power *power = NULL;
     struct acpi_processor_cx *cx = NULL;
-    struct acpi_processor_cx *next_state = NULL;
+    int next_state;
     int sleep_ticks = 0;
     u32 t1, t2 = 0;
 
@@ -287,7 +215,16 @@ static void acpi_processor_idle(void)
         return;
     }
 
-    cx = power->state;
+    next_state = cpuidle_current_governor->select(power);
+    if ( next_state > 0 )
+    {
+        cx = &power->states[next_state];
+        if ( power->flags.bm_check && acpi_idle_bm_check()
+             && cx->type == ACPI_STATE_C3 )
+            cx = power->safe_state;
+        if ( cx->type > max_cstate )
+            cx = &power->states[max_cstate];
+    }
     if ( !cx )
     {
         if ( pm_idle_save )
@@ -303,69 +240,14 @@ static void acpi_processor_idle(void)
         return;
     }
 
-    /*
-     * Check BM Activity
-     * -----------------
-     * Check for bus mastering activity (if required), record, and check
-     * for demotion.
-     */
-    if ( power->flags.bm_check )
-    {
-        u32 bm_status = 0;
-        unsigned long diff = (NOW() - power->bm_check_timestamp) >> 23;
-
-        if ( diff > 31 )
-            diff = 31;
-
-        power->bm_activity <<= diff;
-
-        acpi_get_register(ACPI_BITREG_BUS_MASTER_STATUS, &bm_status);
-        if ( bm_status )
-        {
-            power->bm_activity |= 0x1;
-            acpi_set_register(ACPI_BITREG_BUS_MASTER_STATUS, 1);
-        }
-        /*
-         * PIIX4 Erratum #18: Note that BM_STS doesn't always reflect
-         * the true state of bus mastering activity; forcing us to
-         * manually check the BMIDEA bit of each IDE channel.
-         */
-        /*else if ( errata.piix4.bmisx )
-        {
-            if ( (inb_p(errata.piix4.bmisx + 0x02) & 0x01)
-                || (inb_p(errata.piix4.bmisx + 0x0A) & 0x01) )
-                pr->power.bm_activity |= 0x1;
-        }*/
-
-        power->bm_check_timestamp = NOW();
-
-        /*
-         * If bus mastering is or was active this jiffy, demote
-         * to avoid a faulty transition.  Note that the processor
-         * won't enter a low-power state during this call (to this
-         * function) but should upon the next.
-         *
-         * TBD: A better policy might be to fallback to the demotion
-         *      state (use it for this quantum only) istead of
-         *      demoting -- and rely on duration as our sole demotion
-         *      qualification.  This may, however, introduce DMA
-         *      issues (e.g. floppy DMA transfer overrun/underrun).
-         */
-        if ( (power->bm_activity & 0x1) && cx->demotion.threshold.bm )
-        {
-            local_irq_enable();
-            next_state = cx->demotion.state;
-            goto end;
-        }
-    }
+    power->last_state = cx;
 
     /*
      * Sleep:
      * ------
      * Invoke the current Cx state to put the processor to sleep.
      */
-    if ( cx->type == ACPI_STATE_C2 || cx->type == ACPI_STATE_C3 )
-        smp_mb__after_clear_bit();
+    acpi_idle_update_bm_rld(power, cx);
 
     switch ( cx->type )
     {
@@ -399,8 +281,7 @@ static void acpi_processor_idle(void)
         /* Re-enable interrupts */
         local_irq_enable();
         /* Compute time (ticks) that we were actually asleep */
-        sleep_ticks =
-            ticks_elapsed(t1, t2) - cx->latency_ticks - C2_OVERHEAD;
+        sleep_ticks = ticks_elapsed(t1, t2);
         break;
 
     case ACPI_STATE_C3:
@@ -416,8 +297,8 @@ static void acpi_processor_idle(void)
          */
         if ( power->flags.bm_check && power->flags.bm_control )
         {
-            atomic_inc(&c3_cpu_count);
-            if ( atomic_read(&c3_cpu_count) == num_online_cpus() )
+            spin_lock(&c3_cpu_status.lock);
+            if ( ++c3_cpu_status.count == num_online_cpus() )
             {
                 /*
                  * All CPUs are trying to go to C3
@@ -425,6 +306,7 @@ static void acpi_processor_idle(void)
                  */
                 acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1);
             }
+            spin_unlock(&c3_cpu_status.lock);
         }
         else if ( !power->flags.bm_check )
         {
@@ -455,8 +337,10 @@ static void acpi_processor_idle(void)
         if ( power->flags.bm_check && power->flags.bm_control )
         {
             /* Enable bus master arbitration */
-            atomic_dec(&c3_cpu_count);
-            acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0);
+            spin_lock(&c3_cpu_status.lock);
+            if ( c3_cpu_status.count-- == num_online_cpus() )
+                acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0);
+            spin_unlock(&c3_cpu_status.lock);
         }
 
         /* Re-enable interrupts */
@@ -465,8 +349,6 @@ static void acpi_processor_idle(void)
         lapic_timer_on();
         /* Compute time (ticks) that we were actually asleep */
         sleep_ticks = ticks_elapsed(t1, t2);
-        /* Do not account our idle-switching overhead: */
-        sleep_ticks -= cx->latency_ticks + C3_OVERHEAD;
 
         break;
 
@@ -476,163 +358,14 @@ static void acpi_processor_idle(void)
     }
 
     cx->usage++;
-    if ( (cx->type != ACPI_STATE_C1) && (sleep_ticks > 0) )
+    if ( sleep_ticks > 0 )
+    {
+        power->last_residency = PM_TIMER_TICKS_TO_US(sleep_ticks);
         cx->time += sleep_ticks;
-
-    next_state = power->state;
-
-    /*
-     * Promotion?
-     * ----------
-     * Track the number of longs (time asleep is greater than threshold)
-     * and promote when the count threshold is reached.  Note that bus
-     * mastering activity may prevent promotions.
-     * Do not promote above max_cstate.
-     */
-    if ( cx->promotion.state &&
-         ((cx->promotion.state - power->states) <= max_cstate) )
-    {
-        if ( sleep_ticks > cx->promotion.threshold.ticks )
-        {
-            cx->promotion.count++;
-            cx->demotion.count = 0;
-            if ( cx->promotion.count >= cx->promotion.threshold.count )
-            {
-                if ( power->flags.bm_check )
-                {
-                    if ( !(power->bm_activity & cx->promotion.threshold.bm) )
-                    {
-                        next_state = cx->promotion.state;
-                        goto end;
-                    }
-                }
-                else
-                {
-                    next_state = cx->promotion.state;
-                    goto end;
-                }
-            }
-        }
-    }
-
-    /*
-     * Demotion?
-     * ---------
-     * Track the number of shorts (time asleep is less than time threshold)
-     * and demote when the usage threshold is reached.
-     */
-    if ( cx->demotion.state )
-    {
-        if ( sleep_ticks < cx->demotion.threshold.ticks )
-        {
-            cx->demotion.count++;
-            cx->promotion.count = 0;
-            if ( cx->demotion.count >= cx->demotion.threshold.count )
-            {
-                next_state = cx->demotion.state;
-                goto end;
-            }
-        }
-    }
-
-end:
-    /*
-     * Demote if current state exceeds max_cstate
-     */
-    if ( (power->state - power->states) > max_cstate )
-    {
-        if ( cx->demotion.state )
-            next_state = cx->demotion.state;
-    }
-
-    /*
-     * New Cx State?
-     * -------------
-     * If we're going to start using a new Cx state we must clean up
-     * from the previous and prepare to use the new.
-     */
-    if ( next_state != power->state )
-        acpi_processor_power_activate(power, next_state);
-}
-
-static int acpi_processor_set_power_policy(struct acpi_processor_power *power)
-{
-    unsigned int i;
-    unsigned int state_is_set = 0;
-    struct acpi_processor_cx *lower = NULL;
-    struct acpi_processor_cx *higher = NULL;
-    struct acpi_processor_cx *cx;
-
-    if ( !power )
-        return -EINVAL;
-
-    /*
-     * This function sets the default Cx state policy (OS idle handler).
-     * Our scheme is to promote quickly to C2 but more conservatively
-     * to C3.  We're favoring C2  for its characteristics of low latency
-     * (quick response), good power savings, and ability to allow bus
-     * mastering activity.  Note that the Cx state policy is completely
-     * customizable and can be altered dynamically.
-     */
-
-    /* startup state */
-    for ( i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++ )
-    {
-        cx = &power->states[i];
-        if ( !cx->valid )
-            continue;
-
-        if ( !state_is_set )
-            power->state = cx;
-        state_is_set++;
-        break;
-    }
-
-    if ( !state_is_set )
-        return -ENODEV;
-
-    /* demotion */
-    for ( i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++ )
-    {
-        cx = &power->states[i];
-        if ( !cx->valid )
-            continue;
-
-        if ( lower )
-        {
-            cx->demotion.state = lower;
-            cx->demotion.threshold.ticks = cx->latency_ticks;
-            cx->demotion.threshold.count = 1;
-            if ( cx->type == ACPI_STATE_C3 )
-                cx->demotion.threshold.bm = bm_history;
-        }
-
-        lower = cx;
-    }
-
-    /* promotion */
-    for ( i = (ACPI_PROCESSOR_MAX_POWER - 1); i > 0; i-- )
-    {
-        cx = &power->states[i];
-        if ( !cx->valid )
-            continue;
-
-        if ( higher )
-        {
-            cx->promotion.state = higher;
-            cx->promotion.threshold.ticks = cx->latency_ticks;
-            if ( cx->type >= ACPI_STATE_C2 )
-                cx->promotion.threshold.count = 4;
-            else
-                cx->promotion.threshold.count = 10;
-            if ( higher->type == ACPI_STATE_C3 )
-                cx->promotion.threshold.bm = bm_history;
-        }
-
-        higher = cx;
-    }
-
-    return 0;
+    }
+
+    if ( cpuidle_current_governor->reflect )
+        cpuidle_current_governor->reflect(power);
 }
 
 static int init_cx_pminfo(struct acpi_processor_power *acpi_power)
@@ -821,6 +554,8 @@ static int check_cx(struct acpi_processo
     return 0;
 }
 
+static unsigned int latency_factor = 2;
+
 static void set_cx(
     struct acpi_processor_power *acpi_power,
     xen_processor_cx_t *xen_cx)
@@ -842,6 +577,9 @@ static void set_cx(
     cx->power    = xen_cx->power;
     
     cx->latency_ticks = US_TO_PM_TIMER_TICKS(cx->latency);
+    cx->target_residency = cx->latency * latency_factor;
+    if ( cx->type == ACPI_STATE_C1 || cx->type == ACPI_STATE_C2 )
+        acpi_power->safe_state = cx;
 }
 
 int get_cpu_id(u8 acpi_id)
@@ -936,6 +674,7 @@ long set_cx_pminfo(uint32_t cpu, struct 
 
     init_cx_pminfo(acpi_power);
 
+    acpi_power->cpu = cpu_id;
     acpi_power->flags.bm_check = power->flags.bm_check;
     acpi_power->flags.bm_control = power->flags.bm_control;
     acpi_power->flags.has_cst = power->flags.has_cst;
@@ -950,10 +689,11 @@ long set_cx_pminfo(uint32_t cpu, struct 
         set_cx(acpi_power, &xen_cx);
     }
 
+    if ( cpuidle_current_governor->enable &&
+         cpuidle_current_governor->enable(acpi_power) )
+        return -EFAULT;
+
     /* FIXME: C-state dependency is not supported by far */
-    
-    /* initialize default policy */
-    acpi_processor_set_power_policy(acpi_power);
 
     print_acpi_power(cpu_id, acpi_power);
 
@@ -978,7 +718,7 @@ int pmstat_get_cx_stat(uint32_t cpuid, s
     uint64_t usage;
     int i;
 
-    stat->last = (power->state) ? power->state->type : 0;
+    stat->last = (power->last_state) ? power->last_state->type : 0;
     stat->nr = processor_powers[cpuid].count;
     stat->idle_time = v->runstate.time[RUNSTATE_running];
     if ( v->is_running )
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/acpi/cpufreq/cpufreq.c
--- a/xen/arch/x86/acpi/cpufreq/cpufreq.c       Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/acpi/cpufreq/cpufreq.c       Fri Sep 12 14:47:40 2008 +0900
@@ -48,7 +48,7 @@ struct cpufreq_policy xen_px_policy[NR_C
 struct cpufreq_policy xen_px_policy[NR_CPUS];
 
 static cpumask_t *cpufreq_dom_pt;
-static cpumask_t cpufreq_dom_mask;
+static unsigned long *cpufreq_dom_mask;
 static unsigned int cpufreq_dom_max;
 
 enum {
@@ -562,7 +562,8 @@ void cpufreq_dom_exit(void)
 void cpufreq_dom_exit(void)
 {
     cpufreq_dom_max = 0;
-    cpus_clear(cpufreq_dom_mask);
+    if (cpufreq_dom_mask)
+        xfree(cpufreq_dom_mask);
     if (cpufreq_dom_pt)
         xfree(cpufreq_dom_pt);
 }
@@ -572,22 +573,28 @@ int cpufreq_dom_init(void)
     unsigned int i;
 
     cpufreq_dom_max = 0;
-    cpus_clear(cpufreq_dom_mask);
 
     for_each_online_cpu(i) {
-        cpu_set(processor_pminfo[i].perf.domain_info.domain, cpufreq_dom_mask);
         if (cpufreq_dom_max < processor_pminfo[i].perf.domain_info.domain)
             cpufreq_dom_max = processor_pminfo[i].perf.domain_info.domain;
     }
     cpufreq_dom_max++;
+
+    cpufreq_dom_mask = xmalloc_array(unsigned long,
+                                     BITS_TO_LONGS(cpufreq_dom_max));
+    if (!cpufreq_dom_mask)
+        return -ENOMEM;
+    bitmap_zero(cpufreq_dom_mask, cpufreq_dom_max);
 
     cpufreq_dom_pt = xmalloc_array(cpumask_t, cpufreq_dom_max);
     if (!cpufreq_dom_pt)
         return -ENOMEM;
     memset(cpufreq_dom_pt, 0, cpufreq_dom_max * sizeof(cpumask_t));
 
-    for_each_online_cpu(i)
+    for_each_online_cpu(i) {
+        __set_bit(processor_pminfo[i].perf.domain_info.domain, 
cpufreq_dom_mask);
         cpu_set(i, 
cpufreq_dom_pt[processor_pminfo[i].perf.domain_info.domain]);
+    }
 
     for_each_online_cpu(i)
         processor_pminfo[i].perf.shared_cpu_map =
@@ -616,10 +623,11 @@ static int cpufreq_cpu_init(void)
 
 int cpufreq_dom_dbs(unsigned int event)
 {
-    int cpu, dom, ret = 0;
-
-    for (dom=0; dom<cpufreq_dom_max; dom++) {
-        if (!cpu_isset(dom, cpufreq_dom_mask))
+    unsigned int cpu, dom;
+    int ret = 0;
+
+    for (dom = 0; dom < cpufreq_dom_max; dom++) {
+        if (!test_bit(dom, cpufreq_dom_mask))
             continue;
         cpu = first_cpu(cpufreq_dom_pt[dom]);
         ret = cpufreq_governor_dbs(&xen_px_policy[cpu], event);
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/acpi/cpufreq/powernow.c
--- a/xen/arch/x86/acpi/cpufreq/powernow.c      Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/acpi/cpufreq/powernow.c      Fri Sep 12 14:47:40 2008 +0900
@@ -197,8 +197,8 @@ static int powernow_cpufreq_cpu_init(str
 
     data->max_freq = perf->states[0].core_frequency * 1000;
     /* table init */
-    for (i=0; i<perf->state_count && i<max_hw_pstate; i++) {
-        if (i>0 && perf->states[i].core_frequency >=
+    for (i = 0; i < perf->state_count && i <= max_hw_pstate; i++) {
+        if (i > 0 && perf->states[i].core_frequency >=
             data->freq_table[valid_states-1].frequency / 1000)
             continue;
 
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/acpi/cpuidle_menu.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/acpi/cpuidle_menu.c  Fri Sep 12 14:47:40 2008 +0900
@@ -0,0 +1,132 @@
+/*
+ * cpuidle_menu - menu governor for cpu idle, main idea come from Linux.
+ *            drivers/cpuidle/governors/menu.c 
+ *
+ *  Copyright (C) 2006-2007 Adam Belay <abelay@xxxxxxxxxx>
+ *  Copyright (C) 2007, 2008 Intel Corporation
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or (at
+ *  your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+#include <xen/config.h>
+#include <xen/errno.h>
+#include <xen/lib.h>
+#include <xen/types.h>
+#include <xen/acpi.h>
+#include <xen/timer.h>
+#include <xen/cpuidle.h>
+
+#define BREAK_FUZZ      4       /* 4 us */
+#define USEC_PER_SEC 1000000
+
+struct menu_device
+{
+    int             last_state_idx;
+    unsigned int    expected_us;
+    unsigned int    predicted_us;
+    unsigned int    last_measured_us;
+    unsigned int    elapsed_us;
+};
+
+static DEFINE_PER_CPU(struct menu_device, menu_devices);
+
+static s_time_t get_sleep_length_ns(void)
+{
+    return per_cpu(timer_deadline, smp_processor_id()) - NOW();
+}
+
+static int menu_select(struct acpi_processor_power *power)
+{
+    struct menu_device *data = &__get_cpu_var(menu_devices);
+    int i;
+
+    /* determine the expected residency time */
+    data->expected_us = (u32) get_sleep_length_ns() / 1000;
+
+    /* find the deepest idle state that satisfies our constraints */
+    for ( i = 1; i < power->count; i++ )
+    {
+        struct acpi_processor_cx *s = &power->states[i];
+
+        if ( s->target_residency > data->expected_us + s->latency )
+            break;
+        if ( s->target_residency > data->predicted_us )
+            break;
+        /* TBD: we need to check the QoS requirment in future */
+    }
+
+    data->last_state_idx = i - 1;
+    return i - 1;
+}
+
+static void menu_reflect(struct acpi_processor_power *power)
+{
+    struct menu_device *data = &__get_cpu_var(menu_devices);
+    struct acpi_processor_cx *target = &power->states[data->last_state_idx];
+    unsigned int last_residency; 
+    unsigned int measured_us;
+
+    /*
+     * Ugh, this idle state doesn't support residency measurements, so we
+     * are basically lost in the dark.  As a compromise, assume we slept
+     * for one full standard timer tick.  However, be aware that this
+     * could potentially result in a suboptimal state transition.
+     */
+    if ( target->type == ACPI_STATE_C1 )
+        last_residency = USEC_PER_SEC / HZ;
+    else
+        last_residency = power->last_residency;
+
+    measured_us = last_residency + data->elapsed_us;
+
+    /* if wrapping, set to max uint (-1) */
+    measured_us = data->elapsed_us <= measured_us ? measured_us : -1;
+
+    /* Predict time remaining until next break event */
+    data->predicted_us = max(measured_us, data->last_measured_us);
+
+    /* Distinguish between expected & non-expected events */
+    if ( last_residency + BREAK_FUZZ
+         < data->expected_us + target->latency )
+    {
+        data->last_measured_us = measured_us;
+        data->elapsed_us = 0;
+    }
+    else
+        data->elapsed_us = measured_us;
+}
+
+static int menu_enable_device(struct acpi_processor_power *power)
+{
+    struct menu_device *data = &per_cpu(menu_devices, power->cpu);
+
+    memset(data, 0, sizeof(struct menu_device));
+
+    return 0;
+}
+
+static struct cpuidle_governor menu_governor =
+{
+    .name =         "menu",
+    .rating =       20,
+    .enable =       menu_enable_device,
+    .select =       menu_select,
+    .reflect =      menu_reflect,
+};
+
+struct cpuidle_governor *cpuidle_current_governor = &menu_governor;
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c     Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/domain.c     Fri Sep 12 14:47:40 2008 +0900
@@ -31,6 +31,7 @@
 #include <xen/compat.h>
 #include <xen/acpi.h>
 #include <xen/pci.h>
+#include <xen/paging.h>
 #include <asm/regs.h>
 #include <asm/mc146818rtc.h>
 #include <asm/system.h>
@@ -40,7 +41,6 @@
 #include <asm/i387.h>
 #include <asm/mpspec.h>
 #include <asm/ldt.h>
-#include <asm/paging.h>
 #include <asm/hypercall.h>
 #include <asm/hvm/hvm.h>
 #include <asm/hvm/support.h>
@@ -302,7 +302,8 @@ int vcpu_initialise(struct vcpu *v)
     else
     {
         /* PV guests by default have a 100Hz ticker. */
-        v->periodic_period = MILLISECS(10);
+        if ( !is_idle_domain(d) )
+            v->periodic_period = MILLISECS(10);
 
         /* PV guests get an emulated PIT too for video BIOSes to use. */
         if ( !is_idle_domain(d) && (v->vcpu_id == 0) )
@@ -1645,23 +1646,26 @@ static int relinquish_memory(
 
         /*
          * Forcibly invalidate top-most, still valid page tables at this point
-         * to break circular 'linear page table' references. This is okay
-         * because MMU structures are not shared across domains and this domain
-         * is now dead. Thus top-most valid tables are not in use so a non-zero
-         * count means circular reference.
+         * to break circular 'linear page table' references as well as clean up
+         * partially validated pages. This is okay because MMU structures are
+         * not shared across domains and this domain is now dead. Thus top-most
+         * valid tables are not in use so a non-zero count means circular
+         * reference or partially validated.
          */
         y = page->u.inuse.type_info;
         for ( ; ; )
         {
             x = y;
-            if ( likely((x & (PGT_type_mask|PGT_validated)) !=
-                        (type|PGT_validated)) )
+            if ( likely((x & PGT_type_mask) != type) ||
+                 likely(!(x & (PGT_validated|PGT_partial))) )
                 break;
 
-            y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated);
+            y = cmpxchg(&page->u.inuse.type_info, x,
+                        x & ~(PGT_validated|PGT_partial));
             if ( likely(y == x) )
             {
-                free_page_type(page, type);
+                if ( free_page_type(page, x, 0) != 0 )
+                    BUG();
                 break;
             }
         }
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/domain_build.c
--- a/xen/arch/x86/domain_build.c       Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/domain_build.c       Fri Sep 12 14:47:40 2008 +0900
@@ -26,6 +26,7 @@
 #include <asm/desc.h>
 #include <asm/i387.h>
 #include <asm/paging.h>
+#include <asm/p2m.h>
 #include <asm/e820.h>
 
 #include <public/version.h>
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/domctl.c
--- a/xen/arch/x86/domctl.c     Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/domctl.c     Fri Sep 12 14:47:40 2008 +0900
@@ -20,7 +20,7 @@
 #include <xen/trace.h>
 #include <xen/console.h>
 #include <xen/iocap.h>
-#include <asm/paging.h>
+#include <xen/paging.h>
 #include <asm/irq.h>
 #include <asm/hvm/hvm.h>
 #include <asm/hvm/support.h>
@@ -67,14 +67,6 @@ long arch_do_domctl(
         ret = -ESRCH;
         if ( unlikely((d = rcu_lock_domain_by_id(domctl->domain)) == NULL) )
             break;
-
-        ret = xsm_ioport_permission(d, fp, 
-                                    domctl->u.ioport_permission.allow_access);
-        if ( ret )
-        {
-            rcu_unlock_domain(d);
-            break;
-        }
 
         if ( np == 0 )
             ret = 0;
@@ -550,6 +542,10 @@ long arch_do_domctl(
         if ( (d = rcu_lock_domain_by_id(domctl->domain)) == NULL )
             break;
 
+        ret = xsm_sendtrigger(d);
+        if ( ret )
+            goto sendtrigger_out;
+
         ret = -EINVAL;
         if ( domctl->u.sendtrigger.vcpu >= MAX_VIRT_CPUS )
             goto sendtrigger_out;
@@ -628,6 +624,10 @@ long arch_do_domctl(
         bus = (domctl->u.assign_device.machine_bdf >> 16) & 0xff;
         devfn = (domctl->u.assign_device.machine_bdf >> 8) & 0xff;
 
+        ret = xsm_test_assign_device(domctl->u.assign_device.machine_bdf);
+        if ( ret )
+            break;
+
         if ( device_assigned(bus, devfn) )
         {
             gdprintk(XENLOG_ERR, "XEN_DOMCTL_test_assign_device: "
@@ -655,6 +655,11 @@ long arch_do_domctl(
                 "XEN_DOMCTL_assign_device: get_domain_by_id() failed\n");
             break;
         }
+
+        ret = xsm_assign_device(d, domctl->u.assign_device.machine_bdf);
+        if ( ret )
+            goto assign_device_out;
+
         bus = (domctl->u.assign_device.machine_bdf >> 16) & 0xff;
         devfn = (domctl->u.assign_device.machine_bdf >> 8) & 0xff;
 
@@ -680,6 +685,7 @@ long arch_do_domctl(
                      "assign device (%x:%x:%x) failed\n",
                      bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
 
+    assign_device_out:
         put_domain(d);
     }
     break;
@@ -700,6 +706,11 @@ long arch_do_domctl(
                 "XEN_DOMCTL_deassign_device: get_domain_by_id() failed\n"); 
             break;
         }
+
+        ret = xsm_assign_device(d, domctl->u.assign_device.machine_bdf);
+        if ( ret )
+            goto deassign_device_out;
+
         bus = (domctl->u.assign_device.machine_bdf >> 16) & 0xff;
         devfn = (domctl->u.assign_device.machine_bdf >> 8) & 0xff;
 
@@ -720,6 +731,8 @@ long arch_do_domctl(
         deassign_device(d, bus, devfn);
         gdprintk(XENLOG_INFO, "XEN_DOMCTL_deassign_device: bdf = %x:%x:%x\n",
             bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
+
+    deassign_device_out:
         put_domain(d);
     }
     break;
@@ -733,10 +746,17 @@ long arch_do_domctl(
         if ( (d = rcu_lock_domain_by_id(domctl->domain)) == NULL )
             break;
         bind = &(domctl->u.bind_pt_irq);
+
+        ret = xsm_bind_pt_irq(d, bind);
+        if ( ret )
+            goto bind_out;
+
         if ( iommu_enabled )
             ret = pt_irq_create_bind_vtd(d, bind);
         if ( ret < 0 )
             gdprintk(XENLOG_ERR, "pt_irq_create_bind failed!\n");
+
+    bind_out:
         rcu_unlock_domain(d);
     }
     break;    
@@ -877,11 +897,16 @@ long arch_do_domctl(
         if ( d == NULL )
             break;
 
+        ret = xsm_pin_mem_cacheattr(d);
+        if ( ret )
+            goto pin_out;
+
         ret = hvm_set_mem_pinned_cacheattr(
             d, domctl->u.pin_mem_cacheattr.start,
             domctl->u.pin_mem_cacheattr.end,
             domctl->u.pin_mem_cacheattr.type);
 
+    pin_out:
         rcu_unlock_domain(d);
     }
     break;
@@ -899,6 +924,10 @@ long arch_do_domctl(
         d = rcu_lock_domain_by_id(domctl->domain);
         if ( d == NULL )
             break;
+
+        ret = xsm_ext_vcpucontext(d, domctl->cmd);
+        if ( ret )
+            goto ext_vcpucontext_out;
 
         ret = -ESRCH;
         if ( (evc->vcpu >= MAX_VIRT_CPUS) ||
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/hpet.c
--- a/xen/arch/x86/hpet.c       Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/hpet.c       Fri Sep 12 14:47:40 2008 +0900
@@ -100,6 +100,13 @@ static int reprogram_hpet_evt_channel(
 
     ch->next_event = expire;
 
+    if ( expire == STIME_MAX )
+    {
+        /* We assume it will take a long time for the timer to wrap. */
+        hpet_write32(0, HPET_T0_CMP);
+        return 0;
+    }
+
     delta = min_t(int64_t, delta, MAX_DELTA_NS);
     delta = max_t(int64_t, delta, MIN_DELTA_NS);
     delta = ns2ticks(delta, ch->shift, ch->mult);
@@ -206,9 +213,11 @@ void hpet_broadcast_enter(void)
 {
     struct hpet_event_channel *ch = &hpet_event;
 
+    spin_lock(&ch->lock);
+
+    disable_APIC_timer();
+
     cpu_set(smp_processor_id(), ch->cpumask);
-
-    spin_lock(&ch->lock);
 
     /* reprogram if current cpu expire time is nearer */
     if ( this_cpu(timer_deadline) < ch->next_event )
@@ -222,8 +231,23 @@ void hpet_broadcast_exit(void)
     struct hpet_event_channel *ch = &hpet_event;
     int cpu = smp_processor_id();
 
+    spin_lock_irq(&ch->lock);
+
     if ( cpu_test_and_clear(cpu, ch->cpumask) )
-        reprogram_timer(per_cpu(timer_deadline, cpu));
+    {
+        /* Cancel any outstanding LAPIC event and re-enable interrupts. */
+        reprogram_timer(0);
+        enable_APIC_timer();
+        
+        /* Reprogram the deadline; trigger timer work now if it has passed. */
+        if ( !reprogram_timer(per_cpu(timer_deadline, cpu)) )
+            raise_softirq(TIMER_SOFTIRQ);
+
+        if ( cpus_empty(ch->cpumask) && ch->next_event != STIME_MAX )
+            reprogram_hpet_evt_channel(ch, STIME_MAX, 0, 0);
+    }
+
+    spin_unlock_irq(&ch->lock);
 }
 
 int hpet_broadcast_is_available(void)
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/hvm/hvm.c
--- a/xen/arch/x86/hvm/hvm.c    Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/hvm/hvm.c    Fri Sep 12 14:47:40 2008 +0900
@@ -31,10 +31,11 @@
 #include <xen/hypercall.h>
 #include <xen/guest_access.h>
 #include <xen/event.h>
+#include <xen/paging.h>
+#include <asm/shadow.h>
 #include <asm/current.h>
 #include <asm/e820.h>
 #include <asm/io.h>
-#include <asm/paging.h>
 #include <asm/regs.h>
 #include <asm/cpufeature.h>
 #include <asm/processor.h>
@@ -772,7 +773,7 @@ void hvm_hlt(unsigned long rflags)
 
     do_sched_op_compat(SCHEDOP_block, 0);
 
-    HVMTRACE_1D(HLT, curr, /* pending = */ vcpu_runnable(curr));
+    HVMTRACE_1D(HLT, /* pending = */ vcpu_runnable(curr));
 }
 
 void hvm_triple_fault(void)
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/hvm/svm/intr.c
--- a/xen/arch/x86/hvm/svm/intr.c       Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/hvm/svm/intr.c       Fri Sep 12 14:47:40 2008 +0900
@@ -80,7 +80,7 @@ static void enable_intr_window(struct vc
 
     ASSERT(intack.source != hvm_intsrc_none);
 
-    HVMTRACE_2D(INJ_VIRQ, v, 0x0, /*fake=*/ 1);
+    HVMTRACE_2D(INJ_VIRQ, 0x0, /*fake=*/ 1);
 
     /*
      * Create a dummy virtual interrupt to intercept as soon as the
@@ -199,7 +199,7 @@ asmlinkage void svm_intr_assist(void)
     }
     else
     {
-        HVMTRACE_2D(INJ_VIRQ, v, intack.vector, /*fake=*/ 0);
+        HVMTRACE_2D(INJ_VIRQ, intack.vector, /*fake=*/ 0);
         svm_inject_extint(v, intack.vector);
         pt_intr_post(v, intack);
     }
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/hvm/svm/svm.c
--- a/xen/arch/x86/hvm/svm/svm.c        Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/hvm/svm/svm.c        Fri Sep 12 14:47:40 2008 +0900
@@ -759,11 +759,11 @@ static void svm_inject_exception(
     if ( trapnr == TRAP_page_fault )
     {
         vmcb->cr2 = curr->arch.hvm_vcpu.guest_cr[2] = cr2;
-        HVMTRACE_LONG_2D(PF_INJECT, curr, errcode, TRC_PAR_LONG(cr2));
+        HVMTRACE_LONG_2D(PF_INJECT, errcode, TRC_PAR_LONG(cr2));
     }
     else
     {
-        HVMTRACE_2D(INJ_EXC, curr, trapnr, errcode);
+        HVMTRACE_2D(INJ_EXC, trapnr, errcode);
     }
 
     if ( (trapnr == TRAP_debug) &&
@@ -919,7 +919,7 @@ static void svm_cpuid_intercept(
             __clear_bit(X86_FEATURE_APIC & 31, edx);
     }
 
-    HVMTRACE_5D (CPUID, v, input, *eax, *ebx, *ecx, *edx);
+    HVMTRACE_5D (CPUID, input, *eax, *ebx, *ecx, *edx);
 }
 
 static void svm_vmexit_do_cpuid(struct cpu_user_regs *regs)
@@ -946,7 +946,7 @@ static void svm_vmexit_do_cpuid(struct c
 
 static void svm_dr_access(struct vcpu *v, struct cpu_user_regs *regs)
 {
-    HVMTRACE_0D(DR_WRITE, v);
+    HVMTRACE_0D(DR_WRITE);
     __restore_debug_registers(v);
 }
 
@@ -1018,7 +1018,7 @@ static int svm_msr_read_intercept(struct
     regs->edx = msr_content >> 32;
 
  done:
-    HVMTRACE_3D (MSR_READ, v, ecx, regs->eax, regs->edx);
+    HVMTRACE_3D (MSR_READ, ecx, regs->eax, regs->edx);
     HVM_DBG_LOG(DBG_LEVEL_1, "returns: ecx=%x, eax=%lx, edx=%lx",
                 ecx, (unsigned long)regs->eax, (unsigned long)regs->edx);
     return X86EMUL_OKAY;
@@ -1037,7 +1037,7 @@ static int svm_msr_write_intercept(struc
 
     msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
 
-    HVMTRACE_3D (MSR_WRITE, v, ecx, regs->eax, regs->edx);
+    HVMTRACE_3D (MSR_WRITE, ecx, regs->eax, regs->edx);
 
     switch ( ecx )
     {
@@ -1168,7 +1168,7 @@ static void svm_invlpg_intercept(unsigne
 static void svm_invlpg_intercept(unsigned long vaddr)
 {
     struct vcpu *curr = current;
-    HVMTRACE_LONG_2D(INVLPG, curr, 0, TRC_PAR_LONG(vaddr));
+    HVMTRACE_LONG_2D(INVLPG, 0, TRC_PAR_LONG(vaddr));
     paging_invlpg(curr, vaddr);
     svm_asid_g_invlpg(curr, vaddr);
 }
@@ -1191,7 +1191,7 @@ asmlinkage void svm_vmexit_handler(struc
 
     exit_reason = vmcb->exitcode;
 
-    HVMTRACE_ND(VMEXIT64, 1/*cycles*/, v, 3, exit_reason,
+    HVMTRACE_ND(VMEXIT64, 1/*cycles*/, 3, exit_reason,
                 (uint32_t)regs->eip, (uint32_t)((uint64_t)regs->eip >> 32),
                 0, 0, 0);
 
@@ -1216,17 +1216,17 @@ asmlinkage void svm_vmexit_handler(struc
     {
     case VMEXIT_INTR:
         /* Asynchronous event, handled when we STGI'd after the VMEXIT. */
-        HVMTRACE_0D(INTR, v);
+        HVMTRACE_0D(INTR);
         break;
 
     case VMEXIT_NMI:
         /* Asynchronous event, handled when we STGI'd after the VMEXIT. */
-        HVMTRACE_0D(NMI, v);
+        HVMTRACE_0D(NMI);
         break;
 
     case VMEXIT_SMI:
         /* Asynchronous event, handled when we STGI'd after the VMEXIT. */
-        HVMTRACE_0D(SMI, v);
+        HVMTRACE_0D(SMI);
         break;
 
     case VMEXIT_EXCEPTION_DB:
@@ -1261,10 +1261,12 @@ asmlinkage void svm_vmexit_handler(struc
 
         if ( paging_fault(va, regs) )
         {
-            if (hvm_long_mode_enabled(v))
-                HVMTRACE_LONG_2D(PF_XEN, v, regs->error_code, 
TRC_PAR_LONG(va));
+            if ( trace_will_trace_event(TRC_SHADOW) )
+                break;
+            if ( hvm_long_mode_enabled(v) )
+                HVMTRACE_LONG_2D(PF_XEN, regs->error_code, TRC_PAR_LONG(va));
             else
-                HVMTRACE_2D(PF_XEN, v, regs->error_code, va);
+                HVMTRACE_2D(PF_XEN, regs->error_code, va);
             break;
         }
 
@@ -1274,7 +1276,7 @@ asmlinkage void svm_vmexit_handler(struc
 
     /* Asynchronous event, handled when we STGI'd after the VMEXIT. */
     case VMEXIT_EXCEPTION_MC:
-        HVMTRACE_0D(MCE, v);
+        HVMTRACE_0D(MCE);
         break;
 
     case VMEXIT_VINTR:
@@ -1331,7 +1333,7 @@ asmlinkage void svm_vmexit_handler(struc
     case VMEXIT_VMMCALL:
         if ( (inst_len = __get_instruction_length(v, INSTR_VMCALL)) == 0 )
             break;
-        HVMTRACE_1D(VMMCALL, v, regs->eax);
+        HVMTRACE_1D(VMMCALL, regs->eax);
         rc = hvm_do_hypercall(regs);
         if ( rc != HVM_HCALL_preempted )
         {
@@ -1406,7 +1408,7 @@ asmlinkage void svm_vmexit_handler(struc
 
 asmlinkage void svm_trace_vmentry(void)
 {
-    HVMTRACE_ND (VMENTRY, 1/*cycles*/, current, 0, 0, 0, 0, 0, 0, 0);
+    HVMTRACE_ND (VMENTRY, 1/*cycles*/, 0, 0, 0, 0, 0, 0, 0);
 }
   
 /*
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/hvm/vmx/intr.c
--- a/xen/arch/x86/hvm/vmx/intr.c       Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/hvm/vmx/intr.c       Fri Sep 12 14:47:40 2008 +0900
@@ -198,7 +198,7 @@ asmlinkage void vmx_intr_assist(void)
     }
     else
     {
-        HVMTRACE_2D(INJ_VIRQ, v, intack.vector, /*fake=*/ 0);
+        HVMTRACE_2D(INJ_VIRQ, intack.vector, /*fake=*/ 0);
         vmx_inject_extint(v, intack.vector);
         pt_intr_post(v, intack);
     }
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/hvm/vmx/vmx.c
--- a/xen/arch/x86/hvm/vmx/vmx.c        Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/hvm/vmx/vmx.c        Fri Sep 12 14:47:40 2008 +0900
@@ -1114,10 +1114,10 @@ static void __vmx_inject_exception(
     __vmwrite(VM_ENTRY_INTR_INFO, intr_fields);
 
     if ( trap == TRAP_page_fault )
-        HVMTRACE_LONG_2D(PF_INJECT, v, error_code,
+        HVMTRACE_LONG_2D(PF_INJECT, error_code,
             TRC_PAR_LONG(v->arch.hvm_vcpu.guest_cr[2]));
     else
-        HVMTRACE_2D(INJ_EXC, v, trap, error_code);
+        HVMTRACE_2D(INJ_EXC, trap, error_code);
 }
 
 void vmx_inject_hw_exception(struct vcpu *v, int trap, int error_code)
@@ -1345,7 +1345,7 @@ static void vmx_cpuid_intercept(
             break;
     }
 
-    HVMTRACE_5D (CPUID, current, input, *eax, *ebx, *ecx, *edx);
+    HVMTRACE_5D (CPUID, input, *eax, *ebx, *ecx, *edx);
 }
 
 static void vmx_do_cpuid(struct cpu_user_regs *regs)
@@ -1370,7 +1370,7 @@ static void vmx_dr_access(unsigned long 
 {
     struct vcpu *v = current;
 
-    HVMTRACE_0D(DR_WRITE, v);
+    HVMTRACE_0D(DR_WRITE);
 
     if ( !v->arch.hvm_vcpu.flag_dr_dirty )
         __restore_debug_registers(v);
@@ -1383,7 +1383,7 @@ static void vmx_invlpg_intercept(unsigne
 static void vmx_invlpg_intercept(unsigned long vaddr)
 {
     struct vcpu *curr = current;
-    HVMTRACE_LONG_2D(INVLPG, curr, /*invlpga=*/ 0, TRC_PAR_LONG(vaddr));
+    HVMTRACE_LONG_2D(INVLPG, /*invlpga=*/ 0, TRC_PAR_LONG(vaddr));
     if ( paging_invlpg(curr, vaddr) )
         vpid_sync_vcpu_gva(curr, vaddr);
 }
@@ -1434,7 +1434,7 @@ static int mov_to_cr(int gp, int cr, str
         goto exit_and_crash;
     }
 
-    HVMTRACE_LONG_2D(CR_WRITE, v, cr, TRC_PAR_LONG(value));
+    HVMTRACE_LONG_2D(CR_WRITE, cr, TRC_PAR_LONG(value));
 
     HVM_DBG_LOG(DBG_LEVEL_1, "CR%d, value = %lx", cr, value);
 
@@ -1505,7 +1505,7 @@ static void mov_from_cr(int cr, int gp, 
         break;
     }
 
-    HVMTRACE_LONG_2D(CR_READ, v, cr, TRC_PAR_LONG(value));
+    HVMTRACE_LONG_2D(CR_READ, cr, TRC_PAR_LONG(value));
 
     HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR%d, value = %lx", cr, value);
 }
@@ -1531,13 +1531,13 @@ static int vmx_cr_access(unsigned long e
     case VMX_CONTROL_REG_ACCESS_TYPE_CLTS:
         v->arch.hvm_vcpu.guest_cr[0] &= ~X86_CR0_TS;
         vmx_update_guest_cr(v, 0);
-        HVMTRACE_0D(CLTS, current);
+        HVMTRACE_0D(CLTS);
         break;
     case VMX_CONTROL_REG_ACCESS_TYPE_LMSW:
         value = v->arch.hvm_vcpu.guest_cr[0];
         /* LMSW can: (1) set bits 0-3; (2) clear bits 1-3. */
         value = (value & ~0xe) | ((exit_qualification >> 16) & 0xf);
-        HVMTRACE_LONG_1D(LMSW, current, value);
+        HVMTRACE_LONG_1D(LMSW, value);
         return !hvm_set_cr0(value);
     default:
         BUG();
@@ -1692,7 +1692,7 @@ static int vmx_msr_read_intercept(struct
     regs->edx = (uint32_t)(msr_content >> 32);
 
 done:
-    HVMTRACE_3D (MSR_READ, v, ecx, regs->eax, regs->edx);
+    HVMTRACE_3D (MSR_READ, ecx, regs->eax, regs->edx);
     HVM_DBG_LOG(DBG_LEVEL_1, "returns: ecx=%x, eax=%lx, edx=%lx",
                 ecx, (unsigned long)regs->eax,
                 (unsigned long)regs->edx);
@@ -1803,7 +1803,7 @@ static int vmx_msr_write_intercept(struc
 
     msr_content = (u32)regs->eax | ((u64)regs->edx << 32);
 
-    HVMTRACE_3D (MSR_WRITE, v, ecx, regs->eax, regs->edx);
+    HVMTRACE_3D (MSR_WRITE, ecx, regs->eax, regs->edx);
 
     switch ( ecx )
     {
@@ -1894,7 +1894,7 @@ static void vmx_do_extint(struct cpu_use
     BUG_ON(!(vector & INTR_INFO_VALID_MASK));
 
     vector &= INTR_INFO_VECTOR_MASK;
-    HVMTRACE_1D(INTR, current, vector);
+    HVMTRACE_1D(INTR, vector);
 
     switch ( vector )
     {
@@ -2010,7 +2010,7 @@ static void vmx_failed_vmentry(unsigned 
         break;
     case EXIT_REASON_MACHINE_CHECK:
         printk("caused by machine check.\n");
-        HVMTRACE_0D(MCE, curr);
+        HVMTRACE_0D(MCE);
         do_machine_check(regs);
         break;
     default:
@@ -2037,7 +2037,7 @@ asmlinkage void vmx_vmexit_handler(struc
 
     exit_reason = __vmread(VM_EXIT_REASON);
 
-    HVMTRACE_ND(VMEXIT64, 1/*cycles*/, v, 3, exit_reason,
+    HVMTRACE_ND(VMEXIT64, 1/*cycles*/, 3, exit_reason,
                 (uint32_t)regs->eip, (uint32_t)((uint64_t)regs->eip >> 32),
                 0, 0, 0);
 
@@ -2101,7 +2101,8 @@ asmlinkage void vmx_vmexit_handler(struc
              !(__vmread(IDT_VECTORING_INFO) & INTR_INFO_VALID_MASK) &&
              (vector != TRAP_double_fault) )
             __vmwrite(GUEST_INTERRUPTIBILITY_INFO,
-                    __vmread(GUEST_INTERRUPTIBILITY_INFO)|VMX_INTR_SHADOW_NMI);
+                      __vmread(GUEST_INTERRUPTIBILITY_INFO)
+                      | VMX_INTR_SHADOW_NMI);
 
         perfc_incra(cause_vector, vector);
 
@@ -2128,12 +2129,14 @@ asmlinkage void vmx_vmexit_handler(struc
 
             if ( paging_fault(exit_qualification, regs) )
             {
+                if ( trace_will_trace_event(TRC_SHADOW) )
+                    break;
                 if ( hvm_long_mode_enabled(v) )
-                    HVMTRACE_LONG_2D (PF_XEN, v, regs->error_code,
-                        TRC_PAR_LONG(exit_qualification) );
+                    HVMTRACE_LONG_2D(PF_XEN, regs->error_code,
+                                     TRC_PAR_LONG(exit_qualification) );
                 else
-                    HVMTRACE_2D (PF_XEN, v,
-                        regs->error_code, exit_qualification );
+                    HVMTRACE_2D(PF_XEN,
+                                regs->error_code, exit_qualification );
                 break;
             }
 
@@ -2144,11 +2147,11 @@ asmlinkage void vmx_vmexit_handler(struc
             if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) !=
                  (X86_EVENTTYPE_NMI << 8) )
                 goto exit_and_crash;
-            HVMTRACE_0D(NMI, v);
+            HVMTRACE_0D(NMI);
             do_nmi(regs); /* Real NMI, vector 2: normal processing. */
             break;
         case TRAP_machine_check:
-            HVMTRACE_0D(MCE, v);
+            HVMTRACE_0D(MCE);
             do_machine_check(regs);
             break;
         default:
@@ -2213,7 +2216,7 @@ asmlinkage void vmx_vmexit_handler(struc
     case EXIT_REASON_VMCALL:
     {
         int rc;
-        HVMTRACE_1D(VMMCALL, v, regs->eax);
+        HVMTRACE_1D(VMMCALL, regs->eax);
         inst_len = __get_instruction_length(); /* Safe: VMCALL */
         rc = hvm_do_hypercall(regs);
         if ( rc != HVM_HCALL_preempted )
@@ -2300,7 +2303,7 @@ asmlinkage void vmx_vmexit_handler(struc
 
 asmlinkage void vmx_trace_vmentry(void)
 {
-    HVMTRACE_ND (VMENTRY, 1/*cycles*/, current, 0, 0, 0, 0, 0, 0, 0);
+    HVMTRACE_ND (VMENTRY, 1/*cycles*/, 0, 0, 0, 0, 0, 0, 0);
 }
 
 /*
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/io_apic.c
--- a/xen/arch/x86/io_apic.c    Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/io_apic.c    Fri Sep 12 14:47:40 2008 +0900
@@ -45,23 +45,14 @@ int (*ioapic_renumber_irq)(int ioapic, i
 int (*ioapic_renumber_irq)(int ioapic, int irq);
 atomic_t irq_mis_count;
 
-int msi_enable = 0;
-boolean_param("msi", msi_enable);
-
 int domain_irq_to_vector(struct domain *d, int irq)
 {
-    if ( !msi_enable )
-        return irq_to_vector(irq);
-    else
-        return d->arch.pirq_vector[irq];
+    return d->arch.pirq_vector[irq];
 }
 
 int domain_vector_to_irq(struct domain *d, int vector)
 {
-    if ( !msi_enable )
-        return vector_to_irq(vector);
-    else
-        return d->arch.vector_pirq[vector];
+    return d->arch.vector_pirq[vector];
 }
 
 /* Where if anywhere is the i8259 connect in external int mode */
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/irq.c
--- a/xen/arch/x86/irq.c        Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/irq.c        Fri Sep 12 14:47:40 2008 +0900
@@ -737,9 +737,12 @@ __initcall(setup_dump_irqs);
 
 void fixup_irqs(cpumask_t map)
 {
-    unsigned int irq;
+    unsigned int irq, sp;
     static int warned;
-
+    irq_guest_action_t *action;
+    struct pending_eoi *peoi;
+
+    /* Direct all future interrupts away from this CPU. */
     for ( irq = 0; irq < NR_IRQS; irq++ )
     {
         cpumask_t mask;
@@ -758,8 +761,24 @@ void fixup_irqs(cpumask_t map)
             printk("Cannot set affinity for irq %i\n", irq);
     }
 
+    /* Service any interrupts that beat us in the re-direction race. */
     local_irq_enable();
     mdelay(1);
     local_irq_disable();
+
+    /* Clean up cpu_eoi_map of every interrupt to exclude this CPU. */
+    for ( irq = 0; irq < NR_IRQS; irq++ )
+    {
+        if ( !(irq_desc[irq].status & IRQ_GUEST) )
+            continue;
+        action = (irq_guest_action_t *)irq_desc[irq].action;
+        cpu_clear(smp_processor_id(), action->cpu_eoi_map);
+    }
+
+    /* Flush the interrupt EOI stack. */
+    peoi = this_cpu(pending_eoi);
+    for ( sp = 0; sp < pending_eoi_sp(peoi); sp++ )
+        peoi[sp].ready = 1;
+    flush_ready_eoi(NULL);
 }
 #endif
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/mm.c
--- a/xen/arch/x86/mm.c Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/mm.c Fri Sep 12 14:47:40 2008 +0900
@@ -507,11 +507,11 @@ static int alloc_segdesc_page(struct pag
             goto fail;
 
     unmap_domain_page(descs);
-    return 1;
+    return 0;
 
  fail:
     unmap_domain_page(descs);
-    return 0;
+    return -EINVAL;
 }
 
 
@@ -565,20 +565,23 @@ static int get_page_from_pagenr(unsigned
 
 static int get_page_and_type_from_pagenr(unsigned long page_nr, 
                                          unsigned long type,
-                                         struct domain *d)
+                                         struct domain *d,
+                                         int preemptible)
 {
     struct page_info *page = mfn_to_page(page_nr);
+    int rc;
 
     if ( unlikely(!get_page_from_pagenr(page_nr, d)) )
-        return 0;
-
-    if ( unlikely(!get_page_type(page, type)) )
-    {
+        return -EINVAL;
+
+    rc = (preemptible ?
+          get_page_type_preemptible(page, type) :
+          (get_page_type(page, type) ? 0 : -EINVAL));
+
+    if ( rc )
         put_page(page);
-        return 0;
-    }
-
-    return 1;
+
+    return rc;
 }
 
 /*
@@ -754,22 +757,22 @@ get_page_from_l2e(
     if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
     {
         MEM_LOG("Bad L2 flags %x", l2e_get_flags(l2e) & L2_DISALLOW_MASK);
-        return 0;
-    }
-
-    rc = get_page_and_type_from_pagenr(l2e_get_pfn(l2e), PGT_l1_page_table, d);
-    if ( unlikely(!rc) )
-        rc = get_l2_linear_pagetable(l2e, pfn, d);
+        return -EINVAL;
+    }
+
+    rc = get_page_and_type_from_pagenr(
+        l2e_get_pfn(l2e), PGT_l1_page_table, d, 0);
+    if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) )
+        rc = 0;
 
     return rc;
 }
 
 
-#if CONFIG_PAGING_LEVELS >= 3
 define_get_linear_pagetable(l3);
 static int
 get_page_from_l3e(
-    l3_pgentry_t l3e, unsigned long pfn, struct domain *d)
+    l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int preemptible)
 {
     int rc;
 
@@ -779,22 +782,22 @@ get_page_from_l3e(
     if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) )
     {
         MEM_LOG("Bad L3 flags %x", l3e_get_flags(l3e) & l3_disallow_mask(d));
-        return 0;
-    }
-
-    rc = get_page_and_type_from_pagenr(l3e_get_pfn(l3e), PGT_l2_page_table, d);
-    if ( unlikely(!rc) )
-        rc = get_l3_linear_pagetable(l3e, pfn, d);
+        return -EINVAL;
+    }
+
+    rc = get_page_and_type_from_pagenr(
+        l3e_get_pfn(l3e), PGT_l2_page_table, d, preemptible);
+    if ( unlikely(rc == -EINVAL) && get_l3_linear_pagetable(l3e, pfn, d) )
+        rc = 0;
 
     return rc;
 }
-#endif /* 3 level */
 
 #if CONFIG_PAGING_LEVELS >= 4
 define_get_linear_pagetable(l4);
 static int
 get_page_from_l4e(
-    l4_pgentry_t l4e, unsigned long pfn, struct domain *d)
+    l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int preemptible)
 {
     int rc;
 
@@ -804,12 +807,13 @@ get_page_from_l4e(
     if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
     {
         MEM_LOG("Bad L4 flags %x", l4e_get_flags(l4e) & L4_DISALLOW_MASK);
-        return 0;
-    }
-
-    rc = get_page_and_type_from_pagenr(l4e_get_pfn(l4e), PGT_l3_page_table, d);
-    if ( unlikely(!rc) )
-        rc = get_l4_linear_pagetable(l4e, pfn, d);
+        return -EINVAL;
+    }
+
+    rc = get_page_and_type_from_pagenr(
+        l4e_get_pfn(l4e), PGT_l3_page_table, d, preemptible);
+    if ( unlikely(rc == -EINVAL) && get_l4_linear_pagetable(l4e, pfn, d) )
+        rc = 0;
 
     return rc;
 }
@@ -946,29 +950,35 @@ void put_page_from_l1e(l1_pgentry_t l1e,
  * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
  * Note also that this automatically deals correctly with linear p.t.'s.
  */
-static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
+static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
 {
     if ( (l2e_get_flags(l2e) & _PAGE_PRESENT) && 
          (l2e_get_pfn(l2e) != pfn) )
+    {
         put_page_and_type(l2e_get_page(l2e));
-}
-
-
-#if CONFIG_PAGING_LEVELS >= 3
-static void put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn)
+        return 0;
+    }
+    return 1;
+}
+
+
+static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
+                             int preemptible)
 {
     if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) && 
          (l3e_get_pfn(l3e) != pfn) )
-        put_page_and_type(l3e_get_page(l3e));
-}
-#endif
+        return put_page_and_type_preemptible(l3e_get_page(l3e), preemptible);
+    return 1;
+}
 
 #if CONFIG_PAGING_LEVELS >= 4
-static void put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn)
+static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
+                             int preemptible)
 {
     if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) && 
          (l4e_get_pfn(l4e) != pfn) )
-        put_page_and_type(l4e_get_page(l4e));
+        return put_page_and_type_preemptible(l4e_get_page(l4e), preemptible);
+    return 1;
 }
 #endif
 
@@ -977,7 +987,7 @@ static int alloc_l1_table(struct page_in
     struct domain *d = page_get_owner(page);
     unsigned long  pfn = page_to_mfn(page);
     l1_pgentry_t  *pl1e;
-    int            i;
+    unsigned int   i;
 
     pl1e = map_domain_page(pfn);
 
@@ -991,7 +1001,7 @@ static int alloc_l1_table(struct page_in
     }
 
     unmap_domain_page(pl1e);
-    return 1;
+    return 0;
 
  fail:
     MEM_LOG("Failure in alloc_l1_table: entry %d", i);
@@ -1000,7 +1010,7 @@ static int alloc_l1_table(struct page_in
             put_page_from_l1e(pl1e[i], d);
 
     unmap_domain_page(pl1e);
-    return 0;
+    return -EINVAL;
 }
 
 static int create_pae_xen_mappings(struct domain *d, l3_pgentry_t *pl3e)
@@ -1128,47 +1138,53 @@ static void pae_flush_pgd(
 # define pae_flush_pgd(mfn, idx, nl3e) ((void)0)
 #endif
 
-static int alloc_l2_table(struct page_info *page, unsigned long type)
+static int alloc_l2_table(struct page_info *page, unsigned long type,
+                          int preemptible)
 {
     struct domain *d = page_get_owner(page);
     unsigned long  pfn = page_to_mfn(page);
     l2_pgentry_t  *pl2e;
-    int            i;
+    unsigned int   i;
+    int            rc = 0;
 
     pl2e = map_domain_page(pfn);
 
-    for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
-    {
-        if ( !is_guest_l2_slot(d, type, i) )
+    for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; i++ )
+    {
+        if ( preemptible && i && hypercall_preempt_check() )
+        {
+            page->nr_validated_ptes = i;
+            rc = -EAGAIN;
+            break;
+        }
+
+        if ( !is_guest_l2_slot(d, type, i) ||
+             (rc = get_page_from_l2e(pl2e[i], pfn, d)) > 0 )
             continue;
 
-        if ( unlikely(!get_page_from_l2e(pl2e[i], pfn, d)) )
-            goto fail;
-        
+        if ( rc < 0 )
+        {
+            MEM_LOG("Failure in alloc_l2_table: entry %d", i);
+            while ( i-- > 0 )
+                if ( is_guest_l2_slot(d, type, i) )
+                    put_page_from_l2e(pl2e[i], pfn);
+            break;
+        }
+
         adjust_guest_l2e(pl2e[i], d);
     }
 
     unmap_domain_page(pl2e);
-    return 1;
-
- fail:
-    MEM_LOG("Failure in alloc_l2_table: entry %d", i);
-    while ( i-- > 0 )
-        if ( is_guest_l2_slot(d, type, i) )
-            put_page_from_l2e(pl2e[i], pfn);
-
-    unmap_domain_page(pl2e);
-    return 0;
-}
-
-
-#if CONFIG_PAGING_LEVELS >= 3
-static int alloc_l3_table(struct page_info *page)
+    return rc > 0 ? 0 : rc;
+}
+
+static int alloc_l3_table(struct page_info *page, int preemptible)
 {
     struct domain *d = page_get_owner(page);
     unsigned long  pfn = page_to_mfn(page);
     l3_pgentry_t  *pl3e;
-    int            i;
+    unsigned int   i;
+    int            rc = 0;
 
 #if CONFIG_PAGING_LEVELS == 3
     /*
@@ -1181,7 +1197,7 @@ static int alloc_l3_table(struct page_in
          d->vcpu[0] && d->vcpu[0]->is_initialised )
     {
         MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn);
-        return 0;
+        return -EINVAL;
     }
 #endif
 
@@ -1197,64 +1213,96 @@ static int alloc_l3_table(struct page_in
     if ( is_pv_32on64_domain(d) )
         memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));
 
-    for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
+    for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES; i++ )
     {
         if ( is_pv_32bit_domain(d) && (i == 3) )
         {
             if ( !(l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ||
-                 (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) ||
-                 !get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]),
-                                                PGT_l2_page_table |
-                                                PGT_pae_xen_l2,
-                                                d) )
-                goto fail;
-        }
-        else if ( !is_guest_l3_slot(i) )
+                 (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) )
+                rc = -EINVAL;
+            else
+                rc = get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]),
+                                                   PGT_l2_page_table |
+                                                   PGT_pae_xen_l2,
+                                                   d, preemptible);
+        }
+        else if ( !is_guest_l3_slot(i) ||
+                  (rc = get_page_from_l3e(pl3e[i], pfn, d, preemptible)) > 0 )
             continue;
-        else if ( unlikely(!get_page_from_l3e(pl3e[i], pfn, d)) )
-            goto fail;
+
+        if ( rc == -EAGAIN )
+        {
+            page->nr_validated_ptes = i;
+            page->partial_pte = 1;
+        }
+        else if ( rc == -EINTR && i )
+        {
+            page->nr_validated_ptes = i;
+            page->partial_pte = 0;
+            rc = -EAGAIN;
+        }
+        if ( rc < 0 )
+            break;
 
         adjust_guest_l3e(pl3e[i], d);
     }
 
-    if ( !create_pae_xen_mappings(d, pl3e) )
-        goto fail;
+    if ( rc >= 0 && !create_pae_xen_mappings(d, pl3e) )
+        rc = -EINVAL;
+    if ( rc < 0 && rc != -EAGAIN && rc != -EINTR )
+    {
+        MEM_LOG("Failure in alloc_l3_table: entry %d", i);
+        while ( i-- > 0 )
+        {
+            if ( !is_guest_l3_slot(i) )
+                continue;
+            unadjust_guest_l3e(pl3e[i], d);
+            put_page_from_l3e(pl3e[i], pfn, 0);
+        }
+    }
 
     unmap_domain_page(pl3e);
-    return 1;
-
- fail:
-    MEM_LOG("Failure in alloc_l3_table: entry %d", i);
-    while ( i-- > 0 )
-    {
-        if ( !is_guest_l3_slot(i) )
-            continue;
-        unadjust_guest_l3e(pl3e[i], d);
-        put_page_from_l3e(pl3e[i], pfn);
-    }
-
-    unmap_domain_page(pl3e);
-    return 0;
-}
-#else
-#define alloc_l3_table(page) (0)
-#endif
+    return rc > 0 ? 0 : rc;
+}
 
 #if CONFIG_PAGING_LEVELS >= 4
-static int alloc_l4_table(struct page_info *page)
+static int alloc_l4_table(struct page_info *page, int preemptible)
 {
     struct domain *d = page_get_owner(page);
     unsigned long  pfn = page_to_mfn(page);
     l4_pgentry_t  *pl4e = page_to_virt(page);
-    int            i;
-
-    for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
-    {
-        if ( !is_guest_l4_slot(d, i) )
+    unsigned int   i;
+    int            rc = 0;
+
+    for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES; i++ )
+    {
+        if ( !is_guest_l4_slot(d, i) ||
+             (rc = get_page_from_l4e(pl4e[i], pfn, d, preemptible)) > 0 )
             continue;
 
-        if ( unlikely(!get_page_from_l4e(pl4e[i], pfn, d)) )
-            goto fail;
+        if ( rc == -EAGAIN )
+        {
+            page->nr_validated_ptes = i;
+            page->partial_pte = 1;
+        }
+        else if ( rc == -EINTR )
+        {
+            if ( i )
+            {
+                page->nr_validated_ptes = i;
+                page->partial_pte = 0;
+                rc = -EAGAIN;
+            }
+        }
+        else if ( rc < 0 )
+        {
+            MEM_LOG("Failure in alloc_l4_table: entry %d", i);
+            while ( i-- > 0 )
+                if ( is_guest_l4_slot(d, i) )
+                    put_page_from_l4e(pl4e[i], pfn, 0);
+        }
+        if ( rc < 0 )
+            return rc;
 
         adjust_guest_l4e(pl4e[i], d);
     }
@@ -1269,18 +1317,10 @@ static int alloc_l4_table(struct page_in
         l4e_from_page(virt_to_page(d->arch.mm_perdomain_l3),
                       __PAGE_HYPERVISOR);
 
-    return 1;
-
- fail:
-    MEM_LOG("Failure in alloc_l4_table: entry %d", i);
-    while ( i-- > 0 )
-        if ( is_guest_l4_slot(d, i) )
-            put_page_from_l4e(pl4e[i], pfn);
-
-    return 0;
+    return rc > 0 ? 0 : rc;
 }
 #else
-#define alloc_l4_table(page) (0)
+#define alloc_l4_table(page, preemptible) (-EINVAL)
 #endif
 
 
@@ -1289,7 +1329,7 @@ static void free_l1_table(struct page_in
     struct domain *d = page_get_owner(page);
     unsigned long pfn = page_to_mfn(page);
     l1_pgentry_t *pl1e;
-    int i;
+    unsigned int  i;
 
     pl1e = map_domain_page(pfn);
 
@@ -1301,74 +1341,114 @@ static void free_l1_table(struct page_in
 }
 
 
-static void free_l2_table(struct page_info *page)
+static int free_l2_table(struct page_info *page, int preemptible)
 {
 #ifdef CONFIG_COMPAT
     struct domain *d = page_get_owner(page);
 #endif
     unsigned long pfn = page_to_mfn(page);
     l2_pgentry_t *pl2e;
-    int i;
+    unsigned int  i = page->nr_validated_ptes - 1;
+    int err = 0;
 
     pl2e = map_domain_page(pfn);
 
-    for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
-        if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) )
-            put_page_from_l2e(pl2e[i], pfn);
+    ASSERT(page->nr_validated_ptes);
+    do {
+        if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) &&
+             put_page_from_l2e(pl2e[i], pfn) == 0 &&
+             preemptible && i && hypercall_preempt_check() )
+        {
+           page->nr_validated_ptes = i;
+           err = -EAGAIN;
+        }
+    } while ( !err && i-- );
 
     unmap_domain_page(pl2e);
 
-    page->u.inuse.type_info &= ~PGT_pae_xen_l2;
-}
-
-
-#if CONFIG_PAGING_LEVELS >= 3
-
-static void free_l3_table(struct page_info *page)
+    if ( !err )
+        page->u.inuse.type_info &= ~PGT_pae_xen_l2;
+
+    return err;
+}
+
+static int free_l3_table(struct page_info *page, int preemptible)
 {
     struct domain *d = page_get_owner(page);
     unsigned long pfn = page_to_mfn(page);
     l3_pgentry_t *pl3e;
-    int           i;
+    unsigned int  i = page->nr_validated_ptes - !page->partial_pte;
+    int rc = 0;
 
 #ifdef DOMAIN_DESTRUCT_AVOID_RECURSION
     if ( d->arch.relmem == RELMEM_l3 )
-        return;
+        return 0;
 #endif
 
     pl3e = map_domain_page(pfn);
 
-    for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ )
+    do {
         if ( is_guest_l3_slot(i) )
         {
-            put_page_from_l3e(pl3e[i], pfn);
+            rc = put_page_from_l3e(pl3e[i], pfn, preemptible);
+            if ( rc > 0 )
+                continue;
+            if ( rc )
+                break;
             unadjust_guest_l3e(pl3e[i], d);
         }
+    } while ( i-- );
 
     unmap_domain_page(pl3e);
-}
-
-#endif
+
+    if ( rc == -EAGAIN )
+    {
+        page->nr_validated_ptes = i;
+        page->partial_pte = 1;
+    }
+    else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 )
+    {
+        page->nr_validated_ptes = i + 1;
+        page->partial_pte = 0;
+        rc = -EAGAIN;
+    }
+    return rc > 0 ? 0 : rc;
+}
 
 #if CONFIG_PAGING_LEVELS >= 4
-
-static void free_l4_table(struct page_info *page)
+static int free_l4_table(struct page_info *page, int preemptible)
 {
     struct domain *d = page_get_owner(page);
     unsigned long pfn = page_to_mfn(page);
     l4_pgentry_t *pl4e = page_to_virt(page);
-    int           i;
+    unsigned int  i = page->nr_validated_ptes - !page->partial_pte;
+    int rc = 0;
 
 #ifdef DOMAIN_DESTRUCT_AVOID_RECURSION
     if ( d->arch.relmem == RELMEM_l4 )
-        return;
+        return 0;
 #endif
 
-    for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ )
+    do {
         if ( is_guest_l4_slot(d, i) )
-            put_page_from_l4e(pl4e[i], pfn);
-}
-
+            rc = put_page_from_l4e(pl4e[i], pfn, preemptible);
+    } while ( rc >= 0 && i-- );
+
+    if ( rc == -EAGAIN )
+    {
+        page->nr_validated_ptes = i;
+        page->partial_pte = 1;
+    }
+    else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 )
+    {
+        page->nr_validated_ptes = i + 1;
+        page->partial_pte = 0;
+        rc = -EAGAIN;
+    }
+    return rc > 0 ? 0 : rc;
+}
+#else
+#define free_l4_table(page, preemptible) (-EINVAL)
 #endif
 
 static void page_lock(struct page_info *page)
@@ -1560,7 +1640,7 @@ static int mod_l2_entry(l2_pgentry_t *pl
             return rc;
         }
 
-        if ( unlikely(!get_page_from_l2e(nl2e, pfn, d)) )
+        if ( unlikely(get_page_from_l2e(nl2e, pfn, d) < 0) )
             return page_unlock(l2pg), 0;
 
         adjust_guest_l2e(nl2e, d);
@@ -1582,25 +1662,24 @@ static int mod_l2_entry(l2_pgentry_t *pl
     put_page_from_l2e(ol2e, pfn);
     return rc;
 }
-
-#if CONFIG_PAGING_LEVELS >= 3
 
 /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
 static int mod_l3_entry(l3_pgentry_t *pl3e, 
                         l3_pgentry_t nl3e, 
                         unsigned long pfn,
-                        int preserve_ad)
+                        int preserve_ad,
+                        int preemptible)
 {
     l3_pgentry_t ol3e;
     struct vcpu *curr = current;
     struct domain *d = curr->domain;
     struct page_info *l3pg = mfn_to_page(pfn);
-    int rc = 1;
+    int rc = 0;
 
     if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) )
     {
         MEM_LOG("Illegal L3 update attempt in Xen-private area %p", pl3e);
-        return 0;
+        return -EINVAL;
     }
 
     /*
@@ -1608,12 +1687,12 @@ static int mod_l3_entry(l3_pgentry_t *pl
      * would be a pain to ensure they remain continuously valid throughout.
      */
     if ( is_pv_32bit_domain(d) && (pgentry_ptr_to_slot(pl3e) >= 3) )
-        return 0;
+        return -EINVAL;
 
     page_lock(l3pg);
 
     if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
-        return page_unlock(l3pg), 0;
+        return page_unlock(l3pg), -EFAULT;
 
     if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
     {
@@ -1622,7 +1701,7 @@ static int mod_l3_entry(l3_pgentry_t *pl
             page_unlock(l3pg);
             MEM_LOG("Bad L3 flags %x",
                     l3e_get_flags(nl3e) & l3_disallow_mask(d));
-            return 0;
+            return -EINVAL;
         }
 
         /* Fast path for identical mapping and presence. */
@@ -1631,28 +1710,30 @@ static int mod_l3_entry(l3_pgentry_t *pl
             adjust_guest_l3e(nl3e, d);
             rc = UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr, preserve_ad);
             page_unlock(l3pg);
-            return rc;
-        }
-
-        if ( unlikely(!get_page_from_l3e(nl3e, pfn, d)) )
-            return page_unlock(l3pg), 0;
+            return rc ? 0 : -EFAULT;
+        }
+
+        rc = get_page_from_l3e(nl3e, pfn, d, preemptible);
+        if ( unlikely(rc < 0) )
+            return page_unlock(l3pg), rc;
+        rc = 0;
 
         adjust_guest_l3e(nl3e, d);
         if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr,
                                     preserve_ad)) )
         {
             ol3e = nl3e;
-            rc = 0;
+            rc = -EFAULT;
         }
     }
     else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr,
                                      preserve_ad)) )
     {
         page_unlock(l3pg);
-        return 0;
-    }
-
-    if ( likely(rc) )
+        return -EFAULT;
+    }
+
+    if ( likely(rc == 0) )
     {
         if ( !create_pae_xen_mappings(d, pl3e) )
             BUG();
@@ -1661,11 +1742,9 @@ static int mod_l3_entry(l3_pgentry_t *pl
     }
 
     page_unlock(l3pg);
-    put_page_from_l3e(ol3e, pfn);
+    put_page_from_l3e(ol3e, pfn, 0);
     return rc;
 }
-
-#endif
 
 #if CONFIG_PAGING_LEVELS >= 4
 
@@ -1673,24 +1752,25 @@ static int mod_l4_entry(l4_pgentry_t *pl
 static int mod_l4_entry(l4_pgentry_t *pl4e, 
                         l4_pgentry_t nl4e, 
                         unsigned long pfn,
-                        int preserve_ad)
+                        int preserve_ad,
+                        int preemptible)
 {
     struct vcpu *curr = current;
     struct domain *d = curr->domain;
     l4_pgentry_t ol4e;
     struct page_info *l4pg = mfn_to_page(pfn);
-    int rc = 1;
+    int rc = 0;
 
     if ( unlikely(!is_guest_l4_slot(d, pgentry_ptr_to_slot(pl4e))) )
     {
         MEM_LOG("Illegal L4 update attempt in Xen-private area %p", pl4e);
-        return 0;
+        return -EINVAL;
     }
 
     page_lock(l4pg);
 
     if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
-        return page_unlock(l4pg), 0;
+        return page_unlock(l4pg), -EFAULT;
 
     if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
     {
@@ -1699,7 +1779,7 @@ static int mod_l4_entry(l4_pgentry_t *pl
             page_unlock(l4pg);
             MEM_LOG("Bad L4 flags %x",
                     l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
-            return 0;
+            return -EINVAL;
         }
 
         /* Fast path for identical mapping and presence. */
@@ -1708,29 +1788,31 @@ static int mod_l4_entry(l4_pgentry_t *pl
             adjust_guest_l4e(nl4e, d);
             rc = UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr, preserve_ad);
             page_unlock(l4pg);
-            return rc;
-        }
-
-        if ( unlikely(!get_page_from_l4e(nl4e, pfn, d)) )
-            return page_unlock(l4pg), 0;
+            return rc ? 0 : -EFAULT;
+        }
+
+        rc = get_page_from_l4e(nl4e, pfn, d, preemptible);
+        if ( unlikely(rc < 0) )
+            return page_unlock(l4pg), rc;
+        rc = 0;
 
         adjust_guest_l4e(nl4e, d);
         if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr,
                                     preserve_ad)) )
         {
             ol4e = nl4e;
-            rc = 0;
+            rc = -EFAULT;
         }
     }
     else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr,
                                      preserve_ad)) )
     {
         page_unlock(l4pg);
-        return 0;
+        return -EFAULT;
     }
 
     page_unlock(l4pg);
-    put_page_from_l4e(ol4e, pfn);
+    put_page_from_l4e(ol4e, pfn, 0);
     return rc;
 }
 
@@ -1788,9 +1870,11 @@ int get_page(struct page_info *page, str
 }
 
 
-static int alloc_page_type(struct page_info *page, unsigned long type)
+static int alloc_page_type(struct page_info *page, unsigned long type,
+                           int preemptible)
 {
     struct domain *owner = page_get_owner(page);
+    int rc;
 
     /* A page table is dirtied when its type count becomes non-zero. */
     if ( likely(owner != NULL) )
@@ -1799,30 +1883,65 @@ static int alloc_page_type(struct page_i
     switch ( type & PGT_type_mask )
     {
     case PGT_l1_page_table:
-        return alloc_l1_table(page);
+        alloc_l1_table(page);
+        rc = 0;
+        break;
     case PGT_l2_page_table:
-        return alloc_l2_table(page, type);
+        rc = alloc_l2_table(page, type, preemptible);
+        break;
     case PGT_l3_page_table:
-        return alloc_l3_table(page);
+        rc = alloc_l3_table(page, preemptible);
+        break;
     case PGT_l4_page_table:
-        return alloc_l4_table(page);
+        rc = alloc_l4_table(page, preemptible);
+        break;
     case PGT_seg_desc_page:
-        return alloc_segdesc_page(page);
+        rc = alloc_segdesc_page(page);
+        break;
     default:
         printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%x\n", 
                type, page->u.inuse.type_info,
                page->count_info);
+        rc = -EINVAL;
         BUG();
     }
 
-    return 0;
-}
-
-
-void free_page_type(struct page_info *page, unsigned long type)
+    /* No need for atomic update of type_info here: noone else updates it. */
+    wmb();
+    if ( rc == -EAGAIN )
+    {
+        page->u.inuse.type_info |= PGT_partial;
+    }
+    else if ( rc == -EINTR )
+    {
+        ASSERT((page->u.inuse.type_info &
+                (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
+        page->u.inuse.type_info &= ~PGT_count_mask;
+    }
+    else if ( rc )
+    {
+        ASSERT(rc < 0);
+        MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %"
+                PRtype_info ": caf=%08x taf=%" PRtype_info,
+                page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
+                type, page->count_info, page->u.inuse.type_info);
+        page->u.inuse.type_info = 0;
+    }
+    else
+    {
+        page->u.inuse.type_info |= PGT_validated;
+    }
+
+    return rc;
+}
+
+
+int free_page_type(struct page_info *page, unsigned long type,
+                   int preemptible)
 {
     struct domain *owner = page_get_owner(page);
     unsigned long gmfn;
+    int rc;
 
     if ( likely(owner != NULL) )
     {
@@ -1842,7 +1961,7 @@ void free_page_type(struct page_info *pa
             paging_mark_dirty(owner, page_to_mfn(page));
 
             if ( shadow_mode_refcounts(owner) )
-                return;
+                return 0;
 
             gmfn = mfn_to_gmfn(owner, page_to_mfn(page));
             ASSERT(VALID_M2P(gmfn));
@@ -1850,42 +1969,80 @@ void free_page_type(struct page_info *pa
         }
     }
 
+    if ( !(type & PGT_partial) )
+    {
+        page->nr_validated_ptes = 1U << PAGETABLE_ORDER;
+        page->partial_pte = 0;
+    }
     switch ( type & PGT_type_mask )
     {
     case PGT_l1_page_table:
         free_l1_table(page);
+        rc = 0;
         break;
-
     case PGT_l2_page_table:
-        free_l2_table(page);
+        rc = free_l2_table(page, preemptible);
         break;
-
-#if CONFIG_PAGING_LEVELS >= 3
     case PGT_l3_page_table:
-        free_l3_table(page);
+#if CONFIG_PAGING_LEVELS == 3
+        if ( !(type & PGT_partial) )
+            page->nr_validated_ptes = L3_PAGETABLE_ENTRIES;
+#endif
+        rc = free_l3_table(page, preemptible);
         break;
-#endif
-
-#if CONFIG_PAGING_LEVELS >= 4
     case PGT_l4_page_table:
-        free_l4_table(page);
+        rc = free_l4_table(page, preemptible);
         break;
-#endif
-
     default:
-        printk("%s: type %lx pfn %lx\n",__FUNCTION__,
-               type, page_to_mfn(page));
+        MEM_LOG("type %lx pfn %lx\n", type, page_to_mfn(page));
+        rc = -EINVAL;
         BUG();
     }
-}
-
-
-void put_page_type(struct page_info *page)
+
+    /* No need for atomic update of type_info here: noone else updates it. */
+    if ( rc == 0 )
+    {
+        /*
+         * Record TLB information for flush later. We do not stamp page tables
+         * when running in shadow mode:
+         *  1. Pointless, since it's the shadow pt's which must be tracked.
+         *  2. Shadow mode reuses this field for shadowed page tables to
+         *     store flags info -- we don't want to conflict with that.
+         */
+        if ( !(shadow_mode_enabled(page_get_owner(page)) &&
+               (page->count_info & PGC_page_table)) )
+            page->tlbflush_timestamp = tlbflush_current_time();
+        wmb();
+        page->u.inuse.type_info--;
+    }
+    else if ( rc == -EINTR )
+    {
+        ASSERT(!(page->u.inuse.type_info &
+                 (PGT_count_mask|PGT_validated|PGT_partial)));
+        if ( !(shadow_mode_enabled(page_get_owner(page)) &&
+               (page->count_info & PGC_page_table)) )
+            page->tlbflush_timestamp = tlbflush_current_time();
+        wmb();
+        page->u.inuse.type_info |= PGT_validated;
+    }
+    else
+    {
+        BUG_ON(rc != -EAGAIN);
+        wmb();
+        page->u.inuse.type_info |= PGT_partial;
+    }
+
+    return rc;
+}
+
+
+static int __put_page_type(struct page_info *page,
+                           int preemptible)
 {
     unsigned long nx, x, y = page->u.inuse.type_info;
 
- again:
-    do {
+    for ( ; ; )
+    {
         x  = y;
         nx = x - 1;
 
@@ -1894,21 +2051,19 @@ void put_page_type(struct page_info *pag
         if ( unlikely((nx & PGT_count_mask) == 0) )
         {
             if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
-                 likely(nx & PGT_validated) )
+                 likely(nx & (PGT_validated|PGT_partial)) )
             {
                 /*
                  * Page-table pages must be unvalidated when count is zero. The
                  * 'free' is safe because the refcnt is non-zero and validated
                  * bit is clear => other ops will spin or fail.
                  */
-                if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, 
-                                           x & ~PGT_validated)) != x) )
-                    goto again;
+                nx = x & ~(PGT_validated|PGT_partial);
+                if ( unlikely((y = cmpxchg(&page->u.inuse.type_info,
+                                           x, nx)) != x) )
+                    continue;
                 /* We cleared the 'valid bit' so we do the clean up. */
-                free_page_type(page, x);
-                /* Carry on, but with the 'valid bit' now clear. */
-                x  &= ~PGT_validated;
-                nx &= ~PGT_validated;
+                return free_page_type(page, x, preemptible);
             }
 
             /*
@@ -1922,25 +2077,33 @@ void put_page_type(struct page_info *pag
                    (page->count_info & PGC_page_table)) )
                 page->tlbflush_timestamp = tlbflush_current_time();
         }
-    }
-    while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
-}
-
-
-int get_page_type(struct page_info *page, unsigned long type)
+
+        if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) )
+            break;
+
+        if ( preemptible && hypercall_preempt_check() )
+            return -EINTR;
+    }
+
+    return 0;
+}
+
+
+static int __get_page_type(struct page_info *page, unsigned long type,
+                           int preemptible)
 {
     unsigned long nx, x, y = page->u.inuse.type_info;
 
     ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
 
- again:
-    do {
+    for ( ; ; )
+    {
         x  = y;
         nx = x + 1;
         if ( unlikely((nx & PGT_count_mask) == 0) )
         {
             MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page));
-            return 0;
+            return -EINVAL;
         }
         else if ( unlikely((x & PGT_count_mask) == 0) )
         {
@@ -1993,28 +2156,43 @@ int get_page_type(struct page_info *page
             /* Don't log failure if it could be a recursive-mapping attempt. */
             if ( ((x & PGT_type_mask) == PGT_l2_page_table) &&
                  (type == PGT_l1_page_table) )
-                return 0;
+                return -EINVAL;
             if ( ((x & PGT_type_mask) == PGT_l3_page_table) &&
                  (type == PGT_l2_page_table) )
-                return 0;
+                return -EINVAL;
             if ( ((x & PGT_type_mask) == PGT_l4_page_table) &&
                  (type == PGT_l3_page_table) )
-                return 0;
+                return -EINVAL;
             MEM_LOG("Bad type (saw %" PRtype_info " != exp %" PRtype_info ") "
                     "for mfn %lx (pfn %lx)",
                     x, type, page_to_mfn(page),
                     get_gpfn_from_mfn(page_to_mfn(page)));
-            return 0;
+            return -EINVAL;
         }
         else if ( unlikely(!(x & PGT_validated)) )
         {
-            /* Someone else is updating validation of this page. Wait... */
-            while ( (y = page->u.inuse.type_info) == x )
-                cpu_relax();
-            goto again;
-        }
-    }
-    while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
+            if ( !(x & PGT_partial) )
+            {
+                /* Someone else is updating validation of this page. Wait... */
+                while ( (y = page->u.inuse.type_info) == x )
+                {
+                    if ( preemptible && hypercall_preempt_check() )
+                        return -EINTR;
+                    cpu_relax();
+                }
+                continue;
+            }
+            /* Type ref count was left at 1 when PGT_partial got set. */
+            ASSERT((x & PGT_count_mask) == 1);
+            nx = x & ~PGT_partial;
+        }
+
+        if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) )
+            break;
+
+        if ( preemptible && hypercall_preempt_check() )
+            return -EINTR;
+    }
 
     if ( unlikely((x & PGT_type_mask) != type) )
     {
@@ -2032,25 +2210,42 @@ int get_page_type(struct page_info *page
 
     if ( unlikely(!(nx & PGT_validated)) )
     {
-        /* Try to validate page type; drop the new reference on failure. */
-        if ( unlikely(!alloc_page_type(page, type)) )
-        {
-            MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %"
-                    PRtype_info ": caf=%08x taf=%" PRtype_info,
-                    page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)),
-                    type, page->count_info, page->u.inuse.type_info);
-            /* Noone else can get a reference. We hold the only ref. */
-            page->u.inuse.type_info = 0;
-            return 0;
-        }
-
-        /* Noone else is updating simultaneously. */
-        __set_bit(_PGT_validated, &page->u.inuse.type_info);
-    }
-
-    return 1;
-}
-
+        if ( !(x & PGT_partial) )
+        {
+            page->nr_validated_ptes = 0;
+            page->partial_pte = 0;
+        }
+        return alloc_page_type(page, type, preemptible);
+    }
+
+    return 0;
+}
+
+void put_page_type(struct page_info *page)
+{
+    int rc = __put_page_type(page, 0);
+    ASSERT(rc == 0);
+    (void)rc;
+}
+
+int get_page_type(struct page_info *page, unsigned long type)
+{
+    int rc = __get_page_type(page, type, 0);
+    if ( likely(rc == 0) )
+        return 1;
+    ASSERT(rc == -EINVAL);
+    return 0;
+}
+
+int put_page_type_preemptible(struct page_info *page)
+{
+    return __put_page_type(page, 1);
+}
+
+int get_page_type_preemptible(struct page_info *page, unsigned long type)
+{
+    return __get_page_type(page, type, 1);
+}
 
 void cleanup_page_cacheattr(struct page_info *page)
 {
@@ -2087,7 +2282,7 @@ int new_guest_cr3(unsigned long mfn)
                     l4e_from_pfn(
                         mfn,
                         (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED)),
-                    pagetable_get_pfn(v->arch.guest_table), 0);
+                    pagetable_get_pfn(v->arch.guest_table), 0, 0) == 0;
         if ( unlikely(!okay) )
         {
             MEM_LOG("Error while installing new compat baseptr %lx", mfn);
@@ -2102,7 +2297,7 @@ int new_guest_cr3(unsigned long mfn)
 #endif
     okay = paging_mode_refcounts(d)
         ? get_page_from_pagenr(mfn, d)
-        : get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
+        : !get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0);
     if ( unlikely(!okay) )
     {
         MEM_LOG("Error while installing new baseptr %lx", mfn);
@@ -2276,9 +2471,7 @@ int do_mmuext_op(
     {
         if ( hypercall_preempt_check() )
         {
-            rc = hypercall_create_continuation(
-                __HYPERVISOR_mmuext_op, "hihi",
-                uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
+            rc = -EAGAIN;
             break;
         }
 
@@ -2325,10 +2518,14 @@ int do_mmuext_op(
             if ( paging_mode_refcounts(FOREIGNDOM) )
                 break;
 
-            okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM);
+            rc = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM, 1);
+            okay = !rc;
             if ( unlikely(!okay) )
             {
-                MEM_LOG("Error while pinning mfn %lx", mfn);
+                if ( rc == -EINTR )
+                    rc = -EAGAIN;
+                else if ( rc != -EAGAIN )
+                    MEM_LOG("Error while pinning mfn %lx", mfn);
                 break;
             }
 
@@ -2373,8 +2570,11 @@ int do_mmuext_op(
             {
                 put_page_and_type(page);
                 put_page(page);
-                /* A page is dirtied when its pin status is cleared. */
-                paging_mark_dirty(d, mfn);
+                if ( !rc )
+                {
+                    /* A page is dirtied when its pin status is cleared. */
+                    paging_mark_dirty(d, mfn);
+                }
             }
             else
             {
@@ -2398,8 +2598,8 @@ int do_mmuext_op(
                 if ( paging_mode_refcounts(d) )
                     okay = get_page_from_pagenr(mfn, d);
                 else
-                    okay = get_page_and_type_from_pagenr(
-                        mfn, PGT_root_page_table, d);
+                    okay = !get_page_and_type_from_pagenr(
+                        mfn, PGT_root_page_table, d, 0);
                 if ( unlikely(!okay) )
                 {
                     MEM_LOG("Error while installing new mfn %lx", mfn);
@@ -2517,6 +2717,11 @@ int do_mmuext_op(
         guest_handle_add_offset(uops, 1);
     }
 
+    if ( rc == -EAGAIN )
+        rc = hypercall_create_continuation(
+            __HYPERVISOR_mmuext_op, "hihi",
+            uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
+
     process_deferred_ops();
 
     perfc_add(num_mmuext_ops, i);
@@ -2576,9 +2781,7 @@ int do_mmu_update(
     {
         if ( hypercall_preempt_check() )
         {
-            rc = hypercall_create_continuation(
-                __HYPERVISOR_mmu_update, "hihi",
-                ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
+            rc = -EAGAIN;
             break;
         }
 
@@ -2601,7 +2804,7 @@ int do_mmu_update(
              */
         case MMU_NORMAL_PT_UPDATE:
         case MMU_PT_UPDATE_PRESERVE_AD:
-            rc = xsm_mmu_normal_update(d, req.val);
+            rc = xsm_mmu_normal_update(d, FOREIGNDOM, req.val);
             if ( rc )
                 break;
 
@@ -2653,27 +2856,29 @@ int do_mmu_update(
                                         cmd == MMU_PT_UPDATE_PRESERVE_AD);
                 }
                 break;
-#if CONFIG_PAGING_LEVELS >= 3
                 case PGT_l3_page_table:
                 {
                     l3_pgentry_t l3e = l3e_from_intpte(req.val);
-                    okay = mod_l3_entry(va, l3e, mfn,
-                                        cmd == MMU_PT_UPDATE_PRESERVE_AD);
+                    rc = mod_l3_entry(va, l3e, mfn,
+                                      cmd == MMU_PT_UPDATE_PRESERVE_AD, 1);
+                    okay = !rc;
                 }
                 break;
-#endif
 #if CONFIG_PAGING_LEVELS >= 4
                 case PGT_l4_page_table:
                 {
                     l4_pgentry_t l4e = l4e_from_intpte(req.val);
-                    okay = mod_l4_entry(va, l4e, mfn,
-                                        cmd == MMU_PT_UPDATE_PRESERVE_AD);
+                    rc = mod_l4_entry(va, l4e, mfn,
+                                      cmd == MMU_PT_UPDATE_PRESERVE_AD, 1);
+                    okay = !rc;
                 }
                 break;
 #endif
                 }
 
                 put_page_type(page);
+                if ( rc == -EINTR )
+                    rc = -EAGAIN;
             }
             break;
 
@@ -2741,6 +2946,11 @@ int do_mmu_update(
 
         guest_handle_add_offset(ureqs, 1);
     }
+
+    if ( rc == -EAGAIN )
+        rc = hypercall_create_continuation(
+            __HYPERVISOR_mmu_update, "hihi",
+            ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
 
     process_deferred_ops();
 
@@ -3111,7 +3321,7 @@ int do_update_va_mapping(unsigned long v
     if ( unlikely(!access_ok(va, 1) && !paging_mode_external(d)) )
         return -EINVAL;
 
-    rc = xsm_update_va_mapping(d, val);
+    rc = xsm_update_va_mapping(d, FOREIGNDOM, val);
     if ( rc )
         return rc;
 
@@ -3695,9 +3905,8 @@ static int ptwr_emulated_update(
     nl1e = l1e_from_intpte(val);
     if ( unlikely(!get_page_from_l1e(nl1e, d)) )
     {
-        if ( (CONFIG_PAGING_LEVELS >= 3) && is_pv_32bit_domain(d) &&
-             (bytes == 4) && (unaligned_addr & 4) && !do_cmpxchg &&
-             (l1e_get_flags(nl1e) & _PAGE_PRESENT) )
+        if ( is_pv_32bit_domain(d) && (bytes == 4) && (unaligned_addr & 4) &&
+             !do_cmpxchg && (l1e_get_flags(nl1e) & _PAGE_PRESENT) )
         {
             /*
              * If this is an upper-half write to a PAE PTE then we assume that
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/mm/hap/hap.c
--- a/xen/arch/x86/mm/hap/hap.c Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/mm/hap/hap.c Fri Sep 12 14:47:40 2008 +0900
@@ -37,6 +37,7 @@
 #include <asm/shared.h>
 #include <asm/hap.h>
 #include <asm/paging.h>
+#include <asm/p2m.h>
 #include <asm/domain.h>
 #include <xen/numa.h>
 
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/mm/shadow/common.c
--- a/xen/arch/x86/mm/shadow/common.c   Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/mm/shadow/common.c   Fri Sep 12 14:47:40 2008 +0900
@@ -39,6 +39,7 @@
 #include <xen/numa.h>
 #include "private.h"
 
+DEFINE_PER_CPU(uint32_t,trace_shadow_path_flags);
 
 /* Set up the shadow-specific parts of a domain struct at start of day.
  * Called for every domain from arch_domain_create() */
@@ -630,6 +631,8 @@ void oos_fixup_add(struct vcpu *v, mfn_t
 
             if ( mfn_x(oos_fixup[idx].smfn[next]) != INVALID_MFN )
             {
+                TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_OOS_FIXUP_EVICT);
+
                 /* Reuse this slot and remove current writable mapping. */
                 sh_remove_write_access_from_sl1p(v, gmfn, 
                                                  oos_fixup[idx].smfn[next],
@@ -645,6 +648,8 @@ void oos_fixup_add(struct vcpu *v, mfn_t
             oos_fixup[idx].smfn[next] = smfn;
             oos_fixup[idx].off[next] = off;
             oos_fixup[idx].next = (next + 1) % SHADOW_OOS_FIXUPS;
+
+            TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_OOS_FIXUP_ADD);
             return;
         }
     }
@@ -687,6 +692,16 @@ static int oos_remove_write_access(struc
 }
 
 
+static inline void trace_resync(int event, mfn_t gmfn)
+{
+    if ( tb_init_done )
+    {
+        /* Convert gmfn to gfn */
+        unsigned long gfn = mfn_to_gfn(current->domain, gmfn);
+        __trace_var(event, 0/*!tsc*/, sizeof(gfn), (unsigned char*)&gfn);
+    }
+}
+
 /* Pull all the entries on an out-of-sync page back into sync. */
 static void _sh_resync(struct vcpu *v, mfn_t gmfn,
                        struct oos_fixup *fixup, mfn_t snp)
@@ -700,8 +715,8 @@ static void _sh_resync(struct vcpu *v, m
              & ~SHF_L1_ANY));
     ASSERT(!sh_page_has_multiple_shadows(mfn_to_page(gmfn)));
 
-    SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, va=%lx\n",
-                  v->domain->domain_id, v->vcpu_id, mfn_x(gmfn), va);
+    SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx\n",
+                  v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
 
     /* Need to pull write access so the page *stays* in sync. */
     if ( oos_remove_write_access(v, gmfn, fixup) )
@@ -719,6 +734,7 @@ static void _sh_resync(struct vcpu *v, m
     /* Now we know all the entries are synced, and will stay that way */
     pg->shadow_flags &= ~SHF_out_of_sync;
     perfc_incr(shadow_resync);
+    trace_resync(TRC_SHADOW_RESYNC_FULL, gmfn);
 }
 
 
@@ -930,6 +946,7 @@ void sh_resync_all(struct vcpu *v, int s
                 /* Update the shadows and leave the page OOS. */
                 if ( sh_skip_sync(v, oos[idx]) )
                     continue;
+                trace_resync(TRC_SHADOW_RESYNC_ONLY, oos[idx]);
                 _sh_resync_l1(other, oos[idx], oos_snapshot[idx]);
             }
             else
@@ -945,15 +962,16 @@ void sh_resync_all(struct vcpu *v, int s
     }
 }
 
-/* Allow a shadowed page to go out of sync */
+/* Allow a shadowed page to go out of sync. Unsyncs are traced in
+ * multi.c:sh_page_fault() */
 int sh_unsync(struct vcpu *v, mfn_t gmfn)
 {
     struct page_info *pg;
     
     ASSERT(shadow_locked_by_me(v->domain));
 
-    SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx va %lx\n",
-                  v->domain->domain_id, v->vcpu_id, mfn_x(gmfn), va);
+    SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx\n",
+                  v->domain->domain_id, v->vcpu_id, mfn_x(gmfn));
 
     pg = mfn_to_page(gmfn);
  
@@ -970,6 +988,7 @@ int sh_unsync(struct vcpu *v, mfn_t gmfn
     pg->shadow_flags |= SHF_out_of_sync|SHF_oos_may_write;
     oos_hash_add(v, gmfn);
     perfc_incr(shadow_unsync);
+    TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_UNSYNC);
     return 1;
 }
 
@@ -1005,6 +1024,7 @@ void shadow_promote(struct vcpu *v, mfn_
 
     ASSERT(!test_bit(type, &page->shadow_flags));
     set_bit(type, &page->shadow_flags);
+    TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_PROMOTE);
 }
 
 void shadow_demote(struct vcpu *v, mfn_t gmfn, u32 type)
@@ -1027,6 +1047,8 @@ void shadow_demote(struct vcpu *v, mfn_t
 #endif 
         clear_bit(_PGC_page_table, &page->count_info);
     }
+
+    TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_DEMOTE);
 }
 
 /**************************************************************************/
@@ -1094,6 +1116,7 @@ sh_validate_guest_entry(struct vcpu *v, 
     ASSERT((page->shadow_flags 
             & (SHF_L4_64|SHF_L3_64|SHF_L2H_64|SHF_L2_64|SHF_L1_64)) == 0);
 #endif
+    this_cpu(trace_shadow_path_flags) |= (result<<(TRCE_SFLAG_SET_CHANGED)); 
 
     return result;
 }
@@ -1295,6 +1318,18 @@ static void shadow_unhook_mappings(struc
     }
 }
 
+static inline void trace_shadow_prealloc_unpin(struct domain *d, mfn_t smfn)
+{
+    if ( tb_init_done )
+    {
+        /* Convert smfn to gfn */
+        unsigned long gfn;
+        ASSERT(mfn_valid(smfn));
+        gfn = mfn_to_gfn(d, _mfn(mfn_to_shadow_page(smfn)->backpointer));
+        __trace_var(TRC_SHADOW_PREALLOC_UNPIN, 0/*!tsc*/,
+                    sizeof(gfn), (unsigned char*)&gfn);
+    }
+}
 
 /* Make sure there are at least count order-sized pages
  * available in the shadow page pool. */
@@ -1327,6 +1362,7 @@ static void _shadow_prealloc(
         smfn = shadow_page_to_mfn(sp);
 
         /* Unpin this top-level shadow */
+        trace_shadow_prealloc_unpin(d, smfn);
         sh_unpin(v, smfn);
 
         /* See if that freed up enough space */
@@ -1343,6 +1379,7 @@ static void _shadow_prealloc(
         {
             if ( !pagetable_is_null(v2->arch.shadow_table[i]) )
             {
+                TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_PREALLOC_UNHOOK);
                 shadow_unhook_mappings(v, 
                                pagetable_get_mfn(v2->arch.shadow_table[i]));
 
@@ -2200,6 +2237,16 @@ void sh_destroy_shadow(struct vcpu *v, m
     }    
 }
 
+static inline void trace_shadow_wrmap_bf(mfn_t gmfn)
+{
+    if ( tb_init_done )
+    {
+        /* Convert gmfn to gfn */
+        unsigned long gfn = mfn_to_gfn(current->domain, gmfn);
+        __trace_var(TRC_SHADOW_WRMAP_BF, 0/*!tsc*/, sizeof(gfn), (unsigned 
char*)&gfn);
+    }
+}
+
 /**************************************************************************/
 /* Remove all writeable mappings of a guest frame from the shadow tables 
  * Returns non-zero if we need to flush TLBs. 
@@ -2265,6 +2312,8 @@ int sh_remove_write_access(struct vcpu *
          || (pg->u.inuse.type_info & PGT_count_mask) == 0 )
         return 0;
 
+    TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_WRMAP);
+
     perfc_incr(shadow_writeable);
 
     /* If this isn't a "normal" writeable page, the domain is trying to 
@@ -2285,11 +2334,14 @@ int sh_remove_write_access(struct vcpu *
          * and that mapping is likely to be in the current pagetable,
          * in the guest's linear map (on non-HIGHPTE linux and windows)*/
 
-#define GUESS(_a, _h) do {                                                \
+#define GUESS(_a, _h) do {                                              \
             if ( v->arch.paging.mode->shadow.guess_wrmap(v, (_a), gmfn) ) \
-                perfc_incr(shadow_writeable_h_ ## _h);                   \
-            if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 )          \
-                return 1;                                                 \
+                perfc_incr(shadow_writeable_h_ ## _h);                  \
+            if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 )        \
+            {                                                           \
+                TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_WRMAP_GUESS_FOUND);   \
+                return 1;                                               \
+            }                                                           \
         } while (0)
 
         if ( level == 0 && fault_addr )
@@ -2377,6 +2429,7 @@ int sh_remove_write_access(struct vcpu *
 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC */
     
     /* Brute-force search of all the shadows, by walking the hash */
+    trace_shadow_wrmap_bf(gmfn);
     if ( level == 0 )
         perfc_incr(shadow_writeable_bf_1);
     else
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/mm/shadow/multi.c
--- a/xen/arch/x86/mm/shadow/multi.c    Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/mm/shadow/multi.c    Fri Sep 12 14:47:40 2008 +0900
@@ -225,6 +225,7 @@ static uint32_t set_ad_bits(void *guest_
 static uint32_t set_ad_bits(void *guest_p, void *walk_p, int set_dirty)
 {
     guest_intpte_t old, new;
+    int ret = 0;
 
     old = *(guest_intpte_t *)walk_p;
     new = old | _PAGE_ACCESSED | (set_dirty ? _PAGE_DIRTY : 0);
@@ -234,10 +235,16 @@ static uint32_t set_ad_bits(void *guest_
          * into the guest table as well.  If the guest table has changed
          * under out feet then leave it alone. */
         *(guest_intpte_t *)walk_p = new;
-        if ( cmpxchg(((guest_intpte_t *)guest_p), old, new) == old ) 
-            return 1;
-    }
-    return 0;
+        if( cmpxchg(((guest_intpte_t *)guest_p), old, new) == old ) 
+            ret = 1;
+
+        /* FIXME -- this code is longer than necessary */
+        if(set_dirty)
+            TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SET_AD);
+        else
+            TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SET_A);
+    }
+    return ret;
 }
 
 /* This validation is called with lock held, and after write permission
@@ -1432,6 +1439,7 @@ static int shadow_set_l1e(struct vcpu *v
     {
         /* About to install a new reference */        
         if ( shadow_mode_refcounts(d) ) {
+            TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SHADOW_L1_GET_REF);
             if ( shadow_get_page_from_l1e(new_sl1e, d) == 0 ) 
             {
                 /* Doesn't look like a pagetable. */
@@ -1461,6 +1469,7 @@ static int shadow_set_l1e(struct vcpu *v
         {
             shadow_vram_put_l1e(old_sl1e, sl1e, sl1mfn, d);
             shadow_put_page_from_l1e(old_sl1e, d);
+            TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SHADOW_L1_PUT_REF);
         } 
     }
     return flags;
@@ -2896,6 +2905,7 @@ static inline void check_for_early_unsha
     {
         perfc_incr(shadow_early_unshadow);
         sh_remove_shadows(v, gmfn, 1, 0 /* Fast, can fail to unshadow */ );
+        TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EARLY_UNSHADOW);
     }
     v->arch.paging.shadow.last_emulated_mfn_for_unshadow = mfn_x(gmfn);
 #endif
@@ -3012,6 +3022,132 @@ static void sh_prefetch(struct vcpu *v, 
 
 #endif /* SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH */
 
+#if GUEST_PAGING_LEVELS == 4
+typedef u64 guest_va_t;
+typedef u64 guest_pa_t;
+#elif GUEST_PAGING_LEVELS == 3
+typedef u32 guest_va_t;
+typedef u64 guest_pa_t;
+#else
+typedef u32 guest_va_t;
+typedef u32 guest_pa_t;
+#endif
+
+static inline void trace_shadow_gen(u32 event, guest_va_t va)
+{
+    if ( tb_init_done )
+    {
+        event |= (GUEST_PAGING_LEVELS-2)<<8;
+        __trace_var(event, 0/*!tsc*/, sizeof(va), (unsigned char*)&va);
+    }
+}
+
+static inline void trace_shadow_fixup(guest_l1e_t gl1e,
+                                      guest_va_t va)
+{
+    if ( tb_init_done )
+    {
+        struct {
+            /* for PAE, guest_l1e may be 64 while guest_va may be 32;
+               so put it first for alignment sake. */
+            guest_l1e_t gl1e;
+            guest_va_t va;
+            u32 flags;
+        } __attribute__((packed)) d;
+        u32 event;
+
+        event = TRC_SHADOW_FIXUP | ((GUEST_PAGING_LEVELS-2)<<8);
+
+        d.gl1e = gl1e;
+        d.va = va;
+        d.flags = this_cpu(trace_shadow_path_flags);
+
+        __trace_var(event, 0/*!tsc*/, sizeof(d), (unsigned char*)&d);
+    }
+}
+                                          
+static inline void trace_not_shadow_fault(guest_l1e_t gl1e,
+                                          guest_va_t va)
+{
+    if ( tb_init_done )
+    {
+        struct {
+            /* for PAE, guest_l1e may be 64 while guest_va may be 32;
+               so put it first for alignment sake. */
+            guest_l1e_t gl1e;
+            guest_va_t va;
+            u32 flags;
+        } __attribute__((packed)) d;
+        u32 event;
+
+        event = TRC_SHADOW_NOT_SHADOW | ((GUEST_PAGING_LEVELS-2)<<8);
+
+        d.gl1e = gl1e;
+        d.va = va;
+        d.flags = this_cpu(trace_shadow_path_flags);
+
+        __trace_var(event, 0/*!tsc*/, sizeof(d), (unsigned char*)&d);
+    }
+}
+                                          
+static inline void trace_shadow_emulate_other(u32 event,
+                                                 guest_va_t va,
+                                                 gfn_t gfn)
+{
+    if ( tb_init_done )
+    {
+        struct {
+            /* for PAE, guest_l1e may be 64 while guest_va may be 32;
+               so put it first for alignment sake. */
+#if GUEST_PAGING_LEVELS == 2
+            u32 gfn;
+#else
+            u64 gfn;
+#endif
+            guest_va_t va;
+        } __attribute__((packed)) d;
+
+        event |= ((GUEST_PAGING_LEVELS-2)<<8);
+
+        d.gfn=gfn_x(gfn);
+        d.va = va;
+
+        __trace_var(event, 0/*!tsc*/, sizeof(d), (unsigned char*)&d);
+    }
+}
+
+#if GUEST_PAGING_LEVELS == 3
+static DEFINE_PER_CPU(guest_va_t,trace_emulate_initial_va);
+static DEFINE_PER_CPU(int,trace_extra_emulation_count);
+#endif
+static DEFINE_PER_CPU(guest_pa_t,trace_emulate_write_val);
+
+static inline void trace_shadow_emulate(guest_l1e_t gl1e, unsigned long va)
+{
+    if ( tb_init_done )
+    {
+        struct {
+            /* for PAE, guest_l1e may be 64 while guest_va may be 32;
+               so put it first for alignment sake. */
+            guest_l1e_t gl1e, write_val;
+            guest_va_t va;
+            unsigned flags:29, emulation_count:3;
+        } __attribute__((packed)) d;
+        u32 event;
+
+        event = TRC_SHADOW_EMULATE | ((GUEST_PAGING_LEVELS-2)<<8);
+
+        d.gl1e = gl1e;
+        d.write_val.l1 = this_cpu(trace_emulate_write_val);
+        d.va = va;
+#if GUEST_PAGING_LEVELS == 3
+        d.emulation_count = this_cpu(trace_extra_emulation_count);
+#endif
+        d.flags = this_cpu(trace_shadow_path_flags);
+
+        __trace_var(event, 0/*!tsc*/, sizeof(d), (unsigned char*)&d);
+    }
+}
 
 /**************************************************************************/
 /* Entry points into the shadow code */
@@ -3027,8 +3163,8 @@ static int sh_page_fault(struct vcpu *v,
 {
     struct domain *d = v->domain;
     walk_t gw;
-    gfn_t gfn;
-    mfn_t gmfn, sl1mfn=_mfn(0);
+    gfn_t gfn = _gfn(0);
+    mfn_t gmfn, sl1mfn = _mfn(0);
     shadow_l1e_t sl1e, *ptr_sl1e;
     paddr_t gpa;
     struct sh_emulate_ctxt emul_ctxt;
@@ -3043,7 +3179,7 @@ static int sh_page_fault(struct vcpu *v,
 
     SHADOW_PRINTK("d:v=%u:%u va=%#lx err=%u, rip=%lx\n",
                   v->domain->domain_id, v->vcpu_id, va, regs->error_code,
-                  regs->rip);
+                  regs->eip);
 
     perfc_incr(shadow_fault);
 
@@ -3132,6 +3268,7 @@ static int sh_page_fault(struct vcpu *v,
                 reset_early_unshadow(v);
                 perfc_incr(shadow_fault_fast_gnp);
                 SHADOW_PRINTK("fast path not-present\n");
+                trace_shadow_gen(TRC_SHADOW_FAST_PROPAGATE, va);
                 return 0;
             }
             else
@@ -3145,6 +3282,7 @@ static int sh_page_fault(struct vcpu *v,
             perfc_incr(shadow_fault_fast_mmio);
             SHADOW_PRINTK("fast path mmio %#"PRIpaddr"\n", gpa);
             reset_early_unshadow(v);
+            trace_shadow_gen(TRC_SHADOW_FAST_MMIO, va);
             return (handle_mmio_with_translation(va, gpa >> PAGE_SHIFT)
                     ? EXCRET_fault_fixed : 0);
         }
@@ -3155,6 +3293,7 @@ static int sh_page_fault(struct vcpu *v,
              * Retry and let the hardware give us the right fault next time. */
             perfc_incr(shadow_fault_fast_fail);
             SHADOW_PRINTK("fast path false alarm!\n");            
+            trace_shadow_gen(TRC_SHADOW_FALSE_FAST_PATH, va);
             return EXCRET_fault_fixed;
         }
     }
@@ -3190,7 +3329,7 @@ static int sh_page_fault(struct vcpu *v,
         perfc_incr(shadow_fault_bail_real_fault);
         SHADOW_PRINTK("not a shadow fault\n");
         reset_early_unshadow(v);
-        return 0;
+        goto propagate;
     }
 
     /* It's possible that the guest has put pagetables in memory that it has 
@@ -3200,7 +3339,7 @@ static int sh_page_fault(struct vcpu *v,
     if ( unlikely(d->is_shutting_down) )
     {
         SHADOW_PRINTK("guest is shutting down\n");
-        return 0;
+        goto propagate;
     }
 
     /* What kind of access are we dealing with? */
@@ -3218,7 +3357,7 @@ static int sh_page_fault(struct vcpu *v,
         SHADOW_PRINTK("BAD gfn=%"SH_PRI_gfn" gmfn=%"PRI_mfn"\n", 
                       gfn_x(gfn), mfn_x(gmfn));
         reset_early_unshadow(v);
-        return 0;
+        goto propagate;
     }
 
 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
@@ -3229,6 +3368,8 @@ static int sh_page_fault(struct vcpu *v,
 
     shadow_lock(d);
 
+    TRACE_CLEAR_PATH_FLAGS;
+    
     rc = gw_remove_write_accesses(v, va, &gw);
 
     /* First bit set: Removed write access to a page. */
@@ -3281,6 +3422,7 @@ static int sh_page_fault(struct vcpu *v,
          * Get out of the fault handler immediately. */
         ASSERT(d->is_shutting_down);
         shadow_unlock(d);
+        trace_shadow_gen(TRC_SHADOW_DOMF_DYING, va);
         return 0;
     }
 
@@ -3383,6 +3525,7 @@ static int sh_page_fault(struct vcpu *v,
     d->arch.paging.log_dirty.fault_count++;
     reset_early_unshadow(v);
 
+    trace_shadow_fixup(gw.l1e, va);
  done:
     sh_audit_gw(v, &gw);
     SHADOW_PRINTK("fixed\n");
@@ -3405,6 +3548,8 @@ static int sh_page_fault(struct vcpu *v,
                       mfn_x(gmfn));
         perfc_incr(shadow_fault_emulate_failed);
         sh_remove_shadows(v, gmfn, 0 /* thorough */, 1 /* must succeed */);
+        trace_shadow_emulate_other(TRC_SHADOW_EMULATE_UNSHADOW_USER,
+                                      va, gfn);
         goto done;
     }
 
@@ -3421,6 +3566,8 @@ static int sh_page_fault(struct vcpu *v,
     shadow_audit_tables(v);
     shadow_unlock(d);
 
+    this_cpu(trace_emulate_write_val) = 0;
+
 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
  early_emulation:
 #endif
@@ -3446,6 +3593,8 @@ static int sh_page_fault(struct vcpu *v,
                      "injection: cr2=%#lx, mfn=%#lx\n", 
                      va, mfn_x(gmfn));
             sh_remove_shadows(v, gmfn, 0 /* thorough */, 1 /* must succeed */);
+            trace_shadow_emulate_other(TRC_SHADOW_EMULATE_UNSHADOW_EVTINJ,
+                                       va, gfn);
             return EXCRET_fault_fixed;
         }
     }
@@ -3478,6 +3627,10 @@ static int sh_page_fault(struct vcpu *v,
          * to support more operations in the emulator.  More likely, 
          * though, this is a hint that this page should not be shadowed. */
         shadow_remove_all_shadows(v, gmfn);
+
+        trace_shadow_emulate_other(TRC_SHADOW_EMULATE_UNSHADOW_UNHANDLED,
+                                   va, gfn);
+        goto emulate_done;
     }
 
 #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION
@@ -3504,7 +3657,8 @@ static int sh_page_fault(struct vcpu *v,
 
 #if GUEST_PAGING_LEVELS == 3 /* PAE guest */
     if ( r == X86EMUL_OKAY ) {
-        int i;
+        int i, emulation_count=0;
+        this_cpu(trace_emulate_initial_va) = va;
         /* Emulate up to four extra instructions in the hope of catching 
          * the "second half" of a 64-bit pagetable write. */
         for ( i = 0 ; i < 4 ; i++ )
@@ -3513,10 +3667,12 @@ static int sh_page_fault(struct vcpu *v,
             v->arch.paging.last_write_was_pt = 0;
             r = x86_emulate(&emul_ctxt.ctxt, emul_ops);
             if ( r == X86EMUL_OKAY )
-            {
+            { 
+                emulation_count++;
                 if ( v->arch.paging.last_write_was_pt )
                 {
                     perfc_incr(shadow_em_ex_pt);
+                    
TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EMULATION_2ND_PT_WRITTEN);
                     break; /* Don't emulate past the other half of the write */
                 }
                 else 
@@ -3525,12 +3681,16 @@ static int sh_page_fault(struct vcpu *v,
             else
             {
                 perfc_incr(shadow_em_ex_fail);
+                TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EMULATION_LAST_FAILED);
                 break; /* Don't emulate again if we failed! */
             }
         }
+        this_cpu(trace_extra_emulation_count)=emulation_count;
     }
 #endif /* PAE guest */
 
+    trace_shadow_emulate(gw.l1e, va);
+ emulate_done:
     SHADOW_PRINTK("emulated\n");
     return EXCRET_fault_fixed;
 
@@ -3543,6 +3703,7 @@ static int sh_page_fault(struct vcpu *v,
     shadow_audit_tables(v);
     reset_early_unshadow(v);
     shadow_unlock(d);
+    trace_shadow_gen(TRC_SHADOW_MMIO, va);
     return (handle_mmio_with_translation(va, gpa >> PAGE_SHIFT)
             ? EXCRET_fault_fixed : 0);
 
@@ -3552,6 +3713,10 @@ static int sh_page_fault(struct vcpu *v,
     shadow_audit_tables(v);
     reset_early_unshadow(v);
     shadow_unlock(d);
+
+propagate:
+    trace_not_shadow_fault(gw.l1e, va);
+
     return 0;
 }
 
@@ -3990,7 +4155,7 @@ sh_detach_old_tables(struct vcpu *v)
             sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable);
         v->arch.paging.shadow.guest_vtable = NULL;
     }
-#endif
+#endif // !NDEBUG
 
 
     ////
@@ -4446,6 +4611,7 @@ static int sh_guess_wrmap(struct vcpu *v
     sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW);
     r = shadow_set_l1e(v, sl1p, sl1e, sl1mfn);
     ASSERT( !(r & SHADOW_SET_ERROR) );
+    TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_WRMAP_GUESS_FOUND);
     return 1;
 }
 #endif
@@ -4800,7 +4966,7 @@ static void emulate_unmap_dest(struct vc
 
 static int
 sh_x86_emulate_write(struct vcpu *v, unsigned long vaddr, void *src,
-                      u32 bytes, struct sh_emulate_ctxt *sh_ctxt)
+                     u32 bytes, struct sh_emulate_ctxt *sh_ctxt)
 {
     void *addr;
 
@@ -4814,6 +4980,22 @@ sh_x86_emulate_write(struct vcpu *v, uns
 
     shadow_lock(v->domain);
     memcpy(addr, src, bytes);
+
+    if ( tb_init_done )
+    {
+#if GUEST_PAGING_LEVELS == 3
+        if ( vaddr == this_cpu(trace_emulate_initial_va) )
+            memcpy(&this_cpu(trace_emulate_write_val), src, bytes);
+        else if ( (vaddr & ~(0x7UL)) == this_cpu(trace_emulate_initial_va) )
+        {
+            TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EMULATE_FULL_PT);
+            memcpy(&this_cpu(trace_emulate_write_val),
+                   (void *)(((unsigned long) addr) & ~(0x7UL)), 
GUEST_PTE_SIZE);
+        }
+#else
+        memcpy(&this_cpu(trace_emulate_write_val), src, bytes);
+#endif
+    }
 
     emulate_unmap_dest(v, addr, bytes, sh_ctxt);
     shadow_audit_tables(v);
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/mm/shadow/private.h
--- a/xen/arch/x86/mm/shadow/private.h  Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/mm/shadow/private.h  Fri Sep 12 14:47:40 2008 +0900
@@ -90,6 +90,43 @@ extern int shadow_audit_enable;
 #define SHADOW_DEBUG_EMULATE           1
 #define SHADOW_DEBUG_P2M               1
 #define SHADOW_DEBUG_LOGDIRTY          0
+
+/******************************************************************************
+ * Tracing
+ */
+DECLARE_PER_CPU(uint32_t,trace_shadow_path_flags);
+
+#define TRACE_SHADOW_PATH_FLAG(_x)                      \
+    do {                                                \
+        this_cpu(trace_shadow_path_flags) |= (1<<(_x));      \
+    } while(0)
+
+#define TRACE_CLEAR_PATH_FLAGS                  \
+    this_cpu(trace_shadow_path_flags) = 0
+
+enum {
+    TRCE_SFLAG_SET_AD,
+    TRCE_SFLAG_SET_A,
+    TRCE_SFLAG_SHADOW_L1_GET_REF,
+    TRCE_SFLAG_SHADOW_L1_PUT_REF,
+    TRCE_SFLAG_L2_PROPAGATE,
+    TRCE_SFLAG_SET_CHANGED,
+    TRCE_SFLAG_SET_FLUSH,
+    TRCE_SFLAG_SET_ERROR,
+    TRCE_SFLAG_DEMOTE,
+    TRCE_SFLAG_PROMOTE,
+    TRCE_SFLAG_WRMAP,
+    TRCE_SFLAG_WRMAP_GUESS_FOUND,
+    TRCE_SFLAG_WRMAP_BRUTE_FORCE,
+    TRCE_SFLAG_EARLY_UNSHADOW,
+    TRCE_SFLAG_EMULATION_2ND_PT_WRITTEN,
+    TRCE_SFLAG_EMULATION_LAST_FAILED,
+    TRCE_SFLAG_EMULATE_FULL_PT,
+    TRCE_SFLAG_PREALLOC_UNHOOK,
+    TRCE_SFLAG_UNSYNC,
+    TRCE_SFLAG_OOS_FIXUP_ADD,
+    TRCE_SFLAG_OOS_FIXUP_EVICT,
+};
 
 /******************************************************************************
  * The shadow lock.
@@ -143,6 +180,12 @@ extern int shadow_audit_enable;
     } while (0)
 
 
+/* Size (in bytes) of a guest PTE */
+#if GUEST_PAGING_LEVELS >= 3
+# define GUEST_PTE_SIZE 8
+#else
+# define GUEST_PTE_SIZE 4
+#endif
 
 /******************************************************************************
  * Auditing routines 
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/physdev.c
--- a/xen/arch/x86/physdev.c    Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/physdev.c    Fri Sep 12 14:47:40 2008 +0900
@@ -58,9 +58,6 @@ static int get_free_pirq(struct domain *
     return i;
 }
 
-/*
- * Caller hold the irq_lock
- */
 static int map_domain_pirq(struct domain *d, int pirq, int vector,
                            struct physdev_map_pirq *map)
 {
@@ -136,13 +133,12 @@ done:
     return ret;
 }
 
-/*
- * The pirq should has been unbound before this call
- */
+/* The pirq should have been unbound before this call. */
 static int unmap_domain_pirq(struct domain *d, int pirq)
 {
-    int ret = 0;
-    int vector;
+    unsigned long flags;
+    irq_desc_t *desc;
+    int vector, ret = 0;
 
     if ( d == NULL || pirq < 0 || pirq >= NR_PIRQS )
         return -EINVAL;
@@ -159,33 +155,29 @@ static int unmap_domain_pirq(struct doma
         gdprintk(XENLOG_G_ERR, "domain %X: pirq %x not mapped still\n",
                  d->domain_id, pirq);
         ret = -EINVAL;
-    }
-    else
-    {
-        unsigned long flags;
-        irq_desc_t *desc;
-
-        desc = &irq_desc[vector];
-        spin_lock_irqsave(&desc->lock, flags);
-        if ( desc->msi_desc )
-            pci_disable_msi(vector);
-
-        if ( desc->handler == &pci_msi_type )
-        {
-            /* MSI is not shared, so should be released already */
-            BUG_ON(desc->status & IRQ_GUEST);
-            irq_desc[vector].handler = &no_irq_type;
-        }
-        spin_unlock_irqrestore(&desc->lock, flags);
-
-        d->arch.pirq_vector[pirq] = d->arch.vector_pirq[vector] = 0;
-    }
+        goto done;
+    }
+
+    desc = &irq_desc[vector];
+    spin_lock_irqsave(&desc->lock, flags);
+    if ( desc->msi_desc )
+        pci_disable_msi(vector);
+
+    if ( desc->handler == &pci_msi_type )
+    {
+        /* MSI is not shared, so should be released already */
+        BUG_ON(desc->status & IRQ_GUEST);
+        irq_desc[vector].handler = &no_irq_type;
+    }
+    spin_unlock_irqrestore(&desc->lock, flags);
+
+    d->arch.pirq_vector[pirq] = d->arch.vector_pirq[vector] = 0;
 
     ret = irq_deny_access(d, pirq);
-
     if ( ret )
         gdprintk(XENLOG_G_ERR, "deny irq %x access failed\n", pirq);
 
+ done:
     return ret;
 }
 
@@ -194,10 +186,6 @@ static int physdev_map_pirq(struct physd
     struct domain *d;
     int vector, pirq, ret = 0;
     unsigned long flags;
-
-    /* if msi_enable is not enabled, map always succeeds */
-    if ( !msi_enable )
-        return 0;
 
     if ( !IS_PRIV(current->domain) )
         return -EPERM;
@@ -308,14 +296,8 @@ static int physdev_unmap_pirq(struct phy
     unsigned long flags;
     int ret;
 
-    if ( !msi_enable )
-        return 0;
-
     if ( !IS_PRIV(current->domain) )
         return -EPERM;
-
-    if ( !unmap )
-        return -EINVAL;
 
     if ( unmap->domid == DOMID_SELF )
         d = rcu_lock_domain(current->domain);
@@ -323,14 +305,12 @@ static int physdev_unmap_pirq(struct phy
         d = rcu_lock_domain_by_id(unmap->domid);
 
     if ( d == NULL )
-    {
-        rcu_unlock_domain(d);
         return -ESRCH;
-    }
 
     spin_lock_irqsave(&d->arch.irq_lock, flags);
     ret = unmap_domain_pirq(d, unmap->pirq);
     spin_unlock_irqrestore(&d->arch.irq_lock, flags);
+
     rcu_unlock_domain(d);
 
     return ret;
@@ -452,20 +432,14 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H
 
         irq = irq_op.irq;
         ret = -EINVAL;
-        if ( ((irq < 0) && (irq != AUTO_ASSIGN)) || (irq >= NR_IRQS) )
+        if ( (irq < 0) || (irq >= NR_IRQS) )
             break;
 
         irq_op.vector = assign_irq_vector(irq);
 
-        ret = 0;
-
-        if ( msi_enable )
-        {
-            spin_lock_irqsave(&dom0->arch.irq_lock, flags);
-            if ( irq != AUTO_ASSIGN )
-                ret = map_domain_pirq(dom0, irq_op.irq, irq_op.vector, NULL);
-            spin_unlock_irqrestore(&dom0->arch.irq_lock, flags);
-        }
+        spin_lock_irqsave(&dom0->arch.irq_lock, flags);
+        ret = map_domain_pirq(dom0, irq_op.irq, irq_op.vector, NULL);
+        spin_unlock_irqrestore(&dom0->arch.irq_lock, flags);
 
         if ( copy_to_guest(arg, &irq_op, 1) != 0 )
             ret = -EFAULT;
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/platform_hypercall.c
--- a/xen/arch/x86/platform_hypercall.c Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/platform_hypercall.c Fri Sep 12 14:47:40 2008 +0900
@@ -192,6 +192,10 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xe
     break;
 
     case XENPF_firmware_info:
+        ret = xsm_firmware_info();
+        if ( ret )
+            break;
+
         switch ( op->u.firmware_info.type )
         {
         case XEN_FW_DISK_INFO: {
@@ -280,10 +284,18 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xe
         break;
 
     case XENPF_enter_acpi_sleep:
+        ret = xsm_acpi_sleep();
+        if ( ret )
+            break;
+
         ret = acpi_enter_sleep(&op->u.enter_acpi_sleep);
         break;
 
     case XENPF_change_freq:
+        ret = xsm_change_freq();
+        if ( ret )
+            break;
+
         ret = -ENOSYS;
         if ( cpufreq_controller != FREQCTL_dom0_kernel )
             break;
@@ -305,6 +317,10 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xe
         cpumask_t cpumap;
         XEN_GUEST_HANDLE(uint8) cpumap_bitmap;
         XEN_GUEST_HANDLE(uint64) idletimes;
+
+        ret = xsm_getidletime();
+        if ( ret )
+            break;
 
         ret = -ENOSYS;
         if ( cpufreq_controller != FREQCTL_dom0_kernel )
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/smpboot.c
--- a/xen/arch/x86/smpboot.c    Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/smpboot.c    Fri Sep 12 14:47:40 2008 +0900
@@ -1225,15 +1225,6 @@ int __cpu_disable(void)
        if (cpu == 0)
                return -EBUSY;
 
-       /*
-        * Only S3 is using this path, and thus idle vcpus are running on all
-        * APs when we are called. To support full cpu hotplug, other 
-        * notification mechanisms should be introduced (e.g., migrate vcpus
-        * off this physical cpu before rendezvous point).
-        */
-       if (!is_idle_vcpu(current))
-               return -EINVAL;
-
        local_irq_disable();
        clear_local_APIC();
        /* Allow any queued timer interrupts to get serviced */
@@ -1249,6 +1240,9 @@ int __cpu_disable(void)
        fixup_irqs(map);
        /* It's now safe to remove this processor from the online map */
        cpu_clear(cpu, cpu_online_map);
+
+       cpu_disable_scheduler();
+
        return 0;
 }
 
@@ -1275,28 +1269,6 @@ static int take_cpu_down(void *unused)
     return __cpu_disable();
 }
 
-/* 
- * XXX: One important thing missed here is to migrate vcpus
- * from dead cpu to other online ones and then put whole
- * system into a stop state. It assures a safe environment
- * for a cpu hotplug/remove at normal running state.
- *
- * However for xen PM case, at this point:
- *     -> All other domains should be notified with PM event,
- *        and then in following states:
- *             * Suspend state, or
- *             * Paused state, which is a force step to all
- *               domains if they do nothing to suspend
- *     -> All vcpus of dom0 (except vcpu0) have already beem
- *        hot removed
- * with the net effect that all other cpus only have idle vcpu
- * running. In this special case, we can avoid vcpu migration
- * then and system can be considered in a stop state.
- *
- * So current cpu hotplug is a special version for PM specific
- * usage, and need more effort later for full cpu hotplug.
- * (ktian1)
- */
 int cpu_down(unsigned int cpu)
 {
        int err = 0;
@@ -1304,6 +1276,12 @@ int cpu_down(unsigned int cpu)
        spin_lock(&cpu_add_remove_lock);
        if (num_online_cpus() == 1) {
                err = -EBUSY;
+               goto out;
+       }
+
+       /* Can not offline BSP */
+       if (cpu == 0) {
+               err = -EINVAL;
                goto out;
        }
 
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/time.c
--- a/xen/arch/x86/time.c       Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/time.c       Fri Sep 12 14:47:40 2008 +0900
@@ -993,15 +993,16 @@ static void local_time_calibration(void)
  * All CPUS snapshot their local TSC and extrapolation of system time.
  */
 struct calibration_rendezvous {
+    cpumask_t cpu_calibration_map;
     atomic_t nr_cpus;
     s_time_t master_stime;
 };
 
 static void time_calibration_rendezvous(void *_r)
 {
-    unsigned int total_cpus = num_online_cpus();
     struct cpu_calibration *c = &this_cpu(cpu_calibration);
     struct calibration_rendezvous *r = _r;
+    unsigned int total_cpus = cpus_weight(r->cpu_calibration_map);
 
     if ( smp_processor_id() == 0 )
     {
@@ -1029,11 +1030,13 @@ static void time_calibration(void *unuse
 static void time_calibration(void *unused)
 {
     struct calibration_rendezvous r = {
+        .cpu_calibration_map = cpu_online_map,
         .nr_cpus = ATOMIC_INIT(0)
     };
 
     /* @wait=1 because we must wait for all cpus before freeing @r. */
-    on_each_cpu(time_calibration_rendezvous, &r, 0, 1);
+    on_selected_cpus(r.cpu_calibration_map,
+                     time_calibration_rendezvous, &r, 0, 1);
 }
 
 void init_percpu_time(void)
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/traps.c
--- a/xen/arch/x86/traps.c      Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/arch/x86/traps.c      Fri Sep 12 14:47:40 2008 +0900
@@ -47,7 +47,7 @@
 #include <xen/version.h>
 #include <xen/kexec.h>
 #include <xen/trace.h>
-#include <asm/paging.h>
+#include <xen/paging.h>
 #include <asm/system.h>
 #include <asm/io.h>
 #include <asm/atomic.h>
@@ -2116,6 +2116,36 @@ static int emulate_privileged_op(struct 
             if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
                 goto fail;
             break;
+        case MSR_AMD64_NB_CFG:
+            if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
+                 boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x11 )
+                goto fail;
+            if ( !IS_PRIV(v->domain) )
+                break;
+            if ( (rdmsr_safe(MSR_AMD64_NB_CFG, l, h) != 0) ||
+                 (eax != l) ||
+                 ((edx ^ h) & ~(1 << (AMD64_NB_CFG_CF8_EXT_ENABLE_BIT - 32))) )
+                goto invalid;
+            if ( wrmsr_safe(MSR_AMD64_NB_CFG, eax, edx) != 0 )
+                goto fail;
+            break;
+        case MSR_FAM10H_MMIO_CONF_BASE:
+            if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
+                 boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x11 )
+                goto fail;
+            if ( !IS_PRIV(v->domain) )
+                break;
+            if ( (rdmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, l, h) != 0) ||
+                 (((((u64)h << 32) | l) ^ res) &
+                  ~((1 << FAM10H_MMIO_CONF_ENABLE_BIT) |
+                    (FAM10H_MMIO_CONF_BUSRANGE_MASK <<
+                     FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
+                    ((u64)FAM10H_MMIO_CONF_BASE_MASK <<
+                     FAM10H_MMIO_CONF_BASE_SHIFT))) )
+                goto invalid;
+            if ( wrmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, eax, edx) != 0 )
+                goto fail;
+            break;
         case MSR_IA32_PERF_CTL:
             if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
                 goto fail;
@@ -2124,11 +2154,18 @@ static int emulate_privileged_op(struct 
             if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
                 goto fail;
             break;
+        case MSR_IA32_THERM_CONTROL:
+            if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
+                goto fail;
+            if ( wrmsr_safe(regs->ecx, eax, edx) != 0 )
+                goto fail;
+            break;
         default:
             if ( wrmsr_hypervisor_regs(regs->ecx, eax, edx) )
                 break;
             if ( (rdmsr_safe(regs->ecx, l, h) != 0) ||
                  (eax != l) || (edx != h) )
+        invalid:
                 gdprintk(XENLOG_WARNING, "Domain attempted WRMSR %p from "
                         "%08x:%08x to %08x:%08x.\n",
                         _p(regs->ecx), h, l, edx, eax);
@@ -2198,6 +2235,12 @@ static int emulate_privileged_op(struct 
             regs->eax |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL |
                          MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL |
                          MSR_IA32_MISC_ENABLE_XTPR_DISABLE;
+            break;
+        case MSR_IA32_THERM_CONTROL:
+            if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
+                goto fail;
+            if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
+                goto fail;
             break;
         default:
             if ( rdmsr_hypervisor_regs(regs->ecx, &l, &h) )
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/common/domain.c
--- a/xen/common/domain.c       Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/common/domain.c       Fri Sep 12 14:47:40 2008 +0900
@@ -651,9 +651,11 @@ void vcpu_reset(struct vcpu *v)
 
     set_bit(_VPF_down, &v->pause_flags);
 
+    clear_bit(v->vcpu_id, d->poll_mask);
+    v->poll_evtchn = 0;
+
     v->fpu_initialised = 0;
     v->fpu_dirtied     = 0;
-    v->is_polling      = 0;
     v->is_initialised  = 0;
     v->nmi_pending     = 0;
     v->mce_pending     = 0;
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/common/domctl.c
--- a/xen/common/domctl.c       Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/common/domctl.c       Fri Sep 12 14:47:40 2008 +0900
@@ -655,9 +655,6 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
         spin_lock(&d->page_alloc_lock);
         if ( new_max >= d->tot_pages )
         {
-            ret = guest_physmap_max_mem_pages(d, new_max);
-            if ( ret != 0 )
-                break;
             d->max_pages = new_max;
             ret = 0;
         }
@@ -729,16 +726,11 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
         if ( d == NULL )
             break;
 
-        ret = xsm_irq_permission(d, pirq, op->u.irq_permission.allow_access);
-        if ( ret )
-            goto irq_permission_out;
-        
         if ( op->u.irq_permission.allow_access )
             ret = irq_permit_access(d, pirq);
         else
             ret = irq_deny_access(d, pirq);
 
-    irq_permission_out:
         rcu_unlock_domain(d);
     }
     break;
@@ -757,17 +749,12 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
         d = rcu_lock_domain_by_id(op->domain);
         if ( d == NULL )
             break;
-
-        ret = xsm_iomem_permission(d, mfn, 
op->u.iomem_permission.allow_access);
-        if ( ret )
-            goto iomem_permission_out;
 
         if ( op->u.iomem_permission.allow_access )
             ret = iomem_permit_access(d, mfn, mfn + nr_mfns - 1);
         else
             ret = iomem_deny_access(d, mfn, mfn + nr_mfns - 1);
 
-    iomem_permission_out:
         rcu_unlock_domain(d);
     }
     break;
@@ -813,6 +800,12 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
         {
             put_domain(e);
             goto set_target_out;
+        }
+
+        ret = xsm_set_target(d, e);
+        if ( ret ) {
+            put_domain(e);
+            goto set_target_out;            
         }
 
         /* Hold reference on @e until we destroy @d. */
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/common/event_channel.c
--- a/xen/common/event_channel.c        Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/common/event_channel.c        Fri Sep 12 14:47:40 2008 +0900
@@ -545,6 +545,7 @@ static int evtchn_set_pending(struct vcp
 static int evtchn_set_pending(struct vcpu *v, int port)
 {
     struct domain *d = v->domain;
+    int vcpuid;
 
     /*
      * The following bit operations must happen in strict order.
@@ -564,15 +565,19 @@ static int evtchn_set_pending(struct vcp
     }
     
     /* Check if some VCPU might be polling for this event. */
-    if ( unlikely(d->is_polling) )
-    {
-        d->is_polling = 0;
-        smp_mb(); /* check vcpu poll-flags /after/ clearing domain poll-flag */
-        for_each_vcpu ( d, v )
+    if ( likely(bitmap_empty(d->poll_mask, MAX_VIRT_CPUS)) )
+        return 0;
+
+    /* Wake any interested (or potentially interested) pollers. */
+    for ( vcpuid = find_first_bit(d->poll_mask, MAX_VIRT_CPUS);
+          vcpuid < MAX_VIRT_CPUS;
+          vcpuid = find_next_bit(d->poll_mask, MAX_VIRT_CPUS, vcpuid+1) )
+    {
+        v = d->vcpu[vcpuid];
+        if ( ((v->poll_evtchn <= 0) || (v->poll_evtchn == port)) &&
+             test_and_clear_bit(vcpuid, d->poll_mask) )
         {
-            if ( !v->is_polling )
-                continue;
-            v->is_polling = 0;
+            v->poll_evtchn = 0;
             vcpu_unblock(v);
         }
     }
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/common/rangeset.c
--- a/xen/common/rangeset.c     Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/common/rangeset.c     Fri Sep 12 14:47:40 2008 +0900
@@ -10,6 +10,7 @@
 #include <xen/sched.h>
 #include <xen/errno.h>
 #include <xen/rangeset.h>
+#include <xsm/xsm.h>
 
 /* An inclusive range [s,e] and pointer to next range in ascending order. */
 struct range {
@@ -95,6 +96,10 @@ int rangeset_add_range(
 {
     struct range *x, *y;
     int rc = 0;
+
+    rc = xsm_add_range(r->domain, r->name, s, e);
+    if ( rc )
+        return rc;
 
     ASSERT(s <= e);
 
@@ -164,6 +169,10 @@ int rangeset_remove_range(
     struct range *x, *y, *t;
     int rc = 0;
 
+    rc = xsm_remove_range(r->domain, r->name, s, e);
+    if ( rc )
+        return rc;
+
     ASSERT(s <= e);
 
     spin_lock(&r->lock);
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/common/sched_credit.c
--- a/xen/common/sched_credit.c Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/common/sched_credit.c Fri Sep 12 14:47:40 2008 +0900
@@ -1107,6 +1107,10 @@ csched_load_balance(int cpu, struct csch
 
     BUG_ON( cpu != snext->vcpu->processor );
 
+    /* If this CPU is going offline we shouldn't steal work. */
+    if ( unlikely(!cpu_online(cpu)) )
+        goto out;
+
     if ( snext->pri == CSCHED_PRI_IDLE )
         CSCHED_STAT_CRANK(load_balance_idle);
     else if ( snext->pri == CSCHED_PRI_TS_OVER )
@@ -1149,6 +1153,7 @@ csched_load_balance(int cpu, struct csch
             return speer;
     }
 
+ out:
     /* Failed to find more important work elsewhere... */
     __runq_remove(snext);
     return snext;
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/common/schedule.c
--- a/xen/common/schedule.c     Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/common/schedule.c     Fri Sep 12 14:47:40 2008 +0900
@@ -63,11 +63,31 @@ static struct scheduler ops;
          (( ops.fn != NULL ) ? ops.fn( __VA_ARGS__ )      \
           : (typeof(ops.fn(__VA_ARGS__)))0 )
 
+static inline void trace_runstate_change(struct vcpu *v, int new_state)
+{
+    struct { uint32_t vcpu:16, domain:16; } d;
+    uint32_t event;
+
+    if ( likely(!tb_init_done) )
+        return;
+
+    d.vcpu = v->vcpu_id;
+    d.domain = v->domain->domain_id;
+
+    event = TRC_SCHED_RUNSTATE_CHANGE;
+    event |= ( v->runstate.state & 0x3 ) << 8;
+    event |= ( new_state & 0x3 ) << 4;
+
+    __trace_var(event, 1/*tsc*/, sizeof(d), (unsigned char *)&d);
+}
+
 static inline void vcpu_runstate_change(
     struct vcpu *v, int new_state, s_time_t new_entry_time)
 {
     ASSERT(v->runstate.state != new_state);
     ASSERT(spin_is_locked(&per_cpu(schedule_data,v->processor).schedule_lock));
+
+    trace_runstate_change(v, new_state);
 
     v->runstate.time[v->runstate.state] +=
         new_entry_time - v->runstate.state_entry_time;
@@ -198,6 +218,27 @@ void vcpu_wake(struct vcpu *v)
     TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id);
 }
 
+void vcpu_unblock(struct vcpu *v)
+{
+    if ( !test_and_clear_bit(_VPF_blocked, &v->pause_flags) )
+        return;
+
+    /* Polling period ends when a VCPU is unblocked. */
+    if ( unlikely(v->poll_evtchn != 0) )
+    {
+        v->poll_evtchn = 0;
+        /*
+         * We *must* re-clear _VPF_blocked to avoid racing other wakeups of
+         * this VCPU (and it then going back to sleep on poll_mask).
+         * Test-and-clear is idiomatic and ensures clear_bit not reordered.
+         */
+        if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) )
+            clear_bit(_VPF_blocked, &v->pause_flags);
+    }
+
+    vcpu_wake(v);
+}
+
 static void vcpu_migrate(struct vcpu *v)
 {
     unsigned long flags;
@@ -247,6 +288,48 @@ void vcpu_force_reschedule(struct vcpu *
     }
 }
 
+/*
+ * This function is used by cpu_hotplug code from stop_machine context.
+ * Hence we can avoid needing to take the 
+ */
+void cpu_disable_scheduler(void)
+{
+    struct domain *d;
+    struct vcpu *v;
+    unsigned int cpu = smp_processor_id();
+
+    for_each_domain ( d )
+    {
+        for_each_vcpu ( d, v )
+        {
+            if ( is_idle_vcpu(v) )
+                continue;
+
+            if ( (cpus_weight(v->cpu_affinity) == 1) &&
+                 cpu_isset(cpu, v->cpu_affinity) )
+            {
+                printk("Breaking vcpu affinity for domain %d vcpu %d\n",
+                        v->domain->domain_id, v->vcpu_id);
+                cpus_setall(v->cpu_affinity);
+            }
+
+            /*
+             * Migrate single-shot timers to CPU0. A new cpu will automatically
+             * be chosen when the timer is next re-set.
+             */
+            if ( v->singleshot_timer.cpu == cpu )
+                migrate_timer(&v->singleshot_timer, 0);
+
+            if ( v->processor == cpu )
+            {
+                set_bit(_VPF_migrating, &v->pause_flags);
+                vcpu_sleep_nosync(v);
+                vcpu_migrate(v);
+            }
+        }
+    }
+}
+
 static int __vcpu_set_affinity(
     struct vcpu *v, cpumask_t *affinity,
     bool_t old_lock_status, bool_t new_lock_status)
@@ -337,7 +420,7 @@ static long do_poll(struct sched_poll *s
     struct vcpu   *v = current;
     struct domain *d = v->domain;
     evtchn_port_t  port;
-    long           rc = 0;
+    long           rc;
     unsigned int   i;
 
     /* Fairly arbitrary limit. */
@@ -348,11 +431,24 @@ static long do_poll(struct sched_poll *s
         return -EFAULT;
 
     set_bit(_VPF_blocked, &v->pause_flags);
-    v->is_polling = 1;
-    d->is_polling = 1;
-
+    v->poll_evtchn = -1;
+    set_bit(v->vcpu_id, d->poll_mask);
+
+#ifndef CONFIG_X86 /* set_bit() implies mb() on x86 */
     /* Check for events /after/ setting flags: avoids wakeup waiting race. */
-    smp_wmb();
+    smp_mb();
+
+    /*
+     * Someone may have seen we are blocked but not that we are polling, or
+     * vice versa. We are certainly being woken, so clean up and bail. Beyond
+     * this point others can be guaranteed to clean up for us if they wake us.
+     */
+    rc = 0;
+    if ( (v->poll_evtchn == 0) ||
+         !test_bit(_VPF_blocked, &v->pause_flags) ||
+         !test_bit(v->vcpu_id, d->poll_mask) )
+        goto out;
+#endif
 
     for ( i = 0; i < sched_poll->nr_ports; i++ )
     {
@@ -369,6 +465,9 @@ static long do_poll(struct sched_poll *s
             goto out;
     }
 
+    if ( sched_poll->nr_ports == 1 )
+        v->poll_evtchn = port;
+
     if ( sched_poll->timeout != 0 )
         set_timer(&v->poll_timer, sched_poll->timeout);
 
@@ -378,7 +477,8 @@ static long do_poll(struct sched_poll *s
     return 0;
 
  out:
-    v->is_polling = 0;
+    v->poll_evtchn = 0;
+    clear_bit(v->vcpu_id, d->poll_mask);
     clear_bit(_VPF_blocked, &v->pause_flags);
     return rc;
 }
@@ -628,7 +728,9 @@ static void vcpu_periodic_timer_work(str
         return;
 
     periodic_next_event = v->periodic_last_event + v->periodic_period;
-    if ( now > periodic_next_event )
+
+    /* The timer subsystem may call us up to TIME_SLOP ahead of deadline. */
+    if ( (now + TIME_SLOP) > periodic_next_event )
     {
         send_timer_event(v);
         v->periodic_last_event = now;
@@ -758,11 +860,8 @@ static void poll_timer_fn(void *data)
 {
     struct vcpu *v = data;
 
-    if ( !v->is_polling )
-        return;
-
-    v->is_polling = 0;
-    vcpu_unblock(v);
+    if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) )
+        vcpu_unblock(v);
 }
 
 /* Initialise the data structures. */
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/common/sysctl.c
--- a/xen/common/sysctl.c       Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/common/sysctl.c       Fri Sep 12 14:47:40 2008 +0900
@@ -149,6 +149,10 @@ long do_sysctl(XEN_GUEST_HANDLE(xen_sysc
         char c;
         uint32_t i;
 
+        ret = xsm_debug_keys();
+        if ( ret )
+            break;
+
         for ( i = 0; i < op->u.debug_keys.nr_keys; i++ )
         {
             if ( copy_from_guest_offset(&c, op->u.debug_keys.keys, i, 1) )
@@ -166,6 +170,10 @@ long do_sysctl(XEN_GUEST_HANDLE(xen_sysc
 
         nr_cpus = min_t(uint32_t, op->u.getcpuinfo.max_cpus, NR_CPUS);
 
+        ret = xsm_getcpuinfo();
+        if ( ret )
+            break;
+
         for ( i = 0; i < nr_cpus; i++ )
         {
             /* Assume no holes in idle-vcpu map. */
@@ -188,6 +196,10 @@ long do_sysctl(XEN_GUEST_HANDLE(xen_sysc
 
     case XEN_SYSCTL_availheap:
     { 
+        ret = xsm_availheap();
+        if ( ret )
+            break;
+
         op->u.availheap.avail_bytes = avail_domheap_pages_region(
             op->u.availheap.node,
             op->u.availheap.min_bitwidth,
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/common/trace.c
--- a/xen/common/trace.c        Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/common/trace.c        Fri Sep 12 14:47:40 2008 +0900
@@ -58,6 +58,7 @@ static int t_buf_highwater;
 
 /* Number of records lost due to per-CPU trace buffer being full. */
 static DEFINE_PER_CPU(unsigned long, lost_records);
+static DEFINE_PER_CPU(unsigned long, lost_records_first_tsc);
 
 /* a flag recording whether initialization has been done */
 /* or more properly, if the tbuf subsystem is enabled right now */
@@ -147,6 +148,31 @@ static int tb_set_size(int size)
     return 0;
 }
 
+int trace_will_trace_event(u32 event)
+{
+    if ( !tb_init_done )
+        return 0;
+
+    /*
+     * Copied from __trace_var()
+     */
+    if ( (tb_event_mask & event) == 0 )
+        return 0;
+
+    /* match class */
+    if ( ((tb_event_mask >> TRC_CLS_SHIFT) & (event >> TRC_CLS_SHIFT)) == 0 )
+        return 0;
+
+    /* then match subclass */
+    if ( (((tb_event_mask >> TRC_SUBCLS_SHIFT) & 0xf )
+                & ((event >> TRC_SUBCLS_SHIFT) & 0xf )) == 0 )
+        return 0;
+
+    if ( !cpu_isset(smp_processor_id(), tb_cpu_mask) )
+        return 0;
+
+    return 1;
+}
 
 /**
  * init_trace_bufs - performs initialization of the per-cpu trace buffers.
@@ -354,22 +380,27 @@ static inline int insert_wrap_record(str
                     NULL);
 }
 
-#define LOST_REC_SIZE 8
+#define LOST_REC_SIZE (4 + 8 + 16) /* header + tsc + sizeof(struct ed) */
 
 static inline int insert_lost_records(struct t_buf *buf)
 {
     struct {
         u32 lost_records;
-    } ed;
-
+        u32 did:16, vid:16;
+        u64 first_tsc;
+    } __attribute__((packed)) ed;
+
+    ed.vid = current->vcpu_id;
+    ed.did = current->domain->domain_id;
     ed.lost_records = this_cpu(lost_records);
+    ed.first_tsc = this_cpu(lost_records_first_tsc);
 
     this_cpu(lost_records) = 0;
 
     return __insert_record(buf,
                            TRC_LOST_RECORDS,
                            sizeof(ed),
-                           0 /* !cycles */,
+                           1 /* cycles */,
                            LOST_REC_SIZE,
                            (unsigned char *)&ed);
 }
@@ -401,7 +432,8 @@ void __trace_var(u32 event, int cycles, 
     int extra_word;
     int started_below_highwater;
 
-    ASSERT(tb_init_done);
+    if( !tb_init_done )
+        return;
 
     /* Convert byte count into word count, rounding up */
     extra_word = (extra / sizeof(u32));
@@ -479,7 +511,8 @@ void __trace_var(u32 event, int cycles, 
     /* Do we have enough space for everything? */
     if ( total_size > bytes_to_tail )
     {
-        this_cpu(lost_records)++;
+        if ( ++this_cpu(lost_records) == 1 )
+            this_cpu(lost_records_first_tsc)=(u64)get_cycles();
         local_irq_restore(flags);
         return;
     }
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/drivers/acpi/hwregs.c
--- a/xen/drivers/acpi/hwregs.c Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/drivers/acpi/hwregs.c Fri Sep 12 14:47:40 2008 +0900
@@ -239,11 +239,13 @@ acpi_status acpi_set_register(u32 regist
 
        case ACPI_REGISTER_PM2_CONTROL:
 
+#if 0 /* Redundant read in original Linux code. */
                status = acpi_hw_register_read(ACPI_REGISTER_PM2_CONTROL,
                                               &register_value);
                if (ACPI_FAILURE(status)) {
                        goto unlock_and_exit;
                }
+#endif
 
                ACPI_DEBUG_PRINT((ACPI_DB_IO,
                                  "PM2 control: Read %X from %8.8X%8.8X\n",
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/drivers/passthrough/iommu.c
--- a/xen/drivers/passthrough/iommu.c   Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/drivers/passthrough/iommu.c   Fri Sep 12 14:47:40 2008 +0900
@@ -33,11 +33,13 @@ int amd_iov_detect(void);
  *   pv                         Enable IOMMU for PV domains
  *   no-pv                      Disable IOMMU for PV domains (default)
  *   force|required             Don't boot unless IOMMU is enabled
+ *   passthrough                Bypass VT-d translation for Dom0
  */
 custom_param("iommu", parse_iommu_param);
 int iommu_enabled = 0;
 int iommu_pv_enabled = 0;
 int force_iommu = 0;
+int iommu_passthrough = 0;
 
 static void __init parse_iommu_param(char *s)
 {
@@ -58,6 +60,8 @@ static void __init parse_iommu_param(cha
             iommu_pv_enabled = 0;
         else if ( !strcmp(s, "force") || !strcmp(s, "required") )
             force_iommu = 1;
+        else if ( !strcmp(s, "passthrough") )
+            iommu_passthrough = 1;
 
         s = ss + 1;
     } while ( ss );
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/drivers/passthrough/vtd/iommu.c
--- a/xen/drivers/passthrough/vtd/iommu.c       Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/drivers/passthrough/vtd/iommu.c       Fri Sep 12 14:47:40 2008 +0900
@@ -1090,12 +1090,13 @@ static int domain_context_mapping_one(
     }
 
     spin_lock_irqsave(&iommu->lock, flags);
-
-#ifdef CONTEXT_PASSTHRU
-    if ( ecap_pass_thru(iommu->ecap) && (domain->domain_id == 0) )
+    if ( iommu_passthrough &&
+         ecap_pass_thru(iommu->ecap) && (domain->domain_id == 0) )
+    {
         context_set_translation_type(*context, CONTEXT_TT_PASS_THRU);
+        agaw = level_to_agaw(iommu->nr_pt_levels);
+    }
     else
-#endif
     {
         /* Ensure we have pagetables allocated down to leaf PTE. */
         if ( hd->pgd_maddr == 0 )
@@ -1459,11 +1460,13 @@ int intel_iommu_map_page(
     u64 pg_maddr;
     int pte_present;
 
-#ifdef CONTEXT_PASSTHRU
+    drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
+    iommu = drhd->iommu;
+
     /* do nothing if dom0 and iommu supports pass thru */
-    if ( ecap_pass_thru(iommu->ecap) && (d->domain_id == 0) )
+    if ( iommu_passthrough &&
+         ecap_pass_thru(iommu->ecap) && (d->domain_id == 0) )
         return 0;
-#endif
 
     pg_maddr = addr_to_dma_page_maddr(d, (paddr_t)gfn << PAGE_SHIFT_4K, 1);
     if ( pg_maddr == 0 )
@@ -1500,11 +1503,10 @@ int intel_iommu_unmap_page(struct domain
     drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list);
     iommu = drhd->iommu;
 
-#ifdef CONTEXT_PASSTHRU
     /* do nothing if dom0 and iommu supports pass thru */
-    if ( ecap_pass_thru(iommu->ecap) && (d->domain_id == 0) )
+    if ( iommu_passthrough &&
+         ecap_pass_thru(iommu->ecap) && (d->domain_id == 0) )
         return 0;
-#endif
 
     dma_pte_clear_one(d, (paddr_t)gfn << PAGE_SHIFT_4K);
 
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/include/asm-ia64/shadow.h
--- a/xen/include/asm-ia64/shadow.h     Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/include/asm-ia64/shadow.h     Fri Sep 12 14:47:40 2008 +0900
@@ -63,8 +63,6 @@ shadow_mark_page_dirty(struct domain *d,
         return 0;
 }
 
-#define guest_physmap_max_mem_pages(d, n) (0)
-
 #endif // _XEN_SHADOW_H
 
 /*
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/include/asm-x86/bitops.h
--- a/xen/include/asm-x86/bitops.h      Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/include/asm-x86/bitops.h      Fri Sep 12 14:47:40 2008 +0900
@@ -116,8 +116,8 @@ static inline void __clear_bit(int nr, v
     __clear_bit(nr, addr);                              \
 })
 
-#define smp_mb__before_clear_bit() barrier()
-#define smp_mb__after_clear_bit()  barrier()
+#define smp_mb__before_clear_bit() ((void)0)
+#define smp_mb__after_clear_bit()  ((void)0)
 
 /**
  * __change_bit - Toggle a bit in memory
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/include/asm-x86/guest_access.h
--- a/xen/include/asm-x86/guest_access.h        Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/include/asm-x86/guest_access.h        Fri Sep 12 14:47:40 2008 +0900
@@ -8,7 +8,7 @@
 #define __ASM_X86_GUEST_ACCESS_H__
 
 #include <asm/uaccess.h>
-#include <asm/shadow.h>
+#include <asm/paging.h>
 #include <asm/hvm/support.h>
 #include <asm/hvm/guest_access.h>
 
@@ -87,10 +87,10 @@
  * Allows use of faster __copy_* functions.
  */
 #define guest_handle_okay(hnd, nr)                      \
-    (shadow_mode_external(current->domain) ||           \
+    (paging_mode_external(current->domain) ||           \
      array_access_ok((hnd).p, (nr), sizeof(*(hnd).p)))
 #define guest_handle_subrange_okay(hnd, first, last)    \
-    (shadow_mode_external(current->domain) ||           \
+    (paging_mode_external(current->domain) ||           \
      array_access_ok((hnd).p + (first),                 \
                      (last)-(first)+1,                  \
                      sizeof(*(hnd).p)))
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/include/asm-x86/hvm/trace.h
--- a/xen/include/asm-x86/hvm/trace.h   Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/include/asm-x86/hvm/trace.h   Fri Sep 12 14:47:40 2008 +0900
@@ -56,16 +56,13 @@
 #define TRC_PAR_LONG(par) (par)
 #endif
 
-#define HVMTRACE_ND(evt, cycles, vcpu, count, d1, d2, d3, d4, d5, d6)   \
+#define HVMTRACE_ND(evt, cycles, count, d1, d2, d3, d4, d5, d6)         \
     do {                                                                \
         if ( unlikely(tb_init_done) && DO_TRC_HVM_ ## evt )             \
         {                                                               \
             struct {                                                    \
-                u32 did:16, vid:16;                                     \
                 u32 d[6];                                               \
             } _d;                                                       \
-            _d.did=(vcpu)->domain->domain_id;                           \
-            _d.vid=(vcpu)->vcpu_id;                                     \
             _d.d[0]=(d1);                                               \
             _d.d[1]=(d2);                                               \
             _d.d[2]=(d3);                                               \
@@ -77,32 +74,32 @@
         }                                                               \
     } while(0)
 
-#define HVMTRACE_6D(evt, vcpu, d1, d2, d3, d4, d5, d6)    \
-                      HVMTRACE_ND(evt, 0, vcpu, 6, d1, d2, d3,  d4, d5, d6)
-#define HVMTRACE_5D(evt, vcpu, d1, d2, d3, d4, d5)        \
-                      HVMTRACE_ND(evt, 0, vcpu, 5, d1, d2, d3,  d4, d5, 0)
-#define HVMTRACE_4D(evt, vcpu, d1, d2, d3, d4)               \
-                      HVMTRACE_ND(evt, 0, vcpu, 4, d1, d2, d3,  d4, 0, 0)
-#define HVMTRACE_3D(evt, vcpu, d1, d2, d3)                   \
-                      HVMTRACE_ND(evt, 0, vcpu, 3, d1, d2, d3,  0, 0, 0)
-#define HVMTRACE_2D(evt, vcpu, d1, d2)                       \
-                      HVMTRACE_ND(evt, 0, vcpu, 2, d1, d2,  0,  0, 0, 0)
-#define HVMTRACE_1D(evt, vcpu, d1)                           \
-                      HVMTRACE_ND(evt, 0, vcpu, 1, d1,  0,  0,  0, 0, 0)
-#define HVMTRACE_0D(evt, vcpu)                               \
-                      HVMTRACE_ND(evt, 0, vcpu, 0, 0,  0,  0,  0, 0, 0)
+#define HVMTRACE_6D(evt, d1, d2, d3, d4, d5, d6)    \
+                      HVMTRACE_ND(evt, 0, 6, d1, d2, d3,  d4, d5, d6)
+#define HVMTRACE_5D(evt, d1, d2, d3, d4, d5)        \
+                      HVMTRACE_ND(evt, 0, 5, d1, d2, d3,  d4, d5, 0)
+#define HVMTRACE_4D(evt, d1, d2, d3, d4)               \
+                      HVMTRACE_ND(evt, 0, 4, d1, d2, d3,  d4, 0, 0)
+#define HVMTRACE_3D(evt, d1, d2, d3)                   \
+                      HVMTRACE_ND(evt, 0, 3, d1, d2, d3,  0, 0, 0)
+#define HVMTRACE_2D(evt, d1, d2)                       \
+                      HVMTRACE_ND(evt, 0, 2, d1, d2,  0,  0, 0, 0)
+#define HVMTRACE_1D(evt, d1)                           \
+                      HVMTRACE_ND(evt, 0, 1, d1,  0,  0,  0, 0, 0)
+#define HVMTRACE_0D(evt)                               \
+                      HVMTRACE_ND(evt, 0, 0, 0,  0,  0,  0, 0, 0)
 
 
 
 #ifdef __x86_64__
-#define HVMTRACE_LONG_1D(evt, vcpu, d1)                  \
-                   HVMTRACE_2D(evt ## 64, vcpu, (d1) & 0xFFFFFFFF, (d1) >> 32)
-#define HVMTRACE_LONG_2D(evt,vcpu,d1,d2, ...)              \
-                   HVMTRACE_3D(evt ## 64, vcpu, d1, d2)
-#define HVMTRACE_LONG_3D(evt, vcpu, d1, d2, d3, ...)      \
-                   HVMTRACE_4D(evt ## 64, vcpu, d1, d2, d3)
-#define HVMTRACE_LONG_4D(evt, vcpu, d1, d2, d3, d4, ...)  \
-                   HVMTRACE_5D(evt ## 64, vcpu, d1, d2, d3, d4)
+#define HVMTRACE_LONG_1D(evt, d1)                  \
+                   HVMTRACE_2D(evt ## 64, (d1) & 0xFFFFFFFF, (d1) >> 32)
+#define HVMTRACE_LONG_2D(evt, d1, d2, ...)              \
+                   HVMTRACE_3D(evt ## 64, d1, d2)
+#define HVMTRACE_LONG_3D(evt, d1, d2, d3, ...)      \
+                   HVMTRACE_4D(evt ## 64, d1, d2, d3)
+#define HVMTRACE_LONG_4D(evt, d1, d2, d3, d4, ...)  \
+                   HVMTRACE_5D(evt ## 64, d1, d2, d3, d4)
 #else
 #define HVMTRACE_LONG_1D HVMTRACE_1D
 #define HVMTRACE_LONG_2D HVMTRACE_2D
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/include/asm-x86/io_apic.h
--- a/xen/include/asm-x86/io_apic.h     Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/include/asm-x86/io_apic.h     Fri Sep 12 14:47:40 2008 +0900
@@ -162,8 +162,6 @@ static inline void io_apic_modify(unsign
 /* 1 if "noapic" boot option passed */
 extern int skip_ioapic_setup;
 
-extern int msi_enable;
-
 /*
  * If we use the IO-APIC for IRQ routing, disable automatic
  * assignment of PCI IRQ's.
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/include/asm-x86/mm.h
--- a/xen/include/asm-x86/mm.h  Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/include/asm-x86/mm.h  Fri Sep 12 14:47:40 2008 +0900
@@ -57,6 +57,17 @@ struct page_info
          * (except page table pages when the guest is in shadow mode).
          */
         u32 tlbflush_timestamp;
+
+        /*
+         * When PGT_partial is true then this field is valid and indicates
+         * that PTEs in the range [0, @nr_validated_ptes) have been validated.
+         * If @partial_pte is true then PTE at @nr_validated_ptes+1 has been
+         * partially validated.
+         */
+        struct {
+            u16 nr_validated_ptes;
+            bool_t partial_pte;
+        };
 
         /*
          * Guest pages with a shadow.  This does not conflict with
@@ -86,9 +97,12 @@ struct page_info
  /* PAE only: is this an L2 page directory containing Xen-private mappings? */
 #define _PGT_pae_xen_l2     26
 #define PGT_pae_xen_l2      (1U<<_PGT_pae_xen_l2)
-
- /* 26-bit count of uses of this frame as its current type. */
-#define PGT_count_mask      ((1U<<26)-1)
+/* Has this page been *partially* validated for use as its current type? */
+#define _PGT_partial        25
+#define PGT_partial         (1U<<_PGT_partial)
+
+ /* 25-bit count of uses of this frame as its current type. */
+#define PGT_count_mask      ((1U<<25)-1)
 
  /* Cleared when the owning guest 'frees' this page. */
 #define _PGC_allocated      31
@@ -154,7 +168,8 @@ extern unsigned long total_pages;
 extern unsigned long total_pages;
 void init_frametable(void);
 
-void free_page_type(struct page_info *page, unsigned long type);
+int free_page_type(struct page_info *page, unsigned long type,
+                   int preemptible);
 int _shadow_mode_refcounts(struct domain *d);
 
 void cleanup_page_cacheattr(struct page_info *page);
@@ -165,6 +180,8 @@ int  get_page(struct page_info *page, st
 int  get_page(struct page_info *page, struct domain *domain);
 void put_page_type(struct page_info *page);
 int  get_page_type(struct page_info *page, unsigned long type);
+int  put_page_type_preemptible(struct page_info *page);
+int  get_page_type_preemptible(struct page_info *page, unsigned long type);
 int  get_page_from_l1e(l1_pgentry_t l1e, struct domain *d);
 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d);
 
@@ -174,6 +191,19 @@ static inline void put_page_and_type(str
     put_page(page);
 }
 
+static inline int put_page_and_type_preemptible(struct page_info *page,
+                                                int preemptible)
+{
+    int rc = 0;
+
+    if ( preemptible )
+        rc = put_page_type_preemptible(page);
+    else
+        put_page_type(page);
+    if ( likely(rc == 0) )
+        put_page(page);
+    return rc;
+}
 
 static inline int get_page_and_type(struct page_info *page,
                                     struct domain *domain,
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/include/asm-x86/msr-index.h
--- a/xen/include/asm-x86/msr-index.h   Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/include/asm-x86/msr-index.h   Fri Sep 12 14:47:40 2008 +0900
@@ -194,10 +194,22 @@
 #define _K8_VMCR_SVME_DISABLE          4
 #define K8_VMCR_SVME_DISABLE           (1 << _K8_VMCR_SVME_DISABLE)
 
+/* AMD64 MSRs */
+#define MSR_AMD64_NB_CFG               0xc001001f
+#define AMD64_NB_CFG_CF8_EXT_ENABLE_BIT        46
+
 /* AMD Family10h machine check MSRs */
 #define MSR_F10_MC4_MISC1              0xc0000408
 #define MSR_F10_MC4_MISC2              0xc0000409
 #define MSR_F10_MC4_MISC3              0xc000040A
+
+/* Other AMD Fam10h MSRs */
+#define MSR_FAM10H_MMIO_CONF_BASE      0xc0010058
+#define FAM10H_MMIO_CONF_ENABLE_BIT    0
+#define FAM10H_MMIO_CONF_BUSRANGE_MASK 0xf
+#define FAM10H_MMIO_CONF_BUSRANGE_SHIFT 2
+#define FAM10H_MMIO_CONF_BASE_MASK     0xfffffff
+#define FAM10H_MMIO_CONF_BASE_SHIFT    20
 
 /* K6 MSRs */
 #define MSR_K6_EFER                    0xc0000080
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/include/asm-x86/shadow.h
--- a/xen/include/asm-x86/shadow.h      Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/include/asm-x86/shadow.h      Fri Sep 12 14:47:40 2008 +0900
@@ -115,8 +115,6 @@ static inline void shadow_remove_all_sha
     sh_remove_shadows(v, gmfn, 0 /* Be thorough */, 1 /* Must succeed */);
 }
 
-#define guest_physmap_max_mem_pages(d, n) (0)
-
 #endif /* _XEN_SHADOW_H */
 
 /*
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/include/public/trace.h
--- a/xen/include/public/trace.h        Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/include/public/trace.h        Fri Sep 12 14:47:40 2008 +0900
@@ -37,6 +37,7 @@
 #define TRC_HVM      0x0008f000    /* Xen HVM trace            */
 #define TRC_MEM      0x0010f000    /* Xen memory trace         */
 #define TRC_PV       0x0020f000    /* Xen PV traces            */
+#define TRC_SHADOW   0x0040f000    /* Xen shadow tracing       */
 #define TRC_ALL      0x0ffff000
 #define TRC_HD_TO_EVENT(x) ((x)&0x0fffffff)
 #define TRC_HD_CYCLE_FLAG (1UL<<31)
@@ -50,26 +51,30 @@
 #define TRC_HVM_ENTRYEXIT 0x00081000   /* VMENTRY and #VMEXIT       */
 #define TRC_HVM_HANDLER   0x00082000   /* various HVM handlers      */
 
+#define TRC_SCHED_MIN       0x00021000   /* Just runstate changes */
+#define TRC_SCHED_VERBOSE   0x00028000   /* More inclusive scheduling */
+
 /* Trace events per class */
 #define TRC_LOST_RECORDS        (TRC_GEN + 1)
 #define TRC_TRACE_WRAP_BUFFER  (TRC_GEN + 2)
 #define TRC_TRACE_CPU_CHANGE    (TRC_GEN + 3)
 
-#define TRC_SCHED_DOM_ADD       (TRC_SCHED +  1)
-#define TRC_SCHED_DOM_REM       (TRC_SCHED +  2)
-#define TRC_SCHED_SLEEP         (TRC_SCHED +  3)
-#define TRC_SCHED_WAKE          (TRC_SCHED +  4)
-#define TRC_SCHED_YIELD         (TRC_SCHED +  5)
-#define TRC_SCHED_BLOCK         (TRC_SCHED +  6)
-#define TRC_SCHED_SHUTDOWN      (TRC_SCHED +  7)
-#define TRC_SCHED_CTL           (TRC_SCHED +  8)
-#define TRC_SCHED_ADJDOM        (TRC_SCHED +  9)
-#define TRC_SCHED_SWITCH        (TRC_SCHED + 10)
-#define TRC_SCHED_S_TIMER_FN    (TRC_SCHED + 11)
-#define TRC_SCHED_T_TIMER_FN    (TRC_SCHED + 12)
-#define TRC_SCHED_DOM_TIMER_FN  (TRC_SCHED + 13)
-#define TRC_SCHED_SWITCH_INFPREV (TRC_SCHED + 14)
-#define TRC_SCHED_SWITCH_INFNEXT (TRC_SCHED + 15)
+#define TRC_SCHED_RUNSTATE_CHANGE (TRC_SCHED_MIN + 1)
+#define TRC_SCHED_DOM_ADD        (TRC_SCHED_VERBOSE +  1)
+#define TRC_SCHED_DOM_REM        (TRC_SCHED_VERBOSE +  2)
+#define TRC_SCHED_SLEEP          (TRC_SCHED_VERBOSE +  3)
+#define TRC_SCHED_WAKE           (TRC_SCHED_VERBOSE +  4)
+#define TRC_SCHED_YIELD          (TRC_SCHED_VERBOSE +  5)
+#define TRC_SCHED_BLOCK          (TRC_SCHED_VERBOSE +  6)
+#define TRC_SCHED_SHUTDOWN       (TRC_SCHED_VERBOSE +  7)
+#define TRC_SCHED_CTL            (TRC_SCHED_VERBOSE +  8)
+#define TRC_SCHED_ADJDOM         (TRC_SCHED_VERBOSE +  9)
+#define TRC_SCHED_SWITCH         (TRC_SCHED_VERBOSE + 10)
+#define TRC_SCHED_S_TIMER_FN     (TRC_SCHED_VERBOSE + 11)
+#define TRC_SCHED_T_TIMER_FN     (TRC_SCHED_VERBOSE + 12)
+#define TRC_SCHED_DOM_TIMER_FN   (TRC_SCHED_VERBOSE + 13)
+#define TRC_SCHED_SWITCH_INFPREV (TRC_SCHED_VERBOSE + 14)
+#define TRC_SCHED_SWITCH_INFNEXT (TRC_SCHED_VERBOSE + 15)
 
 #define TRC_MEM_PAGE_GRANT_MAP      (TRC_MEM + 1)
 #define TRC_MEM_PAGE_GRANT_UNMAP    (TRC_MEM + 2)
@@ -88,6 +93,22 @@
 #define TRC_PV_PTWR_EMULATION_PAE    (TRC_PV + 12)
   /* Indicates that addresses in trace record are 64 bits */
 #define TRC_64_FLAG               (0x100) 
+
+#define TRC_SHADOW_NOT_SHADOW                 (TRC_SHADOW +  1)
+#define TRC_SHADOW_FAST_PROPAGATE             (TRC_SHADOW +  2)
+#define TRC_SHADOW_FAST_MMIO                  (TRC_SHADOW +  3)
+#define TRC_SHADOW_FALSE_FAST_PATH            (TRC_SHADOW +  4)
+#define TRC_SHADOW_MMIO                       (TRC_SHADOW +  5)
+#define TRC_SHADOW_FIXUP                      (TRC_SHADOW +  6)
+#define TRC_SHADOW_DOMF_DYING                 (TRC_SHADOW +  7)
+#define TRC_SHADOW_EMULATE                    (TRC_SHADOW +  8)
+#define TRC_SHADOW_EMULATE_UNSHADOW_USER      (TRC_SHADOW +  9)
+#define TRC_SHADOW_EMULATE_UNSHADOW_EVTINJ    (TRC_SHADOW + 10)
+#define TRC_SHADOW_EMULATE_UNSHADOW_UNHANDLED (TRC_SHADOW + 11)
+#define TRC_SHADOW_WRMAP_BF                   (TRC_SHADOW + 12)
+#define TRC_SHADOW_PREALLOC_UNPIN             (TRC_SHADOW + 13)
+#define TRC_SHADOW_RESYNC_FULL                (TRC_SHADOW + 14)
+#define TRC_SHADOW_RESYNC_ONLY                (TRC_SHADOW + 15)
 
 /* trace events per subclass */
 #define TRC_HVM_VMENTRY         (TRC_HVM_ENTRYEXIT + 0x01)
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/include/xen/cpuidle.h
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/include/xen/cpuidle.h Fri Sep 12 14:47:40 2008 +0900
@@ -0,0 +1,82 @@
+/*
+ * cpuidle.h - xen idle state module derived from Linux 
+ *
+ * (C) 2007 Venkatesh Pallipadi <venkatesh.pallipadi@xxxxxxxxx>
+ *          Shaohua Li <shaohua.li@xxxxxxxxx>
+ *          Adam Belay <abelay@xxxxxxxxxx>
+ *  Copyright (C) 2008 Intel Corporation
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or (at
+ *  your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+#ifndef _XEN_CPUIDLE_H
+#define _XEN_CPUIDLE_H
+
+#define ACPI_PROCESSOR_MAX_POWER        8
+#define CPUIDLE_NAME_LEN                16
+
+struct acpi_processor_cx
+{
+    u8 valid;
+    u8 type;
+    u32 address;
+    u8 space_id;
+    u32 latency;
+    u32 latency_ticks;
+    u32 power;
+    u32 usage;
+    u64 time;
+    u32 target_residency;
+};
+
+struct acpi_processor_flags
+{
+    u8 bm_control:1;
+    u8 bm_check:1;
+    u8 has_cst:1;
+    u8 power_setup_done:1;
+    u8 bm_rld_set:1;
+};
+
+struct acpi_processor_power
+{
+    unsigned int cpu;
+    struct acpi_processor_flags flags;
+    struct acpi_processor_cx *last_state;
+    struct acpi_processor_cx *safe_state;
+    u32 last_residency;
+    void *gdata; /* governor specific data */
+    u32 count;
+    struct acpi_processor_cx states[ACPI_PROCESSOR_MAX_POWER];
+};
+
+struct cpuidle_governor
+{
+    char                    name[CPUIDLE_NAME_LEN];
+    unsigned int            rating;
+
+    int  (*enable)          (struct acpi_processor_power *dev);
+    void (*disable)         (struct acpi_processor_power *dev);
+
+    int  (*select)          (struct acpi_processor_power *dev);
+    void (*reflect)         (struct acpi_processor_power *dev);
+};
+
+extern struct cpuidle_governor *cpuidle_current_governor;
+
+#endif /* _XEN_CPUIDLE_H */
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/include/xen/iommu.h
--- a/xen/include/xen/iommu.h   Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/include/xen/iommu.h   Fri Sep 12 14:47:40 2008 +0900
@@ -31,6 +31,7 @@ extern int iommu_enabled;
 extern int iommu_enabled;
 extern int iommu_pv_enabled;
 extern int force_iommu;
+extern int iommu_passthrough;
 
 #define domain_hvm_iommu(d)     (&d->arch.hvm_domain.hvm_iommu)
 
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/include/xen/sched.h
--- a/xen/include/xen/sched.h   Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/include/xen/sched.h   Fri Sep 12 14:47:40 2008 +0900
@@ -106,8 +106,6 @@ struct vcpu
     bool_t           fpu_initialised;
     /* Has the FPU been used since it was last saved? */
     bool_t           fpu_dirtied;
-    /* Is this VCPU polling any event channels (SCHEDOP_poll)? */
-    bool_t           is_polling;
     /* Initialization completed for this VCPU? */
     bool_t           is_initialised;
     /* Currently running on a CPU? */
@@ -133,6 +131,13 @@ struct vcpu
     bool_t           paused_for_shutdown;
     /* VCPU affinity is temporarily locked from controller changes? */
     bool_t           affinity_locked;
+
+    /*
+     * > 0: a single port is being polled;
+     * = 0: nothing is being polled (vcpu should be clear in d->poll_mask);
+     * < 0: multiple ports may be being polled.
+     */
+    int              poll_evtchn;
 
     unsigned long    pause_flags;
     atomic_t         pause_count;
@@ -209,14 +214,15 @@ struct domain
     struct domain   *target;
     /* Is this guest being debugged by dom0? */
     bool_t           debugger_attached;
-    /* Are any VCPUs polling event channels (SCHEDOP_poll)? */
-    bool_t           is_polling;
     /* Is this guest dying (i.e., a zombie)? */
     enum { DOMDYING_alive, DOMDYING_dying, DOMDYING_dead } is_dying;
     /* Domain is paused by controller software? */
     bool_t           is_paused_by_controller;
     /* Domain's VCPUs are pinned 1:1 to physical CPUs? */
     bool_t           is_pinned;
+
+    /* Are any VCPUs polling event channels (SCHEDOP_poll)? */
+    DECLARE_BITMAP(poll_mask, MAX_VIRT_CPUS);
 
     /* Guest has shut down (inc. reason code)? */
     spinlock_t       shutdown_lock;
@@ -507,6 +513,7 @@ static inline int vcpu_runnable(struct v
              atomic_read(&v->domain->pause_count));
 }
 
+void vcpu_unblock(struct vcpu *v);
 void vcpu_pause(struct vcpu *v);
 void vcpu_pause_nosync(struct vcpu *v);
 void domain_pause(struct domain *d);
@@ -517,17 +524,12 @@ void cpu_init(void);
 void cpu_init(void);
 
 void vcpu_force_reschedule(struct vcpu *v);
+void cpu_disable_scheduler(void);
 int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity);
 int vcpu_lock_affinity(struct vcpu *v, cpumask_t *affinity);
 void vcpu_unlock_affinity(struct vcpu *v, cpumask_t *affinity);
 
 void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate);
-
-static inline void vcpu_unblock(struct vcpu *v)
-{
-    if ( test_and_clear_bit(_VPF_blocked, &v->pause_flags) )
-        vcpu_wake(v);
-}
 
 #define IS_PRIV(_d) ((_d)->is_privileged)
 #define IS_PRIV_FOR(_d, _t) (IS_PRIV(_d) || ((_d)->target && (_d)->target == 
(_t)))
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/include/xen/trace.h
--- a/xen/include/xen/trace.h   Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/include/xen/trace.h   Fri Sep 12 14:47:40 2008 +0900
@@ -33,6 +33,8 @@ void init_trace_bufs(void);
 
 /* used to retrieve the physical address of the trace buffers */
 int tb_control(struct xen_sysctl_tbuf_op *tbc);
+
+int trace_will_trace_event(u32 event);
 
 void __trace_var(u32 event, int cycles, int extra, unsigned char *extra_data);
 
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/include/xsm/xsm.h
--- a/xen/include/xsm/xsm.h     Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/include/xsm/xsm.h     Fri Sep 12 14:47:40 2008 +0900
@@ -64,16 +64,17 @@ struct xsm_operations {
     int (*getvcpucontext) (struct domain *d);
     int (*getvcpuinfo) (struct domain *d);
     int (*domain_settime) (struct domain *d);
+    int (*set_target) (struct domain *d, struct domain *e);
     int (*tbufcontrol) (void);
     int (*readconsole) (uint32_t clear);
     int (*sched_id) (void);
     int (*setdomainmaxmem) (struct domain *d);
     int (*setdomainhandle) (struct domain *d);
     int (*setdebugging) (struct domain *d);
-    int (*irq_permission) (struct domain *d, uint8_t pirq, uint8_t access);
-    int (*iomem_permission) (struct domain *d, unsigned long mfn, 
-                                                                uint8_t 
access);
     int (*perfcontrol) (void);
+    int (*debug_keys) (void);
+    int (*getcpuinfo) (void);
+    int (*availheap) (void);
 
     int (*evtchn_unbound) (struct domain *d, struct evtchn *chn, domid_t id2);
     int (*evtchn_interdomain) (struct domain *d1, struct evtchn *chn1,
@@ -106,13 +107,13 @@ struct xsm_operations {
 
     int (*kexec) (void);
     int (*schedop_shutdown) (struct domain *d1, struct domain *d2);
+    int (*add_range) (struct domain *d, char *name, unsigned long s, unsigned 
long e);
+    int (*remove_range) (struct domain *d, char *name, unsigned long s, 
unsigned long e);
 
     long (*__do_xsm_op) (XEN_GUEST_HANDLE(xsm_op_t) op);
 
 #ifdef CONFIG_X86
     int (*shadow_control) (struct domain *d, uint32_t op);
-    int (*ioport_permission) (struct domain *d, uint32_t ioport, 
-                                                                uint8_t 
access);
     int (*getpageframeinfo) (struct page_info *page);
     int (*getmemlist) (struct domain *d);
     int (*hypercall_init) (struct domain *d);
@@ -130,13 +131,26 @@ struct xsm_operations {
     int (*microcode) (void);
     int (*physinfo) (void);
     int (*platform_quirk) (uint32_t);
+    int (*firmware_info) (void);
+    int (*acpi_sleep) (void);
+    int (*change_freq) (void);
+    int (*getidletime) (void);
     int (*machine_memory_map) (void);
     int (*domain_memory_map) (struct domain *d);
-    int (*mmu_normal_update) (struct domain *d, intpte_t fpte);
+    int (*mmu_normal_update) (struct domain *d, struct domain *f, 
+                                                                intpte_t fpte);
     int (*mmu_machphys_update) (struct domain *d, unsigned long mfn);
-    int (*update_va_mapping) (struct domain *d, l1_pgentry_t pte);
+    int (*update_va_mapping) (struct domain *d, struct domain *f, 
+                                                            l1_pgentry_t pte);
     int (*add_to_physmap) (struct domain *d1, struct domain *d2);
     int (*remove_from_physmap) (struct domain *d1, struct domain *d2);
+    int (*sendtrigger) (struct domain *d);
+    int (*test_assign_device) (uint32_t machine_bdf);
+    int (*assign_device) (struct domain *d, uint32_t machine_bdf);
+    int (*deassign_device) (struct domain *d, uint32_t machine_bdf);
+    int (*bind_pt_irq) (struct domain *d, struct xen_domctl_bind_pt_irq *bind);
+    int (*pin_mem_cacheattr) (struct domain *d);
+    int (*ext_vcpucontext) (struct domain *d, uint32_t cmd);
 #endif
 };
 
@@ -215,6 +229,11 @@ static inline int xsm_domain_settime (st
     return xsm_call(domain_settime(d));
 }
 
+static inline int xsm_set_target (struct domain *d, struct domain *e)
+{
+    return xsm_call(set_target(d, e));
+}
+
 static inline int xsm_tbufcontrol (void)
 {
     return xsm_call(tbufcontrol());
@@ -245,21 +264,24 @@ static inline int xsm_setdebugging (stru
     return xsm_call(setdebugging(d));
 }
 
-static inline int xsm_irq_permission (struct domain *d, uint8_t pirq,
-                                                                uint8_t access)
-{
-    return xsm_call(irq_permission(d, pirq, access));
-} 
-
-static inline int xsm_iomem_permission (struct domain *d, unsigned long mfn,
-                                                                uint8_t access)
-{
-    return xsm_call(iomem_permission(d, mfn, access));
-}
-
 static inline int xsm_perfcontrol (void)
 {
     return xsm_call(perfcontrol());
+}
+
+static inline int xsm_debug_keys (void)
+{
+    return xsm_call(debug_keys());
+}
+
+static inline int xsm_availheap (void)
+{
+    return xsm_call(availheap());
+}
+
+static inline int xsm_getcpuinfo (void)
+{
+    return xsm_call(getcpuinfo());
 }
 
 static inline int xsm_evtchn_unbound (struct domain *d1, struct evtchn *chn,
@@ -385,6 +407,18 @@ static inline int xsm_schedop_shutdown (
 static inline int xsm_schedop_shutdown (struct domain *d1, struct domain *d2)
 {
     return xsm_call(schedop_shutdown(d1, d2));
+}
+
+static inline int xsm_add_range (struct domain *d, char *name, unsigned long s,
+                                                                        
unsigned long e)
+{
+    return xsm_call(add_range(d, name, s, e));
+}
+ 
+static inline int xsm_remove_range (struct domain *d, char *name, unsigned 
long s,
+                                                                        
unsigned long e)
+{
+    return xsm_call(remove_range(d, name, s, e));
 }
 
 static inline long __do_xsm_op (XEN_GUEST_HANDLE(xsm_op_t) op)
@@ -413,12 +447,6 @@ static inline int xsm_shadow_control (st
     return xsm_call(shadow_control(d, op));
 }
 
-static inline int xsm_ioport_permission (struct domain *d, uint32_t ioport,
-                                                                uint8_t access)
-{
-    return xsm_call(ioport_permission(d, ioport, access));
-}
-
 static inline int xsm_getpageframeinfo (struct page_info *page)
 {
     return xsm_call(getpageframeinfo(page));
@@ -504,6 +532,26 @@ static inline int xsm_platform_quirk (ui
     return xsm_call(platform_quirk(quirk));
 }
 
+static inline int xsm_firmware_info (void)
+{
+    return xsm_call(firmware_info());
+}
+
+static inline int xsm_acpi_sleep (void)
+{
+    return xsm_call(acpi_sleep());
+}
+
+static inline int xsm_change_freq (void)
+{
+    return xsm_call(change_freq());
+}
+
+static inline int xsm_getidletime (void)
+{
+    return xsm_call(getidletime());
+}
+
 static inline int xsm_machine_memory_map(void)
 {
     return xsm_call(machine_memory_map());
@@ -514,9 +562,10 @@ static inline int xsm_domain_memory_map(
     return xsm_call(domain_memory_map(d));
 }
 
-static inline int xsm_mmu_normal_update (struct domain *d, intpte_t fpte)
-{
-    return xsm_call(mmu_normal_update(d, fpte));
+static inline int xsm_mmu_normal_update (struct domain *d, struct domain *f, 
+                                                                intpte_t fpte)
+{
+    return xsm_call(mmu_normal_update(d, f, fpte));
 }
 
 static inline int xsm_mmu_machphys_update (struct domain *d, unsigned long mfn)
@@ -524,9 +573,10 @@ static inline int xsm_mmu_machphys_updat
     return xsm_call(mmu_machphys_update(d, mfn));
 }
 
-static inline int xsm_update_va_mapping(struct domain *d, l1_pgentry_t pte)
-{
-    return xsm_call(update_va_mapping(d, pte));
+static inline int xsm_update_va_mapping(struct domain *d, struct domain *f, 
+                                                            l1_pgentry_t pte)
+{
+    return xsm_call(update_va_mapping(d, f, pte));
 }
 
 static inline int xsm_add_to_physmap(struct domain *d1, struct domain *d2)
@@ -538,6 +588,42 @@ static inline int xsm_remove_from_physma
 {
     return xsm_call(remove_from_physmap(d1, d2));
 }
+
+static inline int xsm_sendtrigger(struct domain *d)
+{
+    return xsm_call(sendtrigger(d));
+}
+
+static inline int xsm_test_assign_device(uint32_t machine_bdf)
+{
+    return xsm_call(test_assign_device(machine_bdf));
+}
+
+static inline int xsm_assign_device(struct domain *d, uint32_t machine_bdf)
+{
+    return xsm_call(assign_device(d, machine_bdf));
+}
+
+static inline int xsm_deassign_device(struct domain *d, uint32_t machine_bdf)
+{
+    return xsm_call(deassign_device(d, machine_bdf));
+}
+
+static inline int xsm_bind_pt_irq(struct domain *d, 
+                                                struct xen_domctl_bind_pt_irq 
*bind)
+{
+    return xsm_call(bind_pt_irq(d, bind));
+}
+
+static inline int xsm_pin_mem_cacheattr(struct domain *d)
+{
+    return xsm_call(pin_mem_cacheattr(d));
+}
+
+static inline int xsm_ext_vcpucontext(struct domain *d, uint32_t cmd)
+{
+    return xsm_call(ext_vcpucontext(d, cmd));
+}
 #endif /* CONFIG_X86 */
 
 #endif /* __XSM_H */
diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/xsm/dummy.c
--- a/xen/xsm/dummy.c   Fri Sep 12 14:32:45 2008 +0900
+++ b/xen/xsm/dummy.c   Fri Sep 12 14:47:40 2008 +0900
@@ -84,6 +84,11 @@ static int dummy_domain_settime (struct 
     return 0;
 }
 
+static int dummy_set_target (struct domain *d, struct domain *e)
+{
+    return 0;
+}
+
 static int dummy_tbufcontrol (void)
 {
     return 0;
@@ -114,18 +119,22 @@ static int dummy_setdebugging (struct do
     return 0;
 }
 
-static int dummy_irq_permission (struct domain *d, uint8_t pirq, uint8_t 
access)
-{
-    return 0;
-}
-
-static int dummy_iomem_permission (struct domain *d, unsigned long mfn,
-                                                                uint8_t access)
-{
-    return 0;
-}
-
 static int dummy_perfcontrol (void)
+{
+    return 0;
+}
+
+static int dummy_debug_keys (void)
+{
+    return 0;
+}
+
+static int dummy_getcpuinfo (void)
+{
+    return 0;
+}
+
+static int dummy_availheap (void)
 {
     return 0;
 }
@@ -259,18 +268,23 @@ static long dummy___do_xsm_op(XEN_GUEST_
     return -ENOSYS;
 }
 
+static int dummy_add_range (struct domain *d, char *name, unsigned long s, 
unsigned long e)
+{
+    return 0;
+}
+
+static int dummy_remove_range (struct domain *d, char *name, unsigned long s, 
+                                                                        
unsigned long e)
+{
+    return 0;
+}
+
 #ifdef CONFIG_X86
 static int dummy_shadow_control (struct domain *d, uint32_t op)
 {
     return 0;
 }
 
-static int dummy_ioport_permission (struct domain *d, uint32_t ioport, 
-                                                                uint8_t access)
-{
-    return 0;
-}
-
 static int dummy_getpageframeinfo (struct page_info *page)
 {
     return 0;
@@ -356,6 +370,26 @@ static int dummy_platform_quirk (uint32_
     return 0;
 }
 
+static int dummy_firmware_info (void)
+{
+    return 0;
+}
+
+static int dummy_acpi_sleep (void)
+{
+    return 0;
+}
+
+static int dummy_change_freq (void)
+{
+    return 0;
+}
+
+static int dummy_getidletime (void)
+{
+    return 0;
+}
+
 static int dummy_machine_memory_map (void)
 {
     return 0;
@@ -366,7 +400,8 @@ static int dummy_domain_memory_map (stru
     return 0;
 }
 
-static int dummy_mmu_normal_update (struct domain *d, intpte_t fpte)
+static int dummy_mmu_normal_update (struct domain *d, struct domain *f, 
+                                                                intpte_t fpte)
 {
     return 0;
 }
@@ -376,12 +411,48 @@ static int dummy_mmu_machphys_update (st
     return 0;
 }
 

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.