[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-changelog] [xen-unstable] merge with xen-unstable.hg
# HG changeset patch # User Isaku Yamahata <yamahata@xxxxxxxxxxxxx> # Date 1221198460 -32400 # Node ID ec8eaab557d867dca3e8cbb3e0384d797929102a # Parent 4ddd63b4be9be2440d213da60b10c20327e5c515 # Parent 346c073ed6a4f0debca36588039d649e2efd93c3 merge with xen-unstable.hg --- .hgignore | 1 Config.mk | 4 docs/misc/vtd.txt | 27 docs/src/user.tex | 4 stubdom/README | 8 tools/examples/init.d/xendomains | 6 tools/examples/xend-config.sxp | 4 tools/examples/xmexample.hvm | 2 tools/examples/xmexample.hvm-stubdom | 2 tools/flask/policy/Makefile | 234 +++++ tools/flask/policy/Rules.modular | 166 +++ tools/flask/policy/Rules.monolithic | 196 ++++ tools/flask/policy/policy/constraints | 27 tools/flask/policy/policy/flask/Makefile | 41 tools/flask/policy/policy/flask/access_vectors | 166 +++ tools/flask/policy/policy/flask/initial_sids | 17 tools/flask/policy/policy/flask/mkaccess_vector.sh | 227 +++++ tools/flask/policy/policy/flask/mkflask.sh | 95 ++ tools/flask/policy/policy/flask/security_classes | 20 tools/flask/policy/policy/global_booleans | 5 tools/flask/policy/policy/global_tunables | 6 tools/flask/policy/policy/mcs | 324 +++++++ tools/flask/policy/policy/mls | 354 ++++++++ tools/flask/policy/policy/modules.conf | 21 tools/flask/policy/policy/modules/xen/xen.if | 1 tools/flask/policy/policy/modules/xen/xen.te | 135 +++ tools/flask/policy/policy/support/loadable_module.spt | 166 +++ tools/flask/policy/policy/support/misc_macros.spt | 32 tools/flask/policy/policy/systemuser | 19 tools/flask/policy/policy/users | 39 tools/ioemu/hw/cirrus_vga.c | 3 tools/ioemu/hw/pass-through.c | 146 +++ tools/ioemu/hw/pass-through.h | 15 tools/ioemu/hw/pci.c | 5 tools/ioemu/hw/pt-msi.c | 2 tools/ioemu/hw/vga.c | 8 tools/ioemu/hw/xen_machine_fv.c | 4 tools/ioemu/vl.h | 2 tools/libxc/ia64/xc_ia64_linux_save.c | 6 tools/libxc/xc_domain_save.c | 65 - tools/libxc/xc_evtchn.c | 15 tools/libxc/xc_private.c | 10 tools/libxc/xenctrl.h | 6 tools/libxc/xenguest.h | 2 tools/python/Makefile | 26 tools/python/xen/util/xsconstants.py | 6 tools/python/xen/util/xsm/flask/flask.py | 8 tools/python/xen/util/xsm/xsm.py | 20 tools/python/xen/xend/XendConfig.py | 2 tools/python/xen/xend/XendDomainInfo.py | 6 tools/python/xen/xend/XendOptions.py | 8 tools/python/xen/xend/server/blkif.py | 2 tools/python/xen/xend/server/netif.py | 2 tools/python/xen/xend/server/pciif.py | 2 tools/python/xen/xm/create.py | 6 tools/python/xen/xm/main.py | 2 tools/xcutils/lsevtchn.c | 48 - tools/xcutils/xc_save.c | 117 +- tools/xenstore/xs.c | 7 tools/xentrace/formats | 149 ++- tools/xentrace/xentrace.c | 399 ++++++++- xen/arch/x86/acpi/Makefile | 2 xen/arch/x86/acpi/cpu_idle.c | 434 ++------- xen/arch/x86/acpi/cpufreq/cpufreq.c | 26 xen/arch/x86/acpi/cpufreq/powernow.c | 4 xen/arch/x86/acpi/cpuidle_menu.c | 132 +++ xen/arch/x86/domain.c | 24 xen/arch/x86/domain_build.c | 1 xen/arch/x86/domctl.c | 47 - xen/arch/x86/hpet.c | 30 xen/arch/x86/hvm/hvm.c | 5 xen/arch/x86/hvm/svm/intr.c | 4 xen/arch/x86/hvm/svm/svm.c | 36 xen/arch/x86/hvm/vmx/intr.c | 2 xen/arch/x86/hvm/vmx/vmx.c | 49 - xen/arch/x86/io_apic.c | 13 xen/arch/x86/irq.c | 23 xen/arch/x86/mm.c | 783 +++++++++++------- xen/arch/x86/mm/hap/hap.c | 1 xen/arch/x86/mm/shadow/common.c | 71 + xen/arch/x86/mm/shadow/multi.c | 210 ++++ xen/arch/x86/mm/shadow/private.h | 43 xen/arch/x86/physdev.c | 80 - xen/arch/x86/platform_hypercall.c | 16 xen/arch/x86/smpboot.c | 40 xen/arch/x86/time.c | 7 xen/arch/x86/traps.c | 45 + xen/common/domain.c | 4 xen/common/domctl.c | 19 xen/common/event_channel.c | 21 xen/common/rangeset.c | 9 xen/common/sched_credit.c | 5 xen/common/schedule.c | 123 ++ xen/common/sysctl.c | 12 xen/common/trace.c | 45 - xen/drivers/acpi/hwregs.c | 2 xen/drivers/passthrough/iommu.c | 4 xen/drivers/passthrough/vtd/iommu.c | 22 xen/include/asm-ia64/shadow.h | 2 xen/include/asm-x86/bitops.h | 4 xen/include/asm-x86/guest_access.h | 6 xen/include/asm-x86/hvm/trace.h | 49 - xen/include/asm-x86/io_apic.h | 2 xen/include/asm-x86/mm.h | 38 xen/include/asm-x86/msr-index.h | 12 xen/include/asm-x86/shadow.h | 2 xen/include/public/trace.h | 51 - xen/include/xen/cpuidle.h | 82 + xen/include/xen/iommu.h | 1 xen/include/xen/sched.h | 22 xen/include/xen/trace.h | 2 xen/include/xsm/xsm.h | 148 ++- xen/xsm/dummy.c | 130 ++ xen/xsm/flask/hooks.c | 318 ++++++- xen/xsm/flask/include/av_perm_to_string.h | 21 xen/xsm/flask/include/av_permissions.h | 63 - xen/xsm/flask/include/flask.h | 11 xen/xsm/flask/include/initial_sid_to_string.h | 3 xen/xsm/flask/include/security.h | 6 xen/xsm/flask/ss/policydb.h | 13 xen/xsm/flask/ss/services.c | 40 121 files changed, 5439 insertions(+), 1429 deletions(-) diff -r 4ddd63b4be9b -r ec8eaab557d8 .hgignore --- a/.hgignore Fri Sep 12 14:32:45 2008 +0900 +++ b/.hgignore Fri Sep 12 14:47:40 2008 +0900 @@ -185,7 +185,6 @@ ^tools/misc/xenperf$ ^tools/pygrub/build/.*$ ^tools/python/build/.*$ -^tools/python/xen/util/xsm/xsm\.py$ ^tools/security/secpol_tool$ ^tools/security/xen/.*$ ^tools/security/xensec_tool$ diff -r 4ddd63b4be9b -r ec8eaab557d8 Config.mk --- a/Config.mk Fri Sep 12 14:32:45 2008 +0900 +++ b/Config.mk Fri Sep 12 14:47:40 2008 +0900 @@ -86,11 +86,7 @@ QEMU_REMOTE=http://xenbits.xensource.com # Mercurial in-tree version, or a local directory, or a git URL. # CONFIG_QEMU ?= ioemu # CONFIG_QEMU ?= ../qemu-xen.git -ifeq ($(XEN_TARGET_ARCH),ia64) -CONFIG_QEMU ?= ioemu -else CONFIG_QEMU ?= $(QEMU_REMOTE) -endif # Optional components XENSTAT_XENTOP ?= y diff -r 4ddd63b4be9b -r ec8eaab557d8 docs/misc/vtd.txt --- a/docs/misc/vtd.txt Fri Sep 12 14:32:45 2008 +0900 +++ b/docs/misc/vtd.txt Fri Sep 12 14:47:40 2008 +0900 @@ -1,8 +1,9 @@ Title : How to do PCI Passthrough with Title : How to do PCI Passthrough with VT-d Authors : Allen Kay <allen.m.kay@xxxxxxxxx> Weidong Han <weidong.han@xxxxxxxxx> + Yuji Shimada <shimada-yxb@xxxxxxxxxxxxxxx> Created : October-24-2007 -Updated : August-06-2008 +Updated : September-09-2008 How to turn on VT-d in Xen -------------------------- @@ -106,3 +107,27 @@ http://h10010.www1.hp.com/wwpc/us/en/en/ For more information, pls refer to http://wiki.xensource.com/xenwiki/VTdHowTo. + +Assigning devices to HVM domains +-------------------------------- + +Most device types such as NIC, HBA, EHCI and UHCI can be assigned to +an HVM domain. + +But some devices have design features which make them unsuitable for +assignment to an HVM domain. Examples include: + + * Device has an internal resource, such as private memory, which is + mapped to memory address space with BAR (Base Address Register). + * Driver submits command with a pointer to a buffer within internal + resource. Device decodes the pointer (address), and accesses to the + buffer. + +In an HVM domain, the BAR is virtualized, and host-BAR value and +guest-BAR value are different. The addresses of internal resource from +device's view and driver's view are different. Similarly, the +addresses of buffer within internal resource from device's view and +driver's view are different. As a result, device can't access to the +buffer specified by driver. + +Such devices assigned to HVM domain currently do not work. diff -r 4ddd63b4be9b -r ec8eaab557d8 docs/src/user.tex --- a/docs/src/user.tex Fri Sep 12 14:32:45 2008 +0900 +++ b/docs/src/user.tex Fri Sep 12 14:47:40 2008 +0900 @@ -4252,7 +4252,7 @@ directory of the Xen source distribution \section{Online References} The official Xen web site can be found at: -\begin{quote} {\tt http://www.xensource.com} +\begin{quote} {\tt http://www.xen.org} \end{quote} This contains links to the latest versions of all online @@ -4282,7 +4282,7 @@ mailing lists and subscription informati Subscribe at: \\ {\small {\tt http://lists.xensource.com/xen-announce}} \item[xen-changelog@xxxxxxxxxxxxxxxxxxx] Changelog feed - from the unstable and 2.0 trees - developer oriented. Subscribe at: \\ + from the unstable and 3.x trees - developer oriented. Subscribe at: \\ {\small {\tt http://lists.xensource.com/xen-changelog}} \end{description} diff -r 4ddd63b4be9b -r ec8eaab557d8 stubdom/README --- a/stubdom/README Fri Sep 12 14:32:45 2008 +0900 +++ b/stubdom/README Fri Sep 12 14:47:40 2008 +0900 @@ -27,7 +27,7 @@ device_model = '/usr/lib/xen/bin/stubdom - disable anything related to dom0, like pty serial assignments -Create /etc/xen/stubdom-hvmconfig (where "hvmconfig" is the name of your HVM +Create /etc/xen/hvmconfig-dm (where "hvmconfig" is the name of your HVM guest) with kernel = "/usr/lib/xen/boot/ioemu-stubdom.gz" @@ -52,7 +52,7 @@ vnc = 0 vnc = 0 sdl = 0 - - In stubdom-hvmconfig, set an sdl vfb: + - In hvmconfig-dm, set an sdl vfb: vfb = [ 'type=sdl' ] @@ -65,7 +65,7 @@ vnc = 1 vnc = 1 vnclisten = "172.30.206.1" - - In stubdom-hvmconfig, fill the reserved vif with the same IP, for instance: + - In hvmconfig-dm, fill the reserved vif with the same IP, for instance: vif = [ 'ip=172.30.206.1', 'ip=10.0.1.1,mac=aa:00:00:12:23:34'] @@ -76,7 +76,7 @@ vnc = 0 vnc = 0 sdl = 0 - - In stubdom-hvmconfig, set a vnc vfb: + - In hvmconfig-dm, set a vnc vfb: vfb = [ 'type=vnc' ] diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/examples/init.d/xendomains --- a/tools/examples/init.d/xendomains Fri Sep 12 14:32:45 2008 +0900 +++ b/tools/examples/init.d/xendomains Fri Sep 12 14:47:40 2008 +0900 @@ -327,15 +327,17 @@ stop() if test $id = 0; then continue; fi echo -n " $name" if test "$XENDOMAINS_AUTO_ONLY" = "true"; then - case $name in + eval " + case \"\$name\" in ($NAMES) # nothing ;; (*) - echo -n "(skip)" + echo -n '(skip)' continue ;; esac + " fi # XENDOMAINS_SYSRQ chould be something like just "s" # or "s e i u" or even "s e s i u o" diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/examples/xend-config.sxp --- a/tools/examples/xend-config.sxp Fri Sep 12 14:32:45 2008 +0900 +++ b/tools/examples/xend-config.sxp Fri Sep 12 14:47:40 2008 +0900 @@ -14,6 +14,10 @@ #(logfile /var/log/xen/xend.log) #(loglevel DEBUG) +# Uncomment the line below. Set the value to flask, acm, or dummy to +# select a security module. + +#(xsm_module_name dummy) # The Xen-API server configuration. # diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/examples/xmexample.hvm --- a/tools/examples/xmexample.hvm Fri Sep 12 14:32:45 2008 +0900 +++ b/tools/examples/xmexample.hvm Fri Sep 12 14:47:40 2008 +0900 @@ -220,7 +220,7 @@ serial='pty' # Configure guest CPUID responses: # #cpuid=[ '1:ecx=xxxxxxxxxxx00xxxxxxxxxxxxxxxxxxx, -# eax=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' ] +# eax=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' ] # - Unset the SSE4 features (CPUID.1[ECX][20-19]) # - Default behaviour for all other bits in ECX And EAX registers. # diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/examples/xmexample.hvm-stubdom --- a/tools/examples/xmexample.hvm-stubdom Fri Sep 12 14:32:45 2008 +0900 +++ b/tools/examples/xmexample.hvm-stubdom Fri Sep 12 14:47:40 2008 +0900 @@ -236,7 +236,7 @@ stdvga=0 # Configure guest CPUID responses: # #cpuid=[ '1:ecx=xxxxxxxxxxx00xxxxxxxxxxxxxxxxxxx, -# eax=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' ] +# eax=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' ] # - Unset the SSE4 features (CPUID.1[ECX][20-19]) # - Default behaviour for all other bits in ECX And EAX registers. # diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/Makefile --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/flask/policy/Makefile Fri Sep 12 14:47:40 2008 +0900 @@ -0,0 +1,234 @@ +# +# Makefile for the security policy. +# +# Targets: +# +# install - compile and install the policy configuration, and context files. +# load - compile, install, and load the policy configuration. +# reload - compile, install, and load/reload the policy configuration. +# policy - compile the policy configuration locally for testing/development. +# +# The default target is 'policy'. +# + +######################################## +# +# Configurable portions of the Makefile +# + +# Policy version +# By default, checkpolicy will create the highest +# version policy it supports. Setting this will +# override the version. +OUTPUT_POLICY = 20 + +# Policy Type +# strict, targeted, +# strict-mls, targeted-mls, +# strict-mcs, targeted-mcs +TYPE = strict + +# Policy Name +# If set, this will be used as the policy +# name. Otherwise the policy type will be +# used for the name. +NAME = xenrefpolicy + +# Distribution +# Some distributions have portions of policy +# for programs or configurations specific to the +# distribution. Setting this will enable options +# for the distribution. +# redhat, gentoo, debian, and suse are current options. +# Fedora users should enable redhat. +#DISTRO = + +# Build monolithic policy. Putting n here +# will build a loadable module policy. +MONOLITHIC=y + +# Uncomment this to disable command echoing +#QUIET:=@ + +######################################## +# +# NO OPTIONS BELOW HERE +# + +# executable paths +PREFIX := /usr +BINDIR := $(PREFIX)/bin +SBINDIR := $(PREFIX)/sbin +CHECKPOLICY := $(BINDIR)/checkpolicy +CHECKMODULE := $(BINDIR)/checkmodule +SEMOD_PKG := $(BINDIR)/semodule_package +LOADPOLICY := $(SBINDIR)/flask-loadpolicy + +CFLAGS := -Wall + +# policy source layout +POLDIR := policy +MODDIR := $(POLDIR)/modules +FLASKDIR := $(POLDIR)/flask +SECCLASS := $(FLASKDIR)/security_classes +ISIDS := $(FLASKDIR)/initial_sids +AVS := $(FLASKDIR)/access_vectors + +#policy building support tools +SUPPORT := support +FCSORT := tmp/fc_sort + +# config file paths +GLOBALTUN := $(POLDIR)/global_tunables +GLOBALBOOL := $(POLDIR)/global_booleans +MOD_CONF := $(POLDIR)/modules.conf +TUNABLES := $(POLDIR)/tunables.conf +BOOLEANS := $(POLDIR)/booleans.conf + +# install paths +TOPDIR = $(DESTDIR)/etc/xen/ +INSTALLDIR = $(TOPDIR)/$(NAME) +SRCPATH = $(INSTALLDIR)/src +USERPATH = $(INSTALLDIR)/users +CONTEXTPATH = $(INSTALLDIR)/contexts + +# enable MLS if requested. +ifneq ($(findstring -mls,$(TYPE)),) + override M4PARAM += -D enable_mls + CHECKPOLICY += -M + CHECKMODULE += -M +endif + +# enable MLS if MCS requested. +ifneq ($(findstring -mcs,$(TYPE)),) + override M4PARAM += -D enable_mcs + CHECKPOLICY += -M + CHECKMODULE += -M +endif + +# compile targeted policy if requested. +ifneq ($(findstring targeted,$(TYPE)),) + override M4PARAM += -D targeted_policy +endif + +# enable distribution-specific policy +ifneq ($(DISTRO),) + override M4PARAM += -D distro_$(DISTRO) +endif + +ifneq ($(OUTPUT_POLICY),) + CHECKPOLICY += -c $(OUTPUT_POLICY) +endif + +ifeq ($(NAME),) + NAME := $(TYPE) +endif + +# determine the policy version and current kernel version if possible +PV := $(shell $(CHECKPOLICY) -V |cut -f 1 -d ' ') +KV := $(shell cat /selinux/policyvers) + +# dont print version warnings if we are unable to determine +# the currently running kernel's policy version +ifeq ($(KV),) + KV := $(PV) +endif + +FC := file_contexts +POLVER := policy.$(PV) + +M4SUPPORT = $(wildcard $(POLDIR)/support/*.spt) + +APPCONF := config/appconfig-$(TYPE) +APPDIR := $(CONTEXTPATH) +APPFILES := $(INSTALLDIR)/booleans +CONTEXTFILES += $(wildcard $(APPCONF)/*_context*) $(APPCONF)/media +USER_FILES := $(POLDIR)/systemuser $(POLDIR)/users + +ALL_LAYERS := $(filter-out $(MODDIR)/CVS,$(shell find $(wildcard $(MODDIR)/*) -maxdepth 0 -type d)) + +GENERATED_TE := $(basename $(foreach dir,$(ALL_LAYERS),$(wildcard $(dir)/*.te.in))) +GENERATED_IF := $(basename $(foreach dir,$(ALL_LAYERS),$(wildcard $(dir)/*.if.in))) +GENERATED_FC := $(basename $(foreach dir,$(ALL_LAYERS),$(wildcard $(dir)/*.fc.in))) + +# sort here since it removes duplicates, which can happen +# when a generated file is already generated +DETECTED_MODS := $(sort $(foreach dir,$(ALL_LAYERS),$(wildcard $(dir)/*.te)) $(GENERATED_TE)) + +# modules.conf setting for base module +MODBASE := base + +# modules.conf setting for module +MODMOD := module + +# extract settings from modules.conf +BASE_MODS := $(foreach mod,$(shell awk '/^[[:blank:]]*[[:alpha:]]/{ if ($$3 == "$(MODBASE)") print $$1 }' $(MOD_CONF) 2> /dev/null),$(subst ./,,$(shell find -iname $(mod).te))) +MOD_MODS := $(foreach mod,$(shell awk '/^[[:blank:]]*[[:alpha:]]/{ if ($$3 == "$(MODMOD)") print $$1 }' $(MOD_CONF) 2> /dev/null),$(subst ./,,$(shell find -iname $(mod).te))) + +HOMEDIR_TEMPLATE = tmp/homedir_template + +######################################## +# +# Load appropriate rules +# + +ifeq ($(MONOLITHIC),y) + include Rules.monolithic +else + include Rules.modular +endif + +######################################## +# +# Create config files +# +conf: $(MOD_CONF) $(BOOLEANS) $(GENERATED_TE) $(GENERATED_IF) $(GENERATED_FC) + +$(MOD_CONF) $(BOOLEANS): $(POLXML) + @echo "Updating $(MOD_CONF) and $(BOOLEANS)" + $(QUIET) cd $(DOCS) && ../$(GENDOC) -t ../$(BOOLEANS) -m ../$(MOD_CONF) -x ../$(POLXML) + +######################################## +# +# Appconfig files +# +install-appconfig: $(APPFILES) + +$(INSTALLDIR)/booleans: $(BOOLEANS) + @mkdir -p $(INSTALLDIR) + $(QUIET) egrep '^[[:blank:]]*[[:alpha:]]' $(BOOLEANS) \ + | sed -e 's/false/0/g' -e 's/true/1/g' > tmp/booleans + $(QUIET) install -m 644 tmp/booleans $@ + +######################################## +# +# Install policy sources +# +install-src: + rm -rf $(SRCPATH)/policy.old + -mv $(SRCPATH)/policy $(SRCPATH)/policy.old + mkdir -p $(SRCPATH)/policy + cp -R . $(SRCPATH)/policy + +######################################## +# +# Clean everything +# +bare: clean + rm -f $(POLXML) + rm -f $(SUPPORT)/*.pyc + rm -f $(FCSORT) + rm -f $(MOD_CONF) + rm -f $(BOOLEANS) + rm -fR $(HTMLDIR) +ifneq ($(GENERATED_TE),) + rm -f $(GENERATED_TE) +endif +ifneq ($(GENERATED_IF),) + rm -f $(GENERATED_IF) +endif +ifneq ($(GENERATED_FC),) + rm -f $(GENERATED_FC) +endif + +.PHONY: install-src install-appconfig conf html bare diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/Rules.modular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/flask/policy/Rules.modular Fri Sep 12 14:47:40 2008 +0900 @@ -0,0 +1,166 @@ +######################################## +# +# Rules and Targets for building modular policies +# + +ALL_MODULES := $(filter $(BASE_MODS) $(MOD_MODS),$(DETECTED_MODS)) +ALL_INTERFACES := $(ALL_MODULES:.te=.if) + +BASE_PKG := base.pp +BASE_FC := base.fc + +BASE_SECTIONS := tmp/pre_te_files.conf tmp/generated_definitions.conf tmp/all_interfaces.conf tmp/all_attrs_types.conf $(GLOBALBOOL) $(GLOBALTUN) tmp/only_te_rules.conf tmp/all_post.conf + +BASE_PRE_TE_FILES := $(SECCLASS) $(ISIDS) $(AVS) $(M4SUPPORT) $(POLDIR)/mls $(POLDIR)/mcs +BASE_TE_FILES := $(BASE_MODS) +BASE_POST_TE_FILES := $(POLDIR)/systemuser $(POLDIR)/constraints +BASE_FC_FILES := $(BASE_MODS:.te=.fc) + +MOD_MODULES := $(MOD_MODS:.te=.mod) +MOD_PKGS := $(notdir $(MOD_MODS:.te=.pp)) + +# search layer dirs for source files +vpath %.te $(ALL_LAYERS) +vpath %.if $(ALL_LAYERS) +vpath %.fc $(ALL_LAYERS) + +######################################## +# +# default action: create all module packages +# +default: base + +base: $(BASE_PKG) + +modules: $(MOD_PKGS) + +#policy: $(POLVER) +#install: $(LOADPATH) $(FCPATH) $(APPFILES) $(USERPATH)/local.users +#load: tmp/load + +######################################## +# +# Create a base module package +# +$(BASE_PKG): tmp/base.mod $(BASE_FC) + @echo "Creating $(NAME) base module package" + $(QUIET) $(SEMOD_PKG) $@ $^ + +######################################## +# +# Compile a base module +# +tmp/base.mod: base.conf + @echo "Compiling $(NAME) base module" + $(QUIET) $(CHECKMODULE) $^ -o $@ + +######################################## +# +# Construct a base module policy.conf +# +base.conf: $(BASE_SECTIONS) + @echo "Creating $(NAME) base module policy.conf" +# checkpolicy can use the #line directives provided by -s for error reporting: + $(QUIET) m4 -D self_contained_policy $(M4PARAM) -s $^ > tmp/$@.tmp + $(QUIET) sed -e /^portcon/d -e /^nodecon/d -e /^netifcon/d < tmp/$@.tmp > $@ +# the ordering of these ocontexts matters: + $(QUIET) grep ^portcon tmp/$@.tmp >> $@ || true + $(QUIET) grep ^netifcon tmp/$@.tmp >> $@ || true + $(QUIET) grep ^nodecon tmp/$@.tmp >> $@ || true + +tmp/pre_te_files.conf: $(BASE_PRE_TE_FILES) + @test -d tmp || mkdir -p tmp + $(QUIET) cat $^ > $@ + +tmp/generated_definitions.conf: $(ALL_LAYERS) $(BASE_TE_FILES) + @test -d tmp || mkdir -p tmp +# define all available object classes + $(QUIET) $(GENPERM) $(AVS) $(SECCLASS) > $@ +# per-userdomain templates + $(QUIET) echo "define(\`per_userdomain_templates',\`" >> $@ + $(QUIET) for i in $(patsubst %.te,%,$(notdir $(ALL_MODULES))); do \ + echo "ifdef(\`""$$i""_per_userdomain_template',\`""$$i""_per_userdomain_template("'$$*'")')" \ + >> $@ ;\ + done + $(QUIET) echo "')" >> $@ +# define foo.te + $(QUIET) for i in $(notdir $(BASE_TE_FILES)); do \ + echo "define(\`$$i')" >> $@ ;\ + done + $(QUIET) $(SETTUN) $(BOOLEANS) >> $@ + +tmp/all_interfaces.conf: $(M4SUPPORT) $(ALL_INTERFACES) +ifeq ($(ALL_INTERFACES),) + $(error No enabled modules! $(notdir $(MOD_CONF)) may need to be generated by using "make conf") +endif + @test -d tmp || mkdir -p tmp + $(QUIET) m4 $^ | sed -e s/dollarsstar/\$$\*/g > $@ + +tmp/all_te_files.conf: $(BASE_TE_FILES) +ifeq ($(BASE_TE_FILES),) + $(error No enabled modules! $(notdir $(MOD_CONF)) may need to be generated by using "make conf") +endif + @test -d tmp || mkdir -p tmp + $(QUIET) cat $^ > $@ + +tmp/post_te_files.conf: $(BASE_POST_TE_FILES) + @test -d tmp || mkdir -p tmp + $(QUIET) cat $^ > $@ + +# extract attributes and put them first. extract post te stuff +# like genfscon and put last. portcon, nodecon, and netifcon +# is delayed since they are generated by m4 +tmp/all_attrs_types.conf tmp/only_te_rules.conf tmp/all_post.conf: tmp/all_te_files.conf tmp/post_te_files.conf + $(QUIET) grep ^attribute tmp/all_te_files.conf > tmp/all_attrs_types.conf || true + $(QUIET) grep '^type ' tmp/all_te_files.conf >> tmp/all_attrs_types.conf + $(QUIET) cat tmp/post_te_files.conf > tmp/all_post.conf + $(QUIET) grep '^sid ' tmp/all_te_files.conf >> tmp/all_post.conf || true + $(QUIET) egrep '^fs_use_(xattr|task|trans)' tmp/all_te_files.conf >> tmp/all_post.conf || true + $(QUIET) grep ^genfscon tmp/all_te_files.conf >> tmp/all_post.conf || true + $(QUIET) sed -r -e /^attribute/d -e '/^type /d' -e /^genfscon/d \ + -e '/^sid /d' -e '/^fs_use_(xattr|task|trans)/d' \ + < tmp/all_te_files.conf > tmp/only_te_rules.conf + +######################################## +# +# Construct base module file contexts +# +$(BASE_FC): $(M4SUPPORT) tmp/generated_definitions.conf $(BASE_FC_FILES) $(FCSORT) +ifeq ($(BASE_FC_FILES),) + $(error No enabled modules! $(notdir $(MOD_CONF)) may need to be generated by using "make conf") +endif + @echo "Creating $(NAME) base module file contexts." + @test -d tmp || mkdir -p tmp + $(QUIET) m4 $(M4PARAM) $(M4SUPPORT) tmp/generated_definitions.conf $(BASE_FC_FILES) > tmp/$@.tmp + $(QUIET) grep -e HOME -e ROLE tmp/$@.tmp > $(HOMEDIR_TEMPLATE) + $(QUIET) sed -i -e /HOME/d -e /ROLE/d tmp/$@.tmp + $(QUIET) $(FCSORT) tmp/$@.tmp $@ + +######################################## +# +# Build module packages +# +tmp/%.mod: $(M4SUPPORT) tmp/generated_definitions.conf tmp/all_interfaces.conf %.te + @if test -z "$(filter $^,$(MOD_MODS))"; then \ + echo "The $(notdir $(basename $@)) module is not configured to be compiled as a lodable module." ;\ + false ;\ + fi + @echo "Compliling $(NAME) $(@F) module" + $(QUIET) m4 $(M4PARAM) -s $^ > $(@:.mod=.tmp) + $(QUIET) $(CHECKMODULE) -m $(@:.mod=.tmp) -o $@ + +%.pp: tmp/%.mod %.fc + @echo "Creating $(NAME) $(@F) policy package" + $(QUIET) $(SEMOD_PKG) $@ $^ + +######################################## +# +# Clean the sources +# +clean: + rm -fR tmp + rm -f base.conf + rm -f *.pp + rm -f $(BASE_FC) + +.PHONY: default base modules clean diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/Rules.monolithic --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/flask/policy/Rules.monolithic Fri Sep 12 14:47:40 2008 +0900 @@ -0,0 +1,196 @@ +######################################## +# +# Rules and Targets for building monolithic policies +# + +# install paths +POLICYPATH = $(INSTALLDIR)/policy +LOADPATH = $(POLICYPATH)/$(POLVER) +FCPATH = $(CONTEXTPATH)/files/file_contexts +HOMEDIRPATH = $(CONTEXTPATH)/files/homedir_template + +# for monolithic policy use all base and module to create policy +ENABLEMOD := $(BASE_MODS) $(MOD_MODS) + +ALL_MODULES := $(filter $(ENABLEMOD),$(DETECTED_MODS)) + +ALL_INTERFACES := $(ALL_MODULES:.te=.if) +ALL_TE_FILES := $(ALL_MODULES) +ALL_FC_FILES := $(ALL_MODULES:.te=.fc) + +PRE_TE_FILES := $(SECCLASS) $(ISIDS) $(AVS) $(M4SUPPORT) $(POLDIR)/mls $(POLDIR)/mcs +POST_TE_FILES := $(POLDIR)/systemuser $(POLDIR)/users $(POLDIR)/constraints + +POLICY_SECTIONS := tmp/pre_te_files.conf tmp/generated_definitions.conf tmp/all_interfaces.conf tmp/all_attrs_types.conf $(GLOBALBOOL) $(GLOBALTUN) tmp/only_te_rules.conf tmp/all_post.conf + +######################################## +# +# default action: build policy locally +# +default: policy + +policy: $(POLVER) + +install: $(LOADPATH) $(FCPATH) $(APPFILES) $(USERPATH)/local.users + +load: tmp/load + +######################################## +# +# Build a binary policy locally +# +$(POLVER): policy.conf + @echo "Compiling $(NAME) $(POLVER)" +ifneq ($(PV),$(KV)) + @echo + @echo "WARNING: Policy version mismatch! Is your OUTPUT_POLICY set correctly?" + @echo +endif + $(QUIET) $(CHECKPOLICY) $^ -o $@ + +######################################## +# +# Install a binary policy +# +$(LOADPATH): policy.conf + @mkdir -p $(POLICYPATH) + @echo "Compiling and installing $(NAME) $(LOADPATH)" +ifneq ($(PV),$(KV)) + @echo + @echo "WARNING: Policy version mismatch! Is your OUTPUT_POLICY set correctly?" + @echo +endif + $(QUIET) $(CHECKPOLICY) $^ -o $@ + +######################################## +# +# Load the binary policy +# +reload tmp/load: $(LOADPATH) $(FCPATH) + @echo "Loading $(NAME) $(LOADPATH)" + $(QUIET) $(LOADPOLICY) -q $(LOADPATH) + @touch tmp/load + +######################################## +# +# Construct a monolithic policy.conf +# +policy.conf: $(POLICY_SECTIONS) + @echo "Creating $(NAME) policy.conf" +# checkpolicy can use the #line directives provided by -s for error reporting: + $(QUIET) m4 -D self_contained_policy $(M4PARAM) -s $^ > tmp/$@.tmp + $(QUIET) sed -e /^portcon/d -e /^nodecon/d -e /^netifcon/d < tmp/$@.tmp > $@ + +tmp/pre_te_files.conf: $(PRE_TE_FILES) + @test -d tmp || mkdir -p tmp + $(QUIET) cat $^ > $@ + +tmp/generated_definitions.conf: $(ALL_LAYERS) $(ALL_TE_FILES) +# per-userdomain templates: + @test -d tmp || mkdir -p tmp + $(QUIET) echo "define(\`per_userdomain_templates',\`" > $@ + $(QUIET) for i in $(patsubst %.te,%,$(notdir $(ALL_MODULES))); do \ + echo "ifdef(\`""$$i""_per_userdomain_template',\`""$$i""_per_userdomain_template("'$$*'")')" \ + >> $@ ;\ + done + $(QUIET) echo "')" >> $@ +# define foo.te + $(QUIET) for i in $(notdir $(ALL_MODULES)); do \ + echo "define(\`$$i')" >> $@ ;\ + done +# $(QUIET) $(SETTUN) $(BOOLEANS) >> $@ + +tmp/all_interfaces.conf: $(M4SUPPORT) $(ALL_INTERFACES) +ifeq ($(ALL_INTERFACES),) + $(error No enabled modules! $(notdir $(MOD_CONF)) may need to be generated by using "make conf") +endif + @test -d tmp || mkdir -p tmp + $(QUIET) m4 $^ | sed -e s/dollarsstar/\$$\*/g > $@ + +tmp/all_te_files.conf: $(ALL_TE_FILES) +ifeq ($(ALL_TE_FILES),) + $(error No enabled modules! $(notdir $(MOD_CONF)) may need to be generated by using "make conf") +endif + @test -d tmp || mkdir -p tmp + $(QUIET) cat $^ > $@ + +tmp/post_te_files.conf: $(POST_TE_FILES) + @test -d tmp || mkdir -p tmp + $(QUIET) cat $^ > $@ + +# extract attributes and put them first. extract post te stuff +# like genfscon and put last. portcon, nodecon, and netifcon +# is delayed since they are generated by m4 +tmp/all_attrs_types.conf tmp/only_te_rules.conf tmp/all_post.conf: tmp/all_te_files.conf tmp/post_te_files.conf + $(QUIET) grep ^attribute tmp/all_te_files.conf > tmp/all_attrs_types.conf || true + $(QUIET) grep '^type ' tmp/all_te_files.conf >> tmp/all_attrs_types.conf + $(QUIET) cat tmp/post_te_files.conf > tmp/all_post.conf + $(QUIET) grep '^sid ' tmp/all_te_files.conf >> tmp/all_post.conf || true + $(QUIET) egrep '^fs_use_(xattr|task|trans)' tmp/all_te_files.conf >> tmp/all_post.conf || true + $(QUIET) grep ^genfscon tmp/all_te_files.conf >> tmp/all_post.conf || true + $(QUIET) sed -r -e /^attribute/d -e '/^type /d' -e /^genfscon/d \ + -e '/^sid /d' -e '/^fs_use_(xattr|task|trans)/d' \ + < tmp/all_te_files.conf > tmp/only_te_rules.conf + +######################################## +# +# Remove the dontaudit rules from the policy.conf +# +enableaudit: policy.conf + @test -d tmp || mkdir -p tmp + @echo "Removing dontaudit rules from policy.conf" + $(QUIET) grep -v dontaudit policy.conf > tmp/policy.audit + $(QUIET) mv tmp/policy.audit policy.conf + +######################################## +# +# Construct file_contexts +# +$(FC): $(M4SUPPORT) tmp/generated_definitions.conf $(ALL_FC_FILES) +ifeq ($(ALL_FC_FILES),) + $(error No enabled modules! $(notdir $(MOD_CONF)) may need to be generated by using "make conf") +endif + @echo "Creating $(NAME) file_contexts." + @test -d tmp || mkdir -p tmp + $(QUIET) m4 $(M4PARAM) $(M4SUPPORT) tmp/generated_definitions.conf $(ALL_FC_FILES) > tmp/$@.tmp +# $(QUIET) grep -e HOME -e ROLE tmp/$@.tmp > $(HOMEDIR_TEMPLATE) +# $(QUIET) sed -i -e /HOME/d -e /ROLE/d tmp/$@.tmp +# $(QUIET) $(FCSORT) tmp/$@.tmp $@ + $(QUIET) touch $(HOMEDIR_TEMPLATE) + $(QUIET) touch $@ + +######################################## +# +# Install file_contexts +# +$(FCPATH): $(FC) $(LOADPATH) $(USERPATH)/system.users + @echo "Validating $(NAME) file_contexts." +# $(QUIET) $(SETFILES) -q -c $(LOADPATH) $(FC) + @echo "Installing file_contexts." + @mkdir -p $(CONTEXTPATH)/files + $(QUIET) install -m 644 $(FC) $(FCPATH) + $(QUIET) install -m 644 $(HOMEDIR_TEMPLATE) $(HOMEDIRPATH) +# $(QUIET) $(GENHOMEDIRCON) -d $(TOPDIR) -t $(NAME) $(USEPWD) + +######################################## +# +# Run policy source checks +# +check: policy.conf $(FC) + $(SECHECK) -s --profile=development --policy=policy.conf --fcfile=$(FC) > $@.res + +longcheck: policy.conf $(FC) + $(SECHECK) -s --profile=all --policy=policy.conf --fcfile=$(FC) > $@.res + +######################################## +# +# Clean the sources +# +clean: + rm -fR tmp + rm -f policy.conf + rm -f policy.$(PV) + rm -f $(FC) + rm -f *.res + +.PHONY: default policy install load reload enableaudit checklabels restorelabels relabel check longcheck clean diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/policy/constraints --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/flask/policy/policy/constraints Fri Sep 12 14:47:40 2008 +0900 @@ -0,0 +1,27 @@ + +# +# Define the constraints +# +# constrain class_set perm_set expression ; +# +# expression : ( expression ) +# | not expression +# | expression and expression +# | expression or expression +# | u1 op u2 +# | r1 role_op r2 +# | t1 op t2 +# | u1 op names +# | u2 op names +# | r1 op names +# | r2 op names +# | t1 op names +# | t2 op names +# +# op : == | != +# role_op : == | != | eq | dom | domby | incomp +# +# names : name | { name_list } +# name_list : name | name_list name +# + diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/policy/flask/Makefile --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/flask/policy/policy/flask/Makefile Fri Sep 12 14:47:40 2008 +0900 @@ -0,0 +1,41 @@ +# flask needs to know where to export the libselinux headers. +LIBSEL ?= ../../libselinux + +# flask needs to know where to export the kernel headers. +LINUXDIR ?= ../../../linux-2.6 + +AWK = awk + +CONFIG_SHELL := $(shell if [ -x "$$BASH" ]; then echo $$BASH; \ + else if [ -x /bin/bash ]; then echo /bin/bash; \ + else echo sh; fi ; fi) + +FLASK_H_DEPEND = security_classes initial_sids +AV_H_DEPEND = access_vectors + +FLASK_H_FILES = class_to_string.h flask.h initial_sid_to_string.h +AV_H_FILES = av_inherit.h common_perm_to_string.h av_perm_to_string.h av_permissions.h +ALL_H_FILES = $(FLASK_H_FILES) $(AV_H_FILES) + +all: $(ALL_H_FILES) + +$(FLASK_H_FILES): $(FLASK_H_DEPEND) + $(CONFIG_SHELL) mkflask.sh $(AWK) $(FLASK_H_DEPEND) + +$(AV_H_FILES): $(AV_H_DEPEND) + $(CONFIG_SHELL) mkaccess_vector.sh $(AWK) $(AV_H_DEPEND) + +tolib: all + install -m 644 flask.h av_permissions.h $(LIBSEL)/include/selinux + install -m 644 class_to_string.h av_inherit.h common_perm_to_string.h av_perm_to_string.h $(LIBSEL)/src + +tokern: all + install -m 644 $(ALL_H_FILES) $(LINUXDIR)/security/selinux/include + +install: all + +relabel: + +clean: + rm -f $(FLASK_H_FILES) + rm -f $(AV_H_FILES) diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/policy/flask/access_vectors --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/flask/policy/policy/flask/access_vectors Fri Sep 12 14:47:40 2008 +0900 @@ -0,0 +1,166 @@ +# +# Define common prefixes for access vectors +# +# common common_name { permission_name ... } + +# +# Define a common prefix for file access vectors. +# + + +# +# Define the access vectors. +# +# class class_name [ inherits common_name ] { permission_name ... } + + +# +# Define the access vector interpretation for file-related objects. +# + +class xen +{ + scheduler + settime + tbufcontrol + readconsole + clearconsole + perfcontrol + mtrr_add + mtrr_del + mtrr_read + microcode + physinfo + quirk + writeconsole + readapic + writeapic + privprofile + nonprivprofile + kexec + firmware + sleep + frequency + getidle + debug + getcpuinfo + heap +} + +class domain +{ + setvcpucontext + pause + unpause + resume + create + transition + max_vcpus + destroy + setvcpuaffinity + getvcpuaffinity + scheduler + getdomaininfo + getvcpuinfo + getvcpucontext + setdomainmaxmem + setdomainhandle + setdebugging + hypercall + settime + set_target + shutdown + setaddrsize + getaddrsize + trigger + getextvcpucontext + setextvcpucontext +} + +class hvm +{ + sethvmc + gethvmc + setparam + getparam + pcilevel + irqlevel + pciroute + bind_irq + cacheattr +} + +class event +{ + bind + send + status + notify + create + vector + reset +} + +class grant +{ + map_read + map_write + unmap + transfer + setup + copy + query +} + +class mmu +{ + map_read + map_write + pageinfo + pagelist + adjust + stat + translategp + updatemp + physmap + pinpage + mfnlist + memorymap +} + +class shadow +{ + disable + enable + logdirty +} + +class resource +{ + add + remove + use + add_irq + remove_irq + add_ioport + remove_ioport + add_iomem + remove_iomem + stat_device + add_device + remove_device +} + +class security +{ + compute_av + compute_create + compute_member + check_context + load_policy + compute_relabel + compute_user + setenforce + setbool + setsecparam +} diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/policy/flask/initial_sids --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/flask/policy/policy/flask/initial_sids Fri Sep 12 14:47:40 2008 +0900 @@ -0,0 +1,17 @@ +# FLASK + +# +# Define initial security identifiers +# +sid xen +sid dom0 +sid domU +sid domio +sid domxen +sid unlabeled +sid security +sid ioport +sid iomem +sid pirq +sid device +# FLASK diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/policy/flask/mkaccess_vector.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/flask/policy/policy/flask/mkaccess_vector.sh Fri Sep 12 14:47:40 2008 +0900 @@ -0,0 +1,227 @@ +#!/bin/sh - +# + +# FLASK + +set -e + +awk=$1 +shift + +# output files +av_permissions="av_permissions.h" +av_inherit="av_inherit.h" +common_perm_to_string="common_perm_to_string.h" +av_perm_to_string="av_perm_to_string.h" + +cat $* | $awk " +BEGIN { + outfile = \"$av_permissions\" + inheritfile = \"$av_inherit\" + cpermfile = \"$common_perm_to_string\" + avpermfile = \"$av_perm_to_string\" + "' + nextstate = "COMMON_OR_AV"; + printf("/* This file is automatically generated. Do not edit. */\n") > outfile; + printf("/* This file is automatically generated. Do not edit. */\n") > inheritfile; + printf("/* This file is automatically generated. Do not edit. */\n") > cpermfile; + printf("/* This file is automatically generated. Do not edit. */\n") > avpermfile; +; + } +/^[ \t]*#/ { + next; + } +$1 == "common" { + if (nextstate != "COMMON_OR_AV") + { + printf("Parse error: Unexpected COMMON definition on line %d\n", NR); + next; + } + + if ($2 in common_defined) + { + printf("Duplicate COMMON definition for %s on line %d.\n", $2, NR); + next; + } + common_defined[$2] = 1; + + tclass = $2; + common_name = $2; + permission = 1; + + printf("TB_(common_%s_perm_to_string)\n", $2) > cpermfile; + + nextstate = "COMMON-OPENBRACKET"; + next; + } +$1 == "class" { + if (nextstate != "COMMON_OR_AV" && + nextstate != "CLASS_OR_CLASS-OPENBRACKET") + { + printf("Parse error: Unexpected class definition on line %d\n", NR); + next; + } + + tclass = $2; + + if (tclass in av_defined) + { + printf("Duplicate access vector definition for %s on line %d\n", tclass, NR); + next; + } + av_defined[tclass] = 1; + + inherits = ""; + permission = 1; + + nextstate = "INHERITS_OR_CLASS-OPENBRACKET"; + next; + } +$1 == "inherits" { + if (nextstate != "INHERITS_OR_CLASS-OPENBRACKET") + { + printf("Parse error: Unexpected INHERITS definition on line %d\n", NR); + next; + } + + if (!($2 in common_defined)) + { + printf("COMMON %s is not defined (line %d).\n", $2, NR); + next; + } + + inherits = $2; + permission = common_base[$2]; + + for (combined in common_perms) + { + split(combined,separate, SUBSEP); + if (separate[1] == inherits) + { + inherited_perms[common_perms[combined]] = separate[2]; + } + } + + j = 1; + for (i in inherited_perms) { + ind[j] = i + 0; + j++; + } + n = asort(ind); + for (i = 1; i <= n; i++) { + perm = inherited_perms[ind[i]]; + printf("#define %s__%s", toupper(tclass), toupper(perm)) > outfile; + spaces = 40 - (length(perm) + length(tclass)); + if (spaces < 1) + spaces = 1; + for (j = 0; j < spaces; j++) + printf(" ") > outfile; + printf("0x%08xUL\n", ind[i]) > outfile; + } + printf("\n") > outfile; + for (i in ind) delete ind[i]; + for (i in inherited_perms) delete inherited_perms[i]; + + printf(" S_(SECCLASS_%s, %s, 0x%08xUL)\n", toupper(tclass), inherits, permission) > inheritfile; + + nextstate = "CLASS_OR_CLASS-OPENBRACKET"; + next; + } +$1 == "{" { + if (nextstate != "INHERITS_OR_CLASS-OPENBRACKET" && + nextstate != "CLASS_OR_CLASS-OPENBRACKET" && + nextstate != "COMMON-OPENBRACKET") + { + printf("Parse error: Unexpected { on line %d\n", NR); + next; + } + + if (nextstate == "INHERITS_OR_CLASS-OPENBRACKET") + nextstate = "CLASS-CLOSEBRACKET"; + + if (nextstate == "CLASS_OR_CLASS-OPENBRACKET") + nextstate = "CLASS-CLOSEBRACKET"; + + if (nextstate == "COMMON-OPENBRACKET") + nextstate = "COMMON-CLOSEBRACKET"; + } +/[a-z][a-z_]*/ { + if (nextstate != "COMMON-CLOSEBRACKET" && + nextstate != "CLASS-CLOSEBRACKET") + { + printf("Parse error: Unexpected symbol %s on line %d\n", $1, NR); + next; + } + + if (nextstate == "COMMON-CLOSEBRACKET") + { + if ((common_name,$1) in common_perms) + { + printf("Duplicate permission %s for common %s on line %d.\n", $1, common_name, NR); + next; + } + + common_perms[common_name,$1] = permission; + + printf("#define COMMON_%s__%s", toupper(common_name), toupper($1)) > outfile; + + printf(" S_(\"%s\")\n", $1) > cpermfile; + } + else + { + if ((tclass,$1) in av_perms) + { + printf("Duplicate permission %s for %s on line %d.\n", $1, tclass, NR); + next; + } + + av_perms[tclass,$1] = permission; + + if (inherits != "") + { + if ((inherits,$1) in common_perms) + { + printf("Permission %s in %s on line %d conflicts with common permission.\n", $1, tclass, inherits, NR); + next; + } + } + + printf("#define %s__%s", toupper(tclass), toupper($1)) > outfile; + + printf(" S_(SECCLASS_%s, %s__%s, \"%s\")\n", toupper(tclass), toupper(tclass), toupper($1), $1) > avpermfile; + } + + spaces = 40 - (length($1) + length(tclass)); + if (spaces < 1) + spaces = 1; + + for (i = 0; i < spaces; i++) + printf(" ") > outfile; + printf("0x%08xUL\n", permission) > outfile; + permission = permission * 2; + } +$1 == "}" { + if (nextstate != "CLASS-CLOSEBRACKET" && + nextstate != "COMMON-CLOSEBRACKET") + { + printf("Parse error: Unexpected } on line %d\n", NR); + next; + } + + if (nextstate == "COMMON-CLOSEBRACKET") + { + common_base[common_name] = permission; + printf("TE_(common_%s_perm_to_string)\n\n", common_name) > cpermfile; + } + + printf("\n") > outfile; + + nextstate = "COMMON_OR_AV"; + } +END { + if (nextstate != "COMMON_OR_AV" && nextstate != "CLASS_OR_CLASS-OPENBRACKET") + printf("Parse error: Unexpected end of file\n"); + + }' + +# FLASK diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/policy/flask/mkflask.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/flask/policy/policy/flask/mkflask.sh Fri Sep 12 14:47:40 2008 +0900 @@ -0,0 +1,95 @@ +#!/bin/sh - +# + +# FLASK + +set -e + +awk=$1 +shift 1 + +# output file +output_file="flask.h" +debug_file="class_to_string.h" +debug_file2="initial_sid_to_string.h" + +cat $* | $awk " +BEGIN { + outfile = \"$output_file\" + debugfile = \"$debug_file\" + debugfile2 = \"$debug_file2\" + "' + nextstate = "CLASS"; + + printf("/* This file is automatically generated. Do not edit. */\n") > outfile; + + printf("#ifndef _SELINUX_FLASK_H_\n") > outfile; + printf("#define _SELINUX_FLASK_H_\n") > outfile; + printf("\n/*\n * Security object class definitions\n */\n") > outfile; + printf("/* This file is automatically generated. Do not edit. */\n") > debugfile; + printf("/*\n * Security object class definitions\n */\n") > debugfile; + printf(" S_(\"null\")\n") > debugfile; + printf("/* This file is automatically generated. Do not edit. */\n") > debugfile2; + printf("static char *initial_sid_to_string[] =\n{\n") > debugfile2; + printf(" \"null\",\n") > debugfile2; + } +/^[ \t]*#/ { + next; + } +$1 == "class" { + if (nextstate != "CLASS") + { + printf("Parse error: Unexpected class definition on line %d\n", NR); + next; + } + + if ($2 in class_found) + { + printf("Duplicate class definition for %s on line %d.\n", $2, NR); + next; + } + class_found[$2] = 1; + + class_value++; + + printf("#define SECCLASS_%s", toupper($2)) > outfile; + for (i = 0; i < 40 - length($2); i++) + printf(" ") > outfile; + printf("%d\n", class_value) > outfile; + + printf(" S_(\"%s\")\n", $2) > debugfile; + } +$1 == "sid" { + if (nextstate == "CLASS") + { + nextstate = "SID"; + printf("\n/*\n * Security identifier indices for initial entities\n */\n") > outfile; + } + + if ($2 in sid_found) + { + printf("Duplicate SID definition for %s on line %d.\n", $2, NR); + next; + } + sid_found[$2] = 1; + sid_value++; + + printf("#define SECINITSID_%s", toupper($2)) > outfile; + for (i = 0; i < 37 - length($2); i++) + printf(" ") > outfile; + printf("%d\n", sid_value) > outfile; + printf(" \"%s\",\n", $2) > debugfile2; + } +END { + if (nextstate != "SID") + printf("Parse error: Unexpected end of file\n"); + + printf("\n#define SECINITSID_NUM") > outfile; + for (i = 0; i < 34; i++) + printf(" ") > outfile; + printf("%d\n", sid_value) > outfile; + printf("\n#endif\n") > outfile; + printf("};\n\n") > debugfile2; + }' + +# FLASK diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/policy/flask/security_classes --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/flask/policy/policy/flask/security_classes Fri Sep 12 14:47:40 2008 +0900 @@ -0,0 +1,20 @@ +# FLASK + +# +# Define the security object classes +# + +# Classes marked as userspace are classes +# for userspace object managers + +class xen +class domain +class hvm +class mmu +class resource +class shadow +class event +class grant +class security + +# FLASK diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/policy/global_booleans --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/flask/policy/policy/global_booleans Fri Sep 12 14:47:40 2008 +0900 @@ -0,0 +1,5 @@ +# +# This file is for the declaration of global booleans. +# To change the default value at build time, the booleans.conf +# file should be used. +# diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/policy/global_tunables --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/flask/policy/policy/global_tunables Fri Sep 12 14:47:40 2008 +0900 @@ -0,0 +1,6 @@ +# +# This file is for the declaration of global tunables. +# To change the default value at build time, the booleans.conf +# file should be used. +# + diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/policy/mcs --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/flask/policy/policy/mcs Fri Sep 12 14:47:40 2008 +0900 @@ -0,0 +1,324 @@ +ifdef(`enable_mcs',` +# +# Define sensitivities +# +# Each sensitivity has a name and zero or more aliases. +# +# MCS is single-sensitivity. +# +sensitivity s0; + +# +# Define the ordering of the sensitivity levels (least to greatest) +# +dominance { s0 } + + +# +# Define the categories +# +# Each category has a name and zero or more aliases. +# +category c0; +category c1; +category c2; +category c3; +category c4; +category c5; +category c6; +category c7; +category c8; +category c9; +category c10; +category c11; +category c12; +category c13; +category c14; +category c15; +category c16; +category c17; +category c18; +category c19; +category c20; +category c21; +category c22; +category c23; +category c24; +category c25; +category c26; +category c27; +category c28; +category c29; +category c30; +category c31; +category c32; +category c33; +category c34; +category c35; +category c36; +category c37; +category c38; +category c39; +category c40; +category c41; +category c42; +category c43; +category c44; +category c45; +category c46; +category c47; +category c48; +category c49; +category c50; +category c51; +category c52; +category c53; +category c54; +category c55; +category c56; +category c57; +category c58; +category c59; +category c60; +category c61; +category c62; +category c63; +category c64; +category c65; +category c66; +category c67; +category c68; +category c69; +category c70; +category c71; +category c72; +category c73; +category c74; +category c75; +category c76; +category c77; +category c78; +category c79; +category c80; +category c81; +category c82; +category c83; +category c84; +category c85; +category c86; +category c87; +category c88; +category c89; +category c90; +category c91; +category c92; +category c93; +category c94; +category c95; +category c96; +category c97; +category c98; +category c99; +category c100; +category c101; +category c102; +category c103; +category c104; +category c105; +category c106; +category c107; +category c108; +category c109; +category c110; +category c111; +category c112; +category c113; +category c114; +category c115; +category c116; +category c117; +category c118; +category c119; +category c120; +category c121; +category c122; +category c123; +category c124; +category c125; +category c126; +category c127; +category c128; +category c129; +category c130; +category c131; +category c132; +category c133; +category c134; +category c135; +category c136; +category c137; +category c138; +category c139; +category c140; +category c141; +category c142; +category c143; +category c144; +category c145; +category c146; +category c147; +category c148; +category c149; +category c150; +category c151; +category c152; +category c153; +category c154; +category c155; +category c156; +category c157; +category c158; +category c159; +category c160; +category c161; +category c162; +category c163; +category c164; +category c165; +category c166; +category c167; +category c168; +category c169; +category c170; +category c171; +category c172; +category c173; +category c174; +category c175; +category c176; +category c177; +category c178; +category c179; +category c180; +category c181; +category c182; +category c183; +category c184; +category c185; +category c186; +category c187; +category c188; +category c189; +category c190; +category c191; +category c192; +category c193; +category c194; +category c195; +category c196; +category c197; +category c198; +category c199; +category c200; +category c201; +category c202; +category c203; +category c204; +category c205; +category c206; +category c207; +category c208; +category c209; +category c210; +category c211; +category c212; +category c213; +category c214; +category c215; +category c216; +category c217; +category c218; +category c219; +category c220; +category c221; +category c222; +category c223; +category c224; +category c225; +category c226; +category c227; +category c228; +category c229; +category c230; +category c231; +category c232; +category c233; +category c234; +category c235; +category c236; +category c237; +category c238; +category c239; +category c240; +category c241; +category c242; +category c243; +category c244; +category c245; +category c246; +category c247; +category c248; +category c249; +category c250; +category c251; +category c252; +category c253; +category c254; +category c255; + + +# +# Each MCS level specifies a sensitivity and zero or more categories which may +# be associated with that sensitivity. +# +level s0:c0.c255; + +# +# Define the MCS policy +# +# mlsconstrain class_set perm_set expression ; +# +# mlsvalidatetrans class_set expression ; +# +# expression : ( expression ) +# | not expression +# | expression and expression +# | expression or expression +# | u1 op u2 +# | r1 role_mls_op r2 +# | t1 op t2 +# | l1 role_mls_op l2 +# | l1 role_mls_op h2 +# | h1 role_mls_op l2 +# | h1 role_mls_op h2 +# | l1 role_mls_op h1 +# | l2 role_mls_op h2 +# | u1 op names +# | u2 op names +# | r1 op names +# | r2 op names +# | t1 op names +# | t2 op names +# | u3 op names (NOTE: this is only available for mlsvalidatetrans) +# | r3 op names (NOTE: this is only available for mlsvalidatetrans) +# | t3 op names (NOTE: this is only available for mlsvalidatetrans) +# +# op : == | != +# role_mls_op : == | != | eq | dom | domby | incomp +# +# names : name | { name_list } +# name_list : name | name_list name +# + + +') dnl end enable_mcs diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/policy/mls --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/flask/policy/policy/mls Fri Sep 12 14:47:40 2008 +0900 @@ -0,0 +1,354 @@ + +ifdef(`enable_mls',` +# +# Define sensitivities +# +# Each sensitivity has a name and zero or more aliases. +# +sensitivity s0; +sensitivity s1; +sensitivity s2; +sensitivity s3; +sensitivity s4; +sensitivity s5; +sensitivity s6; +sensitivity s7; +sensitivity s8; +sensitivity s9; +sensitivity s10; +sensitivity s11; +sensitivity s12; +sensitivity s13; +sensitivity s14; +sensitivity s15; + +# +# Define the ordering of the sensitivity levels (least to greatest) +# +dominance { s0 s1 s2 s3 s4 s5 s6 s7 s8 s9 s10 s11 s12 s13 s14 s15 } + + +# +# Define the categories +# +# Each category has a name and zero or more aliases. +# +category c0; +category c1; +category c2; +category c3; +category c4; +category c5; +category c6; +category c7; +category c8; +category c9; +category c10; +category c11; +category c12; +category c13; +category c14; +category c15; +category c16; +category c17; +category c18; +category c19; +category c20; +category c21; +category c22; +category c23; +category c24; +category c25; +category c26; +category c27; +category c28; +category c29; +category c30; +category c31; +category c32; +category c33; +category c34; +category c35; +category c36; +category c37; +category c38; +category c39; +category c40; +category c41; +category c42; +category c43; +category c44; +category c45; +category c46; +category c47; +category c48; +category c49; +category c50; +category c51; +category c52; +category c53; +category c54; +category c55; +category c56; +category c57; +category c58; +category c59; +category c60; +category c61; +category c62; +category c63; +category c64; +category c65; +category c66; +category c67; +category c68; +category c69; +category c70; +category c71; +category c72; +category c73; +category c74; +category c75; +category c76; +category c77; +category c78; +category c79; +category c80; +category c81; +category c82; +category c83; +category c84; +category c85; +category c86; +category c87; +category c88; +category c89; +category c90; +category c91; +category c92; +category c93; +category c94; +category c95; +category c96; +category c97; +category c98; +category c99; +category c100; +category c101; +category c102; +category c103; +category c104; +category c105; +category c106; +category c107; +category c108; +category c109; +category c110; +category c111; +category c112; +category c113; +category c114; +category c115; +category c116; +category c117; +category c118; +category c119; +category c120; +category c121; +category c122; +category c123; +category c124; +category c125; +category c126; +category c127; +category c128; +category c129; +category c130; +category c131; +category c132; +category c133; +category c134; +category c135; +category c136; +category c137; +category c138; +category c139; +category c140; +category c141; +category c142; +category c143; +category c144; +category c145; +category c146; +category c147; +category c148; +category c149; +category c150; +category c151; +category c152; +category c153; +category c154; +category c155; +category c156; +category c157; +category c158; +category c159; +category c160; +category c161; +category c162; +category c163; +category c164; +category c165; +category c166; +category c167; +category c168; +category c169; +category c170; +category c171; +category c172; +category c173; +category c174; +category c175; +category c176; +category c177; +category c178; +category c179; +category c180; +category c181; +category c182; +category c183; +category c184; +category c185; +category c186; +category c187; +category c188; +category c189; +category c190; +category c191; +category c192; +category c193; +category c194; +category c195; +category c196; +category c197; +category c198; +category c199; +category c200; +category c201; +category c202; +category c203; +category c204; +category c205; +category c206; +category c207; +category c208; +category c209; +category c210; +category c211; +category c212; +category c213; +category c214; +category c215; +category c216; +category c217; +category c218; +category c219; +category c220; +category c221; +category c222; +category c223; +category c224; +category c225; +category c226; +category c227; +category c228; +category c229; +category c230; +category c231; +category c232; +category c233; +category c234; +category c235; +category c236; +category c237; +category c238; +category c239; +category c240; +category c241; +category c242; +category c243; +category c244; +category c245; +category c246; +category c247; +category c248; +category c249; +category c250; +category c251; +category c252; +category c253; +category c254; +category c255; + + +# +# Each MLS level specifies a sensitivity and zero or more categories which may +# be associated with that sensitivity. +# +level s0:c0.c255; +level s1:c0.c255; +level s2:c0.c255; +level s3:c0.c255; +level s4:c0.c255; +level s5:c0.c255; +level s6:c0.c255; +level s7:c0.c255; +level s8:c0.c255; +level s9:c0.c255; +level s10:c0.c255; +level s11:c0.c255; +level s12:c0.c255; +level s13:c0.c255; +level s14:c0.c255; +level s15:c0.c255; + + +# +# Define the MLS policy +# +# mlsconstrain class_set perm_set expression ; +# +# mlsvalidatetrans class_set expression ; +# +# expression : ( expression ) +# | not expression +# | expression and expression +# | expression or expression +# | u1 op u2 +# | r1 role_mls_op r2 +# | t1 op t2 +# | l1 role_mls_op l2 +# | l1 role_mls_op h2 +# | h1 role_mls_op l2 +# | h1 role_mls_op h2 +# | l1 role_mls_op h1 +# | l2 role_mls_op h2 +# | u1 op names +# | u2 op names +# | r1 op names +# | r2 op names +# | t1 op names +# | t2 op names +# | u3 op names (NOTE: this is only available for mlsvalidatetrans) +# | r3 op names (NOTE: this is only available for mlsvalidatetrans) +# | t3 op names (NOTE: this is only available for mlsvalidatetrans) +# +# op : == | != +# role_mls_op : == | != | eq | dom | domby | incomp +# +# names : name | { name_list } +# name_list : name | name_list name +# + + +') dnl end enable_mls diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/policy/modules.conf --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/flask/policy/policy/modules.conf Fri Sep 12 14:47:40 2008 +0900 @@ -0,0 +1,21 @@ +# +# This file contains a listing of available modules. +# To prevent a module from being used in policy +# creation, set the module name to "off". +# +# For monolithic policies, modules set to "base" and "module" +# will be built into the policy. +# +# For modular policies, modules set to "base" will be +# included in the base module. "module" will be compiled +# as individual loadable modules. +# + +# Layer: xen +# Module: xen +# Required in base +# +# Policy for xen. +# +xen = base + diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/policy/modules/xen/xen.if --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/flask/policy/policy/modules/xen/xen.if Fri Sep 12 14:47:40 2008 +0900 @@ -0,0 +1,1 @@ +# diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/policy/modules/xen/xen.te --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/flask/policy/policy/modules/xen/xen.te Fri Sep 12 14:47:40 2008 +0900 @@ -0,0 +1,135 @@ +attribute xen_type; +attribute domain_type; +attribute resource_type; +attribute event_type; + +type xen_t, xen_type, domain_type; + +type dom0_t, domain_type; + +type domio_t, domain_type; + +type domxen_t, domain_type; + +type unlabeled_t, domain_type; + +type security_t, domain_type; + +type pirq_t, resource_type; +type ioport_t, resource_type; +type iomem_t, resource_type; +type device_t, resource_type; + +################################################################################ +# +# create_domain(priv_dom, domain, channel) +# +################################################################################ +define(`create_domain', ` + type $2, domain_type; + allow $1 $2:domain {create max_vcpus setdomainmaxmem + setaddrsize getdomaininfo hypercall + setvcpucontext scheduler unpause + getvcpuinfo getaddrsize getvcpuaffinity}; + allow $1 $2:shadow {enable}; + allow $1 $2:mmu {map_read map_write memorymap adjust pinpage}; + allow $2 $2:mmu {map_read map_write pinpage}; + allow $2 domio_t:mmu {map_read}; + allow $2 $2:grant {query setup}; + allow $1 $2:grant {map_read unmap}; + allow $1 $3:event {create}; +') + +################################################################################ +# +# manage_domain(priv_dom, domain) +# +################################################################################ +define(`manage_domain', ` + allow $1 $2:domain {pause destroy}; +') + +################################################################################ +# +# create_channel(caller, peer, channel) +# +################################################################################ +define(`create_channel', ` + type $3, event_type; + type_transition $1 $2:event $3; + allow $1 $3:event {create}; + allow $3 $2:event {bind}; +') + +################################################################################ +# +# Boot the hypervisor and dom0 +# +################################################################################ +allow dom0_t xen_t:xen {kexec readapic writeapic mtrr_read mtrr_add mtrr_del +scheduler physinfo heap quirk readconsole writeconsole settime microcode}; + +allow dom0_t domio_t:mmu {map_read map_write}; +allow dom0_t iomem_t:mmu {map_read map_write}; +allow dom0_t pirq_t:event {vector}; +allow dom0_t xen_t:mmu {memorymap}; + +allow dom0_t dom0_t:mmu {pinpage map_read map_write adjust}; +allow dom0_t dom0_t:grant {query setup}; +allow dom0_t dom0_t:domain {scheduler getdomaininfo getvcpuinfo getvcpuaffinity}; + +allow xen_t dom0_t:domain {create}; +allow xen_t dom0_t:resource {add remove}; +allow xen_t ioport_t:resource {add_ioport remove_ioport}; +allow dom0_t ioport_t:resource {use}; +allow xen_t iomem_t:resource {add_iomem remove_iomem}; +allow dom0_t iomem_t:resource {use}; +allow xen_t pirq_t:resource {add_irq remove_irq}; +allow dom0_t pirq_t:resource {use}; + +allow dom0_t security_t:security {compute_av compute_create compute_member +check_context load_policy compute_relabel compute_user setenforce setbool +setsecparam}; + +create_channel(dom0_t, dom0_t, evchn0-0_t) +allow dom0_t evchn0-0_t:event {send}; + +################################################################################ +# +# Create and manage a domU w/ dom0 IO +# +################################################################################ +create_domain(dom0_t, domU_t, evchnU-0_t) + +create_channel(domU_t, domU_t, evchnU-U_t) +allow domU_t evchnU-U_t:event {send}; + +create_channel(dom0_t, domU_t, evchn0-U_t) +allow dom0_t evchn0-U_t:event {send}; + +create_channel(domU_t, dom0_t, evchnU-0_t) +allow domU_t evchnU-0_t:event {send}; + +manage_domain(dom0_t, domU_t) + +################################################################################ +# +# +# +################################################################################ +sid xen gen_context(system_u:system_r:xen_t,s0) +sid dom0 gen_context(system_u:system_r:dom0_t,s0) +sid domU gen_context(system_u:system_r:domU_t,s0) +sid domxen gen_context(system_u:system_r:domxen_t,s0) +sid domio gen_context(system_u:system_r:domio_t,s0) +sid unlabeled gen_context(system_u:system_r:unlabeled_t,s0) +sid security gen_context(system_u:system_r:security_t,s0) +sid pirq gen_context(system_u:object_r:pirq_t,s0) +sid iomem gen_context(system_u:object_r:iomem_t,s0) +sid ioport gen_context(system_u:object_r:ioport_t,s0) +sid device gen_context(system_u:object_r:device_t,s0) + +role system_r types { xen_type domain_type }; +role user_r types { xen_type domain_type }; +role sysadm_r types { xen_type domain_type }; +role staff_r types { xen_type domain_type }; diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/policy/support/loadable_module.spt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/flask/policy/policy/support/loadable_module.spt Fri Sep 12 14:47:40 2008 +0900 @@ -0,0 +1,166 @@ +######################################## +# +# Macros for switching between source policy +# and loadable policy module support +# + +############################## +# +# For adding the module statement +# +define(`policy_module',` + ifdef(`self_contained_policy',`',` + module $1 $2; + + require { + role system_r; + all_kernel_class_perms + } + ') +') + +############################## +# +# For use in interfaces, to optionally insert a require block +# +define(`gen_require',` + ifdef(`self_contained_policy',`',` + define(`in_gen_require_block') + require { + $1 + } + undefine(`in_gen_require_block') + ') +') + +############################## +# +# In the future interfaces should be in loadable modules +# +# template(name,rules) +# +define(`template',` + `define(`$1',` +##### begin $1(dollarsstar) + $2 +##### end $1(dollarsstar) + '') +') + +# helper function, since m4 wont expand macros +# if a line is a comment (#): +define(`policy_m4_comment',`dnl +##### $2 depth: $1 +')dnl + +############################## +# +# In the future interfaces should be in loadable modules +# +# interface(name,rules) +# +define(`interface',` + `define(`$1',` + + define(`policy_temp',incr(policy_call_depth)) + pushdef(`policy_call_depth',policy_temp) + undefine(`policy_temp') + + policy_m4_comment(policy_call_depth,begin `$1'(dollarsstar)) + + $2 + + define(`policy_temp',decr(policy_call_depth)) + pushdef(`policy_call_depth',policy_temp) + undefine(`policy_temp') + + policy_m4_comment(policy_call_depth,end `$1'(dollarsstar)) + + '') +') + +define(`policy_call_depth',0) + +############################## +# +# Optional policy handling +# +define(`optional_policy',` + ifdef(`self_contained_policy',` + ifdef(`$1',`$2',`$3') + ',` + optional { + $2 + ifelse(`$3',`',`',` + } else { + $3 + ') + } + ') +') + +############################## +# +# Determine if we should use the default +# tunable value as specified by the policy +# or if the override value should be used +# +define(`dflt_or_overr',`ifdef(`$1',$1,$2)') + +############################## +# +# Extract booleans out of an expression. +# This needs to be reworked so expressions +# with parentheses can work. + +define(`delcare_required_symbols',` +ifelse(regexp($1, `\w'), -1, `', `dnl +bool regexp($1, `\(\w+\)', `\1'); +delcare_required_symbols(regexp($1, `\w+\(.*\)', `\1'))dnl +') dnl +') + +############################## +# +# Tunable declaration +# +define(`gen_tunable',` + ifdef(`self_contained_policy',` + bool $1 dflt_or_overr(`$1'_conf,$2); + ',` + # loadable module tunable + # declaration will go here + # instead of bool when + # loadable modules support + # tunables + bool $1 dflt_or_overr(`$1'_conf,$2); + ') +') + +############################## +# +# Tunable policy handling +# +define(`tunable_policy',` + ifdef(`self_contained_policy',` + if (`$1') { + $2 + } else { + $3 + } + ',` + # structure for tunables + # will go here instead of a + # conditional when loadable + # modules support tunables + gen_require(` + delcare_required_symbols(`$1') + ') + + if (`$1') { + $2 + } else { + $3 + } + ') +') diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/policy/support/misc_macros.spt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/flask/policy/policy/support/misc_macros.spt Fri Sep 12 14:47:40 2008 +0900 @@ -0,0 +1,32 @@ + +######################################## +# +# Helper macros +# + +# +# shiftn(num,list...) +# +# shift the list num times +# +define(`shiftn',`ifelse($1,0,`shift($*)',`shiftn(decr($1),shift(shift($*)))')') + +######################################## +# +# gen_user(username, role_set, mls_defaultlevel, mls_range, [mcs_categories]) +# +define(`gen_user',`user $1 roles { $2 }`'ifdef(`enable_mls', ` level $3 range $4')`'ifdef(`enable_mcs',` level s0 range s0`'ifelse(`$5',,,` - s0:$5')');') + +######################################## +# +# gen_context(context,mls_sensitivity,[mcs_categories]) +# +define(`gen_context',`$1`'ifdef(`enable_mls',`:$2')`'ifdef(`enable_mcs',`:s0`'ifelse(`$3',,,`:$3')')') dnl + +######################################## +# +# gen_bool(name,default_value) +# +define(`gen_bool',` + bool $1 dflt_or_overr(`$1'_conf,$2); +') diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/policy/systemuser --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/flask/policy/policy/systemuser Fri Sep 12 14:47:40 2008 +0900 @@ -0,0 +1,19 @@ +################################## +# +# System User configuration. +# + +# +# gen_user(username, role_set, mls_defaultlevel, mls_range, [mcs_categories]) +# + +# +# system_u is the user identity for system processes and objects. +# There should be no corresponding Unix user identity for system, +# and a user process should never be assigned the system user +# identity. +# +gen_user(system_u, system_r, s0, s0 - s9:c0.c127, c0.c127) + +# Normal users should not be added to this file, +# but instead added to the users file. diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/flask/policy/policy/users --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/flask/policy/policy/users Fri Sep 12 14:47:40 2008 +0900 @@ -0,0 +1,39 @@ + +################################## +# +# Core User configuration. +# + +# +# gen_user(username, role_set, mls_defaultlevel, mls_range, [mcs_catetories]) +# + +# +# user_u is a generic user identity for Linux users who have no +# SELinux user identity defined. The modified daemons will use +# this user identity in the security context if there is no matching +# SELinux user identity for a Linux user. If you do not want to +# permit any access to such users, then remove this entry. +# +ifdef(`targeted_policy',` +gen_user(user_u, user_r sysadm_r system_r, s0, s0 - s9:c0.c127) +',` +gen_user(user_u, user_r, s0, s0 - s9:c0.c127) +') + +# +# The following users correspond to Unix identities. +# These identities are typically assigned as the user attribute +# when login starts the user shell. Users with access to the sysadm_r +# role should use the staff_r role instead of the user_r role when +# not in the sysadm_r. +# +ifdef(`targeted_policy',` + gen_user(root, user_r sysadm_r system_r, s0, s0 - s9:c0.c127, c0.c127) +',` + ifdef(`direct_sysadm_daemon',` + gen_user(root, sysadm_r staff_r system_r, s0, s0 - s9:c0.c127, c0.c127) + ',` + gen_user(root, sysadm_r staff_r, s0, s0 - s9:c0.c127, c0.c127) + ') +') diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/ioemu/hw/cirrus_vga.c --- a/tools/ioemu/hw/cirrus_vga.c Fri Sep 12 14:32:45 2008 +0900 +++ b/tools/ioemu/hw/cirrus_vga.c Fri Sep 12 14:47:40 2008 +0900 @@ -2554,6 +2554,9 @@ static void set_vram_mapping(CirrusVGASt fprintf(logfile,"mapping vram to %lx - %lx\n", begin, end); + if (!s->vram_mfns) + return; + xatp.domid = domid; xatp.space = XENMAPSPACE_mfn; diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/ioemu/hw/pass-through.c --- a/tools/ioemu/hw/pass-through.c Fri Sep 12 14:32:45 2008 +0900 +++ b/tools/ioemu/hw/pass-through.c Fri Sep 12 14:47:40 2008 +0900 @@ -57,6 +57,10 @@ static uint32_t pt_irqpin_reg_init(struc struct pt_reg_info_tbl *reg, uint32_t real_offset); static uint32_t pt_bar_reg_init(struct pt_dev *ptdev, struct pt_reg_info_tbl *reg, uint32_t real_offset); +static uint32_t pt_linkctrl_reg_init(struct pt_dev *ptdev, + struct pt_reg_info_tbl *reg, uint32_t real_offset); +static uint32_t pt_devctrl2_reg_init(struct pt_dev *ptdev, + struct pt_reg_info_tbl *reg, uint32_t real_offset); static uint32_t pt_linkctrl2_reg_init(struct pt_dev *ptdev, struct pt_reg_info_tbl *reg, uint32_t real_offset); static uint32_t pt_msgctrl_reg_init(struct pt_dev *ptdev, @@ -76,6 +80,8 @@ static uint8_t pt_msix_size_init(struct static uint8_t pt_msix_size_init(struct pt_dev *ptdev, struct pt_reg_grp_info_tbl *grp_reg, uint32_t base_offset); static uint8_t pt_vendor_size_init(struct pt_dev *ptdev, + struct pt_reg_grp_info_tbl *grp_reg, uint32_t base_offset); +static uint8_t pt_pcie_size_init(struct pt_dev *ptdev, struct pt_reg_grp_info_tbl *grp_reg, uint32_t base_offset); static int pt_byte_reg_read(struct pt_dev *ptdev, struct pt_reg_tbl *cfg_entry, @@ -438,7 +444,7 @@ static struct pt_reg_info_tbl pt_emu_reg .init_val = 0x0000, .ro_mask = 0x0000, .emu_mask = 0xFFFF, - .init = pt_common_reg_init, + .init = pt_linkctrl_reg_init, .u.w.read = pt_word_reg_read, .u.w.write = pt_linkctrl_reg_write, }, @@ -449,7 +455,7 @@ static struct pt_reg_info_tbl pt_emu_reg .init_val = 0x0000, .ro_mask = 0x0000, .emu_mask = 0xFFFF, - .init = pt_common_reg_init, + .init = pt_devctrl2_reg_init, .u.w.read = pt_word_reg_read, .u.w.write = pt_devctrl2_reg_write, }, @@ -666,8 +672,8 @@ static const struct pt_reg_grp_info_tbl { .grp_id = PCI_CAP_ID_EXP, .grp_type = GRP_TYPE_EMU, - .grp_size = 0x3C, - .size_init = pt_reg_grp_size_init, + .grp_size = 0xFF, + .size_init = pt_pcie_size_init, .emu_reg_tbl= pt_emu_reg_pcie_tbl, }, /* MSI-X Capability Structure reg group */ @@ -1869,12 +1875,57 @@ static uint32_t pt_bar_reg_init(struct p return reg_field; } +/* initialize Link Control register */ +static uint32_t pt_linkctrl_reg_init(struct pt_dev *ptdev, + struct pt_reg_info_tbl *reg, uint32_t real_offset) +{ + uint8_t cap_ver = 0; + uint8_t dev_type = 0; + + cap_ver = (ptdev->dev.config[(real_offset - reg->offset) + PCI_EXP_FLAGS] & + (uint8_t)PCI_EXP_FLAGS_VERS); + dev_type = (ptdev->dev.config[(real_offset - reg->offset) + PCI_EXP_FLAGS] & + (uint8_t)PCI_EXP_FLAGS_TYPE) >> 4; + + /* no need to initialize in case of Root Complex Integrated Endpoint + * with cap_ver 1.x + */ + if ((dev_type == PCI_EXP_TYPE_ROOT_INT_EP) && (cap_ver == 1)) + return PT_INVALID_REG; + + return reg->init_val; +} + +/* initialize Device Control 2 register */ +static uint32_t pt_devctrl2_reg_init(struct pt_dev *ptdev, + struct pt_reg_info_tbl *reg, uint32_t real_offset) +{ + uint8_t cap_ver = 0; + + cap_ver = (ptdev->dev.config[(real_offset - reg->offset) + PCI_EXP_FLAGS] & + (uint8_t)PCI_EXP_FLAGS_VERS); + + /* no need to initialize in case of cap_ver 1.x */ + if (cap_ver == 1) + return PT_INVALID_REG; + + return reg->init_val; +} + /* initialize Link Control 2 register */ static uint32_t pt_linkctrl2_reg_init(struct pt_dev *ptdev, struct pt_reg_info_tbl *reg, uint32_t real_offset) { int reg_field = 0; - + uint8_t cap_ver = 0; + + cap_ver = (ptdev->dev.config[(real_offset - reg->offset) + PCI_EXP_FLAGS] & + (uint8_t)PCI_EXP_FLAGS_VERS); + + /* no need to initialize in case of cap_ver 1.x */ + if (cap_ver == 1) + return PT_INVALID_REG; + /* set Supported Link Speed */ reg_field |= (0x0F & @@ -2034,6 +2085,91 @@ static uint8_t pt_vendor_size_init(struc struct pt_reg_grp_info_tbl *grp_reg, uint32_t base_offset) { return ptdev->dev.config[base_offset + 0x02]; +} + +/* get PCI Express Capability Structure register group size */ +static uint8_t pt_pcie_size_init(struct pt_dev *ptdev, + struct pt_reg_grp_info_tbl *grp_reg, uint32_t base_offset) +{ + PCIDevice *d = &ptdev->dev; + uint16_t exp_flag = 0; + uint16_t type = 0; + uint16_t vers = 0; + uint8_t pcie_size = 0; + + exp_flag = *((uint16_t*)(d->config + (base_offset + PCI_EXP_FLAGS))); + type = (exp_flag & PCI_EXP_FLAGS_TYPE) >> 4; + vers = (exp_flag & PCI_EXP_FLAGS_VERS); + + /* calculate size depend on capability version and device/port type */ + /* in case of PCI Express Base Specification Rev 1.x */ + if (vers == 1) + { + /* The PCI Express Capabilities, Device Capabilities, and Device + * Status/Control registers are required for all PCI Express devices. + * The Link Capabilities and Link Status/Control are required for all + * Endpoints that are not Root Complex Integrated Endpoints. Endpoints + * are not required to implement registers other than those listed + * above and terminate the capability structure. + */ + switch (type) { + case PCI_EXP_TYPE_ENDPOINT: + case PCI_EXP_TYPE_LEG_END: + pcie_size = 0x14; + break; + case PCI_EXP_TYPE_ROOT_INT_EP: + /* has no link */ + pcie_size = 0x0C; + break; + /* only EndPoint passthrough is supported */ + case PCI_EXP_TYPE_ROOT_PORT: + case PCI_EXP_TYPE_UPSTREAM: + case PCI_EXP_TYPE_DOWNSTREAM: + case PCI_EXP_TYPE_PCI_BRIDGE: + case PCI_EXP_TYPE_PCIE_BRIDGE: + case PCI_EXP_TYPE_ROOT_EC: + default: + /* exit I/O emulator */ + PT_LOG("Internal error: Unsupported device/port type[%d]. " + "I/O emulator exit.\n", type); + exit(1); + } + } + /* in case of PCI Express Base Specification Rev 2.0 */ + else if (vers == 2) + { + switch (type) { + case PCI_EXP_TYPE_ENDPOINT: + case PCI_EXP_TYPE_LEG_END: + case PCI_EXP_TYPE_ROOT_INT_EP: + /* For Functions that do not implement the registers, + * these spaces must be hardwired to 0b. + */ + pcie_size = 0x3C; + break; + /* only EndPoint passthrough is supported */ + case PCI_EXP_TYPE_ROOT_PORT: + case PCI_EXP_TYPE_UPSTREAM: + case PCI_EXP_TYPE_DOWNSTREAM: + case PCI_EXP_TYPE_PCI_BRIDGE: + case PCI_EXP_TYPE_PCIE_BRIDGE: + case PCI_EXP_TYPE_ROOT_EC: + default: + /* exit I/O emulator */ + PT_LOG("Internal error: Unsupported device/port type[%d]. " + "I/O emulator exit.\n", type); + exit(1); + } + } + else + { + /* exit I/O emulator */ + PT_LOG("Internal error: Unsupported capability version[%d]. " + "I/O emulator exit.\n", vers); + exit(1); + } + + return pcie_size; } /* read byte size emulate register */ diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/ioemu/hw/pass-through.h --- a/tools/ioemu/hw/pass-through.h Fri Sep 12 14:32:45 2008 +0900 +++ b/tools/ioemu/hw/pass-through.h Fri Sep 12 14:47:40 2008 +0900 @@ -60,6 +60,21 @@ #ifndef PCI_MSI_FLAGS_MASK_BIT /* interrupt masking & reporting supported */ #define PCI_MSI_FLAGS_MASK_BIT 0x0100 +#endif + +#ifndef PCI_EXP_TYPE_PCIE_BRIDGE +/* PCI/PCI-X to PCIE Bridge */ +#define PCI_EXP_TYPE_PCIE_BRIDGE 0x8 +#endif + +#ifndef PCI_EXP_TYPE_ROOT_INT_EP +/* Root Complex Integrated Endpoint */ +#define PCI_EXP_TYPE_ROOT_INT_EP 0x9 +#endif + +#ifndef PCI_EXP_TYPE_ROOT_EC +/* Root Complex Event Collector */ +#define PCI_EXP_TYPE_ROOT_EC 0xa #endif #define PT_INVALID_REG 0xFFFFFFFF /* invalid register value */ diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/ioemu/hw/pci.c --- a/tools/ioemu/hw/pci.c Fri Sep 12 14:32:45 2008 +0900 +++ b/tools/ioemu/hw/pci.c Fri Sep 12 14:47:40 2008 +0900 @@ -45,7 +45,6 @@ static void pci_update_mappings(PCIDevic static void pci_update_mappings(PCIDevice *d); target_phys_addr_t pci_mem_base; -static int pci_irq_index; static PCIBus *first_bus; PCIBus *pci_register_bus(pci_set_irq_fn set_irq, pci_map_irq_fn map_irq, @@ -114,9 +113,6 @@ PCIDevice *pci_register_device(PCIBus *b { PCIDevice *pci_dev; - if (pci_irq_index >= PCI_DEVICES_MAX) - return NULL; - if (devfn < 0) { for(devfn = bus->devfn_min ; devfn < 256; devfn += 8) { if ( !bus->devices[devfn] && @@ -140,7 +136,6 @@ PCIDevice *pci_register_device(PCIBus *b config_write = pci_default_write_config; pci_dev->config_read = config_read; pci_dev->config_write = config_write; - pci_dev->irq_index = pci_irq_index++; bus->devices[devfn] = pci_dev; return pci_dev; } diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/ioemu/hw/pt-msi.c --- a/tools/ioemu/hw/pt-msi.c Fri Sep 12 14:32:45 2008 +0900 +++ b/tools/ioemu/hw/pt-msi.c Fri Sep 12 14:47:40 2008 +0900 @@ -313,7 +313,7 @@ int pt_msix_init(struct pt_dev *dev, int table_off = pci_read_long(pd, pos + PCI_MSIX_TABLE); bar_index = dev->msix->bar_index = table_off & PCI_MSIX_BIR; - table_off &= table_off & ~PCI_MSIX_BIR; + table_off = dev->msix->table_off = table_off & ~PCI_MSIX_BIR; dev->msix->table_base = dev->pci_dev->base_addr[bar_index]; PT_LOG("get MSI-X table bar base %llx\n", (unsigned long long)dev->msix->table_base); diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/ioemu/hw/vga.c --- a/tools/ioemu/hw/vga.c Fri Sep 12 14:32:45 2008 +0900 +++ b/tools/ioemu/hw/vga.c Fri Sep 12 14:47:40 2008 +0900 @@ -2080,7 +2080,13 @@ void xen_vga_vram_map(uint64_t vram_addr if (copy) memcpy(vram, xen_vga_state->vram_ptr, VGA_RAM_SIZE); - qemu_free(xen_vga_state->vram_ptr); + if (xen_vga_state->vram_mfns) { + /* In case this function is called more than once */ + free(xen_vga_state->vram_mfns); + munmap(xen_vga_state->vram_ptr, VGA_RAM_SIZE); + } else { + qemu_free(xen_vga_state->vram_ptr); + } xen_vga_state->vram_ptr = vram; xen_vga_state->vram_mfns = pfn_list; #ifdef CONFIG_STUBDOM diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/ioemu/hw/xen_machine_fv.c --- a/tools/ioemu/hw/xen_machine_fv.c Fri Sep 12 14:32:45 2008 +0900 +++ b/tools/ioemu/hw/xen_machine_fv.c Fri Sep 12 14:47:40 2008 +0900 @@ -139,8 +139,10 @@ uint8_t *qemu_map_cache(target_phys_addr !test_bit(address_offset>>XC_PAGE_SHIFT, entry->valid_mapping)) qemu_remap_bucket(entry, address_index); - if (!test_bit(address_offset>>XC_PAGE_SHIFT, entry->valid_mapping)) + if (!test_bit(address_offset>>XC_PAGE_SHIFT, entry->valid_mapping)) { + last_address_index = ~0UL; return NULL; + } last_address_index = address_index; last_address_vaddr = entry->vaddr_base; diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/ioemu/vl.h --- a/tools/ioemu/vl.h Fri Sep 12 14:32:45 2008 +0900 +++ b/tools/ioemu/vl.h Fri Sep 12 14:47:40 2008 +0900 @@ -812,8 +812,6 @@ struct PCIDevice { /* do not access the following fields */ PCIConfigReadFunc *config_read; PCIConfigWriteFunc *config_write; - /* ??? This is a PC-specific hack, and should be removed. */ - int irq_index; /* Current IRQ levels. Used internally by the generic PCI code. */ int irq_state[4]; diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/libxc/ia64/xc_ia64_linux_save.c --- a/tools/libxc/ia64/xc_ia64_linux_save.c Fri Sep 12 14:32:45 2008 +0900 +++ b/tools/libxc/ia64/xc_ia64_linux_save.c Fri Sep 12 14:47:40 2008 +0900 @@ -53,12 +53,12 @@ static inline void set_bit(int nr, volat } static int -suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd, +suspend_and_state(int (*suspend)(void), int xc_handle, int io_fd, int dom, xc_dominfo_t *info) { int i = 0; - if (!(*suspend)(dom)) { + if (!(*suspend)()) { ERROR("Suspend request failed"); return -1; } @@ -406,7 +406,7 @@ out: int xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters, - uint32_t max_factor, uint32_t flags, int (*suspend)(int), + uint32_t max_factor, uint32_t flags, int (*suspend)(void), int hvm, void *(*init_qemu_maps)(int, unsigned), void (*qemu_flip_buffer)(int, int)) { diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/libxc/xc_domain_save.c --- a/tools/libxc/xc_domain_save.c Fri Sep 12 14:32:45 2008 +0900 +++ b/tools/libxc/xc_domain_save.c Fri Sep 12 14:47:40 2008 +0900 @@ -338,72 +338,23 @@ static int analysis_phase(int xc_handle, } -static int suspend_and_state(int (*suspend)(int), int xc_handle, int io_fd, +static int suspend_and_state(int (*suspend)(void), int xc_handle, int io_fd, int dom, xc_dominfo_t *info) { - int i = 0; - - if ( !(*suspend)(dom) ) + if ( !(*suspend)() ) { ERROR("Suspend request failed"); return -1; } - retry: - - if ( xc_domain_getinfo(xc_handle, dom, 1, info) != 1 ) - { - ERROR("Could not get domain info"); + if ( (xc_domain_getinfo(xc_handle, dom, 1, info) != 1) || + !info->shutdown || (info->shutdown_reason != SHUTDOWN_suspend) ) + { + ERROR("Domain not in suspended state"); return -1; } - if ( info->dying ) - { - ERROR("domain is dying"); - return -1; - } - - if ( info->crashed ) - { - ERROR("domain has crashed"); - return -1; - } - - if ( info->shutdown ) - { - switch ( info->shutdown_reason ) - { - case SHUTDOWN_poweroff: - case SHUTDOWN_reboot: - ERROR("domain has shut down"); - return -1; - case SHUTDOWN_suspend: - return 0; - case SHUTDOWN_crash: - ERROR("domain has crashed"); - return -1; - } - } - - if ( info->paused ) - { - /* Try unpausing domain, wait, and retest. */ - xc_domain_unpause( xc_handle, dom ); - ERROR("Domain was paused. Wait and re-test."); - usleep(10000); /* 10ms */ - goto retry; - } - - if ( ++i < 100 ) - { - ERROR("Retry suspend domain"); - usleep(10000); /* 10ms */ - goto retry; - } - - ERROR("Unable to suspend domain."); - - return -1; + return 0; } /* @@ -796,7 +747,7 @@ static xen_pfn_t *map_and_save_p2m_table int xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters, - uint32_t max_factor, uint32_t flags, int (*suspend)(int), + uint32_t max_factor, uint32_t flags, int (*suspend)(void), int hvm, void *(*init_qemu_maps)(int, unsigned), void (*qemu_flip_buffer)(int, int)) { diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/libxc/xc_evtchn.c --- a/tools/libxc/xc_evtchn.c Fri Sep 12 14:32:45 2008 +0900 +++ b/tools/libxc/xc_evtchn.c Fri Sep 12 14:47:40 2008 +0900 @@ -59,17 +59,8 @@ int xc_evtchn_reset(int xc_handle, return do_evtchn_op(xc_handle, EVTCHNOP_reset, &arg, sizeof(arg), 0); } -int xc_evtchn_status(int xc_handle, - uint32_t dom, - uint32_t port) +int xc_evtchn_status(int xc_handle, xc_evtchn_status_t *status) { - int rc; - struct evtchn_status arg = { .dom = (domid_t)dom, - .port = (evtchn_port_t)port }; - - rc = do_evtchn_op(xc_handle, EVTCHNOP_status, &arg, sizeof(arg), 1); - if ( rc == 0 ) - rc = arg.status; - - return rc; + return do_evtchn_op(xc_handle, EVTCHNOP_status, status, + sizeof(*status), 1); } diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/libxc/xc_private.c --- a/tools/libxc/xc_private.c Fri Sep 12 14:32:45 2008 +0900 +++ b/tools/libxc/xc_private.c Fri Sep 12 14:47:40 2008 +0900 @@ -307,6 +307,13 @@ int xc_memory_op(int xc_handle, goto out1; } break; + case XENMEM_remove_from_physmap: + if ( lock_pages(arg, sizeof(struct xen_remove_from_physmap)) ) + { + PERROR("Could not lock"); + goto out1; + } + break; case XENMEM_current_reservation: case XENMEM_maximum_reservation: case XENMEM_maximum_gpfn: @@ -339,6 +346,9 @@ int xc_memory_op(int xc_handle, break; case XENMEM_add_to_physmap: unlock_pages(arg, sizeof(struct xen_add_to_physmap)); + break; + case XENMEM_remove_from_physmap: + unlock_pages(arg, sizeof(struct xen_remove_from_physmap)); break; case XENMEM_current_reservation: case XENMEM_maximum_reservation: diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/libxc/xenctrl.h --- a/tools/libxc/xenctrl.h Fri Sep 12 14:32:45 2008 +0900 +++ b/tools/libxc/xenctrl.h Fri Sep 12 14:47:40 2008 +0900 @@ -502,9 +502,9 @@ xc_evtchn_alloc_unbound(int xc_handle, int xc_evtchn_reset(int xc_handle, uint32_t dom); -int xc_evtchn_status(int xc_handle, - uint32_t dom, - uint32_t port); + +typedef struct evtchn_status xc_evtchn_status_t; +int xc_evtchn_status(int xc_handle, xc_evtchn_status_t *status); /* * Return a handle to the event channel driver, or -1 on failure, in which case diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/libxc/xenguest.h --- a/tools/libxc/xenguest.h Fri Sep 12 14:32:45 2008 +0900 +++ b/tools/libxc/xenguest.h Fri Sep 12 14:47:40 2008 +0900 @@ -25,7 +25,7 @@ */ int xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters, uint32_t max_factor, uint32_t flags /* XCFLAGS_xxx */, - int (*suspend)(int domid), int hvm, + int (*suspend)(void), int hvm, void *(*init_qemu_maps)(int, unsigned), /* HVM only */ void (*qemu_flip_buffer)(int, int)); /* HVM only */ diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/python/Makefile --- a/tools/python/Makefile Fri Sep 12 14:32:45 2008 +0900 +++ b/tools/python/Makefile Fri Sep 12 14:47:40 2008 +0900 @@ -1,13 +1,5 @@ XEN_ROOT = ../.. XEN_ROOT = ../.. include $(XEN_ROOT)/tools/Rules.mk - -XEN_SECURITY_MODULE = dummy -ifeq ($(FLASK_ENABLE),y) -XEN_SECURITY_MODULE = flask -endif -ifeq ($(ACM_SECURITY),y) -XEN_SECURITY_MODULE = acm -endif .PHONY: all all: build @@ -23,8 +15,8 @@ NLSDIR = /usr/share/locale NLSDIR = /usr/share/locale .PHONY: build buildpy -buildpy: xsm.py - CC="$(CC)" CFLAGS="$(CFLAGS)" XEN_SECURITY_MODULE="$(XEN_SECURITY_MODULE)" python setup.py build +buildpy: + CC="$(CC)" CFLAGS="$(CFLAGS)" python setup.py build build: buildpy refresh-pot refresh-po $(CATALOGS) @@ -61,18 +53,6 @@ refresh-po: $(POTFILE) %.mo: %.po $(MSGFMT) -c -o $@ $< -xsm.py: - @(set -e; \ - echo "XEN_SECURITY_MODULE = \""$(XEN_SECURITY_MODULE)"\""; \ - echo "from xsm_core import *"; \ - echo ""; \ - echo "import xen.util.xsm."$(XEN_SECURITY_MODULE)"."$(XEN_SECURITY_MODULE)" as xsm_module"; \ - echo ""; \ - echo "xsm_init(xsm_module)"; \ - echo "from xen.util.xsm."$(XEN_SECURITY_MODULE)"."$(XEN_SECURITY_MODULE)" import *"; \ - echo "del xsm_module"; \ - echo "") >xen/util/xsm/$@ - .PHONY: install ifndef XEN_PYTHON_NATIVE_INSTALL install: LIBPATH=$(shell PYTHONPATH=xen/util python -c "import auxbin; print auxbin.libpath()") @@ -104,4 +84,4 @@ test: .PHONY: clean clean: - rm -rf build *.pyc *.pyo *.o *.a *~ $(CATALOGS) xen/util/xsm/xsm.py xen/util/auxbin.pyc + rm -rf build *.pyc *.pyo *.o *.a *~ $(CATALOGS) xen/util/auxbin.pyc diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/python/xen/util/xsconstants.py --- a/tools/python/xen/util/xsconstants.py Fri Sep 12 14:32:45 2008 +0900 +++ b/tools/python/xen/util/xsconstants.py Fri Sep 12 14:47:40 2008 +0900 @@ -20,8 +20,10 @@ XS_INST_BOOT = (1 << 0) XS_INST_BOOT = (1 << 0) XS_INST_LOAD = (1 << 1) -XS_POLICY_NONE = 0 XS_POLICY_ACM = (1 << 0) +XS_POLICY_FLASK = (1 << 1) +XS_POLICY_DUMMY = (1 << 2) +XS_POLICY_USE = 0 # Some internal variables used by the Xen-API ACM_LABEL_VM = (1 << 0) @@ -107,6 +109,6 @@ ACM_POLICY_ID = 'ACM' INVALID_POLICY_PREFIX = 'INV_' -INVALID_SSIDREF = 0xFFFFFFFF +INVALID_SSIDREF = 0xFFFFFFFFL XS_INACCESSIBLE_LABEL = '__INACCESSIBLE__' diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/python/xen/util/xsm/flask/flask.py --- a/tools/python/xen/util/xsm/flask/flask.py Fri Sep 12 14:32:45 2008 +0900 +++ b/tools/python/xen/util/xsm/flask/flask.py Fri Sep 12 14:47:40 2008 +0900 @@ -1,5 +1,6 @@ import sys import sys from xen.lowlevel import flask +from xen.util import xsconstants from xen.xend import sxp #Functions exported through XML-RPC @@ -12,7 +13,7 @@ def err(msg): raise XSMError(msg) def on(): - return 0 #xsconstants.XS_POLICY_FLASK + return xsconstants.XS_POLICY_FLASK def ssidref2label(ssidref): try: @@ -37,8 +38,9 @@ def set_security_label(policy, label): return label def ssidref2security_label(ssidref): - return ssidref2label(ssidref) + label = ssidref2label(ssidref) + return label def get_security_label(self, xspol=None): - label = self.info.get('security_label', '') + label = self.info['security_label'] return label diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/python/xen/util/xsm/xsm.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/python/xen/util/xsm/xsm.py Fri Sep 12 14:47:40 2008 +0900 @@ -0,0 +1,20 @@ +import sys +import string +from xen.xend import XendOptions +from xen.util import xsconstants +from xsm_core import xsm_init + +xoptions = XendOptions.instance() +xsm_module_name = xoptions.get_xsm_module_name() + +xsconstants.XS_POLICY_USE = eval("xsconstants.XS_POLICY_" + + string.upper(xsm_module_name)) + +xsm_module_path = "xen.util.xsm." + xsm_module_name + "." + xsm_module_name +xsm_module = __import__(xsm_module_path, globals(), locals(), ['*']) + +xsm_init(xsm_module) + +for op in dir(xsm_module): + if not hasattr(sys.modules[__name__], op): + setattr(sys.modules[__name__], op, getattr(xsm_module, op, None)) diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/python/xen/xend/XendConfig.py --- a/tools/python/xen/xend/XendConfig.py Fri Sep 12 14:32:45 2008 +0900 +++ b/tools/python/xen/xend/XendConfig.py Fri Sep 12 14:47:40 2008 +0900 @@ -729,7 +729,7 @@ class XendConfig(dict): self.parse_cpuid(cfg, 'cpuid_check') import xen.util.xsm.xsm as security - if security.on() == xsconstants.XS_POLICY_ACM: + if security.on() == xsconstants.XS_POLICY_USE: from xen.util.acmpolicy import ACM_LABEL_UNLABELED if not 'security' in cfg and sxp.child_value(sxp_cfg, 'security'): cfg['security'] = sxp.child_value(sxp_cfg, 'security') diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/python/xen/xend/XendDomainInfo.py --- a/tools/python/xen/xend/XendDomainInfo.py Fri Sep 12 14:32:45 2008 +0900 +++ b/tools/python/xen/xend/XendDomainInfo.py Fri Sep 12 14:47:40 2008 +0900 @@ -2069,7 +2069,7 @@ class XendDomainInfo: balloon.free(2*1024) # 2MB should be plenty ssidref = 0 - if security.on() == xsconstants.XS_POLICY_ACM: + if security.on() == xsconstants.XS_POLICY_USE: ssidref = security.calc_dom_ssidref_from_info(self.info) if security.has_authorization(ssidref) == False: raise VmError("VM is not authorized to run.") @@ -2855,10 +2855,6 @@ class XendDomainInfo: info["maxmem_kb"] = XendNode.instance() \ .physinfo_dict()['total_memory'] * 1024 - #ssidref field not used any longer - if 'ssidref' in info: - info.pop('ssidref') - # make sure state is reset for info # TODO: we should eventually get rid of old_dom_states diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/python/xen/xend/XendOptions.py --- a/tools/python/xen/xend/XendOptions.py Fri Sep 12 14:32:45 2008 +0900 +++ b/tools/python/xen/xend/XendOptions.py Fri Sep 12 14:47:40 2008 +0900 @@ -131,6 +131,9 @@ class XendOptions: """Default script to configure a backend network interface""" vif_script = osdep.vif_script + + """Default Xen Security Module""" + xsm_module_default = 'dummy' """Default rotation count of qemu-dm log file.""" qemu_dm_logrotate_count = 10 @@ -427,6 +430,11 @@ class XendOptionsFile(XendOptions): return self.get_config_value('xen-api-server', self.xen_api_server_default) + def get_xsm_module_name(self): + """Get the Xen Security Module name. + """ + return self.get_config_string('xsm_module_name', self.xsm_module_default) + if os.uname()[0] == 'SunOS': class XendOptionsSMF(XendOptions): diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/python/xen/xend/server/blkif.py --- a/tools/python/xen/xend/server/blkif.py Fri Sep 12 14:32:45 2008 +0900 +++ b/tools/python/xen/xend/server/blkif.py Fri Sep 12 14:47:40 2008 +0900 @@ -78,7 +78,7 @@ class BlkifController(DevController): if uuid: back['uuid'] = uuid - if security.on() == xsconstants.XS_POLICY_ACM: + if security.on() == xsconstants.XS_POLICY_USE: self.do_access_control(config, uname) (device_path, devid) = blkif.blkdev_name_to_number(dev) diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/python/xen/xend/server/netif.py --- a/tools/python/xen/xend/server/netif.py Fri Sep 12 14:32:45 2008 +0900 +++ b/tools/python/xen/xend/server/netif.py Fri Sep 12 14:47:40 2008 +0900 @@ -156,7 +156,7 @@ class NetifController(DevController): front = { 'handle' : "%i" % devid, 'mac' : mac } - if security.on() == xsconstants.XS_POLICY_ACM: + if security.on() == xsconstants.XS_POLICY_USE: self.do_access_control(config) return (devid, back, front) diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/python/xen/xend/server/pciif.py --- a/tools/python/xen/xend/server/pciif.py Fri Sep 12 14:32:45 2008 +0900 +++ b/tools/python/xen/xend/server/pciif.py Fri Sep 12 14:47:40 2008 +0900 @@ -286,7 +286,7 @@ class PciController(DevController): )%(dev.name)) if dev.has_non_page_aligned_bar and arch.type != "ia64": - raise VmError("pci: %: non-page-aligned MMIO BAR found." % dev.name) + raise VmError("pci: %s: non-page-aligned MMIO BAR found." % dev.name) self.CheckSiblingDevices(fe_domid, dev) diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/python/xen/xm/create.py --- a/tools/python/xen/xm/create.py Fri Sep 12 14:32:45 2008 +0900 +++ b/tools/python/xen/xm/create.py Fri Sep 12 14:47:40 2008 +0900 @@ -566,11 +566,11 @@ gopts.var('hap', val='HAP', use="""Hap status (0=hap is disabled; 1=hap is enabled.""") -gopts.var('cpuid', val="IN[,SIN]:eax=EAX,ebx=EBX,exc=ECX,edx=EDX", +gopts.var('cpuid', val="IN[,SIN]:eax=EAX,ebx=EBX,ecx=ECX,edx=EDX", fn=append_value, default=[], use="""Cpuid description.""") -gopts.var('cpuid_check', val="IN[,SIN]:eax=EAX,ebx=EBX,exc=ECX,edx=EDX", +gopts.var('cpuid_check', val="IN[,SIN]:eax=EAX,ebx=EBX,ecx=ECX,edx=EDX", fn=append_value, default=[], use="""Cpuid check description.""") @@ -971,7 +971,7 @@ def preprocess_cpuid(vals, attr_name): "of the register %s for input %s\n" % (res['reg'], input) ) cpuid[input][res['reg']] = res['val'] # new register - setattr(vals, attr_name, cpuid) + setattr(vals, attr_name, cpuid) def preprocess_pci(vals): if not vals.pci: return diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/python/xen/xm/main.py --- a/tools/python/xen/xm/main.py Fri Sep 12 14:32:45 2008 +0900 +++ b/tools/python/xen/xm/main.py Fri Sep 12 14:47:40 2008 +0900 @@ -1812,7 +1812,7 @@ def domain_name_to_domid(domain_name): else: dom = server.xend.domain(domain_name) domid = int(sxp.child_value(dom, 'domid', '-1')) - return domid + return int(domid) def xm_vncviewer(args): autopass = False; diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/xcutils/lsevtchn.c --- a/tools/xcutils/lsevtchn.c Fri Sep 12 14:32:45 2008 +0900 +++ b/tools/xcutils/lsevtchn.c Fri Sep 12 14:47:40 2008 +0900 @@ -8,49 +8,55 @@ #include <xenctrl.h> #include <xenguest.h> -int -main(int argc, char **argv) +int main(int argc, char **argv) { - int xc_fd; - int domid = 0, port = 0, status; - const char *msg; + int xc_fd, domid, port, rc; + xc_evtchn_status_t status; - if ( argc > 1 ) - domid = strtol(argv[1], NULL, 10); + domid = (argc > 1) ? strtol(argv[1], NULL, 10) : 0; xc_fd = xc_interface_open(); if ( xc_fd < 0 ) errx(1, "failed to open control interface"); - while ( (status = xc_evtchn_status(xc_fd, domid, port)) >= 0 ) + for ( port = 0; ; port++ ) { - switch ( status ) + status.dom = domid; + status.port = port; + rc = xc_evtchn_status(xc_fd, &status); + if ( rc < 0 ) + break; + + if ( status.status == EVTCHNSTAT_closed ) + continue; + + printf("%4d: VCPU %u: ", port, status.vcpu); + + switch ( status.status ) { - case EVTCHNSTAT_closed: - msg = "Channel is not in use."; - break; case EVTCHNSTAT_unbound: - msg = "Channel is waiting interdom connection."; + printf("Interdomain (Waiting connection) - Remote Domain %u", + status.u.unbound.dom); break; case EVTCHNSTAT_interdomain: - msg = "Channel is connected to remote domain."; + printf("Interdomain (Connected) - Remote Domain %u, Port %u", + status.u.interdomain.dom, status.u.interdomain.port); break; case EVTCHNSTAT_pirq: - msg = "Channel is bound to a phys IRQ line."; + printf("Physical IRQ %u", status.u.pirq); break; case EVTCHNSTAT_virq: - msg = "Channel is bound to a virtual IRQ line."; + printf("Virtual IRQ %u", status.u.virq); break; case EVTCHNSTAT_ipi: - msg = "Channel is bound to a virtual IPI line."; + printf("IPI"); break; default: - msg = "Unknown."; + printf("Unknown"); break; + } - } - printf("%03d: %d: %s\n", port, status, msg); - port++; + printf("\n"); } xc_interface_close(xc_fd); diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/xcutils/xc_save.c --- a/tools/xcutils/xc_save.c Fri Sep 12 14:32:45 2008 +0900 +++ b/tools/xcutils/xc_save.c Fri Sep 12 14:47:40 2008 +0900 @@ -32,7 +32,7 @@ static struct suspendinfo { * Issue a suspend request through stdout, and receive the acknowledgement * from stdin. This is handled by XendCheckpoint in the Python layer. */ -static int compat_suspend(int domid) +static int compat_suspend(void) { char ans[30]; @@ -43,16 +43,35 @@ static int compat_suspend(int domid) !strncmp(ans, "done\n", 5)); } -static int suspend_evtchn_release(int xc, int domid) +static int suspend_evtchn_release(void) { if (si.suspend_evtchn >= 0) { - xc_evtchn_unbind(si.xce, si.suspend_evtchn); - si.suspend_evtchn = -1; + xc_evtchn_unbind(si.xce, si.suspend_evtchn); + si.suspend_evtchn = -1; } if (si.xce >= 0) { - xc_evtchn_close(si.xce); - si.xce = -1; - } + xc_evtchn_close(si.xce); + si.xce = -1; + } + + return 0; +} + +static int await_suspend(void) +{ + int rc; + + do { + rc = xc_evtchn_pending(si.xce); + if (rc < 0) { + warnx("error polling suspend notification channel: %d", rc); + return -1; + } + } while (rc != si.suspend_evtchn); + + /* harmless for one-off suspend */ + if (xc_evtchn_unmask(si.xce, si.suspend_evtchn) < 0) + warnx("failed to unmask suspend notification channel: %d", rc); return 0; } @@ -71,16 +90,16 @@ static int suspend_evtchn_init(int xc, i xs = xs_daemon_open(); if (!xs) { - errx(1, "failed to get xenstore handle"); - return -1; + warnx("failed to get xenstore handle"); + return -1; } sprintf(path, "/local/domain/%d/device/suspend/event-channel", domid); portstr = xs_read(xs, XBT_NULL, path, &plen); xs_daemon_close(xs); if (!portstr || !plen) { - warnx("could not read suspend event channel"); - return -1; + warnx("could not read suspend event channel"); + return -1; } port = atoi(portstr); @@ -88,27 +107,29 @@ static int suspend_evtchn_init(int xc, i si.xce = xc_evtchn_open(); if (si.xce < 0) { - errx(1, "failed to open event channel handle"); - goto cleanup; + warnx("failed to open event channel handle"); + goto cleanup; } si.suspend_evtchn = xc_evtchn_bind_interdomain(si.xce, domid, port); if (si.suspend_evtchn < 0) { - errx(1, "failed to bind suspend event channel: %d", - si.suspend_evtchn); - goto cleanup; + warnx("failed to bind suspend event channel: %d", si.suspend_evtchn); + goto cleanup; } rc = xc_domain_subscribe_for_suspend(xc, domid, port); if (rc < 0) { - errx(1, "failed to subscribe to domain: %d", rc); - goto cleanup; - } + warnx("failed to subscribe to domain: %d", rc); + goto cleanup; + } + + /* event channel is pending immediately after binding */ + await_suspend(); return 0; cleanup: - suspend_evtchn_release(xc, domid); + suspend_evtchn_release(); return -1; } @@ -116,29 +137,20 @@ static int suspend_evtchn_init(int xc, i /** * Issue a suspend request to a dedicated event channel in the guest, and * receive the acknowledgement from the subscribe event channel. */ -static int evtchn_suspend(int domid) -{ - int xcefd; +static int evtchn_suspend(void) +{ int rc; rc = xc_evtchn_notify(si.xce, si.suspend_evtchn); if (rc < 0) { - errx(1, "failed to notify suspend request channel: %d", rc); - return 0; - } - - xcefd = xc_evtchn_fd(si.xce); - do { - rc = xc_evtchn_pending(si.xce); - if (rc < 0) { - errx(1, "error polling suspend notification channel: %d", rc); - return 0; - } - } while (rc != si.suspend_evtchn); - - /* harmless for one-off suspend */ - if (xc_evtchn_unmask(si.xce, si.suspend_evtchn) < 0) - errx(1, "failed to unmask suspend notification channel: %d", rc); + warnx("failed to notify suspend request channel: %d", rc); + return 0; + } + + if (await_suspend() < 0) { + warnx("suspend failed"); + return 0; + } /* notify xend that it can do device migration */ printf("suspended\n"); @@ -147,12 +159,12 @@ static int evtchn_suspend(int domid) return 1; } -static int suspend(int domid) +static int suspend(void) { if (si.suspend_evtchn >= 0) - return evtchn_suspend(domid); - - return compat_suspend(domid); + return evtchn_suspend(); + + return compat_suspend(); } /* For HVM guests, there are two sources of dirty pages: the Xen shadow @@ -195,11 +207,9 @@ static void qemu_flip_buffer(int domid, /* Tell qemu that we want it to start writing log-dirty bits to the * other buffer */ - if (!xs_write(xs, XBT_NULL, qemu_next_active_path, &digit, 1)) { + if (!xs_write(xs, XBT_NULL, qemu_next_active_path, &digit, 1)) errx(1, "can't write next-active to store path (%s)\n", - qemu_next_active_path); - exit(1); - } + qemu_next_active_path); /* Wait a while for qemu to signal that it has switched to the new * active buffer */ @@ -208,10 +218,8 @@ static void qemu_flip_buffer(int domid, tv.tv_usec = 0; FD_ZERO(&fdset); FD_SET(xs_fileno(xs), &fdset); - if ((select(xs_fileno(xs) + 1, &fdset, NULL, NULL, &tv)) != 1) { + if ((select(xs_fileno(xs) + 1, &fdset, NULL, NULL, &tv)) != 1) errx(1, "timed out waiting for qemu to switch buffers\n"); - exit(1); - } watch = xs_read_watch(xs, &len); free(watch); @@ -221,7 +229,7 @@ static void qemu_flip_buffer(int domid, goto read_again; } -static void * init_qemu_maps(int domid, unsigned int bitmap_size) +static void *init_qemu_maps(int domid, unsigned int bitmap_size) { key_t key; char key_ascii[17] = {0,}; @@ -293,7 +301,7 @@ main(int argc, char **argv) int ret; if (argc != 6) - errx(1, "usage: %s iofd domid maxit maxf flags", argv[0]); + errx(1, "usage: %s iofd domid maxit maxf flags", argv[0]); xc_fd = xc_interface_open(); if (xc_fd < 0) @@ -305,13 +313,14 @@ main(int argc, char **argv) max_f = atoi(argv[4]); flags = atoi(argv[5]); - suspend_evtchn_init(xc_fd, domid); + if (suspend_evtchn_init(xc_fd, domid) < 0) + warnx("suspend event channel initialization failed, using slow path"); ret = xc_domain_save(xc_fd, io_fd, domid, maxit, max_f, flags, &suspend, !!(flags & XCFLAGS_HVM), &init_qemu_maps, &qemu_flip_buffer); - suspend_evtchn_release(xc_fd, domid); + suspend_evtchn_release(); xc_interface_close(xc_fd); diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/xenstore/xs.c --- a/tools/xenstore/xs.c Fri Sep 12 14:32:45 2008 +0900 +++ b/tools/xenstore/xs.c Fri Sep 12 14:47:40 2008 +0900 @@ -795,8 +795,11 @@ char *xs_get_domain_path(struct xs_handl bool xs_is_domain_introduced(struct xs_handle *h, unsigned int domid) { - return strcmp("F", - single_with_domid(h, XS_IS_DOMAIN_INTRODUCED, domid)); + char *domain = single_with_domid(h, XS_IS_DOMAIN_INTRODUCED, domid); + int rc = strcmp("F", domain); + + free(domain); + return rc; } /* Only useful for DEBUG versions */ diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/xentrace/formats --- a/tools/xentrace/formats Fri Sep 12 14:32:45 2008 +0900 +++ b/tools/xentrace/formats Fri Sep 12 14:47:40 2008 +0900 @@ -4,56 +4,69 @@ 0x0001f002 CPU%(cpu)d %(tsc)d (+%(relt 0x0001f002 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) wrap_buffer 0x%(1)08x 0x0001f003 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) cpu_change 0x%(1)08x -0x0002f001 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) sched_add_domain [ domid = 0x%(1)08x, edomid = 0x%(2)08x ] -0x0002f002 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) sched_rem_domain [ domid = 0x%(1)08x, edomid = 0x%(2)08x ] -0x0002f003 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) domain_sleep [ domid = 0x%(1)08x, edomid = 0x%(2)08x ] -0x0002f004 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) domain_wake [ domid = 0x%(1)08x, edomid = 0x%(2)08x ] -0x0002f005 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) do_yield [ domid = 0x%(1)08x, edomid = 0x%(2)08x ] -0x0002f006 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) do_block [ domid = 0x%(1)08x, edomid = 0x%(2)08x ] -0x0002f007 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) domain_shutdown [ domid = 0x%(1)08x, edomid = 0x%(2)08x, reason = 0x%(3)08x ] -0x0002f008 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) sched_ctl -0x0002f009 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) sched_adjdom [ domid = 0x%(1)08x ] -0x0002f00a CPU%(cpu)d %(tsc)d (+%(reltsc)8d) __enter_scheduler [ prev<domid:edomid> = 0x%(1)08x : 0x%(2)08x, next<domid:edomid> = 0x%(3)08x : 0x%(4)08x ] -0x0002f00B CPU%(cpu)d %(tsc)d (+%(reltsc)8d) s_timer_fn -0x0002f00c CPU%(cpu)d %(tsc)d (+%(reltsc)8d) t_timer_fn -0x0002f00d CPU%(cpu)d %(tsc)d (+%(reltsc)8d) dom_timer_fn -0x0002f00e CPU%(cpu)d %(tsc)d (+%(reltsc)8d) switch_infprev [ old_domid = 0x%(1)08x, runtime = %(2)d ] -0x0002f00f CPU%(cpu)d %(tsc)d (+%(reltsc)8d) switch_infnext [ new_domid = 0x%(1)08x, time = %(2)d, r_time = %(3)d ] +0x00021011 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) running_to_runnable [ dom:vcpu = 0x%(1)08x ] +0x00021021 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) running_to_blocked [ dom:vcpu = 0x%(1)08x ] +0x00021031 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) running_to_offline [ dom:vcpu = 0x%(1)08x ] +0x00021101 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) runnable_to_running [ dom:vcpu = 0x%(1)08x ] +0x00021121 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) runnable_to_blocked [ dom:vcpu = 0x%(1)08x ] +0x00021131 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) runnable_to_offline [ dom:vcpu = 0x%(1)08x ] +0x00021201 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) blocked_to_running [ dom:vcpu = 0x%(1)08x ] +0x00021211 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) blocked_to_runnable [ dom:vcpu = 0x%(1)08x ] +0x00021231 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) blocked_to_offline [ dom:vcpu = 0x%(1)08x ] +0x00021301 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) offline_to_running [ dom:vcpu = 0x%(1)08x ] +0x00021311 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) offline_to_runnable [ dom:vcpu = 0x%(1)08x ] +0x00021321 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) offline_to_blocked [ dom:vcpu = 0x%(1)08x ] -0x00081001 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) VMENTRY [ dom:vcpu = 0x%(1)08x ] -0x00081002 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) VMEXIT [ dom:vcpu = 0x%(1)08x, exitcode = 0x%(2)08x, rIP = 0x%(3)08x ] -0x00081102 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) VMEXIT [ dom:vcpu = 0x%(1)08x, exitcode = 0x%(2)08x, rIP = 0x%(3)016x ] -0x00082001 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) PF_XEN [ dom:vcpu = 0x%(1)08x, errorcode = 0x%(2)02x, virt = 0x%(3)08x ] -0x00082101 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) PF_XEN [ dom:vcpu = 0x%(1)08x, errorcode = 0x%(2)02x, virt = 0x%(3)016x ] -0x00082002 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) PF_INJECT [ dom:vcpu = 0x%(1)08x, errorcode = 0x%(2)02x, virt = 0x%(3)08x ] -0x00082102 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) PF_INJECT [ dom:vcpu = 0x%(1)08x, errorcode = 0x%(2)02x, virt = 0x%(3)016x ] -0x00082003 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) INJ_EXC [ dom:vcpu = 0x%(1)08x, vector = 0x%(2)02x, errorcode = 0x%(3)04x ] -0x00082004 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) INJ_VIRQ [ dom:vcpu = 0x%(1)08x, vector = 0x%(2)02x, fake = %(3)d ] -0x00082005 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) REINJ_VIRQ [ dom:vcpu = 0x%(1)08x, vector = 0x%(2)02x ] -0x00082006 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) IO_READ [ dom:vcpu = 0x%(1)08x, port = 0x%(2)04x, size = %(3)d ] -0x00082007 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) IO_WRITE [ dom:vcpu = 0x%(1)08x, port = 0x%(2)04x, size = %(3)d ] -0x00082008 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) CR_READ [ dom:vcpu = 0x%(1)08x, CR# = %(2)d, value = 0x%(3)08x ] -0x00082108 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) CR_READ [ dom:vcpu = 0x%(1)08x, CR# = %(2)d, value = 0x%(3)016x ] -0x00082009 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) CR_WRITE [ dom:vcpu = 0x%(1)08x, CR# = %(2)d, value = 0x%(3)08x ] -0x00082109 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) CR_WRITE [ dom:vcpu = 0x%(1)08x, CR# = %(2)d, value = 0x%(3)016x ] -0x0008200A CPU%(cpu)d %(tsc)d (+%(reltsc)8d) DR_READ [ dom:vcpu = 0x%(1)08x ] -0x0008200B CPU%(cpu)d %(tsc)d (+%(reltsc)8d) DR_WRITE [ dom:vcpu = 0x%(1)08x ] -0x0008200C CPU%(cpu)d %(tsc)d (+%(reltsc)8d) MSR_READ [ dom:vcpu = 0x%(1)08x, MSR# = 0x%(2)08x, value = 0x%(3)016x ] -0x0008200D CPU%(cpu)d %(tsc)d (+%(reltsc)8d) MSR_WRITE [ dom:vcpu = 0x%(1)08x, MSR# = 0x%(2)08x, value = 0x%(3)016x ] -0x0008200E CPU%(cpu)d %(tsc)d (+%(reltsc)8d) CPUID [ dom:vcpu = 0x%(1)08x, func = 0x%(2)08x, eax = 0x%(3)08x, ebx = 0x%(4)08x, ecx=0x%(5)08x, edx = 0x%(6)08x ] -0x0008200F CPU%(cpu)d %(tsc)d (+%(reltsc)8d) INTR [ dom:vcpu = 0x%(1)08x, vector = 0x%(2)02x ] -0x00082010 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) NMI [ dom:vcpu = 0x%(1)08x ] -0x00082011 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) SMI [ dom:vcpu = 0x%(1)08x ] -0x00082012 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) VMMCALL [ dom:vcpu = 0x%(1)08x, func = 0x%(2)08x ] -0x00082013 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) HLT [ dom:vcpu = 0x%(1)08x, intpending = %(2)d ] -0x00082014 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) INVLPG [ dom:vcpu = 0x%(1)08x, is invlpga? = %(2)d, virt = 0x%(3)08x ] -0x00082114 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) INVLPG [ dom:vcpu = 0x%(1)08x, is invlpga? = %(2)d, virt = 0x%(3)016x ] -0x00082015 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) MCE [ dom:vcpu = 0x%(1)08x ] -0x00082016 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) IO_ASSIST [ dom:vcpu = 0x%(1)08x, data = 0x%(2)04x ] -0x00082017 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) MMIO_ASSIST [ dom:vcpu = 0x%(1)08x, data = 0x%(2)04x ] -0x00082018 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) CLTS [ dom:vcpu = 0x%(1)08x ] -0x00082019 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) LMSW [ dom:vcpu = 0x%(1)08x, value = 0x%(2)08x ] -0x00082119 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) LMSW [ dom:vcpu = 0x%(1)08x, value = 0x%(2)016x ] +0x00028001 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) sched_add_domain [ domid = 0x%(1)08x, edomid = 0x%(2)08x ] +0x00028002 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) sched_rem_domain [ domid = 0x%(1)08x, edomid = 0x%(2)08x ] +0x00028003 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) domain_sleep [ domid = 0x%(1)08x, edomid = 0x%(2)08x ] +0x00028004 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) domain_wake [ domid = 0x%(1)08x, edomid = 0x%(2)08x ] +0x00028005 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) do_yield [ domid = 0x%(1)08x, edomid = 0x%(2)08x ] +0x00028006 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) do_block [ domid = 0x%(1)08x, edomid = 0x%(2)08x ] +0x00028007 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) domain_shutdown [ domid = 0x%(1)08x, edomid = 0x%(2)08x, reason = 0x%(3)08x ] +0x00028008 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) sched_ctl +0x00028009 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) sched_adjdom [ domid = 0x%(1)08x ] +0x0002800a CPU%(cpu)d %(tsc)d (+%(reltsc)8d) __enter_scheduler [ prev<domid:edomid> = 0x%(1)08x : 0x%(2)08x, next<domid:edomid> = 0x%(3)08x : 0x%(4)08x ] +0x0002800b CPU%(cpu)d %(tsc)d (+%(reltsc)8d) s_timer_fn +0x0002800c CPU%(cpu)d %(tsc)d (+%(reltsc)8d) t_timer_fn +0x0002800d CPU%(cpu)d %(tsc)d (+%(reltsc)8d) dom_timer_fn +0x0002800e CPU%(cpu)d %(tsc)d (+%(reltsc)8d) switch_infprev [ old_domid = 0x%(1)08x, runtime = %(2)d ] +0x0002800f CPU%(cpu)d %(tsc)d (+%(reltsc)8d) switch_infnext [ new_domid = 0x%(1)08x, time = %(2)d, r_time = %(3)d ] + +0x00081001 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) VMENTRY +0x00081002 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) VMEXIT [ exitcode = 0x%(1)08x, rIP = 0x%(2)08x ] +0x00081102 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) VMEXIT [ exitcode = 0x%(1)08x, rIP = 0x%(2)016x ] +0x00082001 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) PF_XEN [ errorcode = 0x%(2)02x, virt = 0x%(1)08x ] +0x00082101 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) PF_XEN [ errorcode = 0x%(2)02x, virt = 0x%(1)016x ] +0x00082002 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) PF_INJECT [ errorcode = 0x%(1)02x, virt = 0x%(2)08x ] +0x00082102 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) PF_INJECT [ errorcode = 0x%(1)02x, virt = 0x%(2)016x ] +0x00082003 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) INJ_EXC [ vector = 0x%(1)02x, errorcode = 0x%(2)04x ] +0x00082004 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) INJ_VIRQ [ vector = 0x%(1)02x, fake = %(2)d ] +0x00082005 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) REINJ_VIRQ [ vector = 0x%(1)02x ] +0x00082006 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) IO_READ [ port = 0x%(1)04x, size = %(2)d ] +0x00082007 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) IO_WRITE [ port = 0x%(1)04x, size = %(2)d ] +0x00082008 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) CR_READ [ CR# = %(1)d, value = 0x%(2)08x ] +0x00082108 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) CR_READ [ CR# = %(1)d, value = 0x%(2)016x ] +0x00082009 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) CR_WRITE [ CR# = %(1)d, value = 0x%(2)08x ] +0x00082109 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) CR_WRITE [ CR# = %(1)d, value = 0x%(2)016x ] +0x0008200A CPU%(cpu)d %(tsc)d (+%(reltsc)8d) DR_READ +0x0008200B CPU%(cpu)d %(tsc)d (+%(reltsc)8d) DR_WRITE +0x0008200C CPU%(cpu)d %(tsc)d (+%(reltsc)8d) MSR_READ [ MSR# = 0x%(1)08x, value = 0x%(2)016x ] +0x0008200D CPU%(cpu)d %(tsc)d (+%(reltsc)8d) MSR_WRITE [ MSR# = 0x%(1)08x, value = 0x%(2)016x ] +0x0008200E CPU%(cpu)d %(tsc)d (+%(reltsc)8d) CPUID [ func = 0x%(1)08x, eax = 0x%(2)08x, ebx = 0x%(3)08x, ecx=0x%(4)08x, edx = 0x%(5)08x ] +0x0008200F CPU%(cpu)d %(tsc)d (+%(reltsc)8d) INTR [ vector = 0x%(1)02x ] +0x00082010 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) NMI +0x00082011 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) SMI +0x00082012 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) VMMCALL [ func = 0x%(1)08x ] +0x00082013 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) HLT [ intpending = %(1)d ] +0x00082014 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) INVLPG [ is invlpga? = %(1)d, virt = 0x%(2)08x ] +0x00082114 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) INVLPG [ is invlpga? = %(1)d, virt = 0x%(2)016x ] +0x00082015 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) MCE +0x00082016 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) IO_ASSIST [ data = 0x%(1)04x ] +0x00082017 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) MMIO_ASSIST [ data = 0x%(1)04x ] +0x00082018 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) CLTS +0x00082019 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) LMSW [ value = 0x%(1)08x ] +0x00082119 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) LMSW [ value = 0x%(1)016x ] 0x0010f001 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) page_grant_map [ domid = %(1)d ] 0x0010f002 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) page_grant_unmap [ domid = %(1)d ] @@ -65,3 +78,41 @@ 0x0020f103 CPU%(cpu)d %(tsc)d (+%(relt 0x0020f103 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) trap [ rip = 0x%(1)016x, trapnr:error = 0x%(2)08x ] 0x0020f004 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) page_fault [ eip = 0x%(1)08x, addr = 0x%(2)08x, error = 0x%(3)08x ] 0x0020f104 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) page_fault [ rip = 0x%(1)16x, addr = 0x%(3)16x, error = 0x%(5)08x ] + +0x0020f006 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) emulate_privop [ eip = 0x%(1)08x ] +0x0020f106 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) emulate_privop [ rip = 0x%(1)16x ] +0x0020f007 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) emulate_4G [ eip = 0x%(1)08x ] +0x0020f107 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) emulate_4G [ rip = 0x%(1)16x ] +0x0020f00c CPU%(cpu)d %(tsc)d (+%(reltsc)8d) ptwr_emulation_pae [ addr = 0x%(2)08x, eip = 0x%(1)08x, npte = 0x%(1)16x ] +0x0020f10c CPU%(cpu)d %(tsc)d (+%(reltsc)8d) ptwr_emulation_pae [ addr = 0x%(2)16x, rip = 0x%(1)16x, npte = 0x%(1)16x ] + +0x0040f001 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_not_shadow [ gl1e = 0x%(1)16x, va = 0x%(2)08x, flags = 0x%(3)08x ] +0x0040f101 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_not_shadow [ gl1e = 0x%(1)16x, va = 0x%(2)16x, flags = 0x%(3)08x ] +0x0040f002 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_fast_propagate [ va = 0x%(1)08x ] +0x0040f102 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_fast_propagate [ va = 0x%(1)16x ] +0x0040f003 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_fast_mmio [ va = 0x%(1)08x ] +0x0040f103 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_fast_mmio [ va = 0x%(1)16x ] +0x0040f004 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_false_fast_path [ va = 0x%(1)08x ] +0x0040f104 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_false_fast_path [ va = 0x%(1)16x ] +0x0040f005 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_mmio [ va = 0x%(1)08x ] +0x0040f105 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_mmio [ va = 0x%(1)16x ] +0x0040f006 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_fixup [ gl1e = 0x%(1)08x, va = 0x%(2)08x, flags = 0x%(3)08x ] +0x0040f106 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_fixup [ gl1e = 0x%(1)16x, va = 0x%(2)16x, flags = 0x%(3)08x ] +0x0040f007 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_domf_dying [ va = 0x%(1)08x ] +0x0040f107 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_domf_dying [ va = 0x%(1)16x ] +0x0040f008 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_emulate [ gl1e = 0x%(1)08x, write_val = 0x%(2)08x, va = 0x%(3)08x, flags = 0x%(4)08x, emulation_count = 0x%(5)08x] +0x0040f108 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_emulate [ gl1e = 0x%(1)16x, write_val = 0x%(2)16x, va = 0x%(3)16x, flags = 0x%(4)08x, emulation_count = 0x%(5)08x] +0x0040f009 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_emulate_unshadow_user [ va = 0x%(1)08x, gfn = 0x%(2)08x ] +0x0040f109 CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_emulate_unshadow_user [ va = 0x%(1)16x, gfn = 0x%(2)16x ] +0x0040f00a CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_emulate_unshadow_evtinj [ va = 0x%(1)08x, gfn = 0x%(2)08x ] +0x0040f10a CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_emulate_unshadow_evtinj [ va = 0x%(1)16x, gfn = 0x%(2)16x ] +0x0040f00b CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_emulate_unshadow_unhandled [ va = 0x%(1)08x, gfn = 0x%(2)08x ] +0x0040f10b CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_emulate_unshadow_unhandled [ va = 0x%(1)16x, gfn = 0x%(2)16x ] +0x0040f00c CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_emulate_wrmap_bf [ gfn = 0x%(1)08x ] +0x0040f10c CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_emulate_wrmap_bf [ gfn = 0x%(1)16x ] +0x0040f00d CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_emulate_prealloc_unpin [ gfn = 0x%(1)08x ] +0x0040f10d CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_emulate_prealloc_unpin [ gfn = 0x%(1)16x ] +0x0040f00e CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_emulate_resync_full [ gfn = 0x%(1)08x ] +0x0040f10e CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_emulate_resync_full [ gfn = 0x%(1)16x ] +0x0040f00f CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_emulate_resync_only [ gfn = 0x%(1)08x ] +0x0040f10f CPU%(cpu)d %(tsc)d (+%(reltsc)8d) shadow_emulate_resync_only [ gfn = 0x%(1)16x ] diff -r 4ddd63b4be9b -r ec8eaab557d8 tools/xentrace/xentrace.c --- a/tools/xentrace/xentrace.c Fri Sep 12 14:32:45 2008 +0900 +++ b/tools/xentrace/xentrace.c Fri Sep 12 14:47:40 2008 +0900 @@ -56,6 +56,7 @@ typedef struct settings_st { unsigned long tbuf_size; unsigned long disk_rsvd; unsigned long timeout; + unsigned long memory_buffer; uint8_t discard:1, disable_tracing:1; } settings_t; @@ -67,10 +68,243 @@ static int xc_handle = -1; static int xc_handle = -1; static int event_fd = -1; static int virq_port = -1; +static int outfd = 1; static void close_handler(int signal) { interrupted = 1; +} + +static struct { + char * buf; + unsigned long prod, cons, size; + unsigned long pending_size, pending_prod; +} membuf = { 0 }; + +#define MEMBUF_INDEX_RESET_THRESHOLD (1<<29) + +/* FIXME -- make a power of 2 so we can mask instead. */ +#define MEMBUF_POINTER(_i) (membuf.buf + ((_i) % membuf.size)) +#define MEMBUF_CONS_INCREMENT(_n) \ + do { \ + membuf.cons += (_n); \ + } while(0) +#define MEMBUF_PROD_SET(_x) \ + do { \ + if ( (_x) < membuf.prod ) { \ + fprintf(stderr, "%s: INTERNAL_ERROR: prod %lu, trying to set to %lu!\n", \ + __func__, membuf.prod, (unsigned long)(_x)); \ + exit(1); \ + } \ + membuf.prod = (_x); \ + if ( (_x) > MEMBUF_INDEX_RESET_THRESHOLD ) \ + { \ + membuf.prod %= membuf.size; \ + membuf.cons %= membuf.size; \ + if( membuf.prod < membuf.cons ) \ + membuf.prod += membuf.size; \ + } \ + } while(0) + +struct cpu_change_record { + uint32_t header; + struct { + int cpu; + unsigned window_size; + } data; +}; + +#define CPU_CHANGE_HEADER \ + (TRC_TRACE_CPU_CHANGE \ + | (((sizeof(struct cpu_change_record)/sizeof(uint32_t)) - 1) \ + << TRACE_EXTRA_SHIFT) ) + +void membuf_alloc(unsigned long size) +{ + membuf.buf = malloc(size); + + if(!membuf.buf) + { + fprintf(stderr, "%s: Couldn't malloc %lu bytes!\n", + __func__, size); + exit(1); + } + + membuf.prod = membuf.cons = 0; + membuf.size = size; +} + +/* + * Reserve a new window in the buffer. Move the 'consumer' forward size + * bytes, re-adjusting the cpu window sizes as necessary, and insert a + * cpu_change record. + */ +void membuf_reserve_window(unsigned cpu, unsigned long window_size) +{ + struct cpu_change_record *rec; + long need_to_consume, free, freed; + + if ( membuf.pending_size > 0 ) + { + fprintf(stderr, "%s: INTERNAL_ERROR: pending_size %lu\n", + __func__, membuf.pending_size); + exit(1); + } + + need_to_consume = window_size + sizeof(*rec); + + if ( window_size > membuf.size ) + { + fprintf(stderr, "%s: reserve size %lu larger than buffer size %lu!\n", + __func__, window_size, membuf.size); + exit(1); + } + + /* Subtract free space already in buffer. */ + free = membuf.size - (membuf.prod - membuf.cons); + if( need_to_consume < free) + goto start_window; + + need_to_consume -= free; + + /* + * "Free" up full windows until we have enough for this window. + * It's a bit wasteful to throw away partial buffers, but the only + * other option is to scan throught he buffer headers. Since the + * common case is that it's going to be thrown away next anyway, I + * think minimizing the overall impact is more important. + */ + do { + rec = (struct cpu_change_record *)MEMBUF_POINTER(membuf.cons); + if( rec->header != CPU_CHANGE_HEADER ) + { + fprintf(stderr, "%s: INTERNAL ERROR: no cpu_change record at consumer!\n", + __func__); + exit(EXIT_FAILURE); + } + + freed = sizeof(*rec) + rec->data.window_size; + + if ( need_to_consume > 0 ) + { + MEMBUF_CONS_INCREMENT(freed); + need_to_consume -= freed; + } + } while( need_to_consume > 0 ); + +start_window: + /* + * Start writing "pending" data. Update prod once all this data is + * written. + */ + membuf.pending_prod = membuf.prod; + membuf.pending_size = window_size; + + rec = (struct cpu_change_record *)MEMBUF_POINTER(membuf.pending_prod); + + rec->header = CPU_CHANGE_HEADER; + rec->data.cpu = cpu; + rec->data.window_size = window_size; + + membuf.pending_prod += sizeof(*rec); +} + +void membuf_write(void *start, unsigned long size) { + char * p; + unsigned long wsize; + + if( (membuf.size - (membuf.prod - membuf.cons)) < size ) + { + fprintf(stderr, "%s: INTERNAL ERROR: need %lu bytes, only have %lu!\n", + __func__, size, membuf.prod - membuf.cons); + exit(1); + } + + if( size > membuf.pending_size ) + { + fprintf(stderr, "%s: INTERNAL ERROR: size %lu, pending %lu!\n", + __func__, size, membuf.pending_size); + exit(1); + } + + wsize = size; + p = MEMBUF_POINTER(membuf.pending_prod); + + /* If the buffer overlaps the "wrap", do an extra write */ + if ( p + size > membuf.buf + membuf.size ) + { + int usize = ( membuf.buf + membuf.size ) - p; + + memcpy(p, start, usize); + + start += usize; + wsize -= usize; + p = membuf.buf; + } + + memcpy(p, start, wsize); + + membuf.pending_prod += size; + membuf.pending_size -= size; + + if ( membuf.pending_size == 0 ) + { + MEMBUF_PROD_SET(membuf.pending_prod); + } +} + +void membuf_dump(void) { + /* Dump circular memory buffer */ + int cons, prod, wsize, written; + char * wstart; + + fprintf(stderr, "Dumping memory buffer.\n"); + + cons = membuf.cons % membuf.size; + prod = membuf.prod % membuf.size; + + if(prod > cons) + { + /* Write in one go */ + wstart = membuf.buf + cons; + wsize = prod - cons; + + written = write(outfd, wstart, wsize); + if ( written != wsize ) + goto fail; + } + else + { + /* Write in two pieces: cons->end, beginning->prod. */ + wstart = membuf.buf + cons; + wsize = membuf.size - cons; + + written = write(outfd, wstart, wsize); + if ( written != wsize ) + { + fprintf(stderr, "Write failed! (size %d, returned %d)\n", + wsize, written); + goto fail; + } + + wstart = membuf.buf; + wsize = prod; + + written = write(outfd, wstart, wsize); + if ( written != wsize ) + { + fprintf(stderr, "Write failed! (size %d, returned %d)\n", + wsize, written); + goto fail; + } + } + + membuf.cons = membuf.prod = 0; + + return; +fail: + exit(1); + return; } /** @@ -85,20 +319,20 @@ static void close_handler(int signal) * of the buffer write. */ static void write_buffer(unsigned int cpu, unsigned char *start, int size, - int total_size, int outfd) + int total_size) { struct statvfs stat; size_t written = 0; - if ( opts.disk_rsvd != 0 ) + if ( opts.memory_buffer == 0 && opts.disk_rsvd != 0 ) { unsigned long long freespace; /* Check that filesystem has enough space. */ if ( fstatvfs (outfd, &stat) ) { - fprintf(stderr, "Statfs failed!\n"); - goto fail; + fprintf(stderr, "Statfs failed!\n"); + goto fail; } freespace = stat.f_frsize * (unsigned long long)stat.f_bfree; @@ -112,8 +346,8 @@ static void write_buffer(unsigned int cp if ( freespace <= opts.disk_rsvd ) { - fprintf(stderr, "Disk space limit reached (free space: %lluMB, limit: %luMB).\n", freespace, opts.disk_rsvd); - exit (EXIT_FAILURE); + fprintf(stderr, "Disk space limit reached (free space: %lluMB, limit: %luMB).\n", freespace, opts.disk_rsvd); + exit (EXIT_FAILURE); } } @@ -122,40 +356,46 @@ static void write_buffer(unsigned int cp * first write. */ if ( total_size != 0 ) { - struct { - uint32_t header; - struct { - unsigned cpu; - unsigned byte_count; - } extra; - } rec; - - rec.header = TRC_TRACE_CPU_CHANGE - | ((sizeof(rec.extra)/sizeof(uint32_t)) << TRACE_EXTRA_SHIFT); - rec.extra.cpu = cpu; - rec.extra.byte_count = total_size; - - written = write(outfd, &rec, sizeof(rec)); - - if ( written != sizeof(rec) ) - { - fprintf(stderr, "Cannot write cpu change (write returned %zd)\n", - written); + if ( opts.memory_buffer ) + { + membuf_reserve_window(cpu, total_size); + } + else + { + struct cpu_change_record rec; + + rec.header = CPU_CHANGE_HEADER; + rec.data.cpu = cpu; + rec.data.window_size = total_size; + + written = write(outfd, &rec, sizeof(rec)); + if ( written != sizeof(rec) ) + { + fprintf(stderr, "Cannot write cpu change (write returned %zd)\n", + written); + goto fail; + } + } + } + + if ( opts.memory_buffer ) + { + membuf_write(start, size); + } + else + { + written = write(outfd, start, size); + if ( written != size ) + { + fprintf(stderr, "Write failed! (size %d, returned %zd)\n", + size, written); goto fail; } } - written = write(outfd, start, size); - if ( written != size ) - { - fprintf(stderr, "Write failed! (size %d, returned %zd)\n", - size, written); - goto fail; - } - return; - fail: +fail: PERROR("Failed to write trace data"); exit(EXIT_FAILURE); } @@ -394,7 +634,7 @@ static void wait_for_event_or_timeout(un * monitor_tbufs - monitor the contents of tbufs and output to a file * @logfile: the FILE * representing the file to log to */ -static int monitor_tbufs(int outfd) +static int monitor_tbufs(void) { int i; @@ -429,9 +669,9 @@ static int monitor_tbufs(int outfd) meta[i]->cons = meta[i]->prod; /* now, scan buffers for events */ - while ( !interrupted ) - { - for ( i = 0; (i < num) && !interrupted; i++ ) + while ( 1 ) + { + for ( i = 0; i < num; i++ ) { unsigned long start_offset, end_offset, window_size, cons, prod; @@ -463,8 +703,7 @@ static int monitor_tbufs(int outfd) /* If window does not wrap, write in one big chunk */ write_buffer(i, data[i]+start_offset, window_size, - window_size, - outfd); + window_size); } else { @@ -474,23 +713,28 @@ static int monitor_tbufs(int outfd) */ write_buffer(i, data[i] + start_offset, data_size - start_offset, - window_size, - outfd); + window_size); write_buffer(i, data[i], end_offset, - 0, - outfd); + 0); } xen_mb(); /* read buffer, then update cons. */ meta[i]->cons = prod; - } + + } + + if ( interrupted ) + break; wait_for_event_or_timeout(opts.poll_sleep); } - if(opts.disable_tracing) + if ( opts.disable_tracing ) disable_tbufs(); + + if ( opts.memory_buffer ) + membuf_dump(); /* cleanup */ free(meta); @@ -538,6 +782,8 @@ static void usage(void) " -T --time-interval=s Run xentrace for s seconds and quit.\n" \ " -?, --help Show this message\n" \ " -V, --version Print program version\n" \ +" -M, --memory-buffer=b Copy trace records to a circular memory buffer.\n" \ +" Dump to file on exit.\n" \ "\n" \ "This tool is used to capture trace buffer data from Xen. The\n" \ "data is output in a binary format, in the following order:\n" \ @@ -551,6 +797,53 @@ static void usage(void) printf("\nReport bugs to %s\n", program_bug_address); exit(EXIT_FAILURE); +} + +/* convert the argument string pointed to by arg to a long int representation, + * including suffixes such as 'M' and 'k'. */ +#define MB (1024*1024) +#define KB (1024) +long sargtol(const char *restrict arg, int base) +{ + char *endp; + long val; + + errno = 0; + val = strtol(arg, &endp, base); + + if ( errno != 0 ) + { + fprintf(stderr, "Invalid option argument: %s\n", arg); + fprintf(stderr, "Error: %s\n\n", strerror(errno)); + usage(); + } + else if (endp == arg) + { + goto invalid; + } + + switch(*endp) + { + case '\0': + break; + case 'M': + val *= MB; + break; + case 'K': + case 'k': + val *= KB; + break; + default: + fprintf(stderr, "Unknown suffix %c\n", *endp); + exit(1); + } + + + return val; +invalid: + return 0; + fprintf(stderr, "Invalid option argument: %s\n\n", arg); + usage(); } /* convert the argument string pointed to by arg to a long int representation */ @@ -606,6 +899,7 @@ static void parse_args(int argc, char ** { "trace-buf-size", required_argument, 0, 'S' }, { "reserve-disk-space", required_argument, 0, 'r' }, { "time-interval", required_argument, 0, 'T' }, + { "memory-buffer", required_argument, 0, 'M' }, { "discard-buffers", no_argument, 0, 'D' }, { "dont-disable-tracing", no_argument, 0, 'x' }, { "help", no_argument, 0, '?' }, @@ -613,7 +907,7 @@ static void parse_args(int argc, char ** { 0, 0, 0, 0 } }; - while ( (option = getopt_long(argc, argv, "c:e:s:S:t:?V", + while ( (option = getopt_long(argc, argv, "t:s:c:e:S:r:T:M:Dx?V", long_options, NULL)) != -1) { switch ( option ) @@ -653,6 +947,10 @@ static void parse_args(int argc, char ** case 'T': opts.timeout = argtol(optarg, 0); + break; + + case 'M': + opts.memory_buffer = sargtol(optarg, 0); break; default: @@ -674,7 +972,7 @@ static void parse_args(int argc, char ** int main(int argc, char **argv) { - int outfd = 1, ret; + int ret; struct sigaction act; opts.outfile = 0; @@ -719,6 +1017,9 @@ int main(int argc, char **argv) fprintf(stderr, "Cannot output to a TTY, specify a log file.\n"); exit(EXIT_FAILURE); } + + if ( opts.memory_buffer > 0 ) + membuf_alloc(opts.memory_buffer); /* ensure that if we get a signal, we'll do cleanup, then exit */ act.sa_handler = close_handler; @@ -729,7 +1030,7 @@ int main(int argc, char **argv) sigaction(SIGINT, &act, NULL); sigaction(SIGALRM, &act, NULL); - ret = monitor_tbufs(outfd); + ret = monitor_tbufs(); return ret; } diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/acpi/Makefile --- a/xen/arch/x86/acpi/Makefile Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/arch/x86/acpi/Makefile Fri Sep 12 14:47:40 2008 +0900 @@ -1,5 +1,5 @@ subdir-y += cpufreq subdir-y += cpufreq obj-y += boot.o -obj-y += power.o suspend.o wakeup_prot.o cpu_idle.o +obj-y += power.o suspend.o wakeup_prot.o cpu_idle.o cpuidle_menu.o obj-y += pmstat.o diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/acpi/cpu_idle.c --- a/xen/arch/x86/acpi/cpu_idle.c Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/arch/x86/acpi/cpu_idle.c Fri Sep 12 14:47:40 2008 +0900 @@ -39,6 +39,7 @@ #include <xen/smp.h> #include <xen/guest_access.h> #include <xen/keyhandler.h> +#include <xen/cpuidle.h> #include <asm/cache.h> #include <asm/io.h> #include <asm/hpet.h> @@ -49,12 +50,9 @@ #define DEBUG_PM_CX #define US_TO_PM_TIMER_TICKS(t) ((t * (PM_TIMER_FREQUENCY/1000)) / 1000) +#define PM_TIMER_TICKS_TO_US(t) ((t * 1000) / (PM_TIMER_FREQUENCY / 1000)) #define C2_OVERHEAD 4 /* 1us (3.579 ticks per us) */ #define C3_OVERHEAD 4 /* 1us (3.579 ticks per us) */ - -#define ACPI_PROCESSOR_MAX_POWER 8 -#define ACPI_PROCESSOR_MAX_C2_LATENCY 100 -#define ACPI_PROCESSOR_MAX_C3_LATENCY 1000 static void (*lapic_timer_off)(void); static void (*lapic_timer_on)(void); @@ -65,66 +63,6 @@ static void (*pm_idle_save) (void) __rea static void (*pm_idle_save) (void) __read_mostly; unsigned int max_cstate __read_mostly = 2; integer_param("max_cstate", max_cstate); -/* - * bm_history -- bit-mask with a bit per jiffy of bus-master activity - * 1000 HZ: 0xFFFFFFFF: 32 jiffies = 32ms - * 800 HZ: 0xFFFFFFFF: 32 jiffies = 40ms - * 100 HZ: 0x0000000F: 4 jiffies = 40ms - * reduce history for more aggressive entry into C3 - */ -unsigned int bm_history __read_mostly = - (HZ >= 800 ? 0xFFFFFFFF : ((1U << (HZ / 25)) - 1)); -integer_param("bm_history", bm_history); - -struct acpi_processor_cx; - -struct acpi_processor_cx_policy -{ - u32 count; - struct acpi_processor_cx *state; - struct - { - u32 time; - u32 ticks; - u32 count; - u32 bm; - } threshold; -}; - -struct acpi_processor_cx -{ - u8 valid; - u8 type; - u32 address; - u8 space_id; - u32 latency; - u32 latency_ticks; - u32 power; - u32 usage; - u64 time; - struct acpi_processor_cx_policy promotion; - struct acpi_processor_cx_policy demotion; -}; - -struct acpi_processor_flags -{ - u8 bm_control:1; - u8 bm_check:1; - u8 has_cst:1; - u8 power_setup_done:1; - u8 bm_rld_set:1; -}; - -struct acpi_processor_power -{ - struct acpi_processor_flags flags; - struct acpi_processor_cx *state; - s_time_t bm_check_timestamp; - u32 default_state; - u32 bm_activity; - u32 count; - struct acpi_processor_cx states[ACPI_PROCESSOR_MAX_POWER]; -}; static struct acpi_processor_power processor_powers[NR_CPUS]; @@ -133,26 +71,21 @@ static void print_acpi_power(uint32_t cp uint32_t i; printk("==cpu%d==\n", cpu); - printk("active state:\t\tC%d\n", (power->state)?power->state->type:-1); + printk("active state:\t\tC%d\n", + (power->last_state) ? power->last_state->type : -1); printk("max_cstate:\t\tC%d\n", max_cstate); - printk("bus master activity:\t%08x\n", power->bm_activity); printk("states:\n"); for ( i = 1; i < power->count; i++ ) { - printk((power->states[i].type == power->state->type) ? " *" : " "); + if ( power->last_state && + power->states[i].type == power->last_state->type ) + printk(" *"); + else + printk(" "); printk("C%d:\t\t", i); printk("type[C%d] ", power->states[i].type); - if ( power->states[i].promotion.state ) - printk("promotion[C%d] ", power->states[i].promotion.state->type); - else - printk("promotion[--] "); - if ( power->states[i].demotion.state ) - printk("demotion[C%d] ", power->states[i].demotion.state->type); - else - printk("demotion[--] "); - printk("latency[%03d]\n ", power->states[i].latency); - printk("\t\t\t"); + printk("latency[%03d] ", power->states[i].latency); printk("usage[%08d] ", power->states[i].usage); printk("duration[%"PRId64"]\n", power->states[i].time); } @@ -180,48 +113,6 @@ static inline u32 ticks_elapsed(u32 t1, return (((0x00FFFFFF - t1) + t2) & 0x00FFFFFF); else return ((0xFFFFFFFF - t1) + t2); -} - -static void acpi_processor_power_activate(struct acpi_processor_power *power, - struct acpi_processor_cx *new) -{ - struct acpi_processor_cx *old; - - if ( !power || !new ) - return; - - old = power->state; - - if ( old ) - old->promotion.count = 0; - new->demotion.count = 0; - - /* Cleanup from old state. */ - if ( old ) - { - switch ( old->type ) - { - case ACPI_STATE_C3: - /* Disable bus master reload */ - if ( new->type != ACPI_STATE_C3 && power->flags.bm_check ) - acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0); - break; - } - } - - /* Prepare to use new state. */ - switch ( new->type ) - { - case ACPI_STATE_C3: - /* Enable bus master reload */ - if ( old->type != ACPI_STATE_C3 && power->flags.bm_check ) - acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 1); - break; - } - - power->state = new; - - return; } static void acpi_safe_halt(void) @@ -263,13 +154,50 @@ static void acpi_idle_do_entry(struct ac } } -static atomic_t c3_cpu_count; +static inline void acpi_idle_update_bm_rld(struct acpi_processor_power *power, + struct acpi_processor_cx *target) +{ + if ( !power->flags.bm_check ) + return; + + if ( power->flags.bm_rld_set && target->type != ACPI_STATE_C3 ) + { + acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0); + power->flags.bm_rld_set = 0; + } + + if ( !power->flags.bm_rld_set && target->type == ACPI_STATE_C3 ) + { + acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 1); + power->flags.bm_rld_set = 1; + } +} + +static int acpi_idle_bm_check(void) +{ + u32 bm_status = 0; + + acpi_get_register(ACPI_BITREG_BUS_MASTER_STATUS, &bm_status); + if ( bm_status ) + acpi_set_register(ACPI_BITREG_BUS_MASTER_STATUS, 1); + /* + * TBD: PIIX4 Erratum #18: Note that BM_STS doesn't always reflect + * the true state of bus mastering activity; forcing us to + * manually check the BMIDEA bit of each IDE channel. + */ + return bm_status; +} + +static struct { + spinlock_t lock; + unsigned int count; +} c3_cpu_status = { .lock = SPIN_LOCK_UNLOCKED }; static void acpi_processor_idle(void) { struct acpi_processor_power *power = NULL; struct acpi_processor_cx *cx = NULL; - struct acpi_processor_cx *next_state = NULL; + int next_state; int sleep_ticks = 0; u32 t1, t2 = 0; @@ -287,7 +215,16 @@ static void acpi_processor_idle(void) return; } - cx = power->state; + next_state = cpuidle_current_governor->select(power); + if ( next_state > 0 ) + { + cx = &power->states[next_state]; + if ( power->flags.bm_check && acpi_idle_bm_check() + && cx->type == ACPI_STATE_C3 ) + cx = power->safe_state; + if ( cx->type > max_cstate ) + cx = &power->states[max_cstate]; + } if ( !cx ) { if ( pm_idle_save ) @@ -303,69 +240,14 @@ static void acpi_processor_idle(void) return; } - /* - * Check BM Activity - * ----------------- - * Check for bus mastering activity (if required), record, and check - * for demotion. - */ - if ( power->flags.bm_check ) - { - u32 bm_status = 0; - unsigned long diff = (NOW() - power->bm_check_timestamp) >> 23; - - if ( diff > 31 ) - diff = 31; - - power->bm_activity <<= diff; - - acpi_get_register(ACPI_BITREG_BUS_MASTER_STATUS, &bm_status); - if ( bm_status ) - { - power->bm_activity |= 0x1; - acpi_set_register(ACPI_BITREG_BUS_MASTER_STATUS, 1); - } - /* - * PIIX4 Erratum #18: Note that BM_STS doesn't always reflect - * the true state of bus mastering activity; forcing us to - * manually check the BMIDEA bit of each IDE channel. - */ - /*else if ( errata.piix4.bmisx ) - { - if ( (inb_p(errata.piix4.bmisx + 0x02) & 0x01) - || (inb_p(errata.piix4.bmisx + 0x0A) & 0x01) ) - pr->power.bm_activity |= 0x1; - }*/ - - power->bm_check_timestamp = NOW(); - - /* - * If bus mastering is or was active this jiffy, demote - * to avoid a faulty transition. Note that the processor - * won't enter a low-power state during this call (to this - * function) but should upon the next. - * - * TBD: A better policy might be to fallback to the demotion - * state (use it for this quantum only) istead of - * demoting -- and rely on duration as our sole demotion - * qualification. This may, however, introduce DMA - * issues (e.g. floppy DMA transfer overrun/underrun). - */ - if ( (power->bm_activity & 0x1) && cx->demotion.threshold.bm ) - { - local_irq_enable(); - next_state = cx->demotion.state; - goto end; - } - } + power->last_state = cx; /* * Sleep: * ------ * Invoke the current Cx state to put the processor to sleep. */ - if ( cx->type == ACPI_STATE_C2 || cx->type == ACPI_STATE_C3 ) - smp_mb__after_clear_bit(); + acpi_idle_update_bm_rld(power, cx); switch ( cx->type ) { @@ -399,8 +281,7 @@ static void acpi_processor_idle(void) /* Re-enable interrupts */ local_irq_enable(); /* Compute time (ticks) that we were actually asleep */ - sleep_ticks = - ticks_elapsed(t1, t2) - cx->latency_ticks - C2_OVERHEAD; + sleep_ticks = ticks_elapsed(t1, t2); break; case ACPI_STATE_C3: @@ -416,8 +297,8 @@ static void acpi_processor_idle(void) */ if ( power->flags.bm_check && power->flags.bm_control ) { - atomic_inc(&c3_cpu_count); - if ( atomic_read(&c3_cpu_count) == num_online_cpus() ) + spin_lock(&c3_cpu_status.lock); + if ( ++c3_cpu_status.count == num_online_cpus() ) { /* * All CPUs are trying to go to C3 @@ -425,6 +306,7 @@ static void acpi_processor_idle(void) */ acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1); } + spin_unlock(&c3_cpu_status.lock); } else if ( !power->flags.bm_check ) { @@ -455,8 +337,10 @@ static void acpi_processor_idle(void) if ( power->flags.bm_check && power->flags.bm_control ) { /* Enable bus master arbitration */ - atomic_dec(&c3_cpu_count); - acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0); + spin_lock(&c3_cpu_status.lock); + if ( c3_cpu_status.count-- == num_online_cpus() ) + acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0); + spin_unlock(&c3_cpu_status.lock); } /* Re-enable interrupts */ @@ -465,8 +349,6 @@ static void acpi_processor_idle(void) lapic_timer_on(); /* Compute time (ticks) that we were actually asleep */ sleep_ticks = ticks_elapsed(t1, t2); - /* Do not account our idle-switching overhead: */ - sleep_ticks -= cx->latency_ticks + C3_OVERHEAD; break; @@ -476,163 +358,14 @@ static void acpi_processor_idle(void) } cx->usage++; - if ( (cx->type != ACPI_STATE_C1) && (sleep_ticks > 0) ) + if ( sleep_ticks > 0 ) + { + power->last_residency = PM_TIMER_TICKS_TO_US(sleep_ticks); cx->time += sleep_ticks; - - next_state = power->state; - - /* - * Promotion? - * ---------- - * Track the number of longs (time asleep is greater than threshold) - * and promote when the count threshold is reached. Note that bus - * mastering activity may prevent promotions. - * Do not promote above max_cstate. - */ - if ( cx->promotion.state && - ((cx->promotion.state - power->states) <= max_cstate) ) - { - if ( sleep_ticks > cx->promotion.threshold.ticks ) - { - cx->promotion.count++; - cx->demotion.count = 0; - if ( cx->promotion.count >= cx->promotion.threshold.count ) - { - if ( power->flags.bm_check ) - { - if ( !(power->bm_activity & cx->promotion.threshold.bm) ) - { - next_state = cx->promotion.state; - goto end; - } - } - else - { - next_state = cx->promotion.state; - goto end; - } - } - } - } - - /* - * Demotion? - * --------- - * Track the number of shorts (time asleep is less than time threshold) - * and demote when the usage threshold is reached. - */ - if ( cx->demotion.state ) - { - if ( sleep_ticks < cx->demotion.threshold.ticks ) - { - cx->demotion.count++; - cx->promotion.count = 0; - if ( cx->demotion.count >= cx->demotion.threshold.count ) - { - next_state = cx->demotion.state; - goto end; - } - } - } - -end: - /* - * Demote if current state exceeds max_cstate - */ - if ( (power->state - power->states) > max_cstate ) - { - if ( cx->demotion.state ) - next_state = cx->demotion.state; - } - - /* - * New Cx State? - * ------------- - * If we're going to start using a new Cx state we must clean up - * from the previous and prepare to use the new. - */ - if ( next_state != power->state ) - acpi_processor_power_activate(power, next_state); -} - -static int acpi_processor_set_power_policy(struct acpi_processor_power *power) -{ - unsigned int i; - unsigned int state_is_set = 0; - struct acpi_processor_cx *lower = NULL; - struct acpi_processor_cx *higher = NULL; - struct acpi_processor_cx *cx; - - if ( !power ) - return -EINVAL; - - /* - * This function sets the default Cx state policy (OS idle handler). - * Our scheme is to promote quickly to C2 but more conservatively - * to C3. We're favoring C2 for its characteristics of low latency - * (quick response), good power savings, and ability to allow bus - * mastering activity. Note that the Cx state policy is completely - * customizable and can be altered dynamically. - */ - - /* startup state */ - for ( i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++ ) - { - cx = &power->states[i]; - if ( !cx->valid ) - continue; - - if ( !state_is_set ) - power->state = cx; - state_is_set++; - break; - } - - if ( !state_is_set ) - return -ENODEV; - - /* demotion */ - for ( i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++ ) - { - cx = &power->states[i]; - if ( !cx->valid ) - continue; - - if ( lower ) - { - cx->demotion.state = lower; - cx->demotion.threshold.ticks = cx->latency_ticks; - cx->demotion.threshold.count = 1; - if ( cx->type == ACPI_STATE_C3 ) - cx->demotion.threshold.bm = bm_history; - } - - lower = cx; - } - - /* promotion */ - for ( i = (ACPI_PROCESSOR_MAX_POWER - 1); i > 0; i-- ) - { - cx = &power->states[i]; - if ( !cx->valid ) - continue; - - if ( higher ) - { - cx->promotion.state = higher; - cx->promotion.threshold.ticks = cx->latency_ticks; - if ( cx->type >= ACPI_STATE_C2 ) - cx->promotion.threshold.count = 4; - else - cx->promotion.threshold.count = 10; - if ( higher->type == ACPI_STATE_C3 ) - cx->promotion.threshold.bm = bm_history; - } - - higher = cx; - } - - return 0; + } + + if ( cpuidle_current_governor->reflect ) + cpuidle_current_governor->reflect(power); } static int init_cx_pminfo(struct acpi_processor_power *acpi_power) @@ -821,6 +554,8 @@ static int check_cx(struct acpi_processo return 0; } +static unsigned int latency_factor = 2; + static void set_cx( struct acpi_processor_power *acpi_power, xen_processor_cx_t *xen_cx) @@ -842,6 +577,9 @@ static void set_cx( cx->power = xen_cx->power; cx->latency_ticks = US_TO_PM_TIMER_TICKS(cx->latency); + cx->target_residency = cx->latency * latency_factor; + if ( cx->type == ACPI_STATE_C1 || cx->type == ACPI_STATE_C2 ) + acpi_power->safe_state = cx; } int get_cpu_id(u8 acpi_id) @@ -936,6 +674,7 @@ long set_cx_pminfo(uint32_t cpu, struct init_cx_pminfo(acpi_power); + acpi_power->cpu = cpu_id; acpi_power->flags.bm_check = power->flags.bm_check; acpi_power->flags.bm_control = power->flags.bm_control; acpi_power->flags.has_cst = power->flags.has_cst; @@ -950,10 +689,11 @@ long set_cx_pminfo(uint32_t cpu, struct set_cx(acpi_power, &xen_cx); } + if ( cpuidle_current_governor->enable && + cpuidle_current_governor->enable(acpi_power) ) + return -EFAULT; + /* FIXME: C-state dependency is not supported by far */ - - /* initialize default policy */ - acpi_processor_set_power_policy(acpi_power); print_acpi_power(cpu_id, acpi_power); @@ -978,7 +718,7 @@ int pmstat_get_cx_stat(uint32_t cpuid, s uint64_t usage; int i; - stat->last = (power->state) ? power->state->type : 0; + stat->last = (power->last_state) ? power->last_state->type : 0; stat->nr = processor_powers[cpuid].count; stat->idle_time = v->runstate.time[RUNSTATE_running]; if ( v->is_running ) diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/acpi/cpufreq/cpufreq.c --- a/xen/arch/x86/acpi/cpufreq/cpufreq.c Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/arch/x86/acpi/cpufreq/cpufreq.c Fri Sep 12 14:47:40 2008 +0900 @@ -48,7 +48,7 @@ struct cpufreq_policy xen_px_policy[NR_C struct cpufreq_policy xen_px_policy[NR_CPUS]; static cpumask_t *cpufreq_dom_pt; -static cpumask_t cpufreq_dom_mask; +static unsigned long *cpufreq_dom_mask; static unsigned int cpufreq_dom_max; enum { @@ -562,7 +562,8 @@ void cpufreq_dom_exit(void) void cpufreq_dom_exit(void) { cpufreq_dom_max = 0; - cpus_clear(cpufreq_dom_mask); + if (cpufreq_dom_mask) + xfree(cpufreq_dom_mask); if (cpufreq_dom_pt) xfree(cpufreq_dom_pt); } @@ -572,22 +573,28 @@ int cpufreq_dom_init(void) unsigned int i; cpufreq_dom_max = 0; - cpus_clear(cpufreq_dom_mask); for_each_online_cpu(i) { - cpu_set(processor_pminfo[i].perf.domain_info.domain, cpufreq_dom_mask); if (cpufreq_dom_max < processor_pminfo[i].perf.domain_info.domain) cpufreq_dom_max = processor_pminfo[i].perf.domain_info.domain; } cpufreq_dom_max++; + + cpufreq_dom_mask = xmalloc_array(unsigned long, + BITS_TO_LONGS(cpufreq_dom_max)); + if (!cpufreq_dom_mask) + return -ENOMEM; + bitmap_zero(cpufreq_dom_mask, cpufreq_dom_max); cpufreq_dom_pt = xmalloc_array(cpumask_t, cpufreq_dom_max); if (!cpufreq_dom_pt) return -ENOMEM; memset(cpufreq_dom_pt, 0, cpufreq_dom_max * sizeof(cpumask_t)); - for_each_online_cpu(i) + for_each_online_cpu(i) { + __set_bit(processor_pminfo[i].perf.domain_info.domain, cpufreq_dom_mask); cpu_set(i, cpufreq_dom_pt[processor_pminfo[i].perf.domain_info.domain]); + } for_each_online_cpu(i) processor_pminfo[i].perf.shared_cpu_map = @@ -616,10 +623,11 @@ static int cpufreq_cpu_init(void) int cpufreq_dom_dbs(unsigned int event) { - int cpu, dom, ret = 0; - - for (dom=0; dom<cpufreq_dom_max; dom++) { - if (!cpu_isset(dom, cpufreq_dom_mask)) + unsigned int cpu, dom; + int ret = 0; + + for (dom = 0; dom < cpufreq_dom_max; dom++) { + if (!test_bit(dom, cpufreq_dom_mask)) continue; cpu = first_cpu(cpufreq_dom_pt[dom]); ret = cpufreq_governor_dbs(&xen_px_policy[cpu], event); diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/acpi/cpufreq/powernow.c --- a/xen/arch/x86/acpi/cpufreq/powernow.c Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/arch/x86/acpi/cpufreq/powernow.c Fri Sep 12 14:47:40 2008 +0900 @@ -197,8 +197,8 @@ static int powernow_cpufreq_cpu_init(str data->max_freq = perf->states[0].core_frequency * 1000; /* table init */ - for (i=0; i<perf->state_count && i<max_hw_pstate; i++) { - if (i>0 && perf->states[i].core_frequency >= + for (i = 0; i < perf->state_count && i <= max_hw_pstate; i++) { + if (i > 0 && perf->states[i].core_frequency >= data->freq_table[valid_states-1].frequency / 1000) continue; diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/acpi/cpuidle_menu.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/arch/x86/acpi/cpuidle_menu.c Fri Sep 12 14:47:40 2008 +0900 @@ -0,0 +1,132 @@ +/* + * cpuidle_menu - menu governor for cpu idle, main idea come from Linux. + * drivers/cpuidle/governors/menu.c + * + * Copyright (C) 2006-2007 Adam Belay <abelay@xxxxxxxxxx> + * Copyright (C) 2007, 2008 Intel Corporation + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ +#include <xen/config.h> +#include <xen/errno.h> +#include <xen/lib.h> +#include <xen/types.h> +#include <xen/acpi.h> +#include <xen/timer.h> +#include <xen/cpuidle.h> + +#define BREAK_FUZZ 4 /* 4 us */ +#define USEC_PER_SEC 1000000 + +struct menu_device +{ + int last_state_idx; + unsigned int expected_us; + unsigned int predicted_us; + unsigned int last_measured_us; + unsigned int elapsed_us; +}; + +static DEFINE_PER_CPU(struct menu_device, menu_devices); + +static s_time_t get_sleep_length_ns(void) +{ + return per_cpu(timer_deadline, smp_processor_id()) - NOW(); +} + +static int menu_select(struct acpi_processor_power *power) +{ + struct menu_device *data = &__get_cpu_var(menu_devices); + int i; + + /* determine the expected residency time */ + data->expected_us = (u32) get_sleep_length_ns() / 1000; + + /* find the deepest idle state that satisfies our constraints */ + for ( i = 1; i < power->count; i++ ) + { + struct acpi_processor_cx *s = &power->states[i]; + + if ( s->target_residency > data->expected_us + s->latency ) + break; + if ( s->target_residency > data->predicted_us ) + break; + /* TBD: we need to check the QoS requirment in future */ + } + + data->last_state_idx = i - 1; + return i - 1; +} + +static void menu_reflect(struct acpi_processor_power *power) +{ + struct menu_device *data = &__get_cpu_var(menu_devices); + struct acpi_processor_cx *target = &power->states[data->last_state_idx]; + unsigned int last_residency; + unsigned int measured_us; + + /* + * Ugh, this idle state doesn't support residency measurements, so we + * are basically lost in the dark. As a compromise, assume we slept + * for one full standard timer tick. However, be aware that this + * could potentially result in a suboptimal state transition. + */ + if ( target->type == ACPI_STATE_C1 ) + last_residency = USEC_PER_SEC / HZ; + else + last_residency = power->last_residency; + + measured_us = last_residency + data->elapsed_us; + + /* if wrapping, set to max uint (-1) */ + measured_us = data->elapsed_us <= measured_us ? measured_us : -1; + + /* Predict time remaining until next break event */ + data->predicted_us = max(measured_us, data->last_measured_us); + + /* Distinguish between expected & non-expected events */ + if ( last_residency + BREAK_FUZZ + < data->expected_us + target->latency ) + { + data->last_measured_us = measured_us; + data->elapsed_us = 0; + } + else + data->elapsed_us = measured_us; +} + +static int menu_enable_device(struct acpi_processor_power *power) +{ + struct menu_device *data = &per_cpu(menu_devices, power->cpu); + + memset(data, 0, sizeof(struct menu_device)); + + return 0; +} + +static struct cpuidle_governor menu_governor = +{ + .name = "menu", + .rating = 20, + .enable = menu_enable_device, + .select = menu_select, + .reflect = menu_reflect, +}; + +struct cpuidle_governor *cpuidle_current_governor = &menu_governor; diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/domain.c --- a/xen/arch/x86/domain.c Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/arch/x86/domain.c Fri Sep 12 14:47:40 2008 +0900 @@ -31,6 +31,7 @@ #include <xen/compat.h> #include <xen/acpi.h> #include <xen/pci.h> +#include <xen/paging.h> #include <asm/regs.h> #include <asm/mc146818rtc.h> #include <asm/system.h> @@ -40,7 +41,6 @@ #include <asm/i387.h> #include <asm/mpspec.h> #include <asm/ldt.h> -#include <asm/paging.h> #include <asm/hypercall.h> #include <asm/hvm/hvm.h> #include <asm/hvm/support.h> @@ -302,7 +302,8 @@ int vcpu_initialise(struct vcpu *v) else { /* PV guests by default have a 100Hz ticker. */ - v->periodic_period = MILLISECS(10); + if ( !is_idle_domain(d) ) + v->periodic_period = MILLISECS(10); /* PV guests get an emulated PIT too for video BIOSes to use. */ if ( !is_idle_domain(d) && (v->vcpu_id == 0) ) @@ -1645,23 +1646,26 @@ static int relinquish_memory( /* * Forcibly invalidate top-most, still valid page tables at this point - * to break circular 'linear page table' references. This is okay - * because MMU structures are not shared across domains and this domain - * is now dead. Thus top-most valid tables are not in use so a non-zero - * count means circular reference. + * to break circular 'linear page table' references as well as clean up + * partially validated pages. This is okay because MMU structures are + * not shared across domains and this domain is now dead. Thus top-most + * valid tables are not in use so a non-zero count means circular + * reference or partially validated. */ y = page->u.inuse.type_info; for ( ; ; ) { x = y; - if ( likely((x & (PGT_type_mask|PGT_validated)) != - (type|PGT_validated)) ) + if ( likely((x & PGT_type_mask) != type) || + likely(!(x & (PGT_validated|PGT_partial))) ) break; - y = cmpxchg(&page->u.inuse.type_info, x, x & ~PGT_validated); + y = cmpxchg(&page->u.inuse.type_info, x, + x & ~(PGT_validated|PGT_partial)); if ( likely(y == x) ) { - free_page_type(page, type); + if ( free_page_type(page, x, 0) != 0 ) + BUG(); break; } } diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/domain_build.c --- a/xen/arch/x86/domain_build.c Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/arch/x86/domain_build.c Fri Sep 12 14:47:40 2008 +0900 @@ -26,6 +26,7 @@ #include <asm/desc.h> #include <asm/i387.h> #include <asm/paging.h> +#include <asm/p2m.h> #include <asm/e820.h> #include <public/version.h> diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/domctl.c --- a/xen/arch/x86/domctl.c Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/arch/x86/domctl.c Fri Sep 12 14:47:40 2008 +0900 @@ -20,7 +20,7 @@ #include <xen/trace.h> #include <xen/console.h> #include <xen/iocap.h> -#include <asm/paging.h> +#include <xen/paging.h> #include <asm/irq.h> #include <asm/hvm/hvm.h> #include <asm/hvm/support.h> @@ -67,14 +67,6 @@ long arch_do_domctl( ret = -ESRCH; if ( unlikely((d = rcu_lock_domain_by_id(domctl->domain)) == NULL) ) break; - - ret = xsm_ioport_permission(d, fp, - domctl->u.ioport_permission.allow_access); - if ( ret ) - { - rcu_unlock_domain(d); - break; - } if ( np == 0 ) ret = 0; @@ -550,6 +542,10 @@ long arch_do_domctl( if ( (d = rcu_lock_domain_by_id(domctl->domain)) == NULL ) break; + ret = xsm_sendtrigger(d); + if ( ret ) + goto sendtrigger_out; + ret = -EINVAL; if ( domctl->u.sendtrigger.vcpu >= MAX_VIRT_CPUS ) goto sendtrigger_out; @@ -628,6 +624,10 @@ long arch_do_domctl( bus = (domctl->u.assign_device.machine_bdf >> 16) & 0xff; devfn = (domctl->u.assign_device.machine_bdf >> 8) & 0xff; + ret = xsm_test_assign_device(domctl->u.assign_device.machine_bdf); + if ( ret ) + break; + if ( device_assigned(bus, devfn) ) { gdprintk(XENLOG_ERR, "XEN_DOMCTL_test_assign_device: " @@ -655,6 +655,11 @@ long arch_do_domctl( "XEN_DOMCTL_assign_device: get_domain_by_id() failed\n"); break; } + + ret = xsm_assign_device(d, domctl->u.assign_device.machine_bdf); + if ( ret ) + goto assign_device_out; + bus = (domctl->u.assign_device.machine_bdf >> 16) & 0xff; devfn = (domctl->u.assign_device.machine_bdf >> 8) & 0xff; @@ -680,6 +685,7 @@ long arch_do_domctl( "assign device (%x:%x:%x) failed\n", bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); + assign_device_out: put_domain(d); } break; @@ -700,6 +706,11 @@ long arch_do_domctl( "XEN_DOMCTL_deassign_device: get_domain_by_id() failed\n"); break; } + + ret = xsm_assign_device(d, domctl->u.assign_device.machine_bdf); + if ( ret ) + goto deassign_device_out; + bus = (domctl->u.assign_device.machine_bdf >> 16) & 0xff; devfn = (domctl->u.assign_device.machine_bdf >> 8) & 0xff; @@ -720,6 +731,8 @@ long arch_do_domctl( deassign_device(d, bus, devfn); gdprintk(XENLOG_INFO, "XEN_DOMCTL_deassign_device: bdf = %x:%x:%x\n", bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); + + deassign_device_out: put_domain(d); } break; @@ -733,10 +746,17 @@ long arch_do_domctl( if ( (d = rcu_lock_domain_by_id(domctl->domain)) == NULL ) break; bind = &(domctl->u.bind_pt_irq); + + ret = xsm_bind_pt_irq(d, bind); + if ( ret ) + goto bind_out; + if ( iommu_enabled ) ret = pt_irq_create_bind_vtd(d, bind); if ( ret < 0 ) gdprintk(XENLOG_ERR, "pt_irq_create_bind failed!\n"); + + bind_out: rcu_unlock_domain(d); } break; @@ -877,11 +897,16 @@ long arch_do_domctl( if ( d == NULL ) break; + ret = xsm_pin_mem_cacheattr(d); + if ( ret ) + goto pin_out; + ret = hvm_set_mem_pinned_cacheattr( d, domctl->u.pin_mem_cacheattr.start, domctl->u.pin_mem_cacheattr.end, domctl->u.pin_mem_cacheattr.type); + pin_out: rcu_unlock_domain(d); } break; @@ -899,6 +924,10 @@ long arch_do_domctl( d = rcu_lock_domain_by_id(domctl->domain); if ( d == NULL ) break; + + ret = xsm_ext_vcpucontext(d, domctl->cmd); + if ( ret ) + goto ext_vcpucontext_out; ret = -ESRCH; if ( (evc->vcpu >= MAX_VIRT_CPUS) || diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/hpet.c --- a/xen/arch/x86/hpet.c Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/arch/x86/hpet.c Fri Sep 12 14:47:40 2008 +0900 @@ -100,6 +100,13 @@ static int reprogram_hpet_evt_channel( ch->next_event = expire; + if ( expire == STIME_MAX ) + { + /* We assume it will take a long time for the timer to wrap. */ + hpet_write32(0, HPET_T0_CMP); + return 0; + } + delta = min_t(int64_t, delta, MAX_DELTA_NS); delta = max_t(int64_t, delta, MIN_DELTA_NS); delta = ns2ticks(delta, ch->shift, ch->mult); @@ -206,9 +213,11 @@ void hpet_broadcast_enter(void) { struct hpet_event_channel *ch = &hpet_event; + spin_lock(&ch->lock); + + disable_APIC_timer(); + cpu_set(smp_processor_id(), ch->cpumask); - - spin_lock(&ch->lock); /* reprogram if current cpu expire time is nearer */ if ( this_cpu(timer_deadline) < ch->next_event ) @@ -222,8 +231,23 @@ void hpet_broadcast_exit(void) struct hpet_event_channel *ch = &hpet_event; int cpu = smp_processor_id(); + spin_lock_irq(&ch->lock); + if ( cpu_test_and_clear(cpu, ch->cpumask) ) - reprogram_timer(per_cpu(timer_deadline, cpu)); + { + /* Cancel any outstanding LAPIC event and re-enable interrupts. */ + reprogram_timer(0); + enable_APIC_timer(); + + /* Reprogram the deadline; trigger timer work now if it has passed. */ + if ( !reprogram_timer(per_cpu(timer_deadline, cpu)) ) + raise_softirq(TIMER_SOFTIRQ); + + if ( cpus_empty(ch->cpumask) && ch->next_event != STIME_MAX ) + reprogram_hpet_evt_channel(ch, STIME_MAX, 0, 0); + } + + spin_unlock_irq(&ch->lock); } int hpet_broadcast_is_available(void) diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/hvm/hvm.c --- a/xen/arch/x86/hvm/hvm.c Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/arch/x86/hvm/hvm.c Fri Sep 12 14:47:40 2008 +0900 @@ -31,10 +31,11 @@ #include <xen/hypercall.h> #include <xen/guest_access.h> #include <xen/event.h> +#include <xen/paging.h> +#include <asm/shadow.h> #include <asm/current.h> #include <asm/e820.h> #include <asm/io.h> -#include <asm/paging.h> #include <asm/regs.h> #include <asm/cpufeature.h> #include <asm/processor.h> @@ -772,7 +773,7 @@ void hvm_hlt(unsigned long rflags) do_sched_op_compat(SCHEDOP_block, 0); - HVMTRACE_1D(HLT, curr, /* pending = */ vcpu_runnable(curr)); + HVMTRACE_1D(HLT, /* pending = */ vcpu_runnable(curr)); } void hvm_triple_fault(void) diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/hvm/svm/intr.c --- a/xen/arch/x86/hvm/svm/intr.c Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/arch/x86/hvm/svm/intr.c Fri Sep 12 14:47:40 2008 +0900 @@ -80,7 +80,7 @@ static void enable_intr_window(struct vc ASSERT(intack.source != hvm_intsrc_none); - HVMTRACE_2D(INJ_VIRQ, v, 0x0, /*fake=*/ 1); + HVMTRACE_2D(INJ_VIRQ, 0x0, /*fake=*/ 1); /* * Create a dummy virtual interrupt to intercept as soon as the @@ -199,7 +199,7 @@ asmlinkage void svm_intr_assist(void) } else { - HVMTRACE_2D(INJ_VIRQ, v, intack.vector, /*fake=*/ 0); + HVMTRACE_2D(INJ_VIRQ, intack.vector, /*fake=*/ 0); svm_inject_extint(v, intack.vector); pt_intr_post(v, intack); } diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/hvm/svm/svm.c --- a/xen/arch/x86/hvm/svm/svm.c Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/arch/x86/hvm/svm/svm.c Fri Sep 12 14:47:40 2008 +0900 @@ -759,11 +759,11 @@ static void svm_inject_exception( if ( trapnr == TRAP_page_fault ) { vmcb->cr2 = curr->arch.hvm_vcpu.guest_cr[2] = cr2; - HVMTRACE_LONG_2D(PF_INJECT, curr, errcode, TRC_PAR_LONG(cr2)); + HVMTRACE_LONG_2D(PF_INJECT, errcode, TRC_PAR_LONG(cr2)); } else { - HVMTRACE_2D(INJ_EXC, curr, trapnr, errcode); + HVMTRACE_2D(INJ_EXC, trapnr, errcode); } if ( (trapnr == TRAP_debug) && @@ -919,7 +919,7 @@ static void svm_cpuid_intercept( __clear_bit(X86_FEATURE_APIC & 31, edx); } - HVMTRACE_5D (CPUID, v, input, *eax, *ebx, *ecx, *edx); + HVMTRACE_5D (CPUID, input, *eax, *ebx, *ecx, *edx); } static void svm_vmexit_do_cpuid(struct cpu_user_regs *regs) @@ -946,7 +946,7 @@ static void svm_vmexit_do_cpuid(struct c static void svm_dr_access(struct vcpu *v, struct cpu_user_regs *regs) { - HVMTRACE_0D(DR_WRITE, v); + HVMTRACE_0D(DR_WRITE); __restore_debug_registers(v); } @@ -1018,7 +1018,7 @@ static int svm_msr_read_intercept(struct regs->edx = msr_content >> 32; done: - HVMTRACE_3D (MSR_READ, v, ecx, regs->eax, regs->edx); + HVMTRACE_3D (MSR_READ, ecx, regs->eax, regs->edx); HVM_DBG_LOG(DBG_LEVEL_1, "returns: ecx=%x, eax=%lx, edx=%lx", ecx, (unsigned long)regs->eax, (unsigned long)regs->edx); return X86EMUL_OKAY; @@ -1037,7 +1037,7 @@ static int svm_msr_write_intercept(struc msr_content = (u32)regs->eax | ((u64)regs->edx << 32); - HVMTRACE_3D (MSR_WRITE, v, ecx, regs->eax, regs->edx); + HVMTRACE_3D (MSR_WRITE, ecx, regs->eax, regs->edx); switch ( ecx ) { @@ -1168,7 +1168,7 @@ static void svm_invlpg_intercept(unsigne static void svm_invlpg_intercept(unsigned long vaddr) { struct vcpu *curr = current; - HVMTRACE_LONG_2D(INVLPG, curr, 0, TRC_PAR_LONG(vaddr)); + HVMTRACE_LONG_2D(INVLPG, 0, TRC_PAR_LONG(vaddr)); paging_invlpg(curr, vaddr); svm_asid_g_invlpg(curr, vaddr); } @@ -1191,7 +1191,7 @@ asmlinkage void svm_vmexit_handler(struc exit_reason = vmcb->exitcode; - HVMTRACE_ND(VMEXIT64, 1/*cycles*/, v, 3, exit_reason, + HVMTRACE_ND(VMEXIT64, 1/*cycles*/, 3, exit_reason, (uint32_t)regs->eip, (uint32_t)((uint64_t)regs->eip >> 32), 0, 0, 0); @@ -1216,17 +1216,17 @@ asmlinkage void svm_vmexit_handler(struc { case VMEXIT_INTR: /* Asynchronous event, handled when we STGI'd after the VMEXIT. */ - HVMTRACE_0D(INTR, v); + HVMTRACE_0D(INTR); break; case VMEXIT_NMI: /* Asynchronous event, handled when we STGI'd after the VMEXIT. */ - HVMTRACE_0D(NMI, v); + HVMTRACE_0D(NMI); break; case VMEXIT_SMI: /* Asynchronous event, handled when we STGI'd after the VMEXIT. */ - HVMTRACE_0D(SMI, v); + HVMTRACE_0D(SMI); break; case VMEXIT_EXCEPTION_DB: @@ -1261,10 +1261,12 @@ asmlinkage void svm_vmexit_handler(struc if ( paging_fault(va, regs) ) { - if (hvm_long_mode_enabled(v)) - HVMTRACE_LONG_2D(PF_XEN, v, regs->error_code, TRC_PAR_LONG(va)); + if ( trace_will_trace_event(TRC_SHADOW) ) + break; + if ( hvm_long_mode_enabled(v) ) + HVMTRACE_LONG_2D(PF_XEN, regs->error_code, TRC_PAR_LONG(va)); else - HVMTRACE_2D(PF_XEN, v, regs->error_code, va); + HVMTRACE_2D(PF_XEN, regs->error_code, va); break; } @@ -1274,7 +1276,7 @@ asmlinkage void svm_vmexit_handler(struc /* Asynchronous event, handled when we STGI'd after the VMEXIT. */ case VMEXIT_EXCEPTION_MC: - HVMTRACE_0D(MCE, v); + HVMTRACE_0D(MCE); break; case VMEXIT_VINTR: @@ -1331,7 +1333,7 @@ asmlinkage void svm_vmexit_handler(struc case VMEXIT_VMMCALL: if ( (inst_len = __get_instruction_length(v, INSTR_VMCALL)) == 0 ) break; - HVMTRACE_1D(VMMCALL, v, regs->eax); + HVMTRACE_1D(VMMCALL, regs->eax); rc = hvm_do_hypercall(regs); if ( rc != HVM_HCALL_preempted ) { @@ -1406,7 +1408,7 @@ asmlinkage void svm_vmexit_handler(struc asmlinkage void svm_trace_vmentry(void) { - HVMTRACE_ND (VMENTRY, 1/*cycles*/, current, 0, 0, 0, 0, 0, 0, 0); + HVMTRACE_ND (VMENTRY, 1/*cycles*/, 0, 0, 0, 0, 0, 0, 0); } /* diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/hvm/vmx/intr.c --- a/xen/arch/x86/hvm/vmx/intr.c Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/arch/x86/hvm/vmx/intr.c Fri Sep 12 14:47:40 2008 +0900 @@ -198,7 +198,7 @@ asmlinkage void vmx_intr_assist(void) } else { - HVMTRACE_2D(INJ_VIRQ, v, intack.vector, /*fake=*/ 0); + HVMTRACE_2D(INJ_VIRQ, intack.vector, /*fake=*/ 0); vmx_inject_extint(v, intack.vector); pt_intr_post(v, intack); } diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/hvm/vmx/vmx.c --- a/xen/arch/x86/hvm/vmx/vmx.c Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/arch/x86/hvm/vmx/vmx.c Fri Sep 12 14:47:40 2008 +0900 @@ -1114,10 +1114,10 @@ static void __vmx_inject_exception( __vmwrite(VM_ENTRY_INTR_INFO, intr_fields); if ( trap == TRAP_page_fault ) - HVMTRACE_LONG_2D(PF_INJECT, v, error_code, + HVMTRACE_LONG_2D(PF_INJECT, error_code, TRC_PAR_LONG(v->arch.hvm_vcpu.guest_cr[2])); else - HVMTRACE_2D(INJ_EXC, v, trap, error_code); + HVMTRACE_2D(INJ_EXC, trap, error_code); } void vmx_inject_hw_exception(struct vcpu *v, int trap, int error_code) @@ -1345,7 +1345,7 @@ static void vmx_cpuid_intercept( break; } - HVMTRACE_5D (CPUID, current, input, *eax, *ebx, *ecx, *edx); + HVMTRACE_5D (CPUID, input, *eax, *ebx, *ecx, *edx); } static void vmx_do_cpuid(struct cpu_user_regs *regs) @@ -1370,7 +1370,7 @@ static void vmx_dr_access(unsigned long { struct vcpu *v = current; - HVMTRACE_0D(DR_WRITE, v); + HVMTRACE_0D(DR_WRITE); if ( !v->arch.hvm_vcpu.flag_dr_dirty ) __restore_debug_registers(v); @@ -1383,7 +1383,7 @@ static void vmx_invlpg_intercept(unsigne static void vmx_invlpg_intercept(unsigned long vaddr) { struct vcpu *curr = current; - HVMTRACE_LONG_2D(INVLPG, curr, /*invlpga=*/ 0, TRC_PAR_LONG(vaddr)); + HVMTRACE_LONG_2D(INVLPG, /*invlpga=*/ 0, TRC_PAR_LONG(vaddr)); if ( paging_invlpg(curr, vaddr) ) vpid_sync_vcpu_gva(curr, vaddr); } @@ -1434,7 +1434,7 @@ static int mov_to_cr(int gp, int cr, str goto exit_and_crash; } - HVMTRACE_LONG_2D(CR_WRITE, v, cr, TRC_PAR_LONG(value)); + HVMTRACE_LONG_2D(CR_WRITE, cr, TRC_PAR_LONG(value)); HVM_DBG_LOG(DBG_LEVEL_1, "CR%d, value = %lx", cr, value); @@ -1505,7 +1505,7 @@ static void mov_from_cr(int cr, int gp, break; } - HVMTRACE_LONG_2D(CR_READ, v, cr, TRC_PAR_LONG(value)); + HVMTRACE_LONG_2D(CR_READ, cr, TRC_PAR_LONG(value)); HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR%d, value = %lx", cr, value); } @@ -1531,13 +1531,13 @@ static int vmx_cr_access(unsigned long e case VMX_CONTROL_REG_ACCESS_TYPE_CLTS: v->arch.hvm_vcpu.guest_cr[0] &= ~X86_CR0_TS; vmx_update_guest_cr(v, 0); - HVMTRACE_0D(CLTS, current); + HVMTRACE_0D(CLTS); break; case VMX_CONTROL_REG_ACCESS_TYPE_LMSW: value = v->arch.hvm_vcpu.guest_cr[0]; /* LMSW can: (1) set bits 0-3; (2) clear bits 1-3. */ value = (value & ~0xe) | ((exit_qualification >> 16) & 0xf); - HVMTRACE_LONG_1D(LMSW, current, value); + HVMTRACE_LONG_1D(LMSW, value); return !hvm_set_cr0(value); default: BUG(); @@ -1692,7 +1692,7 @@ static int vmx_msr_read_intercept(struct regs->edx = (uint32_t)(msr_content >> 32); done: - HVMTRACE_3D (MSR_READ, v, ecx, regs->eax, regs->edx); + HVMTRACE_3D (MSR_READ, ecx, regs->eax, regs->edx); HVM_DBG_LOG(DBG_LEVEL_1, "returns: ecx=%x, eax=%lx, edx=%lx", ecx, (unsigned long)regs->eax, (unsigned long)regs->edx); @@ -1803,7 +1803,7 @@ static int vmx_msr_write_intercept(struc msr_content = (u32)regs->eax | ((u64)regs->edx << 32); - HVMTRACE_3D (MSR_WRITE, v, ecx, regs->eax, regs->edx); + HVMTRACE_3D (MSR_WRITE, ecx, regs->eax, regs->edx); switch ( ecx ) { @@ -1894,7 +1894,7 @@ static void vmx_do_extint(struct cpu_use BUG_ON(!(vector & INTR_INFO_VALID_MASK)); vector &= INTR_INFO_VECTOR_MASK; - HVMTRACE_1D(INTR, current, vector); + HVMTRACE_1D(INTR, vector); switch ( vector ) { @@ -2010,7 +2010,7 @@ static void vmx_failed_vmentry(unsigned break; case EXIT_REASON_MACHINE_CHECK: printk("caused by machine check.\n"); - HVMTRACE_0D(MCE, curr); + HVMTRACE_0D(MCE); do_machine_check(regs); break; default: @@ -2037,7 +2037,7 @@ asmlinkage void vmx_vmexit_handler(struc exit_reason = __vmread(VM_EXIT_REASON); - HVMTRACE_ND(VMEXIT64, 1/*cycles*/, v, 3, exit_reason, + HVMTRACE_ND(VMEXIT64, 1/*cycles*/, 3, exit_reason, (uint32_t)regs->eip, (uint32_t)((uint64_t)regs->eip >> 32), 0, 0, 0); @@ -2101,7 +2101,8 @@ asmlinkage void vmx_vmexit_handler(struc !(__vmread(IDT_VECTORING_INFO) & INTR_INFO_VALID_MASK) && (vector != TRAP_double_fault) ) __vmwrite(GUEST_INTERRUPTIBILITY_INFO, - __vmread(GUEST_INTERRUPTIBILITY_INFO)|VMX_INTR_SHADOW_NMI); + __vmread(GUEST_INTERRUPTIBILITY_INFO) + | VMX_INTR_SHADOW_NMI); perfc_incra(cause_vector, vector); @@ -2128,12 +2129,14 @@ asmlinkage void vmx_vmexit_handler(struc if ( paging_fault(exit_qualification, regs) ) { + if ( trace_will_trace_event(TRC_SHADOW) ) + break; if ( hvm_long_mode_enabled(v) ) - HVMTRACE_LONG_2D (PF_XEN, v, regs->error_code, - TRC_PAR_LONG(exit_qualification) ); + HVMTRACE_LONG_2D(PF_XEN, regs->error_code, + TRC_PAR_LONG(exit_qualification) ); else - HVMTRACE_2D (PF_XEN, v, - regs->error_code, exit_qualification ); + HVMTRACE_2D(PF_XEN, + regs->error_code, exit_qualification ); break; } @@ -2144,11 +2147,11 @@ asmlinkage void vmx_vmexit_handler(struc if ( (intr_info & INTR_INFO_INTR_TYPE_MASK) != (X86_EVENTTYPE_NMI << 8) ) goto exit_and_crash; - HVMTRACE_0D(NMI, v); + HVMTRACE_0D(NMI); do_nmi(regs); /* Real NMI, vector 2: normal processing. */ break; case TRAP_machine_check: - HVMTRACE_0D(MCE, v); + HVMTRACE_0D(MCE); do_machine_check(regs); break; default: @@ -2213,7 +2216,7 @@ asmlinkage void vmx_vmexit_handler(struc case EXIT_REASON_VMCALL: { int rc; - HVMTRACE_1D(VMMCALL, v, regs->eax); + HVMTRACE_1D(VMMCALL, regs->eax); inst_len = __get_instruction_length(); /* Safe: VMCALL */ rc = hvm_do_hypercall(regs); if ( rc != HVM_HCALL_preempted ) @@ -2300,7 +2303,7 @@ asmlinkage void vmx_vmexit_handler(struc asmlinkage void vmx_trace_vmentry(void) { - HVMTRACE_ND (VMENTRY, 1/*cycles*/, current, 0, 0, 0, 0, 0, 0, 0); + HVMTRACE_ND (VMENTRY, 1/*cycles*/, 0, 0, 0, 0, 0, 0, 0); } /* diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/io_apic.c --- a/xen/arch/x86/io_apic.c Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/arch/x86/io_apic.c Fri Sep 12 14:47:40 2008 +0900 @@ -45,23 +45,14 @@ int (*ioapic_renumber_irq)(int ioapic, i int (*ioapic_renumber_irq)(int ioapic, int irq); atomic_t irq_mis_count; -int msi_enable = 0; -boolean_param("msi", msi_enable); - int domain_irq_to_vector(struct domain *d, int irq) { - if ( !msi_enable ) - return irq_to_vector(irq); - else - return d->arch.pirq_vector[irq]; + return d->arch.pirq_vector[irq]; } int domain_vector_to_irq(struct domain *d, int vector) { - if ( !msi_enable ) - return vector_to_irq(vector); - else - return d->arch.vector_pirq[vector]; + return d->arch.vector_pirq[vector]; } /* Where if anywhere is the i8259 connect in external int mode */ diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/irq.c --- a/xen/arch/x86/irq.c Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/arch/x86/irq.c Fri Sep 12 14:47:40 2008 +0900 @@ -737,9 +737,12 @@ __initcall(setup_dump_irqs); void fixup_irqs(cpumask_t map) { - unsigned int irq; + unsigned int irq, sp; static int warned; - + irq_guest_action_t *action; + struct pending_eoi *peoi; + + /* Direct all future interrupts away from this CPU. */ for ( irq = 0; irq < NR_IRQS; irq++ ) { cpumask_t mask; @@ -758,8 +761,24 @@ void fixup_irqs(cpumask_t map) printk("Cannot set affinity for irq %i\n", irq); } + /* Service any interrupts that beat us in the re-direction race. */ local_irq_enable(); mdelay(1); local_irq_disable(); + + /* Clean up cpu_eoi_map of every interrupt to exclude this CPU. */ + for ( irq = 0; irq < NR_IRQS; irq++ ) + { + if ( !(irq_desc[irq].status & IRQ_GUEST) ) + continue; + action = (irq_guest_action_t *)irq_desc[irq].action; + cpu_clear(smp_processor_id(), action->cpu_eoi_map); + } + + /* Flush the interrupt EOI stack. */ + peoi = this_cpu(pending_eoi); + for ( sp = 0; sp < pending_eoi_sp(peoi); sp++ ) + peoi[sp].ready = 1; + flush_ready_eoi(NULL); } #endif diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/mm.c --- a/xen/arch/x86/mm.c Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/arch/x86/mm.c Fri Sep 12 14:47:40 2008 +0900 @@ -507,11 +507,11 @@ static int alloc_segdesc_page(struct pag goto fail; unmap_domain_page(descs); - return 1; + return 0; fail: unmap_domain_page(descs); - return 0; + return -EINVAL; } @@ -565,20 +565,23 @@ static int get_page_from_pagenr(unsigned static int get_page_and_type_from_pagenr(unsigned long page_nr, unsigned long type, - struct domain *d) + struct domain *d, + int preemptible) { struct page_info *page = mfn_to_page(page_nr); + int rc; if ( unlikely(!get_page_from_pagenr(page_nr, d)) ) - return 0; - - if ( unlikely(!get_page_type(page, type)) ) - { + return -EINVAL; + + rc = (preemptible ? + get_page_type_preemptible(page, type) : + (get_page_type(page, type) ? 0 : -EINVAL)); + + if ( rc ) put_page(page); - return 0; - } - - return 1; + + return rc; } /* @@ -754,22 +757,22 @@ get_page_from_l2e( if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) ) { MEM_LOG("Bad L2 flags %x", l2e_get_flags(l2e) & L2_DISALLOW_MASK); - return 0; - } - - rc = get_page_and_type_from_pagenr(l2e_get_pfn(l2e), PGT_l1_page_table, d); - if ( unlikely(!rc) ) - rc = get_l2_linear_pagetable(l2e, pfn, d); + return -EINVAL; + } + + rc = get_page_and_type_from_pagenr( + l2e_get_pfn(l2e), PGT_l1_page_table, d, 0); + if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) ) + rc = 0; return rc; } -#if CONFIG_PAGING_LEVELS >= 3 define_get_linear_pagetable(l3); static int get_page_from_l3e( - l3_pgentry_t l3e, unsigned long pfn, struct domain *d) + l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int preemptible) { int rc; @@ -779,22 +782,22 @@ get_page_from_l3e( if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) ) { MEM_LOG("Bad L3 flags %x", l3e_get_flags(l3e) & l3_disallow_mask(d)); - return 0; - } - - rc = get_page_and_type_from_pagenr(l3e_get_pfn(l3e), PGT_l2_page_table, d); - if ( unlikely(!rc) ) - rc = get_l3_linear_pagetable(l3e, pfn, d); + return -EINVAL; + } + + rc = get_page_and_type_from_pagenr( + l3e_get_pfn(l3e), PGT_l2_page_table, d, preemptible); + if ( unlikely(rc == -EINVAL) && get_l3_linear_pagetable(l3e, pfn, d) ) + rc = 0; return rc; } -#endif /* 3 level */ #if CONFIG_PAGING_LEVELS >= 4 define_get_linear_pagetable(l4); static int get_page_from_l4e( - l4_pgentry_t l4e, unsigned long pfn, struct domain *d) + l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int preemptible) { int rc; @@ -804,12 +807,13 @@ get_page_from_l4e( if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) ) { MEM_LOG("Bad L4 flags %x", l4e_get_flags(l4e) & L4_DISALLOW_MASK); - return 0; - } - - rc = get_page_and_type_from_pagenr(l4e_get_pfn(l4e), PGT_l3_page_table, d); - if ( unlikely(!rc) ) - rc = get_l4_linear_pagetable(l4e, pfn, d); + return -EINVAL; + } + + rc = get_page_and_type_from_pagenr( + l4e_get_pfn(l4e), PGT_l3_page_table, d, preemptible); + if ( unlikely(rc == -EINVAL) && get_l4_linear_pagetable(l4e, pfn, d) ) + rc = 0; return rc; } @@ -946,29 +950,35 @@ void put_page_from_l1e(l1_pgentry_t l1e, * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. * Note also that this automatically deals correctly with linear p.t.'s. */ -static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn) +static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn) { if ( (l2e_get_flags(l2e) & _PAGE_PRESENT) && (l2e_get_pfn(l2e) != pfn) ) + { put_page_and_type(l2e_get_page(l2e)); -} - - -#if CONFIG_PAGING_LEVELS >= 3 -static void put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn) + return 0; + } + return 1; +} + + +static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn, + int preemptible) { if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) && (l3e_get_pfn(l3e) != pfn) ) - put_page_and_type(l3e_get_page(l3e)); -} -#endif + return put_page_and_type_preemptible(l3e_get_page(l3e), preemptible); + return 1; +} #if CONFIG_PAGING_LEVELS >= 4 -static void put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn) +static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn, + int preemptible) { if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) && (l4e_get_pfn(l4e) != pfn) ) - put_page_and_type(l4e_get_page(l4e)); + return put_page_and_type_preemptible(l4e_get_page(l4e), preemptible); + return 1; } #endif @@ -977,7 +987,7 @@ static int alloc_l1_table(struct page_in struct domain *d = page_get_owner(page); unsigned long pfn = page_to_mfn(page); l1_pgentry_t *pl1e; - int i; + unsigned int i; pl1e = map_domain_page(pfn); @@ -991,7 +1001,7 @@ static int alloc_l1_table(struct page_in } unmap_domain_page(pl1e); - return 1; + return 0; fail: MEM_LOG("Failure in alloc_l1_table: entry %d", i); @@ -1000,7 +1010,7 @@ static int alloc_l1_table(struct page_in put_page_from_l1e(pl1e[i], d); unmap_domain_page(pl1e); - return 0; + return -EINVAL; } static int create_pae_xen_mappings(struct domain *d, l3_pgentry_t *pl3e) @@ -1128,47 +1138,53 @@ static void pae_flush_pgd( # define pae_flush_pgd(mfn, idx, nl3e) ((void)0) #endif -static int alloc_l2_table(struct page_info *page, unsigned long type) +static int alloc_l2_table(struct page_info *page, unsigned long type, + int preemptible) { struct domain *d = page_get_owner(page); unsigned long pfn = page_to_mfn(page); l2_pgentry_t *pl2e; - int i; + unsigned int i; + int rc = 0; pl2e = map_domain_page(pfn); - for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) - { - if ( !is_guest_l2_slot(d, type, i) ) + for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; i++ ) + { + if ( preemptible && i && hypercall_preempt_check() ) + { + page->nr_validated_ptes = i; + rc = -EAGAIN; + break; + } + + if ( !is_guest_l2_slot(d, type, i) || + (rc = get_page_from_l2e(pl2e[i], pfn, d)) > 0 ) continue; - if ( unlikely(!get_page_from_l2e(pl2e[i], pfn, d)) ) - goto fail; - + if ( rc < 0 ) + { + MEM_LOG("Failure in alloc_l2_table: entry %d", i); + while ( i-- > 0 ) + if ( is_guest_l2_slot(d, type, i) ) + put_page_from_l2e(pl2e[i], pfn); + break; + } + adjust_guest_l2e(pl2e[i], d); } unmap_domain_page(pl2e); - return 1; - - fail: - MEM_LOG("Failure in alloc_l2_table: entry %d", i); - while ( i-- > 0 ) - if ( is_guest_l2_slot(d, type, i) ) - put_page_from_l2e(pl2e[i], pfn); - - unmap_domain_page(pl2e); - return 0; -} - - -#if CONFIG_PAGING_LEVELS >= 3 -static int alloc_l3_table(struct page_info *page) + return rc > 0 ? 0 : rc; +} + +static int alloc_l3_table(struct page_info *page, int preemptible) { struct domain *d = page_get_owner(page); unsigned long pfn = page_to_mfn(page); l3_pgentry_t *pl3e; - int i; + unsigned int i; + int rc = 0; #if CONFIG_PAGING_LEVELS == 3 /* @@ -1181,7 +1197,7 @@ static int alloc_l3_table(struct page_in d->vcpu[0] && d->vcpu[0]->is_initialised ) { MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn); - return 0; + return -EINVAL; } #endif @@ -1197,64 +1213,96 @@ static int alloc_l3_table(struct page_in if ( is_pv_32on64_domain(d) ) memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e)); - for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ ) + for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES; i++ ) { if ( is_pv_32bit_domain(d) && (i == 3) ) { if ( !(l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) || - (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) || - !get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]), - PGT_l2_page_table | - PGT_pae_xen_l2, - d) ) - goto fail; - } - else if ( !is_guest_l3_slot(i) ) + (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) ) + rc = -EINVAL; + else + rc = get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]), + PGT_l2_page_table | + PGT_pae_xen_l2, + d, preemptible); + } + else if ( !is_guest_l3_slot(i) || + (rc = get_page_from_l3e(pl3e[i], pfn, d, preemptible)) > 0 ) continue; - else if ( unlikely(!get_page_from_l3e(pl3e[i], pfn, d)) ) - goto fail; + + if ( rc == -EAGAIN ) + { + page->nr_validated_ptes = i; + page->partial_pte = 1; + } + else if ( rc == -EINTR && i ) + { + page->nr_validated_ptes = i; + page->partial_pte = 0; + rc = -EAGAIN; + } + if ( rc < 0 ) + break; adjust_guest_l3e(pl3e[i], d); } - if ( !create_pae_xen_mappings(d, pl3e) ) - goto fail; + if ( rc >= 0 && !create_pae_xen_mappings(d, pl3e) ) + rc = -EINVAL; + if ( rc < 0 && rc != -EAGAIN && rc != -EINTR ) + { + MEM_LOG("Failure in alloc_l3_table: entry %d", i); + while ( i-- > 0 ) + { + if ( !is_guest_l3_slot(i) ) + continue; + unadjust_guest_l3e(pl3e[i], d); + put_page_from_l3e(pl3e[i], pfn, 0); + } + } unmap_domain_page(pl3e); - return 1; - - fail: - MEM_LOG("Failure in alloc_l3_table: entry %d", i); - while ( i-- > 0 ) - { - if ( !is_guest_l3_slot(i) ) - continue; - unadjust_guest_l3e(pl3e[i], d); - put_page_from_l3e(pl3e[i], pfn); - } - - unmap_domain_page(pl3e); - return 0; -} -#else -#define alloc_l3_table(page) (0) -#endif + return rc > 0 ? 0 : rc; +} #if CONFIG_PAGING_LEVELS >= 4 -static int alloc_l4_table(struct page_info *page) +static int alloc_l4_table(struct page_info *page, int preemptible) { struct domain *d = page_get_owner(page); unsigned long pfn = page_to_mfn(page); l4_pgentry_t *pl4e = page_to_virt(page); - int i; - - for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ ) - { - if ( !is_guest_l4_slot(d, i) ) + unsigned int i; + int rc = 0; + + for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES; i++ ) + { + if ( !is_guest_l4_slot(d, i) || + (rc = get_page_from_l4e(pl4e[i], pfn, d, preemptible)) > 0 ) continue; - if ( unlikely(!get_page_from_l4e(pl4e[i], pfn, d)) ) - goto fail; + if ( rc == -EAGAIN ) + { + page->nr_validated_ptes = i; + page->partial_pte = 1; + } + else if ( rc == -EINTR ) + { + if ( i ) + { + page->nr_validated_ptes = i; + page->partial_pte = 0; + rc = -EAGAIN; + } + } + else if ( rc < 0 ) + { + MEM_LOG("Failure in alloc_l4_table: entry %d", i); + while ( i-- > 0 ) + if ( is_guest_l4_slot(d, i) ) + put_page_from_l4e(pl4e[i], pfn, 0); + } + if ( rc < 0 ) + return rc; adjust_guest_l4e(pl4e[i], d); } @@ -1269,18 +1317,10 @@ static int alloc_l4_table(struct page_in l4e_from_page(virt_to_page(d->arch.mm_perdomain_l3), __PAGE_HYPERVISOR); - return 1; - - fail: - MEM_LOG("Failure in alloc_l4_table: entry %d", i); - while ( i-- > 0 ) - if ( is_guest_l4_slot(d, i) ) - put_page_from_l4e(pl4e[i], pfn); - - return 0; + return rc > 0 ? 0 : rc; } #else -#define alloc_l4_table(page) (0) +#define alloc_l4_table(page, preemptible) (-EINVAL) #endif @@ -1289,7 +1329,7 @@ static void free_l1_table(struct page_in struct domain *d = page_get_owner(page); unsigned long pfn = page_to_mfn(page); l1_pgentry_t *pl1e; - int i; + unsigned int i; pl1e = map_domain_page(pfn); @@ -1301,74 +1341,114 @@ static void free_l1_table(struct page_in } -static void free_l2_table(struct page_info *page) +static int free_l2_table(struct page_info *page, int preemptible) { #ifdef CONFIG_COMPAT struct domain *d = page_get_owner(page); #endif unsigned long pfn = page_to_mfn(page); l2_pgentry_t *pl2e; - int i; + unsigned int i = page->nr_validated_ptes - 1; + int err = 0; pl2e = map_domain_page(pfn); - for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) - if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) ) - put_page_from_l2e(pl2e[i], pfn); + ASSERT(page->nr_validated_ptes); + do { + if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) && + put_page_from_l2e(pl2e[i], pfn) == 0 && + preemptible && i && hypercall_preempt_check() ) + { + page->nr_validated_ptes = i; + err = -EAGAIN; + } + } while ( !err && i-- ); unmap_domain_page(pl2e); - page->u.inuse.type_info &= ~PGT_pae_xen_l2; -} - - -#if CONFIG_PAGING_LEVELS >= 3 - -static void free_l3_table(struct page_info *page) + if ( !err ) + page->u.inuse.type_info &= ~PGT_pae_xen_l2; + + return err; +} + +static int free_l3_table(struct page_info *page, int preemptible) { struct domain *d = page_get_owner(page); unsigned long pfn = page_to_mfn(page); l3_pgentry_t *pl3e; - int i; + unsigned int i = page->nr_validated_ptes - !page->partial_pte; + int rc = 0; #ifdef DOMAIN_DESTRUCT_AVOID_RECURSION if ( d->arch.relmem == RELMEM_l3 ) - return; + return 0; #endif pl3e = map_domain_page(pfn); - for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ ) + do { if ( is_guest_l3_slot(i) ) { - put_page_from_l3e(pl3e[i], pfn); + rc = put_page_from_l3e(pl3e[i], pfn, preemptible); + if ( rc > 0 ) + continue; + if ( rc ) + break; unadjust_guest_l3e(pl3e[i], d); } + } while ( i-- ); unmap_domain_page(pl3e); -} - -#endif + + if ( rc == -EAGAIN ) + { + page->nr_validated_ptes = i; + page->partial_pte = 1; + } + else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 ) + { + page->nr_validated_ptes = i + 1; + page->partial_pte = 0; + rc = -EAGAIN; + } + return rc > 0 ? 0 : rc; +} #if CONFIG_PAGING_LEVELS >= 4 - -static void free_l4_table(struct page_info *page) +static int free_l4_table(struct page_info *page, int preemptible) { struct domain *d = page_get_owner(page); unsigned long pfn = page_to_mfn(page); l4_pgentry_t *pl4e = page_to_virt(page); - int i; + unsigned int i = page->nr_validated_ptes - !page->partial_pte; + int rc = 0; #ifdef DOMAIN_DESTRUCT_AVOID_RECURSION if ( d->arch.relmem == RELMEM_l4 ) - return; + return 0; #endif - for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ ) + do { if ( is_guest_l4_slot(d, i) ) - put_page_from_l4e(pl4e[i], pfn); -} - + rc = put_page_from_l4e(pl4e[i], pfn, preemptible); + } while ( rc >= 0 && i-- ); + + if ( rc == -EAGAIN ) + { + page->nr_validated_ptes = i; + page->partial_pte = 1; + } + else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 ) + { + page->nr_validated_ptes = i + 1; + page->partial_pte = 0; + rc = -EAGAIN; + } + return rc > 0 ? 0 : rc; +} +#else +#define free_l4_table(page, preemptible) (-EINVAL) #endif static void page_lock(struct page_info *page) @@ -1560,7 +1640,7 @@ static int mod_l2_entry(l2_pgentry_t *pl return rc; } - if ( unlikely(!get_page_from_l2e(nl2e, pfn, d)) ) + if ( unlikely(get_page_from_l2e(nl2e, pfn, d) < 0) ) return page_unlock(l2pg), 0; adjust_guest_l2e(nl2e, d); @@ -1582,25 +1662,24 @@ static int mod_l2_entry(l2_pgentry_t *pl put_page_from_l2e(ol2e, pfn); return rc; } - -#if CONFIG_PAGING_LEVELS >= 3 /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */ static int mod_l3_entry(l3_pgentry_t *pl3e, l3_pgentry_t nl3e, unsigned long pfn, - int preserve_ad) + int preserve_ad, + int preemptible) { l3_pgentry_t ol3e; struct vcpu *curr = current; struct domain *d = curr->domain; struct page_info *l3pg = mfn_to_page(pfn); - int rc = 1; + int rc = 0; if ( unlikely(!is_guest_l3_slot(pgentry_ptr_to_slot(pl3e))) ) { MEM_LOG("Illegal L3 update attempt in Xen-private area %p", pl3e); - return 0; + return -EINVAL; } /* @@ -1608,12 +1687,12 @@ static int mod_l3_entry(l3_pgentry_t *pl * would be a pain to ensure they remain continuously valid throughout. */ if ( is_pv_32bit_domain(d) && (pgentry_ptr_to_slot(pl3e) >= 3) ) - return 0; + return -EINVAL; page_lock(l3pg); if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) ) - return page_unlock(l3pg), 0; + return page_unlock(l3pg), -EFAULT; if ( l3e_get_flags(nl3e) & _PAGE_PRESENT ) { @@ -1622,7 +1701,7 @@ static int mod_l3_entry(l3_pgentry_t *pl page_unlock(l3pg); MEM_LOG("Bad L3 flags %x", l3e_get_flags(nl3e) & l3_disallow_mask(d)); - return 0; + return -EINVAL; } /* Fast path for identical mapping and presence. */ @@ -1631,28 +1710,30 @@ static int mod_l3_entry(l3_pgentry_t *pl adjust_guest_l3e(nl3e, d); rc = UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr, preserve_ad); page_unlock(l3pg); - return rc; - } - - if ( unlikely(!get_page_from_l3e(nl3e, pfn, d)) ) - return page_unlock(l3pg), 0; + return rc ? 0 : -EFAULT; + } + + rc = get_page_from_l3e(nl3e, pfn, d, preemptible); + if ( unlikely(rc < 0) ) + return page_unlock(l3pg), rc; + rc = 0; adjust_guest_l3e(nl3e, d); if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr, preserve_ad)) ) { ol3e = nl3e; - rc = 0; + rc = -EFAULT; } } else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, curr, preserve_ad)) ) { page_unlock(l3pg); - return 0; - } - - if ( likely(rc) ) + return -EFAULT; + } + + if ( likely(rc == 0) ) { if ( !create_pae_xen_mappings(d, pl3e) ) BUG(); @@ -1661,11 +1742,9 @@ static int mod_l3_entry(l3_pgentry_t *pl } page_unlock(l3pg); - put_page_from_l3e(ol3e, pfn); + put_page_from_l3e(ol3e, pfn, 0); return rc; } - -#endif #if CONFIG_PAGING_LEVELS >= 4 @@ -1673,24 +1752,25 @@ static int mod_l4_entry(l4_pgentry_t *pl static int mod_l4_entry(l4_pgentry_t *pl4e, l4_pgentry_t nl4e, unsigned long pfn, - int preserve_ad) + int preserve_ad, + int preemptible) { struct vcpu *curr = current; struct domain *d = curr->domain; l4_pgentry_t ol4e; struct page_info *l4pg = mfn_to_page(pfn); - int rc = 1; + int rc = 0; if ( unlikely(!is_guest_l4_slot(d, pgentry_ptr_to_slot(pl4e))) ) { MEM_LOG("Illegal L4 update attempt in Xen-private area %p", pl4e); - return 0; + return -EINVAL; } page_lock(l4pg); if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) ) - return page_unlock(l4pg), 0; + return page_unlock(l4pg), -EFAULT; if ( l4e_get_flags(nl4e) & _PAGE_PRESENT ) { @@ -1699,7 +1779,7 @@ static int mod_l4_entry(l4_pgentry_t *pl page_unlock(l4pg); MEM_LOG("Bad L4 flags %x", l4e_get_flags(nl4e) & L4_DISALLOW_MASK); - return 0; + return -EINVAL; } /* Fast path for identical mapping and presence. */ @@ -1708,29 +1788,31 @@ static int mod_l4_entry(l4_pgentry_t *pl adjust_guest_l4e(nl4e, d); rc = UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr, preserve_ad); page_unlock(l4pg); - return rc; - } - - if ( unlikely(!get_page_from_l4e(nl4e, pfn, d)) ) - return page_unlock(l4pg), 0; + return rc ? 0 : -EFAULT; + } + + rc = get_page_from_l4e(nl4e, pfn, d, preemptible); + if ( unlikely(rc < 0) ) + return page_unlock(l4pg), rc; + rc = 0; adjust_guest_l4e(nl4e, d); if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr, preserve_ad)) ) { ol4e = nl4e; - rc = 0; + rc = -EFAULT; } } else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, curr, preserve_ad)) ) { page_unlock(l4pg); - return 0; + return -EFAULT; } page_unlock(l4pg); - put_page_from_l4e(ol4e, pfn); + put_page_from_l4e(ol4e, pfn, 0); return rc; } @@ -1788,9 +1870,11 @@ int get_page(struct page_info *page, str } -static int alloc_page_type(struct page_info *page, unsigned long type) +static int alloc_page_type(struct page_info *page, unsigned long type, + int preemptible) { struct domain *owner = page_get_owner(page); + int rc; /* A page table is dirtied when its type count becomes non-zero. */ if ( likely(owner != NULL) ) @@ -1799,30 +1883,65 @@ static int alloc_page_type(struct page_i switch ( type & PGT_type_mask ) { case PGT_l1_page_table: - return alloc_l1_table(page); + alloc_l1_table(page); + rc = 0; + break; case PGT_l2_page_table: - return alloc_l2_table(page, type); + rc = alloc_l2_table(page, type, preemptible); + break; case PGT_l3_page_table: - return alloc_l3_table(page); + rc = alloc_l3_table(page, preemptible); + break; case PGT_l4_page_table: - return alloc_l4_table(page); + rc = alloc_l4_table(page, preemptible); + break; case PGT_seg_desc_page: - return alloc_segdesc_page(page); + rc = alloc_segdesc_page(page); + break; default: printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%x\n", type, page->u.inuse.type_info, page->count_info); + rc = -EINVAL; BUG(); } - return 0; -} - - -void free_page_type(struct page_info *page, unsigned long type) + /* No need for atomic update of type_info here: noone else updates it. */ + wmb(); + if ( rc == -EAGAIN ) + { + page->u.inuse.type_info |= PGT_partial; + } + else if ( rc == -EINTR ) + { + ASSERT((page->u.inuse.type_info & + (PGT_count_mask|PGT_validated|PGT_partial)) == 1); + page->u.inuse.type_info &= ~PGT_count_mask; + } + else if ( rc ) + { + ASSERT(rc < 0); + MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %" + PRtype_info ": caf=%08x taf=%" PRtype_info, + page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)), + type, page->count_info, page->u.inuse.type_info); + page->u.inuse.type_info = 0; + } + else + { + page->u.inuse.type_info |= PGT_validated; + } + + return rc; +} + + +int free_page_type(struct page_info *page, unsigned long type, + int preemptible) { struct domain *owner = page_get_owner(page); unsigned long gmfn; + int rc; if ( likely(owner != NULL) ) { @@ -1842,7 +1961,7 @@ void free_page_type(struct page_info *pa paging_mark_dirty(owner, page_to_mfn(page)); if ( shadow_mode_refcounts(owner) ) - return; + return 0; gmfn = mfn_to_gmfn(owner, page_to_mfn(page)); ASSERT(VALID_M2P(gmfn)); @@ -1850,42 +1969,80 @@ void free_page_type(struct page_info *pa } } + if ( !(type & PGT_partial) ) + { + page->nr_validated_ptes = 1U << PAGETABLE_ORDER; + page->partial_pte = 0; + } switch ( type & PGT_type_mask ) { case PGT_l1_page_table: free_l1_table(page); + rc = 0; break; - case PGT_l2_page_table: - free_l2_table(page); + rc = free_l2_table(page, preemptible); break; - -#if CONFIG_PAGING_LEVELS >= 3 case PGT_l3_page_table: - free_l3_table(page); +#if CONFIG_PAGING_LEVELS == 3 + if ( !(type & PGT_partial) ) + page->nr_validated_ptes = L3_PAGETABLE_ENTRIES; +#endif + rc = free_l3_table(page, preemptible); break; -#endif - -#if CONFIG_PAGING_LEVELS >= 4 case PGT_l4_page_table: - free_l4_table(page); + rc = free_l4_table(page, preemptible); break; -#endif - default: - printk("%s: type %lx pfn %lx\n",__FUNCTION__, - type, page_to_mfn(page)); + MEM_LOG("type %lx pfn %lx\n", type, page_to_mfn(page)); + rc = -EINVAL; BUG(); } -} - - -void put_page_type(struct page_info *page) + + /* No need for atomic update of type_info here: noone else updates it. */ + if ( rc == 0 ) + { + /* + * Record TLB information for flush later. We do not stamp page tables + * when running in shadow mode: + * 1. Pointless, since it's the shadow pt's which must be tracked. + * 2. Shadow mode reuses this field for shadowed page tables to + * store flags info -- we don't want to conflict with that. + */ + if ( !(shadow_mode_enabled(page_get_owner(page)) && + (page->count_info & PGC_page_table)) ) + page->tlbflush_timestamp = tlbflush_current_time(); + wmb(); + page->u.inuse.type_info--; + } + else if ( rc == -EINTR ) + { + ASSERT(!(page->u.inuse.type_info & + (PGT_count_mask|PGT_validated|PGT_partial))); + if ( !(shadow_mode_enabled(page_get_owner(page)) && + (page->count_info & PGC_page_table)) ) + page->tlbflush_timestamp = tlbflush_current_time(); + wmb(); + page->u.inuse.type_info |= PGT_validated; + } + else + { + BUG_ON(rc != -EAGAIN); + wmb(); + page->u.inuse.type_info |= PGT_partial; + } + + return rc; +} + + +static int __put_page_type(struct page_info *page, + int preemptible) { unsigned long nx, x, y = page->u.inuse.type_info; - again: - do { + for ( ; ; ) + { x = y; nx = x - 1; @@ -1894,21 +2051,19 @@ void put_page_type(struct page_info *pag if ( unlikely((nx & PGT_count_mask) == 0) ) { if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) && - likely(nx & PGT_validated) ) + likely(nx & (PGT_validated|PGT_partial)) ) { /* * Page-table pages must be unvalidated when count is zero. The * 'free' is safe because the refcnt is non-zero and validated * bit is clear => other ops will spin or fail. */ - if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, - x & ~PGT_validated)) != x) ) - goto again; + nx = x & ~(PGT_validated|PGT_partial); + if ( unlikely((y = cmpxchg(&page->u.inuse.type_info, + x, nx)) != x) ) + continue; /* We cleared the 'valid bit' so we do the clean up. */ - free_page_type(page, x); - /* Carry on, but with the 'valid bit' now clear. */ - x &= ~PGT_validated; - nx &= ~PGT_validated; + return free_page_type(page, x, preemptible); } /* @@ -1922,25 +2077,33 @@ void put_page_type(struct page_info *pag (page->count_info & PGC_page_table)) ) page->tlbflush_timestamp = tlbflush_current_time(); } - } - while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) ); -} - - -int get_page_type(struct page_info *page, unsigned long type) + + if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) ) + break; + + if ( preemptible && hypercall_preempt_check() ) + return -EINTR; + } + + return 0; +} + + +static int __get_page_type(struct page_info *page, unsigned long type, + int preemptible) { unsigned long nx, x, y = page->u.inuse.type_info; ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2))); - again: - do { + for ( ; ; ) + { x = y; nx = x + 1; if ( unlikely((nx & PGT_count_mask) == 0) ) { MEM_LOG("Type count overflow on pfn %lx", page_to_mfn(page)); - return 0; + return -EINVAL; } else if ( unlikely((x & PGT_count_mask) == 0) ) { @@ -1993,28 +2156,43 @@ int get_page_type(struct page_info *page /* Don't log failure if it could be a recursive-mapping attempt. */ if ( ((x & PGT_type_mask) == PGT_l2_page_table) && (type == PGT_l1_page_table) ) - return 0; + return -EINVAL; if ( ((x & PGT_type_mask) == PGT_l3_page_table) && (type == PGT_l2_page_table) ) - return 0; + return -EINVAL; if ( ((x & PGT_type_mask) == PGT_l4_page_table) && (type == PGT_l3_page_table) ) - return 0; + return -EINVAL; MEM_LOG("Bad type (saw %" PRtype_info " != exp %" PRtype_info ") " "for mfn %lx (pfn %lx)", x, type, page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page))); - return 0; + return -EINVAL; } else if ( unlikely(!(x & PGT_validated)) ) { - /* Someone else is updating validation of this page. Wait... */ - while ( (y = page->u.inuse.type_info) == x ) - cpu_relax(); - goto again; - } - } - while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) ); + if ( !(x & PGT_partial) ) + { + /* Someone else is updating validation of this page. Wait... */ + while ( (y = page->u.inuse.type_info) == x ) + { + if ( preemptible && hypercall_preempt_check() ) + return -EINTR; + cpu_relax(); + } + continue; + } + /* Type ref count was left at 1 when PGT_partial got set. */ + ASSERT((x & PGT_count_mask) == 1); + nx = x & ~PGT_partial; + } + + if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) ) + break; + + if ( preemptible && hypercall_preempt_check() ) + return -EINTR; + } if ( unlikely((x & PGT_type_mask) != type) ) { @@ -2032,25 +2210,42 @@ int get_page_type(struct page_info *page if ( unlikely(!(nx & PGT_validated)) ) { - /* Try to validate page type; drop the new reference on failure. */ - if ( unlikely(!alloc_page_type(page, type)) ) - { - MEM_LOG("Error while validating mfn %lx (pfn %lx) for type %" - PRtype_info ": caf=%08x taf=%" PRtype_info, - page_to_mfn(page), get_gpfn_from_mfn(page_to_mfn(page)), - type, page->count_info, page->u.inuse.type_info); - /* Noone else can get a reference. We hold the only ref. */ - page->u.inuse.type_info = 0; - return 0; - } - - /* Noone else is updating simultaneously. */ - __set_bit(_PGT_validated, &page->u.inuse.type_info); - } - - return 1; -} - + if ( !(x & PGT_partial) ) + { + page->nr_validated_ptes = 0; + page->partial_pte = 0; + } + return alloc_page_type(page, type, preemptible); + } + + return 0; +} + +void put_page_type(struct page_info *page) +{ + int rc = __put_page_type(page, 0); + ASSERT(rc == 0); + (void)rc; +} + +int get_page_type(struct page_info *page, unsigned long type) +{ + int rc = __get_page_type(page, type, 0); + if ( likely(rc == 0) ) + return 1; + ASSERT(rc == -EINVAL); + return 0; +} + +int put_page_type_preemptible(struct page_info *page) +{ + return __put_page_type(page, 1); +} + +int get_page_type_preemptible(struct page_info *page, unsigned long type) +{ + return __get_page_type(page, type, 1); +} void cleanup_page_cacheattr(struct page_info *page) { @@ -2087,7 +2282,7 @@ int new_guest_cr3(unsigned long mfn) l4e_from_pfn( mfn, (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED)), - pagetable_get_pfn(v->arch.guest_table), 0); + pagetable_get_pfn(v->arch.guest_table), 0, 0) == 0; if ( unlikely(!okay) ) { MEM_LOG("Error while installing new compat baseptr %lx", mfn); @@ -2102,7 +2297,7 @@ int new_guest_cr3(unsigned long mfn) #endif okay = paging_mode_refcounts(d) ? get_page_from_pagenr(mfn, d) - : get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d); + : !get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0); if ( unlikely(!okay) ) { MEM_LOG("Error while installing new baseptr %lx", mfn); @@ -2276,9 +2471,7 @@ int do_mmuext_op( { if ( hypercall_preempt_check() ) { - rc = hypercall_create_continuation( - __HYPERVISOR_mmuext_op, "hihi", - uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom); + rc = -EAGAIN; break; } @@ -2325,10 +2518,14 @@ int do_mmuext_op( if ( paging_mode_refcounts(FOREIGNDOM) ) break; - okay = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM); + rc = get_page_and_type_from_pagenr(mfn, type, FOREIGNDOM, 1); + okay = !rc; if ( unlikely(!okay) ) { - MEM_LOG("Error while pinning mfn %lx", mfn); + if ( rc == -EINTR ) + rc = -EAGAIN; + else if ( rc != -EAGAIN ) + MEM_LOG("Error while pinning mfn %lx", mfn); break; } @@ -2373,8 +2570,11 @@ int do_mmuext_op( { put_page_and_type(page); put_page(page); - /* A page is dirtied when its pin status is cleared. */ - paging_mark_dirty(d, mfn); + if ( !rc ) + { + /* A page is dirtied when its pin status is cleared. */ + paging_mark_dirty(d, mfn); + } } else { @@ -2398,8 +2598,8 @@ int do_mmuext_op( if ( paging_mode_refcounts(d) ) okay = get_page_from_pagenr(mfn, d); else - okay = get_page_and_type_from_pagenr( - mfn, PGT_root_page_table, d); + okay = !get_page_and_type_from_pagenr( + mfn, PGT_root_page_table, d, 0); if ( unlikely(!okay) ) { MEM_LOG("Error while installing new mfn %lx", mfn); @@ -2517,6 +2717,11 @@ int do_mmuext_op( guest_handle_add_offset(uops, 1); } + if ( rc == -EAGAIN ) + rc = hypercall_create_continuation( + __HYPERVISOR_mmuext_op, "hihi", + uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom); + process_deferred_ops(); perfc_add(num_mmuext_ops, i); @@ -2576,9 +2781,7 @@ int do_mmu_update( { if ( hypercall_preempt_check() ) { - rc = hypercall_create_continuation( - __HYPERVISOR_mmu_update, "hihi", - ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom); + rc = -EAGAIN; break; } @@ -2601,7 +2804,7 @@ int do_mmu_update( */ case MMU_NORMAL_PT_UPDATE: case MMU_PT_UPDATE_PRESERVE_AD: - rc = xsm_mmu_normal_update(d, req.val); + rc = xsm_mmu_normal_update(d, FOREIGNDOM, req.val); if ( rc ) break; @@ -2653,27 +2856,29 @@ int do_mmu_update( cmd == MMU_PT_UPDATE_PRESERVE_AD); } break; -#if CONFIG_PAGING_LEVELS >= 3 case PGT_l3_page_table: { l3_pgentry_t l3e = l3e_from_intpte(req.val); - okay = mod_l3_entry(va, l3e, mfn, - cmd == MMU_PT_UPDATE_PRESERVE_AD); + rc = mod_l3_entry(va, l3e, mfn, + cmd == MMU_PT_UPDATE_PRESERVE_AD, 1); + okay = !rc; } break; -#endif #if CONFIG_PAGING_LEVELS >= 4 case PGT_l4_page_table: { l4_pgentry_t l4e = l4e_from_intpte(req.val); - okay = mod_l4_entry(va, l4e, mfn, - cmd == MMU_PT_UPDATE_PRESERVE_AD); + rc = mod_l4_entry(va, l4e, mfn, + cmd == MMU_PT_UPDATE_PRESERVE_AD, 1); + okay = !rc; } break; #endif } put_page_type(page); + if ( rc == -EINTR ) + rc = -EAGAIN; } break; @@ -2741,6 +2946,11 @@ int do_mmu_update( guest_handle_add_offset(ureqs, 1); } + + if ( rc == -EAGAIN ) + rc = hypercall_create_continuation( + __HYPERVISOR_mmu_update, "hihi", + ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom); process_deferred_ops(); @@ -3111,7 +3321,7 @@ int do_update_va_mapping(unsigned long v if ( unlikely(!access_ok(va, 1) && !paging_mode_external(d)) ) return -EINVAL; - rc = xsm_update_va_mapping(d, val); + rc = xsm_update_va_mapping(d, FOREIGNDOM, val); if ( rc ) return rc; @@ -3695,9 +3905,8 @@ static int ptwr_emulated_update( nl1e = l1e_from_intpte(val); if ( unlikely(!get_page_from_l1e(nl1e, d)) ) { - if ( (CONFIG_PAGING_LEVELS >= 3) && is_pv_32bit_domain(d) && - (bytes == 4) && (unaligned_addr & 4) && !do_cmpxchg && - (l1e_get_flags(nl1e) & _PAGE_PRESENT) ) + if ( is_pv_32bit_domain(d) && (bytes == 4) && (unaligned_addr & 4) && + !do_cmpxchg && (l1e_get_flags(nl1e) & _PAGE_PRESENT) ) { /* * If this is an upper-half write to a PAE PTE then we assume that diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/mm/hap/hap.c --- a/xen/arch/x86/mm/hap/hap.c Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/arch/x86/mm/hap/hap.c Fri Sep 12 14:47:40 2008 +0900 @@ -37,6 +37,7 @@ #include <asm/shared.h> #include <asm/hap.h> #include <asm/paging.h> +#include <asm/p2m.h> #include <asm/domain.h> #include <xen/numa.h> diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/mm/shadow/common.c --- a/xen/arch/x86/mm/shadow/common.c Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/arch/x86/mm/shadow/common.c Fri Sep 12 14:47:40 2008 +0900 @@ -39,6 +39,7 @@ #include <xen/numa.h> #include "private.h" +DEFINE_PER_CPU(uint32_t,trace_shadow_path_flags); /* Set up the shadow-specific parts of a domain struct at start of day. * Called for every domain from arch_domain_create() */ @@ -630,6 +631,8 @@ void oos_fixup_add(struct vcpu *v, mfn_t if ( mfn_x(oos_fixup[idx].smfn[next]) != INVALID_MFN ) { + TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_OOS_FIXUP_EVICT); + /* Reuse this slot and remove current writable mapping. */ sh_remove_write_access_from_sl1p(v, gmfn, oos_fixup[idx].smfn[next], @@ -645,6 +648,8 @@ void oos_fixup_add(struct vcpu *v, mfn_t oos_fixup[idx].smfn[next] = smfn; oos_fixup[idx].off[next] = off; oos_fixup[idx].next = (next + 1) % SHADOW_OOS_FIXUPS; + + TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_OOS_FIXUP_ADD); return; } } @@ -687,6 +692,16 @@ static int oos_remove_write_access(struc } +static inline void trace_resync(int event, mfn_t gmfn) +{ + if ( tb_init_done ) + { + /* Convert gmfn to gfn */ + unsigned long gfn = mfn_to_gfn(current->domain, gmfn); + __trace_var(event, 0/*!tsc*/, sizeof(gfn), (unsigned char*)&gfn); + } +} + /* Pull all the entries on an out-of-sync page back into sync. */ static void _sh_resync(struct vcpu *v, mfn_t gmfn, struct oos_fixup *fixup, mfn_t snp) @@ -700,8 +715,8 @@ static void _sh_resync(struct vcpu *v, m & ~SHF_L1_ANY)); ASSERT(!sh_page_has_multiple_shadows(mfn_to_page(gmfn))); - SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, va=%lx\n", - v->domain->domain_id, v->vcpu_id, mfn_x(gmfn), va); + SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx\n", + v->domain->domain_id, v->vcpu_id, mfn_x(gmfn)); /* Need to pull write access so the page *stays* in sync. */ if ( oos_remove_write_access(v, gmfn, fixup) ) @@ -719,6 +734,7 @@ static void _sh_resync(struct vcpu *v, m /* Now we know all the entries are synced, and will stay that way */ pg->shadow_flags &= ~SHF_out_of_sync; perfc_incr(shadow_resync); + trace_resync(TRC_SHADOW_RESYNC_FULL, gmfn); } @@ -930,6 +946,7 @@ void sh_resync_all(struct vcpu *v, int s /* Update the shadows and leave the page OOS. */ if ( sh_skip_sync(v, oos[idx]) ) continue; + trace_resync(TRC_SHADOW_RESYNC_ONLY, oos[idx]); _sh_resync_l1(other, oos[idx], oos_snapshot[idx]); } else @@ -945,15 +962,16 @@ void sh_resync_all(struct vcpu *v, int s } } -/* Allow a shadowed page to go out of sync */ +/* Allow a shadowed page to go out of sync. Unsyncs are traced in + * multi.c:sh_page_fault() */ int sh_unsync(struct vcpu *v, mfn_t gmfn) { struct page_info *pg; ASSERT(shadow_locked_by_me(v->domain)); - SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx va %lx\n", - v->domain->domain_id, v->vcpu_id, mfn_x(gmfn), va); + SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx\n", + v->domain->domain_id, v->vcpu_id, mfn_x(gmfn)); pg = mfn_to_page(gmfn); @@ -970,6 +988,7 @@ int sh_unsync(struct vcpu *v, mfn_t gmfn pg->shadow_flags |= SHF_out_of_sync|SHF_oos_may_write; oos_hash_add(v, gmfn); perfc_incr(shadow_unsync); + TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_UNSYNC); return 1; } @@ -1005,6 +1024,7 @@ void shadow_promote(struct vcpu *v, mfn_ ASSERT(!test_bit(type, &page->shadow_flags)); set_bit(type, &page->shadow_flags); + TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_PROMOTE); } void shadow_demote(struct vcpu *v, mfn_t gmfn, u32 type) @@ -1027,6 +1047,8 @@ void shadow_demote(struct vcpu *v, mfn_t #endif clear_bit(_PGC_page_table, &page->count_info); } + + TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_DEMOTE); } /**************************************************************************/ @@ -1094,6 +1116,7 @@ sh_validate_guest_entry(struct vcpu *v, ASSERT((page->shadow_flags & (SHF_L4_64|SHF_L3_64|SHF_L2H_64|SHF_L2_64|SHF_L1_64)) == 0); #endif + this_cpu(trace_shadow_path_flags) |= (result<<(TRCE_SFLAG_SET_CHANGED)); return result; } @@ -1295,6 +1318,18 @@ static void shadow_unhook_mappings(struc } } +static inline void trace_shadow_prealloc_unpin(struct domain *d, mfn_t smfn) +{ + if ( tb_init_done ) + { + /* Convert smfn to gfn */ + unsigned long gfn; + ASSERT(mfn_valid(smfn)); + gfn = mfn_to_gfn(d, _mfn(mfn_to_shadow_page(smfn)->backpointer)); + __trace_var(TRC_SHADOW_PREALLOC_UNPIN, 0/*!tsc*/, + sizeof(gfn), (unsigned char*)&gfn); + } +} /* Make sure there are at least count order-sized pages * available in the shadow page pool. */ @@ -1327,6 +1362,7 @@ static void _shadow_prealloc( smfn = shadow_page_to_mfn(sp); /* Unpin this top-level shadow */ + trace_shadow_prealloc_unpin(d, smfn); sh_unpin(v, smfn); /* See if that freed up enough space */ @@ -1343,6 +1379,7 @@ static void _shadow_prealloc( { if ( !pagetable_is_null(v2->arch.shadow_table[i]) ) { + TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_PREALLOC_UNHOOK); shadow_unhook_mappings(v, pagetable_get_mfn(v2->arch.shadow_table[i])); @@ -2200,6 +2237,16 @@ void sh_destroy_shadow(struct vcpu *v, m } } +static inline void trace_shadow_wrmap_bf(mfn_t gmfn) +{ + if ( tb_init_done ) + { + /* Convert gmfn to gfn */ + unsigned long gfn = mfn_to_gfn(current->domain, gmfn); + __trace_var(TRC_SHADOW_WRMAP_BF, 0/*!tsc*/, sizeof(gfn), (unsigned char*)&gfn); + } +} + /**************************************************************************/ /* Remove all writeable mappings of a guest frame from the shadow tables * Returns non-zero if we need to flush TLBs. @@ -2265,6 +2312,8 @@ int sh_remove_write_access(struct vcpu * || (pg->u.inuse.type_info & PGT_count_mask) == 0 ) return 0; + TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_WRMAP); + perfc_incr(shadow_writeable); /* If this isn't a "normal" writeable page, the domain is trying to @@ -2285,11 +2334,14 @@ int sh_remove_write_access(struct vcpu * * and that mapping is likely to be in the current pagetable, * in the guest's linear map (on non-HIGHPTE linux and windows)*/ -#define GUESS(_a, _h) do { \ +#define GUESS(_a, _h) do { \ if ( v->arch.paging.mode->shadow.guess_wrmap(v, (_a), gmfn) ) \ - perfc_incr(shadow_writeable_h_ ## _h); \ - if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 ) \ - return 1; \ + perfc_incr(shadow_writeable_h_ ## _h); \ + if ( (pg->u.inuse.type_info & PGT_count_mask) == 0 ) \ + { \ + TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_WRMAP_GUESS_FOUND); \ + return 1; \ + } \ } while (0) if ( level == 0 && fault_addr ) @@ -2377,6 +2429,7 @@ int sh_remove_write_access(struct vcpu * #endif /* SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC */ /* Brute-force search of all the shadows, by walking the hash */ + trace_shadow_wrmap_bf(gmfn); if ( level == 0 ) perfc_incr(shadow_writeable_bf_1); else diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/mm/shadow/multi.c --- a/xen/arch/x86/mm/shadow/multi.c Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/arch/x86/mm/shadow/multi.c Fri Sep 12 14:47:40 2008 +0900 @@ -225,6 +225,7 @@ static uint32_t set_ad_bits(void *guest_ static uint32_t set_ad_bits(void *guest_p, void *walk_p, int set_dirty) { guest_intpte_t old, new; + int ret = 0; old = *(guest_intpte_t *)walk_p; new = old | _PAGE_ACCESSED | (set_dirty ? _PAGE_DIRTY : 0); @@ -234,10 +235,16 @@ static uint32_t set_ad_bits(void *guest_ * into the guest table as well. If the guest table has changed * under out feet then leave it alone. */ *(guest_intpte_t *)walk_p = new; - if ( cmpxchg(((guest_intpte_t *)guest_p), old, new) == old ) - return 1; - } - return 0; + if( cmpxchg(((guest_intpte_t *)guest_p), old, new) == old ) + ret = 1; + + /* FIXME -- this code is longer than necessary */ + if(set_dirty) + TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SET_AD); + else + TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SET_A); + } + return ret; } /* This validation is called with lock held, and after write permission @@ -1432,6 +1439,7 @@ static int shadow_set_l1e(struct vcpu *v { /* About to install a new reference */ if ( shadow_mode_refcounts(d) ) { + TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SHADOW_L1_GET_REF); if ( shadow_get_page_from_l1e(new_sl1e, d) == 0 ) { /* Doesn't look like a pagetable. */ @@ -1461,6 +1469,7 @@ static int shadow_set_l1e(struct vcpu *v { shadow_vram_put_l1e(old_sl1e, sl1e, sl1mfn, d); shadow_put_page_from_l1e(old_sl1e, d); + TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SHADOW_L1_PUT_REF); } } return flags; @@ -2896,6 +2905,7 @@ static inline void check_for_early_unsha { perfc_incr(shadow_early_unshadow); sh_remove_shadows(v, gmfn, 1, 0 /* Fast, can fail to unshadow */ ); + TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EARLY_UNSHADOW); } v->arch.paging.shadow.last_emulated_mfn_for_unshadow = mfn_x(gmfn); #endif @@ -3012,6 +3022,132 @@ static void sh_prefetch(struct vcpu *v, #endif /* SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH */ +#if GUEST_PAGING_LEVELS == 4 +typedef u64 guest_va_t; +typedef u64 guest_pa_t; +#elif GUEST_PAGING_LEVELS == 3 +typedef u32 guest_va_t; +typedef u64 guest_pa_t; +#else +typedef u32 guest_va_t; +typedef u32 guest_pa_t; +#endif + +static inline void trace_shadow_gen(u32 event, guest_va_t va) +{ + if ( tb_init_done ) + { + event |= (GUEST_PAGING_LEVELS-2)<<8; + __trace_var(event, 0/*!tsc*/, sizeof(va), (unsigned char*)&va); + } +} + +static inline void trace_shadow_fixup(guest_l1e_t gl1e, + guest_va_t va) +{ + if ( tb_init_done ) + { + struct { + /* for PAE, guest_l1e may be 64 while guest_va may be 32; + so put it first for alignment sake. */ + guest_l1e_t gl1e; + guest_va_t va; + u32 flags; + } __attribute__((packed)) d; + u32 event; + + event = TRC_SHADOW_FIXUP | ((GUEST_PAGING_LEVELS-2)<<8); + + d.gl1e = gl1e; + d.va = va; + d.flags = this_cpu(trace_shadow_path_flags); + + __trace_var(event, 0/*!tsc*/, sizeof(d), (unsigned char*)&d); + } +} + +static inline void trace_not_shadow_fault(guest_l1e_t gl1e, + guest_va_t va) +{ + if ( tb_init_done ) + { + struct { + /* for PAE, guest_l1e may be 64 while guest_va may be 32; + so put it first for alignment sake. */ + guest_l1e_t gl1e; + guest_va_t va; + u32 flags; + } __attribute__((packed)) d; + u32 event; + + event = TRC_SHADOW_NOT_SHADOW | ((GUEST_PAGING_LEVELS-2)<<8); + + d.gl1e = gl1e; + d.va = va; + d.flags = this_cpu(trace_shadow_path_flags); + + __trace_var(event, 0/*!tsc*/, sizeof(d), (unsigned char*)&d); + } +} + +static inline void trace_shadow_emulate_other(u32 event, + guest_va_t va, + gfn_t gfn) +{ + if ( tb_init_done ) + { + struct { + /* for PAE, guest_l1e may be 64 while guest_va may be 32; + so put it first for alignment sake. */ +#if GUEST_PAGING_LEVELS == 2 + u32 gfn; +#else + u64 gfn; +#endif + guest_va_t va; + } __attribute__((packed)) d; + + event |= ((GUEST_PAGING_LEVELS-2)<<8); + + d.gfn=gfn_x(gfn); + d.va = va; + + __trace_var(event, 0/*!tsc*/, sizeof(d), (unsigned char*)&d); + } +} + +#if GUEST_PAGING_LEVELS == 3 +static DEFINE_PER_CPU(guest_va_t,trace_emulate_initial_va); +static DEFINE_PER_CPU(int,trace_extra_emulation_count); +#endif +static DEFINE_PER_CPU(guest_pa_t,trace_emulate_write_val); + +static inline void trace_shadow_emulate(guest_l1e_t gl1e, unsigned long va) +{ + if ( tb_init_done ) + { + struct { + /* for PAE, guest_l1e may be 64 while guest_va may be 32; + so put it first for alignment sake. */ + guest_l1e_t gl1e, write_val; + guest_va_t va; + unsigned flags:29, emulation_count:3; + } __attribute__((packed)) d; + u32 event; + + event = TRC_SHADOW_EMULATE | ((GUEST_PAGING_LEVELS-2)<<8); + + d.gl1e = gl1e; + d.write_val.l1 = this_cpu(trace_emulate_write_val); + d.va = va; +#if GUEST_PAGING_LEVELS == 3 + d.emulation_count = this_cpu(trace_extra_emulation_count); +#endif + d.flags = this_cpu(trace_shadow_path_flags); + + __trace_var(event, 0/*!tsc*/, sizeof(d), (unsigned char*)&d); + } +} /**************************************************************************/ /* Entry points into the shadow code */ @@ -3027,8 +3163,8 @@ static int sh_page_fault(struct vcpu *v, { struct domain *d = v->domain; walk_t gw; - gfn_t gfn; - mfn_t gmfn, sl1mfn=_mfn(0); + gfn_t gfn = _gfn(0); + mfn_t gmfn, sl1mfn = _mfn(0); shadow_l1e_t sl1e, *ptr_sl1e; paddr_t gpa; struct sh_emulate_ctxt emul_ctxt; @@ -3043,7 +3179,7 @@ static int sh_page_fault(struct vcpu *v, SHADOW_PRINTK("d:v=%u:%u va=%#lx err=%u, rip=%lx\n", v->domain->domain_id, v->vcpu_id, va, regs->error_code, - regs->rip); + regs->eip); perfc_incr(shadow_fault); @@ -3132,6 +3268,7 @@ static int sh_page_fault(struct vcpu *v, reset_early_unshadow(v); perfc_incr(shadow_fault_fast_gnp); SHADOW_PRINTK("fast path not-present\n"); + trace_shadow_gen(TRC_SHADOW_FAST_PROPAGATE, va); return 0; } else @@ -3145,6 +3282,7 @@ static int sh_page_fault(struct vcpu *v, perfc_incr(shadow_fault_fast_mmio); SHADOW_PRINTK("fast path mmio %#"PRIpaddr"\n", gpa); reset_early_unshadow(v); + trace_shadow_gen(TRC_SHADOW_FAST_MMIO, va); return (handle_mmio_with_translation(va, gpa >> PAGE_SHIFT) ? EXCRET_fault_fixed : 0); } @@ -3155,6 +3293,7 @@ static int sh_page_fault(struct vcpu *v, * Retry and let the hardware give us the right fault next time. */ perfc_incr(shadow_fault_fast_fail); SHADOW_PRINTK("fast path false alarm!\n"); + trace_shadow_gen(TRC_SHADOW_FALSE_FAST_PATH, va); return EXCRET_fault_fixed; } } @@ -3190,7 +3329,7 @@ static int sh_page_fault(struct vcpu *v, perfc_incr(shadow_fault_bail_real_fault); SHADOW_PRINTK("not a shadow fault\n"); reset_early_unshadow(v); - return 0; + goto propagate; } /* It's possible that the guest has put pagetables in memory that it has @@ -3200,7 +3339,7 @@ static int sh_page_fault(struct vcpu *v, if ( unlikely(d->is_shutting_down) ) { SHADOW_PRINTK("guest is shutting down\n"); - return 0; + goto propagate; } /* What kind of access are we dealing with? */ @@ -3218,7 +3357,7 @@ static int sh_page_fault(struct vcpu *v, SHADOW_PRINTK("BAD gfn=%"SH_PRI_gfn" gmfn=%"PRI_mfn"\n", gfn_x(gfn), mfn_x(gmfn)); reset_early_unshadow(v); - return 0; + goto propagate; } #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) @@ -3229,6 +3368,8 @@ static int sh_page_fault(struct vcpu *v, shadow_lock(d); + TRACE_CLEAR_PATH_FLAGS; + rc = gw_remove_write_accesses(v, va, &gw); /* First bit set: Removed write access to a page. */ @@ -3281,6 +3422,7 @@ static int sh_page_fault(struct vcpu *v, * Get out of the fault handler immediately. */ ASSERT(d->is_shutting_down); shadow_unlock(d); + trace_shadow_gen(TRC_SHADOW_DOMF_DYING, va); return 0; } @@ -3383,6 +3525,7 @@ static int sh_page_fault(struct vcpu *v, d->arch.paging.log_dirty.fault_count++; reset_early_unshadow(v); + trace_shadow_fixup(gw.l1e, va); done: sh_audit_gw(v, &gw); SHADOW_PRINTK("fixed\n"); @@ -3405,6 +3548,8 @@ static int sh_page_fault(struct vcpu *v, mfn_x(gmfn)); perfc_incr(shadow_fault_emulate_failed); sh_remove_shadows(v, gmfn, 0 /* thorough */, 1 /* must succeed */); + trace_shadow_emulate_other(TRC_SHADOW_EMULATE_UNSHADOW_USER, + va, gfn); goto done; } @@ -3421,6 +3566,8 @@ static int sh_page_fault(struct vcpu *v, shadow_audit_tables(v); shadow_unlock(d); + this_cpu(trace_emulate_write_val) = 0; + #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION early_emulation: #endif @@ -3446,6 +3593,8 @@ static int sh_page_fault(struct vcpu *v, "injection: cr2=%#lx, mfn=%#lx\n", va, mfn_x(gmfn)); sh_remove_shadows(v, gmfn, 0 /* thorough */, 1 /* must succeed */); + trace_shadow_emulate_other(TRC_SHADOW_EMULATE_UNSHADOW_EVTINJ, + va, gfn); return EXCRET_fault_fixed; } } @@ -3478,6 +3627,10 @@ static int sh_page_fault(struct vcpu *v, * to support more operations in the emulator. More likely, * though, this is a hint that this page should not be shadowed. */ shadow_remove_all_shadows(v, gmfn); + + trace_shadow_emulate_other(TRC_SHADOW_EMULATE_UNSHADOW_UNHANDLED, + va, gfn); + goto emulate_done; } #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION @@ -3504,7 +3657,8 @@ static int sh_page_fault(struct vcpu *v, #if GUEST_PAGING_LEVELS == 3 /* PAE guest */ if ( r == X86EMUL_OKAY ) { - int i; + int i, emulation_count=0; + this_cpu(trace_emulate_initial_va) = va; /* Emulate up to four extra instructions in the hope of catching * the "second half" of a 64-bit pagetable write. */ for ( i = 0 ; i < 4 ; i++ ) @@ -3513,10 +3667,12 @@ static int sh_page_fault(struct vcpu *v, v->arch.paging.last_write_was_pt = 0; r = x86_emulate(&emul_ctxt.ctxt, emul_ops); if ( r == X86EMUL_OKAY ) - { + { + emulation_count++; if ( v->arch.paging.last_write_was_pt ) { perfc_incr(shadow_em_ex_pt); + TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EMULATION_2ND_PT_WRITTEN); break; /* Don't emulate past the other half of the write */ } else @@ -3525,12 +3681,16 @@ static int sh_page_fault(struct vcpu *v, else { perfc_incr(shadow_em_ex_fail); + TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EMULATION_LAST_FAILED); break; /* Don't emulate again if we failed! */ } } + this_cpu(trace_extra_emulation_count)=emulation_count; } #endif /* PAE guest */ + trace_shadow_emulate(gw.l1e, va); + emulate_done: SHADOW_PRINTK("emulated\n"); return EXCRET_fault_fixed; @@ -3543,6 +3703,7 @@ static int sh_page_fault(struct vcpu *v, shadow_audit_tables(v); reset_early_unshadow(v); shadow_unlock(d); + trace_shadow_gen(TRC_SHADOW_MMIO, va); return (handle_mmio_with_translation(va, gpa >> PAGE_SHIFT) ? EXCRET_fault_fixed : 0); @@ -3552,6 +3713,10 @@ static int sh_page_fault(struct vcpu *v, shadow_audit_tables(v); reset_early_unshadow(v); shadow_unlock(d); + +propagate: + trace_not_shadow_fault(gw.l1e, va); + return 0; } @@ -3990,7 +4155,7 @@ sh_detach_old_tables(struct vcpu *v) sh_unmap_domain_page_global(v->arch.paging.shadow.guest_vtable); v->arch.paging.shadow.guest_vtable = NULL; } -#endif +#endif // !NDEBUG //// @@ -4446,6 +4611,7 @@ static int sh_guess_wrmap(struct vcpu *v sl1e = shadow_l1e_remove_flags(sl1e, _PAGE_RW); r = shadow_set_l1e(v, sl1p, sl1e, sl1mfn); ASSERT( !(r & SHADOW_SET_ERROR) ); + TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_WRMAP_GUESS_FOUND); return 1; } #endif @@ -4800,7 +4966,7 @@ static void emulate_unmap_dest(struct vc static int sh_x86_emulate_write(struct vcpu *v, unsigned long vaddr, void *src, - u32 bytes, struct sh_emulate_ctxt *sh_ctxt) + u32 bytes, struct sh_emulate_ctxt *sh_ctxt) { void *addr; @@ -4814,6 +4980,22 @@ sh_x86_emulate_write(struct vcpu *v, uns shadow_lock(v->domain); memcpy(addr, src, bytes); + + if ( tb_init_done ) + { +#if GUEST_PAGING_LEVELS == 3 + if ( vaddr == this_cpu(trace_emulate_initial_va) ) + memcpy(&this_cpu(trace_emulate_write_val), src, bytes); + else if ( (vaddr & ~(0x7UL)) == this_cpu(trace_emulate_initial_va) ) + { + TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_EMULATE_FULL_PT); + memcpy(&this_cpu(trace_emulate_write_val), + (void *)(((unsigned long) addr) & ~(0x7UL)), GUEST_PTE_SIZE); + } +#else + memcpy(&this_cpu(trace_emulate_write_val), src, bytes); +#endif + } emulate_unmap_dest(v, addr, bytes, sh_ctxt); shadow_audit_tables(v); diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/mm/shadow/private.h --- a/xen/arch/x86/mm/shadow/private.h Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/arch/x86/mm/shadow/private.h Fri Sep 12 14:47:40 2008 +0900 @@ -90,6 +90,43 @@ extern int shadow_audit_enable; #define SHADOW_DEBUG_EMULATE 1 #define SHADOW_DEBUG_P2M 1 #define SHADOW_DEBUG_LOGDIRTY 0 + +/****************************************************************************** + * Tracing + */ +DECLARE_PER_CPU(uint32_t,trace_shadow_path_flags); + +#define TRACE_SHADOW_PATH_FLAG(_x) \ + do { \ + this_cpu(trace_shadow_path_flags) |= (1<<(_x)); \ + } while(0) + +#define TRACE_CLEAR_PATH_FLAGS \ + this_cpu(trace_shadow_path_flags) = 0 + +enum { + TRCE_SFLAG_SET_AD, + TRCE_SFLAG_SET_A, + TRCE_SFLAG_SHADOW_L1_GET_REF, + TRCE_SFLAG_SHADOW_L1_PUT_REF, + TRCE_SFLAG_L2_PROPAGATE, + TRCE_SFLAG_SET_CHANGED, + TRCE_SFLAG_SET_FLUSH, + TRCE_SFLAG_SET_ERROR, + TRCE_SFLAG_DEMOTE, + TRCE_SFLAG_PROMOTE, + TRCE_SFLAG_WRMAP, + TRCE_SFLAG_WRMAP_GUESS_FOUND, + TRCE_SFLAG_WRMAP_BRUTE_FORCE, + TRCE_SFLAG_EARLY_UNSHADOW, + TRCE_SFLAG_EMULATION_2ND_PT_WRITTEN, + TRCE_SFLAG_EMULATION_LAST_FAILED, + TRCE_SFLAG_EMULATE_FULL_PT, + TRCE_SFLAG_PREALLOC_UNHOOK, + TRCE_SFLAG_UNSYNC, + TRCE_SFLAG_OOS_FIXUP_ADD, + TRCE_SFLAG_OOS_FIXUP_EVICT, +}; /****************************************************************************** * The shadow lock. @@ -143,6 +180,12 @@ extern int shadow_audit_enable; } while (0) +/* Size (in bytes) of a guest PTE */ +#if GUEST_PAGING_LEVELS >= 3 +# define GUEST_PTE_SIZE 8 +#else +# define GUEST_PTE_SIZE 4 +#endif /****************************************************************************** * Auditing routines diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/physdev.c --- a/xen/arch/x86/physdev.c Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/arch/x86/physdev.c Fri Sep 12 14:47:40 2008 +0900 @@ -58,9 +58,6 @@ static int get_free_pirq(struct domain * return i; } -/* - * Caller hold the irq_lock - */ static int map_domain_pirq(struct domain *d, int pirq, int vector, struct physdev_map_pirq *map) { @@ -136,13 +133,12 @@ done: return ret; } -/* - * The pirq should has been unbound before this call - */ +/* The pirq should have been unbound before this call. */ static int unmap_domain_pirq(struct domain *d, int pirq) { - int ret = 0; - int vector; + unsigned long flags; + irq_desc_t *desc; + int vector, ret = 0; if ( d == NULL || pirq < 0 || pirq >= NR_PIRQS ) return -EINVAL; @@ -159,33 +155,29 @@ static int unmap_domain_pirq(struct doma gdprintk(XENLOG_G_ERR, "domain %X: pirq %x not mapped still\n", d->domain_id, pirq); ret = -EINVAL; - } - else - { - unsigned long flags; - irq_desc_t *desc; - - desc = &irq_desc[vector]; - spin_lock_irqsave(&desc->lock, flags); - if ( desc->msi_desc ) - pci_disable_msi(vector); - - if ( desc->handler == &pci_msi_type ) - { - /* MSI is not shared, so should be released already */ - BUG_ON(desc->status & IRQ_GUEST); - irq_desc[vector].handler = &no_irq_type; - } - spin_unlock_irqrestore(&desc->lock, flags); - - d->arch.pirq_vector[pirq] = d->arch.vector_pirq[vector] = 0; - } + goto done; + } + + desc = &irq_desc[vector]; + spin_lock_irqsave(&desc->lock, flags); + if ( desc->msi_desc ) + pci_disable_msi(vector); + + if ( desc->handler == &pci_msi_type ) + { + /* MSI is not shared, so should be released already */ + BUG_ON(desc->status & IRQ_GUEST); + irq_desc[vector].handler = &no_irq_type; + } + spin_unlock_irqrestore(&desc->lock, flags); + + d->arch.pirq_vector[pirq] = d->arch.vector_pirq[vector] = 0; ret = irq_deny_access(d, pirq); - if ( ret ) gdprintk(XENLOG_G_ERR, "deny irq %x access failed\n", pirq); + done: return ret; } @@ -194,10 +186,6 @@ static int physdev_map_pirq(struct physd struct domain *d; int vector, pirq, ret = 0; unsigned long flags; - - /* if msi_enable is not enabled, map always succeeds */ - if ( !msi_enable ) - return 0; if ( !IS_PRIV(current->domain) ) return -EPERM; @@ -308,14 +296,8 @@ static int physdev_unmap_pirq(struct phy unsigned long flags; int ret; - if ( !msi_enable ) - return 0; - if ( !IS_PRIV(current->domain) ) return -EPERM; - - if ( !unmap ) - return -EINVAL; if ( unmap->domid == DOMID_SELF ) d = rcu_lock_domain(current->domain); @@ -323,14 +305,12 @@ static int physdev_unmap_pirq(struct phy d = rcu_lock_domain_by_id(unmap->domid); if ( d == NULL ) - { - rcu_unlock_domain(d); return -ESRCH; - } spin_lock_irqsave(&d->arch.irq_lock, flags); ret = unmap_domain_pirq(d, unmap->pirq); spin_unlock_irqrestore(&d->arch.irq_lock, flags); + rcu_unlock_domain(d); return ret; @@ -452,20 +432,14 @@ ret_t do_physdev_op(int cmd, XEN_GUEST_H irq = irq_op.irq; ret = -EINVAL; - if ( ((irq < 0) && (irq != AUTO_ASSIGN)) || (irq >= NR_IRQS) ) + if ( (irq < 0) || (irq >= NR_IRQS) ) break; irq_op.vector = assign_irq_vector(irq); - ret = 0; - - if ( msi_enable ) - { - spin_lock_irqsave(&dom0->arch.irq_lock, flags); - if ( irq != AUTO_ASSIGN ) - ret = map_domain_pirq(dom0, irq_op.irq, irq_op.vector, NULL); - spin_unlock_irqrestore(&dom0->arch.irq_lock, flags); - } + spin_lock_irqsave(&dom0->arch.irq_lock, flags); + ret = map_domain_pirq(dom0, irq_op.irq, irq_op.vector, NULL); + spin_unlock_irqrestore(&dom0->arch.irq_lock, flags); if ( copy_to_guest(arg, &irq_op, 1) != 0 ) ret = -EFAULT; diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/platform_hypercall.c --- a/xen/arch/x86/platform_hypercall.c Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/arch/x86/platform_hypercall.c Fri Sep 12 14:47:40 2008 +0900 @@ -192,6 +192,10 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xe break; case XENPF_firmware_info: + ret = xsm_firmware_info(); + if ( ret ) + break; + switch ( op->u.firmware_info.type ) { case XEN_FW_DISK_INFO: { @@ -280,10 +284,18 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xe break; case XENPF_enter_acpi_sleep: + ret = xsm_acpi_sleep(); + if ( ret ) + break; + ret = acpi_enter_sleep(&op->u.enter_acpi_sleep); break; case XENPF_change_freq: + ret = xsm_change_freq(); + if ( ret ) + break; + ret = -ENOSYS; if ( cpufreq_controller != FREQCTL_dom0_kernel ) break; @@ -305,6 +317,10 @@ ret_t do_platform_op(XEN_GUEST_HANDLE(xe cpumask_t cpumap; XEN_GUEST_HANDLE(uint8) cpumap_bitmap; XEN_GUEST_HANDLE(uint64) idletimes; + + ret = xsm_getidletime(); + if ( ret ) + break; ret = -ENOSYS; if ( cpufreq_controller != FREQCTL_dom0_kernel ) diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/smpboot.c --- a/xen/arch/x86/smpboot.c Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/arch/x86/smpboot.c Fri Sep 12 14:47:40 2008 +0900 @@ -1225,15 +1225,6 @@ int __cpu_disable(void) if (cpu == 0) return -EBUSY; - /* - * Only S3 is using this path, and thus idle vcpus are running on all - * APs when we are called. To support full cpu hotplug, other - * notification mechanisms should be introduced (e.g., migrate vcpus - * off this physical cpu before rendezvous point). - */ - if (!is_idle_vcpu(current)) - return -EINVAL; - local_irq_disable(); clear_local_APIC(); /* Allow any queued timer interrupts to get serviced */ @@ -1249,6 +1240,9 @@ int __cpu_disable(void) fixup_irqs(map); /* It's now safe to remove this processor from the online map */ cpu_clear(cpu, cpu_online_map); + + cpu_disable_scheduler(); + return 0; } @@ -1275,28 +1269,6 @@ static int take_cpu_down(void *unused) return __cpu_disable(); } -/* - * XXX: One important thing missed here is to migrate vcpus - * from dead cpu to other online ones and then put whole - * system into a stop state. It assures a safe environment - * for a cpu hotplug/remove at normal running state. - * - * However for xen PM case, at this point: - * -> All other domains should be notified with PM event, - * and then in following states: - * * Suspend state, or - * * Paused state, which is a force step to all - * domains if they do nothing to suspend - * -> All vcpus of dom0 (except vcpu0) have already beem - * hot removed - * with the net effect that all other cpus only have idle vcpu - * running. In this special case, we can avoid vcpu migration - * then and system can be considered in a stop state. - * - * So current cpu hotplug is a special version for PM specific - * usage, and need more effort later for full cpu hotplug. - * (ktian1) - */ int cpu_down(unsigned int cpu) { int err = 0; @@ -1304,6 +1276,12 @@ int cpu_down(unsigned int cpu) spin_lock(&cpu_add_remove_lock); if (num_online_cpus() == 1) { err = -EBUSY; + goto out; + } + + /* Can not offline BSP */ + if (cpu == 0) { + err = -EINVAL; goto out; } diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/time.c --- a/xen/arch/x86/time.c Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/arch/x86/time.c Fri Sep 12 14:47:40 2008 +0900 @@ -993,15 +993,16 @@ static void local_time_calibration(void) * All CPUS snapshot their local TSC and extrapolation of system time. */ struct calibration_rendezvous { + cpumask_t cpu_calibration_map; atomic_t nr_cpus; s_time_t master_stime; }; static void time_calibration_rendezvous(void *_r) { - unsigned int total_cpus = num_online_cpus(); struct cpu_calibration *c = &this_cpu(cpu_calibration); struct calibration_rendezvous *r = _r; + unsigned int total_cpus = cpus_weight(r->cpu_calibration_map); if ( smp_processor_id() == 0 ) { @@ -1029,11 +1030,13 @@ static void time_calibration(void *unuse static void time_calibration(void *unused) { struct calibration_rendezvous r = { + .cpu_calibration_map = cpu_online_map, .nr_cpus = ATOMIC_INIT(0) }; /* @wait=1 because we must wait for all cpus before freeing @r. */ - on_each_cpu(time_calibration_rendezvous, &r, 0, 1); + on_selected_cpus(r.cpu_calibration_map, + time_calibration_rendezvous, &r, 0, 1); } void init_percpu_time(void) diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/arch/x86/traps.c --- a/xen/arch/x86/traps.c Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/arch/x86/traps.c Fri Sep 12 14:47:40 2008 +0900 @@ -47,7 +47,7 @@ #include <xen/version.h> #include <xen/kexec.h> #include <xen/trace.h> -#include <asm/paging.h> +#include <xen/paging.h> #include <asm/system.h> #include <asm/io.h> #include <asm/atomic.h> @@ -2116,6 +2116,36 @@ static int emulate_privileged_op(struct if ( wrmsr_safe(regs->ecx, eax, edx) != 0 ) goto fail; break; + case MSR_AMD64_NB_CFG: + if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD || + boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x11 ) + goto fail; + if ( !IS_PRIV(v->domain) ) + break; + if ( (rdmsr_safe(MSR_AMD64_NB_CFG, l, h) != 0) || + (eax != l) || + ((edx ^ h) & ~(1 << (AMD64_NB_CFG_CF8_EXT_ENABLE_BIT - 32))) ) + goto invalid; + if ( wrmsr_safe(MSR_AMD64_NB_CFG, eax, edx) != 0 ) + goto fail; + break; + case MSR_FAM10H_MMIO_CONF_BASE: + if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD || + boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x11 ) + goto fail; + if ( !IS_PRIV(v->domain) ) + break; + if ( (rdmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, l, h) != 0) || + (((((u64)h << 32) | l) ^ res) & + ~((1 << FAM10H_MMIO_CONF_ENABLE_BIT) | + (FAM10H_MMIO_CONF_BUSRANGE_MASK << + FAM10H_MMIO_CONF_BUSRANGE_SHIFT) | + ((u64)FAM10H_MMIO_CONF_BASE_MASK << + FAM10H_MMIO_CONF_BASE_SHIFT))) ) + goto invalid; + if ( wrmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, eax, edx) != 0 ) + goto fail; + break; case MSR_IA32_PERF_CTL: if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ) goto fail; @@ -2124,11 +2154,18 @@ static int emulate_privileged_op(struct if ( wrmsr_safe(regs->ecx, eax, edx) != 0 ) goto fail; break; + case MSR_IA32_THERM_CONTROL: + if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ) + goto fail; + if ( wrmsr_safe(regs->ecx, eax, edx) != 0 ) + goto fail; + break; default: if ( wrmsr_hypervisor_regs(regs->ecx, eax, edx) ) break; if ( (rdmsr_safe(regs->ecx, l, h) != 0) || (eax != l) || (edx != h) ) + invalid: gdprintk(XENLOG_WARNING, "Domain attempted WRMSR %p from " "%08x:%08x to %08x:%08x.\n", _p(regs->ecx), h, l, edx, eax); @@ -2198,6 +2235,12 @@ static int emulate_privileged_op(struct regs->eax |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL | MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | MSR_IA32_MISC_ENABLE_XTPR_DISABLE; + break; + case MSR_IA32_THERM_CONTROL: + if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ) + goto fail; + if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) ) + goto fail; break; default: if ( rdmsr_hypervisor_regs(regs->ecx, &l, &h) ) diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/common/domain.c --- a/xen/common/domain.c Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/common/domain.c Fri Sep 12 14:47:40 2008 +0900 @@ -651,9 +651,11 @@ void vcpu_reset(struct vcpu *v) set_bit(_VPF_down, &v->pause_flags); + clear_bit(v->vcpu_id, d->poll_mask); + v->poll_evtchn = 0; + v->fpu_initialised = 0; v->fpu_dirtied = 0; - v->is_polling = 0; v->is_initialised = 0; v->nmi_pending = 0; v->mce_pending = 0; diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/common/domctl.c --- a/xen/common/domctl.c Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/common/domctl.c Fri Sep 12 14:47:40 2008 +0900 @@ -655,9 +655,6 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc spin_lock(&d->page_alloc_lock); if ( new_max >= d->tot_pages ) { - ret = guest_physmap_max_mem_pages(d, new_max); - if ( ret != 0 ) - break; d->max_pages = new_max; ret = 0; } @@ -729,16 +726,11 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc if ( d == NULL ) break; - ret = xsm_irq_permission(d, pirq, op->u.irq_permission.allow_access); - if ( ret ) - goto irq_permission_out; - if ( op->u.irq_permission.allow_access ) ret = irq_permit_access(d, pirq); else ret = irq_deny_access(d, pirq); - irq_permission_out: rcu_unlock_domain(d); } break; @@ -757,17 +749,12 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc d = rcu_lock_domain_by_id(op->domain); if ( d == NULL ) break; - - ret = xsm_iomem_permission(d, mfn, op->u.iomem_permission.allow_access); - if ( ret ) - goto iomem_permission_out; if ( op->u.iomem_permission.allow_access ) ret = iomem_permit_access(d, mfn, mfn + nr_mfns - 1); else ret = iomem_deny_access(d, mfn, mfn + nr_mfns - 1); - iomem_permission_out: rcu_unlock_domain(d); } break; @@ -813,6 +800,12 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc { put_domain(e); goto set_target_out; + } + + ret = xsm_set_target(d, e); + if ( ret ) { + put_domain(e); + goto set_target_out; } /* Hold reference on @e until we destroy @d. */ diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/common/event_channel.c --- a/xen/common/event_channel.c Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/common/event_channel.c Fri Sep 12 14:47:40 2008 +0900 @@ -545,6 +545,7 @@ static int evtchn_set_pending(struct vcp static int evtchn_set_pending(struct vcpu *v, int port) { struct domain *d = v->domain; + int vcpuid; /* * The following bit operations must happen in strict order. @@ -564,15 +565,19 @@ static int evtchn_set_pending(struct vcp } /* Check if some VCPU might be polling for this event. */ - if ( unlikely(d->is_polling) ) - { - d->is_polling = 0; - smp_mb(); /* check vcpu poll-flags /after/ clearing domain poll-flag */ - for_each_vcpu ( d, v ) + if ( likely(bitmap_empty(d->poll_mask, MAX_VIRT_CPUS)) ) + return 0; + + /* Wake any interested (or potentially interested) pollers. */ + for ( vcpuid = find_first_bit(d->poll_mask, MAX_VIRT_CPUS); + vcpuid < MAX_VIRT_CPUS; + vcpuid = find_next_bit(d->poll_mask, MAX_VIRT_CPUS, vcpuid+1) ) + { + v = d->vcpu[vcpuid]; + if ( ((v->poll_evtchn <= 0) || (v->poll_evtchn == port)) && + test_and_clear_bit(vcpuid, d->poll_mask) ) { - if ( !v->is_polling ) - continue; - v->is_polling = 0; + v->poll_evtchn = 0; vcpu_unblock(v); } } diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/common/rangeset.c --- a/xen/common/rangeset.c Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/common/rangeset.c Fri Sep 12 14:47:40 2008 +0900 @@ -10,6 +10,7 @@ #include <xen/sched.h> #include <xen/errno.h> #include <xen/rangeset.h> +#include <xsm/xsm.h> /* An inclusive range [s,e] and pointer to next range in ascending order. */ struct range { @@ -95,6 +96,10 @@ int rangeset_add_range( { struct range *x, *y; int rc = 0; + + rc = xsm_add_range(r->domain, r->name, s, e); + if ( rc ) + return rc; ASSERT(s <= e); @@ -164,6 +169,10 @@ int rangeset_remove_range( struct range *x, *y, *t; int rc = 0; + rc = xsm_remove_range(r->domain, r->name, s, e); + if ( rc ) + return rc; + ASSERT(s <= e); spin_lock(&r->lock); diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/common/sched_credit.c --- a/xen/common/sched_credit.c Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/common/sched_credit.c Fri Sep 12 14:47:40 2008 +0900 @@ -1107,6 +1107,10 @@ csched_load_balance(int cpu, struct csch BUG_ON( cpu != snext->vcpu->processor ); + /* If this CPU is going offline we shouldn't steal work. */ + if ( unlikely(!cpu_online(cpu)) ) + goto out; + if ( snext->pri == CSCHED_PRI_IDLE ) CSCHED_STAT_CRANK(load_balance_idle); else if ( snext->pri == CSCHED_PRI_TS_OVER ) @@ -1149,6 +1153,7 @@ csched_load_balance(int cpu, struct csch return speer; } + out: /* Failed to find more important work elsewhere... */ __runq_remove(snext); return snext; diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/common/schedule.c --- a/xen/common/schedule.c Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/common/schedule.c Fri Sep 12 14:47:40 2008 +0900 @@ -63,11 +63,31 @@ static struct scheduler ops; (( ops.fn != NULL ) ? ops.fn( __VA_ARGS__ ) \ : (typeof(ops.fn(__VA_ARGS__)))0 ) +static inline void trace_runstate_change(struct vcpu *v, int new_state) +{ + struct { uint32_t vcpu:16, domain:16; } d; + uint32_t event; + + if ( likely(!tb_init_done) ) + return; + + d.vcpu = v->vcpu_id; + d.domain = v->domain->domain_id; + + event = TRC_SCHED_RUNSTATE_CHANGE; + event |= ( v->runstate.state & 0x3 ) << 8; + event |= ( new_state & 0x3 ) << 4; + + __trace_var(event, 1/*tsc*/, sizeof(d), (unsigned char *)&d); +} + static inline void vcpu_runstate_change( struct vcpu *v, int new_state, s_time_t new_entry_time) { ASSERT(v->runstate.state != new_state); ASSERT(spin_is_locked(&per_cpu(schedule_data,v->processor).schedule_lock)); + + trace_runstate_change(v, new_state); v->runstate.time[v->runstate.state] += new_entry_time - v->runstate.state_entry_time; @@ -198,6 +218,27 @@ void vcpu_wake(struct vcpu *v) TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id); } +void vcpu_unblock(struct vcpu *v) +{ + if ( !test_and_clear_bit(_VPF_blocked, &v->pause_flags) ) + return; + + /* Polling period ends when a VCPU is unblocked. */ + if ( unlikely(v->poll_evtchn != 0) ) + { + v->poll_evtchn = 0; + /* + * We *must* re-clear _VPF_blocked to avoid racing other wakeups of + * this VCPU (and it then going back to sleep on poll_mask). + * Test-and-clear is idiomatic and ensures clear_bit not reordered. + */ + if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) ) + clear_bit(_VPF_blocked, &v->pause_flags); + } + + vcpu_wake(v); +} + static void vcpu_migrate(struct vcpu *v) { unsigned long flags; @@ -247,6 +288,48 @@ void vcpu_force_reschedule(struct vcpu * } } +/* + * This function is used by cpu_hotplug code from stop_machine context. + * Hence we can avoid needing to take the + */ +void cpu_disable_scheduler(void) +{ + struct domain *d; + struct vcpu *v; + unsigned int cpu = smp_processor_id(); + + for_each_domain ( d ) + { + for_each_vcpu ( d, v ) + { + if ( is_idle_vcpu(v) ) + continue; + + if ( (cpus_weight(v->cpu_affinity) == 1) && + cpu_isset(cpu, v->cpu_affinity) ) + { + printk("Breaking vcpu affinity for domain %d vcpu %d\n", + v->domain->domain_id, v->vcpu_id); + cpus_setall(v->cpu_affinity); + } + + /* + * Migrate single-shot timers to CPU0. A new cpu will automatically + * be chosen when the timer is next re-set. + */ + if ( v->singleshot_timer.cpu == cpu ) + migrate_timer(&v->singleshot_timer, 0); + + if ( v->processor == cpu ) + { + set_bit(_VPF_migrating, &v->pause_flags); + vcpu_sleep_nosync(v); + vcpu_migrate(v); + } + } + } +} + static int __vcpu_set_affinity( struct vcpu *v, cpumask_t *affinity, bool_t old_lock_status, bool_t new_lock_status) @@ -337,7 +420,7 @@ static long do_poll(struct sched_poll *s struct vcpu *v = current; struct domain *d = v->domain; evtchn_port_t port; - long rc = 0; + long rc; unsigned int i; /* Fairly arbitrary limit. */ @@ -348,11 +431,24 @@ static long do_poll(struct sched_poll *s return -EFAULT; set_bit(_VPF_blocked, &v->pause_flags); - v->is_polling = 1; - d->is_polling = 1; - + v->poll_evtchn = -1; + set_bit(v->vcpu_id, d->poll_mask); + +#ifndef CONFIG_X86 /* set_bit() implies mb() on x86 */ /* Check for events /after/ setting flags: avoids wakeup waiting race. */ - smp_wmb(); + smp_mb(); + + /* + * Someone may have seen we are blocked but not that we are polling, or + * vice versa. We are certainly being woken, so clean up and bail. Beyond + * this point others can be guaranteed to clean up for us if they wake us. + */ + rc = 0; + if ( (v->poll_evtchn == 0) || + !test_bit(_VPF_blocked, &v->pause_flags) || + !test_bit(v->vcpu_id, d->poll_mask) ) + goto out; +#endif for ( i = 0; i < sched_poll->nr_ports; i++ ) { @@ -369,6 +465,9 @@ static long do_poll(struct sched_poll *s goto out; } + if ( sched_poll->nr_ports == 1 ) + v->poll_evtchn = port; + if ( sched_poll->timeout != 0 ) set_timer(&v->poll_timer, sched_poll->timeout); @@ -378,7 +477,8 @@ static long do_poll(struct sched_poll *s return 0; out: - v->is_polling = 0; + v->poll_evtchn = 0; + clear_bit(v->vcpu_id, d->poll_mask); clear_bit(_VPF_blocked, &v->pause_flags); return rc; } @@ -628,7 +728,9 @@ static void vcpu_periodic_timer_work(str return; periodic_next_event = v->periodic_last_event + v->periodic_period; - if ( now > periodic_next_event ) + + /* The timer subsystem may call us up to TIME_SLOP ahead of deadline. */ + if ( (now + TIME_SLOP) > periodic_next_event ) { send_timer_event(v); v->periodic_last_event = now; @@ -758,11 +860,8 @@ static void poll_timer_fn(void *data) { struct vcpu *v = data; - if ( !v->is_polling ) - return; - - v->is_polling = 0; - vcpu_unblock(v); + if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) ) + vcpu_unblock(v); } /* Initialise the data structures. */ diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/common/sysctl.c --- a/xen/common/sysctl.c Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/common/sysctl.c Fri Sep 12 14:47:40 2008 +0900 @@ -149,6 +149,10 @@ long do_sysctl(XEN_GUEST_HANDLE(xen_sysc char c; uint32_t i; + ret = xsm_debug_keys(); + if ( ret ) + break; + for ( i = 0; i < op->u.debug_keys.nr_keys; i++ ) { if ( copy_from_guest_offset(&c, op->u.debug_keys.keys, i, 1) ) @@ -166,6 +170,10 @@ long do_sysctl(XEN_GUEST_HANDLE(xen_sysc nr_cpus = min_t(uint32_t, op->u.getcpuinfo.max_cpus, NR_CPUS); + ret = xsm_getcpuinfo(); + if ( ret ) + break; + for ( i = 0; i < nr_cpus; i++ ) { /* Assume no holes in idle-vcpu map. */ @@ -188,6 +196,10 @@ long do_sysctl(XEN_GUEST_HANDLE(xen_sysc case XEN_SYSCTL_availheap: { + ret = xsm_availheap(); + if ( ret ) + break; + op->u.availheap.avail_bytes = avail_domheap_pages_region( op->u.availheap.node, op->u.availheap.min_bitwidth, diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/common/trace.c --- a/xen/common/trace.c Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/common/trace.c Fri Sep 12 14:47:40 2008 +0900 @@ -58,6 +58,7 @@ static int t_buf_highwater; /* Number of records lost due to per-CPU trace buffer being full. */ static DEFINE_PER_CPU(unsigned long, lost_records); +static DEFINE_PER_CPU(unsigned long, lost_records_first_tsc); /* a flag recording whether initialization has been done */ /* or more properly, if the tbuf subsystem is enabled right now */ @@ -147,6 +148,31 @@ static int tb_set_size(int size) return 0; } +int trace_will_trace_event(u32 event) +{ + if ( !tb_init_done ) + return 0; + + /* + * Copied from __trace_var() + */ + if ( (tb_event_mask & event) == 0 ) + return 0; + + /* match class */ + if ( ((tb_event_mask >> TRC_CLS_SHIFT) & (event >> TRC_CLS_SHIFT)) == 0 ) + return 0; + + /* then match subclass */ + if ( (((tb_event_mask >> TRC_SUBCLS_SHIFT) & 0xf ) + & ((event >> TRC_SUBCLS_SHIFT) & 0xf )) == 0 ) + return 0; + + if ( !cpu_isset(smp_processor_id(), tb_cpu_mask) ) + return 0; + + return 1; +} /** * init_trace_bufs - performs initialization of the per-cpu trace buffers. @@ -354,22 +380,27 @@ static inline int insert_wrap_record(str NULL); } -#define LOST_REC_SIZE 8 +#define LOST_REC_SIZE (4 + 8 + 16) /* header + tsc + sizeof(struct ed) */ static inline int insert_lost_records(struct t_buf *buf) { struct { u32 lost_records; - } ed; - + u32 did:16, vid:16; + u64 first_tsc; + } __attribute__((packed)) ed; + + ed.vid = current->vcpu_id; + ed.did = current->domain->domain_id; ed.lost_records = this_cpu(lost_records); + ed.first_tsc = this_cpu(lost_records_first_tsc); this_cpu(lost_records) = 0; return __insert_record(buf, TRC_LOST_RECORDS, sizeof(ed), - 0 /* !cycles */, + 1 /* cycles */, LOST_REC_SIZE, (unsigned char *)&ed); } @@ -401,7 +432,8 @@ void __trace_var(u32 event, int cycles, int extra_word; int started_below_highwater; - ASSERT(tb_init_done); + if( !tb_init_done ) + return; /* Convert byte count into word count, rounding up */ extra_word = (extra / sizeof(u32)); @@ -479,7 +511,8 @@ void __trace_var(u32 event, int cycles, /* Do we have enough space for everything? */ if ( total_size > bytes_to_tail ) { - this_cpu(lost_records)++; + if ( ++this_cpu(lost_records) == 1 ) + this_cpu(lost_records_first_tsc)=(u64)get_cycles(); local_irq_restore(flags); return; } diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/drivers/acpi/hwregs.c --- a/xen/drivers/acpi/hwregs.c Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/drivers/acpi/hwregs.c Fri Sep 12 14:47:40 2008 +0900 @@ -239,11 +239,13 @@ acpi_status acpi_set_register(u32 regist case ACPI_REGISTER_PM2_CONTROL: +#if 0 /* Redundant read in original Linux code. */ status = acpi_hw_register_read(ACPI_REGISTER_PM2_CONTROL, ®ister_value); if (ACPI_FAILURE(status)) { goto unlock_and_exit; } +#endif ACPI_DEBUG_PRINT((ACPI_DB_IO, "PM2 control: Read %X from %8.8X%8.8X\n", diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/drivers/passthrough/iommu.c --- a/xen/drivers/passthrough/iommu.c Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/drivers/passthrough/iommu.c Fri Sep 12 14:47:40 2008 +0900 @@ -33,11 +33,13 @@ int amd_iov_detect(void); * pv Enable IOMMU for PV domains * no-pv Disable IOMMU for PV domains (default) * force|required Don't boot unless IOMMU is enabled + * passthrough Bypass VT-d translation for Dom0 */ custom_param("iommu", parse_iommu_param); int iommu_enabled = 0; int iommu_pv_enabled = 0; int force_iommu = 0; +int iommu_passthrough = 0; static void __init parse_iommu_param(char *s) { @@ -58,6 +60,8 @@ static void __init parse_iommu_param(cha iommu_pv_enabled = 0; else if ( !strcmp(s, "force") || !strcmp(s, "required") ) force_iommu = 1; + else if ( !strcmp(s, "passthrough") ) + iommu_passthrough = 1; s = ss + 1; } while ( ss ); diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/drivers/passthrough/vtd/iommu.c --- a/xen/drivers/passthrough/vtd/iommu.c Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/drivers/passthrough/vtd/iommu.c Fri Sep 12 14:47:40 2008 +0900 @@ -1090,12 +1090,13 @@ static int domain_context_mapping_one( } spin_lock_irqsave(&iommu->lock, flags); - -#ifdef CONTEXT_PASSTHRU - if ( ecap_pass_thru(iommu->ecap) && (domain->domain_id == 0) ) + if ( iommu_passthrough && + ecap_pass_thru(iommu->ecap) && (domain->domain_id == 0) ) + { context_set_translation_type(*context, CONTEXT_TT_PASS_THRU); + agaw = level_to_agaw(iommu->nr_pt_levels); + } else -#endif { /* Ensure we have pagetables allocated down to leaf PTE. */ if ( hd->pgd_maddr == 0 ) @@ -1459,11 +1460,13 @@ int intel_iommu_map_page( u64 pg_maddr; int pte_present; -#ifdef CONTEXT_PASSTHRU + drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list); + iommu = drhd->iommu; + /* do nothing if dom0 and iommu supports pass thru */ - if ( ecap_pass_thru(iommu->ecap) && (d->domain_id == 0) ) + if ( iommu_passthrough && + ecap_pass_thru(iommu->ecap) && (d->domain_id == 0) ) return 0; -#endif pg_maddr = addr_to_dma_page_maddr(d, (paddr_t)gfn << PAGE_SHIFT_4K, 1); if ( pg_maddr == 0 ) @@ -1500,11 +1503,10 @@ int intel_iommu_unmap_page(struct domain drhd = list_entry(acpi_drhd_units.next, typeof(*drhd), list); iommu = drhd->iommu; -#ifdef CONTEXT_PASSTHRU /* do nothing if dom0 and iommu supports pass thru */ - if ( ecap_pass_thru(iommu->ecap) && (d->domain_id == 0) ) + if ( iommu_passthrough && + ecap_pass_thru(iommu->ecap) && (d->domain_id == 0) ) return 0; -#endif dma_pte_clear_one(d, (paddr_t)gfn << PAGE_SHIFT_4K); diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/include/asm-ia64/shadow.h --- a/xen/include/asm-ia64/shadow.h Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/include/asm-ia64/shadow.h Fri Sep 12 14:47:40 2008 +0900 @@ -63,8 +63,6 @@ shadow_mark_page_dirty(struct domain *d, return 0; } -#define guest_physmap_max_mem_pages(d, n) (0) - #endif // _XEN_SHADOW_H /* diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/include/asm-x86/bitops.h --- a/xen/include/asm-x86/bitops.h Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/include/asm-x86/bitops.h Fri Sep 12 14:47:40 2008 +0900 @@ -116,8 +116,8 @@ static inline void __clear_bit(int nr, v __clear_bit(nr, addr); \ }) -#define smp_mb__before_clear_bit() barrier() -#define smp_mb__after_clear_bit() barrier() +#define smp_mb__before_clear_bit() ((void)0) +#define smp_mb__after_clear_bit() ((void)0) /** * __change_bit - Toggle a bit in memory diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/include/asm-x86/guest_access.h --- a/xen/include/asm-x86/guest_access.h Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/include/asm-x86/guest_access.h Fri Sep 12 14:47:40 2008 +0900 @@ -8,7 +8,7 @@ #define __ASM_X86_GUEST_ACCESS_H__ #include <asm/uaccess.h> -#include <asm/shadow.h> +#include <asm/paging.h> #include <asm/hvm/support.h> #include <asm/hvm/guest_access.h> @@ -87,10 +87,10 @@ * Allows use of faster __copy_* functions. */ #define guest_handle_okay(hnd, nr) \ - (shadow_mode_external(current->domain) || \ + (paging_mode_external(current->domain) || \ array_access_ok((hnd).p, (nr), sizeof(*(hnd).p))) #define guest_handle_subrange_okay(hnd, first, last) \ - (shadow_mode_external(current->domain) || \ + (paging_mode_external(current->domain) || \ array_access_ok((hnd).p + (first), \ (last)-(first)+1, \ sizeof(*(hnd).p))) diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/include/asm-x86/hvm/trace.h --- a/xen/include/asm-x86/hvm/trace.h Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/include/asm-x86/hvm/trace.h Fri Sep 12 14:47:40 2008 +0900 @@ -56,16 +56,13 @@ #define TRC_PAR_LONG(par) (par) #endif -#define HVMTRACE_ND(evt, cycles, vcpu, count, d1, d2, d3, d4, d5, d6) \ +#define HVMTRACE_ND(evt, cycles, count, d1, d2, d3, d4, d5, d6) \ do { \ if ( unlikely(tb_init_done) && DO_TRC_HVM_ ## evt ) \ { \ struct { \ - u32 did:16, vid:16; \ u32 d[6]; \ } _d; \ - _d.did=(vcpu)->domain->domain_id; \ - _d.vid=(vcpu)->vcpu_id; \ _d.d[0]=(d1); \ _d.d[1]=(d2); \ _d.d[2]=(d3); \ @@ -77,32 +74,32 @@ } \ } while(0) -#define HVMTRACE_6D(evt, vcpu, d1, d2, d3, d4, d5, d6) \ - HVMTRACE_ND(evt, 0, vcpu, 6, d1, d2, d3, d4, d5, d6) -#define HVMTRACE_5D(evt, vcpu, d1, d2, d3, d4, d5) \ - HVMTRACE_ND(evt, 0, vcpu, 5, d1, d2, d3, d4, d5, 0) -#define HVMTRACE_4D(evt, vcpu, d1, d2, d3, d4) \ - HVMTRACE_ND(evt, 0, vcpu, 4, d1, d2, d3, d4, 0, 0) -#define HVMTRACE_3D(evt, vcpu, d1, d2, d3) \ - HVMTRACE_ND(evt, 0, vcpu, 3, d1, d2, d3, 0, 0, 0) -#define HVMTRACE_2D(evt, vcpu, d1, d2) \ - HVMTRACE_ND(evt, 0, vcpu, 2, d1, d2, 0, 0, 0, 0) -#define HVMTRACE_1D(evt, vcpu, d1) \ - HVMTRACE_ND(evt, 0, vcpu, 1, d1, 0, 0, 0, 0, 0) -#define HVMTRACE_0D(evt, vcpu) \ - HVMTRACE_ND(evt, 0, vcpu, 0, 0, 0, 0, 0, 0, 0) +#define HVMTRACE_6D(evt, d1, d2, d3, d4, d5, d6) \ + HVMTRACE_ND(evt, 0, 6, d1, d2, d3, d4, d5, d6) +#define HVMTRACE_5D(evt, d1, d2, d3, d4, d5) \ + HVMTRACE_ND(evt, 0, 5, d1, d2, d3, d4, d5, 0) +#define HVMTRACE_4D(evt, d1, d2, d3, d4) \ + HVMTRACE_ND(evt, 0, 4, d1, d2, d3, d4, 0, 0) +#define HVMTRACE_3D(evt, d1, d2, d3) \ + HVMTRACE_ND(evt, 0, 3, d1, d2, d3, 0, 0, 0) +#define HVMTRACE_2D(evt, d1, d2) \ + HVMTRACE_ND(evt, 0, 2, d1, d2, 0, 0, 0, 0) +#define HVMTRACE_1D(evt, d1) \ + HVMTRACE_ND(evt, 0, 1, d1, 0, 0, 0, 0, 0) +#define HVMTRACE_0D(evt) \ + HVMTRACE_ND(evt, 0, 0, 0, 0, 0, 0, 0, 0) #ifdef __x86_64__ -#define HVMTRACE_LONG_1D(evt, vcpu, d1) \ - HVMTRACE_2D(evt ## 64, vcpu, (d1) & 0xFFFFFFFF, (d1) >> 32) -#define HVMTRACE_LONG_2D(evt,vcpu,d1,d2, ...) \ - HVMTRACE_3D(evt ## 64, vcpu, d1, d2) -#define HVMTRACE_LONG_3D(evt, vcpu, d1, d2, d3, ...) \ - HVMTRACE_4D(evt ## 64, vcpu, d1, d2, d3) -#define HVMTRACE_LONG_4D(evt, vcpu, d1, d2, d3, d4, ...) \ - HVMTRACE_5D(evt ## 64, vcpu, d1, d2, d3, d4) +#define HVMTRACE_LONG_1D(evt, d1) \ + HVMTRACE_2D(evt ## 64, (d1) & 0xFFFFFFFF, (d1) >> 32) +#define HVMTRACE_LONG_2D(evt, d1, d2, ...) \ + HVMTRACE_3D(evt ## 64, d1, d2) +#define HVMTRACE_LONG_3D(evt, d1, d2, d3, ...) \ + HVMTRACE_4D(evt ## 64, d1, d2, d3) +#define HVMTRACE_LONG_4D(evt, d1, d2, d3, d4, ...) \ + HVMTRACE_5D(evt ## 64, d1, d2, d3, d4) #else #define HVMTRACE_LONG_1D HVMTRACE_1D #define HVMTRACE_LONG_2D HVMTRACE_2D diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/include/asm-x86/io_apic.h --- a/xen/include/asm-x86/io_apic.h Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/include/asm-x86/io_apic.h Fri Sep 12 14:47:40 2008 +0900 @@ -162,8 +162,6 @@ static inline void io_apic_modify(unsign /* 1 if "noapic" boot option passed */ extern int skip_ioapic_setup; -extern int msi_enable; - /* * If we use the IO-APIC for IRQ routing, disable automatic * assignment of PCI IRQ's. diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/include/asm-x86/mm.h --- a/xen/include/asm-x86/mm.h Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/include/asm-x86/mm.h Fri Sep 12 14:47:40 2008 +0900 @@ -57,6 +57,17 @@ struct page_info * (except page table pages when the guest is in shadow mode). */ u32 tlbflush_timestamp; + + /* + * When PGT_partial is true then this field is valid and indicates + * that PTEs in the range [0, @nr_validated_ptes) have been validated. + * If @partial_pte is true then PTE at @nr_validated_ptes+1 has been + * partially validated. + */ + struct { + u16 nr_validated_ptes; + bool_t partial_pte; + }; /* * Guest pages with a shadow. This does not conflict with @@ -86,9 +97,12 @@ struct page_info /* PAE only: is this an L2 page directory containing Xen-private mappings? */ #define _PGT_pae_xen_l2 26 #define PGT_pae_xen_l2 (1U<<_PGT_pae_xen_l2) - - /* 26-bit count of uses of this frame as its current type. */ -#define PGT_count_mask ((1U<<26)-1) +/* Has this page been *partially* validated for use as its current type? */ +#define _PGT_partial 25 +#define PGT_partial (1U<<_PGT_partial) + + /* 25-bit count of uses of this frame as its current type. */ +#define PGT_count_mask ((1U<<25)-1) /* Cleared when the owning guest 'frees' this page. */ #define _PGC_allocated 31 @@ -154,7 +168,8 @@ extern unsigned long total_pages; extern unsigned long total_pages; void init_frametable(void); -void free_page_type(struct page_info *page, unsigned long type); +int free_page_type(struct page_info *page, unsigned long type, + int preemptible); int _shadow_mode_refcounts(struct domain *d); void cleanup_page_cacheattr(struct page_info *page); @@ -165,6 +180,8 @@ int get_page(struct page_info *page, st int get_page(struct page_info *page, struct domain *domain); void put_page_type(struct page_info *page); int get_page_type(struct page_info *page, unsigned long type); +int put_page_type_preemptible(struct page_info *page); +int get_page_type_preemptible(struct page_info *page, unsigned long type); int get_page_from_l1e(l1_pgentry_t l1e, struct domain *d); void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d); @@ -174,6 +191,19 @@ static inline void put_page_and_type(str put_page(page); } +static inline int put_page_and_type_preemptible(struct page_info *page, + int preemptible) +{ + int rc = 0; + + if ( preemptible ) + rc = put_page_type_preemptible(page); + else + put_page_type(page); + if ( likely(rc == 0) ) + put_page(page); + return rc; +} static inline int get_page_and_type(struct page_info *page, struct domain *domain, diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/include/asm-x86/msr-index.h --- a/xen/include/asm-x86/msr-index.h Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/include/asm-x86/msr-index.h Fri Sep 12 14:47:40 2008 +0900 @@ -194,10 +194,22 @@ #define _K8_VMCR_SVME_DISABLE 4 #define K8_VMCR_SVME_DISABLE (1 << _K8_VMCR_SVME_DISABLE) +/* AMD64 MSRs */ +#define MSR_AMD64_NB_CFG 0xc001001f +#define AMD64_NB_CFG_CF8_EXT_ENABLE_BIT 46 + /* AMD Family10h machine check MSRs */ #define MSR_F10_MC4_MISC1 0xc0000408 #define MSR_F10_MC4_MISC2 0xc0000409 #define MSR_F10_MC4_MISC3 0xc000040A + +/* Other AMD Fam10h MSRs */ +#define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058 +#define FAM10H_MMIO_CONF_ENABLE_BIT 0 +#define FAM10H_MMIO_CONF_BUSRANGE_MASK 0xf +#define FAM10H_MMIO_CONF_BUSRANGE_SHIFT 2 +#define FAM10H_MMIO_CONF_BASE_MASK 0xfffffff +#define FAM10H_MMIO_CONF_BASE_SHIFT 20 /* K6 MSRs */ #define MSR_K6_EFER 0xc0000080 diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/include/asm-x86/shadow.h --- a/xen/include/asm-x86/shadow.h Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/include/asm-x86/shadow.h Fri Sep 12 14:47:40 2008 +0900 @@ -115,8 +115,6 @@ static inline void shadow_remove_all_sha sh_remove_shadows(v, gmfn, 0 /* Be thorough */, 1 /* Must succeed */); } -#define guest_physmap_max_mem_pages(d, n) (0) - #endif /* _XEN_SHADOW_H */ /* diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/include/public/trace.h --- a/xen/include/public/trace.h Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/include/public/trace.h Fri Sep 12 14:47:40 2008 +0900 @@ -37,6 +37,7 @@ #define TRC_HVM 0x0008f000 /* Xen HVM trace */ #define TRC_MEM 0x0010f000 /* Xen memory trace */ #define TRC_PV 0x0020f000 /* Xen PV traces */ +#define TRC_SHADOW 0x0040f000 /* Xen shadow tracing */ #define TRC_ALL 0x0ffff000 #define TRC_HD_TO_EVENT(x) ((x)&0x0fffffff) #define TRC_HD_CYCLE_FLAG (1UL<<31) @@ -50,26 +51,30 @@ #define TRC_HVM_ENTRYEXIT 0x00081000 /* VMENTRY and #VMEXIT */ #define TRC_HVM_HANDLER 0x00082000 /* various HVM handlers */ +#define TRC_SCHED_MIN 0x00021000 /* Just runstate changes */ +#define TRC_SCHED_VERBOSE 0x00028000 /* More inclusive scheduling */ + /* Trace events per class */ #define TRC_LOST_RECORDS (TRC_GEN + 1) #define TRC_TRACE_WRAP_BUFFER (TRC_GEN + 2) #define TRC_TRACE_CPU_CHANGE (TRC_GEN + 3) -#define TRC_SCHED_DOM_ADD (TRC_SCHED + 1) -#define TRC_SCHED_DOM_REM (TRC_SCHED + 2) -#define TRC_SCHED_SLEEP (TRC_SCHED + 3) -#define TRC_SCHED_WAKE (TRC_SCHED + 4) -#define TRC_SCHED_YIELD (TRC_SCHED + 5) -#define TRC_SCHED_BLOCK (TRC_SCHED + 6) -#define TRC_SCHED_SHUTDOWN (TRC_SCHED + 7) -#define TRC_SCHED_CTL (TRC_SCHED + 8) -#define TRC_SCHED_ADJDOM (TRC_SCHED + 9) -#define TRC_SCHED_SWITCH (TRC_SCHED + 10) -#define TRC_SCHED_S_TIMER_FN (TRC_SCHED + 11) -#define TRC_SCHED_T_TIMER_FN (TRC_SCHED + 12) -#define TRC_SCHED_DOM_TIMER_FN (TRC_SCHED + 13) -#define TRC_SCHED_SWITCH_INFPREV (TRC_SCHED + 14) -#define TRC_SCHED_SWITCH_INFNEXT (TRC_SCHED + 15) +#define TRC_SCHED_RUNSTATE_CHANGE (TRC_SCHED_MIN + 1) +#define TRC_SCHED_DOM_ADD (TRC_SCHED_VERBOSE + 1) +#define TRC_SCHED_DOM_REM (TRC_SCHED_VERBOSE + 2) +#define TRC_SCHED_SLEEP (TRC_SCHED_VERBOSE + 3) +#define TRC_SCHED_WAKE (TRC_SCHED_VERBOSE + 4) +#define TRC_SCHED_YIELD (TRC_SCHED_VERBOSE + 5) +#define TRC_SCHED_BLOCK (TRC_SCHED_VERBOSE + 6) +#define TRC_SCHED_SHUTDOWN (TRC_SCHED_VERBOSE + 7) +#define TRC_SCHED_CTL (TRC_SCHED_VERBOSE + 8) +#define TRC_SCHED_ADJDOM (TRC_SCHED_VERBOSE + 9) +#define TRC_SCHED_SWITCH (TRC_SCHED_VERBOSE + 10) +#define TRC_SCHED_S_TIMER_FN (TRC_SCHED_VERBOSE + 11) +#define TRC_SCHED_T_TIMER_FN (TRC_SCHED_VERBOSE + 12) +#define TRC_SCHED_DOM_TIMER_FN (TRC_SCHED_VERBOSE + 13) +#define TRC_SCHED_SWITCH_INFPREV (TRC_SCHED_VERBOSE + 14) +#define TRC_SCHED_SWITCH_INFNEXT (TRC_SCHED_VERBOSE + 15) #define TRC_MEM_PAGE_GRANT_MAP (TRC_MEM + 1) #define TRC_MEM_PAGE_GRANT_UNMAP (TRC_MEM + 2) @@ -88,6 +93,22 @@ #define TRC_PV_PTWR_EMULATION_PAE (TRC_PV + 12) /* Indicates that addresses in trace record are 64 bits */ #define TRC_64_FLAG (0x100) + +#define TRC_SHADOW_NOT_SHADOW (TRC_SHADOW + 1) +#define TRC_SHADOW_FAST_PROPAGATE (TRC_SHADOW + 2) +#define TRC_SHADOW_FAST_MMIO (TRC_SHADOW + 3) +#define TRC_SHADOW_FALSE_FAST_PATH (TRC_SHADOW + 4) +#define TRC_SHADOW_MMIO (TRC_SHADOW + 5) +#define TRC_SHADOW_FIXUP (TRC_SHADOW + 6) +#define TRC_SHADOW_DOMF_DYING (TRC_SHADOW + 7) +#define TRC_SHADOW_EMULATE (TRC_SHADOW + 8) +#define TRC_SHADOW_EMULATE_UNSHADOW_USER (TRC_SHADOW + 9) +#define TRC_SHADOW_EMULATE_UNSHADOW_EVTINJ (TRC_SHADOW + 10) +#define TRC_SHADOW_EMULATE_UNSHADOW_UNHANDLED (TRC_SHADOW + 11) +#define TRC_SHADOW_WRMAP_BF (TRC_SHADOW + 12) +#define TRC_SHADOW_PREALLOC_UNPIN (TRC_SHADOW + 13) +#define TRC_SHADOW_RESYNC_FULL (TRC_SHADOW + 14) +#define TRC_SHADOW_RESYNC_ONLY (TRC_SHADOW + 15) /* trace events per subclass */ #define TRC_HVM_VMENTRY (TRC_HVM_ENTRYEXIT + 0x01) diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/include/xen/cpuidle.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/include/xen/cpuidle.h Fri Sep 12 14:47:40 2008 +0900 @@ -0,0 +1,82 @@ +/* + * cpuidle.h - xen idle state module derived from Linux + * + * (C) 2007 Venkatesh Pallipadi <venkatesh.pallipadi@xxxxxxxxx> + * Shaohua Li <shaohua.li@xxxxxxxxx> + * Adam Belay <abelay@xxxxxxxxxx> + * Copyright (C) 2008 Intel Corporation + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ +#ifndef _XEN_CPUIDLE_H +#define _XEN_CPUIDLE_H + +#define ACPI_PROCESSOR_MAX_POWER 8 +#define CPUIDLE_NAME_LEN 16 + +struct acpi_processor_cx +{ + u8 valid; + u8 type; + u32 address; + u8 space_id; + u32 latency; + u32 latency_ticks; + u32 power; + u32 usage; + u64 time; + u32 target_residency; +}; + +struct acpi_processor_flags +{ + u8 bm_control:1; + u8 bm_check:1; + u8 has_cst:1; + u8 power_setup_done:1; + u8 bm_rld_set:1; +}; + +struct acpi_processor_power +{ + unsigned int cpu; + struct acpi_processor_flags flags; + struct acpi_processor_cx *last_state; + struct acpi_processor_cx *safe_state; + u32 last_residency; + void *gdata; /* governor specific data */ + u32 count; + struct acpi_processor_cx states[ACPI_PROCESSOR_MAX_POWER]; +}; + +struct cpuidle_governor +{ + char name[CPUIDLE_NAME_LEN]; + unsigned int rating; + + int (*enable) (struct acpi_processor_power *dev); + void (*disable) (struct acpi_processor_power *dev); + + int (*select) (struct acpi_processor_power *dev); + void (*reflect) (struct acpi_processor_power *dev); +}; + +extern struct cpuidle_governor *cpuidle_current_governor; + +#endif /* _XEN_CPUIDLE_H */ diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/include/xen/iommu.h --- a/xen/include/xen/iommu.h Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/include/xen/iommu.h Fri Sep 12 14:47:40 2008 +0900 @@ -31,6 +31,7 @@ extern int iommu_enabled; extern int iommu_enabled; extern int iommu_pv_enabled; extern int force_iommu; +extern int iommu_passthrough; #define domain_hvm_iommu(d) (&d->arch.hvm_domain.hvm_iommu) diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/include/xen/sched.h --- a/xen/include/xen/sched.h Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/include/xen/sched.h Fri Sep 12 14:47:40 2008 +0900 @@ -106,8 +106,6 @@ struct vcpu bool_t fpu_initialised; /* Has the FPU been used since it was last saved? */ bool_t fpu_dirtied; - /* Is this VCPU polling any event channels (SCHEDOP_poll)? */ - bool_t is_polling; /* Initialization completed for this VCPU? */ bool_t is_initialised; /* Currently running on a CPU? */ @@ -133,6 +131,13 @@ struct vcpu bool_t paused_for_shutdown; /* VCPU affinity is temporarily locked from controller changes? */ bool_t affinity_locked; + + /* + * > 0: a single port is being polled; + * = 0: nothing is being polled (vcpu should be clear in d->poll_mask); + * < 0: multiple ports may be being polled. + */ + int poll_evtchn; unsigned long pause_flags; atomic_t pause_count; @@ -209,14 +214,15 @@ struct domain struct domain *target; /* Is this guest being debugged by dom0? */ bool_t debugger_attached; - /* Are any VCPUs polling event channels (SCHEDOP_poll)? */ - bool_t is_polling; /* Is this guest dying (i.e., a zombie)? */ enum { DOMDYING_alive, DOMDYING_dying, DOMDYING_dead } is_dying; /* Domain is paused by controller software? */ bool_t is_paused_by_controller; /* Domain's VCPUs are pinned 1:1 to physical CPUs? */ bool_t is_pinned; + + /* Are any VCPUs polling event channels (SCHEDOP_poll)? */ + DECLARE_BITMAP(poll_mask, MAX_VIRT_CPUS); /* Guest has shut down (inc. reason code)? */ spinlock_t shutdown_lock; @@ -507,6 +513,7 @@ static inline int vcpu_runnable(struct v atomic_read(&v->domain->pause_count)); } +void vcpu_unblock(struct vcpu *v); void vcpu_pause(struct vcpu *v); void vcpu_pause_nosync(struct vcpu *v); void domain_pause(struct domain *d); @@ -517,17 +524,12 @@ void cpu_init(void); void cpu_init(void); void vcpu_force_reschedule(struct vcpu *v); +void cpu_disable_scheduler(void); int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity); int vcpu_lock_affinity(struct vcpu *v, cpumask_t *affinity); void vcpu_unlock_affinity(struct vcpu *v, cpumask_t *affinity); void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate); - -static inline void vcpu_unblock(struct vcpu *v) -{ - if ( test_and_clear_bit(_VPF_blocked, &v->pause_flags) ) - vcpu_wake(v); -} #define IS_PRIV(_d) ((_d)->is_privileged) #define IS_PRIV_FOR(_d, _t) (IS_PRIV(_d) || ((_d)->target && (_d)->target == (_t))) diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/include/xen/trace.h --- a/xen/include/xen/trace.h Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/include/xen/trace.h Fri Sep 12 14:47:40 2008 +0900 @@ -33,6 +33,8 @@ void init_trace_bufs(void); /* used to retrieve the physical address of the trace buffers */ int tb_control(struct xen_sysctl_tbuf_op *tbc); + +int trace_will_trace_event(u32 event); void __trace_var(u32 event, int cycles, int extra, unsigned char *extra_data); diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/include/xsm/xsm.h --- a/xen/include/xsm/xsm.h Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/include/xsm/xsm.h Fri Sep 12 14:47:40 2008 +0900 @@ -64,16 +64,17 @@ struct xsm_operations { int (*getvcpucontext) (struct domain *d); int (*getvcpuinfo) (struct domain *d); int (*domain_settime) (struct domain *d); + int (*set_target) (struct domain *d, struct domain *e); int (*tbufcontrol) (void); int (*readconsole) (uint32_t clear); int (*sched_id) (void); int (*setdomainmaxmem) (struct domain *d); int (*setdomainhandle) (struct domain *d); int (*setdebugging) (struct domain *d); - int (*irq_permission) (struct domain *d, uint8_t pirq, uint8_t access); - int (*iomem_permission) (struct domain *d, unsigned long mfn, - uint8_t access); int (*perfcontrol) (void); + int (*debug_keys) (void); + int (*getcpuinfo) (void); + int (*availheap) (void); int (*evtchn_unbound) (struct domain *d, struct evtchn *chn, domid_t id2); int (*evtchn_interdomain) (struct domain *d1, struct evtchn *chn1, @@ -106,13 +107,13 @@ struct xsm_operations { int (*kexec) (void); int (*schedop_shutdown) (struct domain *d1, struct domain *d2); + int (*add_range) (struct domain *d, char *name, unsigned long s, unsigned long e); + int (*remove_range) (struct domain *d, char *name, unsigned long s, unsigned long e); long (*__do_xsm_op) (XEN_GUEST_HANDLE(xsm_op_t) op); #ifdef CONFIG_X86 int (*shadow_control) (struct domain *d, uint32_t op); - int (*ioport_permission) (struct domain *d, uint32_t ioport, - uint8_t access); int (*getpageframeinfo) (struct page_info *page); int (*getmemlist) (struct domain *d); int (*hypercall_init) (struct domain *d); @@ -130,13 +131,26 @@ struct xsm_operations { int (*microcode) (void); int (*physinfo) (void); int (*platform_quirk) (uint32_t); + int (*firmware_info) (void); + int (*acpi_sleep) (void); + int (*change_freq) (void); + int (*getidletime) (void); int (*machine_memory_map) (void); int (*domain_memory_map) (struct domain *d); - int (*mmu_normal_update) (struct domain *d, intpte_t fpte); + int (*mmu_normal_update) (struct domain *d, struct domain *f, + intpte_t fpte); int (*mmu_machphys_update) (struct domain *d, unsigned long mfn); - int (*update_va_mapping) (struct domain *d, l1_pgentry_t pte); + int (*update_va_mapping) (struct domain *d, struct domain *f, + l1_pgentry_t pte); int (*add_to_physmap) (struct domain *d1, struct domain *d2); int (*remove_from_physmap) (struct domain *d1, struct domain *d2); + int (*sendtrigger) (struct domain *d); + int (*test_assign_device) (uint32_t machine_bdf); + int (*assign_device) (struct domain *d, uint32_t machine_bdf); + int (*deassign_device) (struct domain *d, uint32_t machine_bdf); + int (*bind_pt_irq) (struct domain *d, struct xen_domctl_bind_pt_irq *bind); + int (*pin_mem_cacheattr) (struct domain *d); + int (*ext_vcpucontext) (struct domain *d, uint32_t cmd); #endif }; @@ -215,6 +229,11 @@ static inline int xsm_domain_settime (st return xsm_call(domain_settime(d)); } +static inline int xsm_set_target (struct domain *d, struct domain *e) +{ + return xsm_call(set_target(d, e)); +} + static inline int xsm_tbufcontrol (void) { return xsm_call(tbufcontrol()); @@ -245,21 +264,24 @@ static inline int xsm_setdebugging (stru return xsm_call(setdebugging(d)); } -static inline int xsm_irq_permission (struct domain *d, uint8_t pirq, - uint8_t access) -{ - return xsm_call(irq_permission(d, pirq, access)); -} - -static inline int xsm_iomem_permission (struct domain *d, unsigned long mfn, - uint8_t access) -{ - return xsm_call(iomem_permission(d, mfn, access)); -} - static inline int xsm_perfcontrol (void) { return xsm_call(perfcontrol()); +} + +static inline int xsm_debug_keys (void) +{ + return xsm_call(debug_keys()); +} + +static inline int xsm_availheap (void) +{ + return xsm_call(availheap()); +} + +static inline int xsm_getcpuinfo (void) +{ + return xsm_call(getcpuinfo()); } static inline int xsm_evtchn_unbound (struct domain *d1, struct evtchn *chn, @@ -385,6 +407,18 @@ static inline int xsm_schedop_shutdown ( static inline int xsm_schedop_shutdown (struct domain *d1, struct domain *d2) { return xsm_call(schedop_shutdown(d1, d2)); +} + +static inline int xsm_add_range (struct domain *d, char *name, unsigned long s, + unsigned long e) +{ + return xsm_call(add_range(d, name, s, e)); +} + +static inline int xsm_remove_range (struct domain *d, char *name, unsigned long s, + unsigned long e) +{ + return xsm_call(remove_range(d, name, s, e)); } static inline long __do_xsm_op (XEN_GUEST_HANDLE(xsm_op_t) op) @@ -413,12 +447,6 @@ static inline int xsm_shadow_control (st return xsm_call(shadow_control(d, op)); } -static inline int xsm_ioport_permission (struct domain *d, uint32_t ioport, - uint8_t access) -{ - return xsm_call(ioport_permission(d, ioport, access)); -} - static inline int xsm_getpageframeinfo (struct page_info *page) { return xsm_call(getpageframeinfo(page)); @@ -504,6 +532,26 @@ static inline int xsm_platform_quirk (ui return xsm_call(platform_quirk(quirk)); } +static inline int xsm_firmware_info (void) +{ + return xsm_call(firmware_info()); +} + +static inline int xsm_acpi_sleep (void) +{ + return xsm_call(acpi_sleep()); +} + +static inline int xsm_change_freq (void) +{ + return xsm_call(change_freq()); +} + +static inline int xsm_getidletime (void) +{ + return xsm_call(getidletime()); +} + static inline int xsm_machine_memory_map(void) { return xsm_call(machine_memory_map()); @@ -514,9 +562,10 @@ static inline int xsm_domain_memory_map( return xsm_call(domain_memory_map(d)); } -static inline int xsm_mmu_normal_update (struct domain *d, intpte_t fpte) -{ - return xsm_call(mmu_normal_update(d, fpte)); +static inline int xsm_mmu_normal_update (struct domain *d, struct domain *f, + intpte_t fpte) +{ + return xsm_call(mmu_normal_update(d, f, fpte)); } static inline int xsm_mmu_machphys_update (struct domain *d, unsigned long mfn) @@ -524,9 +573,10 @@ static inline int xsm_mmu_machphys_updat return xsm_call(mmu_machphys_update(d, mfn)); } -static inline int xsm_update_va_mapping(struct domain *d, l1_pgentry_t pte) -{ - return xsm_call(update_va_mapping(d, pte)); +static inline int xsm_update_va_mapping(struct domain *d, struct domain *f, + l1_pgentry_t pte) +{ + return xsm_call(update_va_mapping(d, f, pte)); } static inline int xsm_add_to_physmap(struct domain *d1, struct domain *d2) @@ -538,6 +588,42 @@ static inline int xsm_remove_from_physma { return xsm_call(remove_from_physmap(d1, d2)); } + +static inline int xsm_sendtrigger(struct domain *d) +{ + return xsm_call(sendtrigger(d)); +} + +static inline int xsm_test_assign_device(uint32_t machine_bdf) +{ + return xsm_call(test_assign_device(machine_bdf)); +} + +static inline int xsm_assign_device(struct domain *d, uint32_t machine_bdf) +{ + return xsm_call(assign_device(d, machine_bdf)); +} + +static inline int xsm_deassign_device(struct domain *d, uint32_t machine_bdf) +{ + return xsm_call(deassign_device(d, machine_bdf)); +} + +static inline int xsm_bind_pt_irq(struct domain *d, + struct xen_domctl_bind_pt_irq *bind) +{ + return xsm_call(bind_pt_irq(d, bind)); +} + +static inline int xsm_pin_mem_cacheattr(struct domain *d) +{ + return xsm_call(pin_mem_cacheattr(d)); +} + +static inline int xsm_ext_vcpucontext(struct domain *d, uint32_t cmd) +{ + return xsm_call(ext_vcpucontext(d, cmd)); +} #endif /* CONFIG_X86 */ #endif /* __XSM_H */ diff -r 4ddd63b4be9b -r ec8eaab557d8 xen/xsm/dummy.c --- a/xen/xsm/dummy.c Fri Sep 12 14:32:45 2008 +0900 +++ b/xen/xsm/dummy.c Fri Sep 12 14:47:40 2008 +0900 @@ -84,6 +84,11 @@ static int dummy_domain_settime (struct return 0; } +static int dummy_set_target (struct domain *d, struct domain *e) +{ + return 0; +} + static int dummy_tbufcontrol (void) { return 0; @@ -114,18 +119,22 @@ static int dummy_setdebugging (struct do return 0; } -static int dummy_irq_permission (struct domain *d, uint8_t pirq, uint8_t access) -{ - return 0; -} - -static int dummy_iomem_permission (struct domain *d, unsigned long mfn, - uint8_t access) -{ - return 0; -} - static int dummy_perfcontrol (void) +{ + return 0; +} + +static int dummy_debug_keys (void) +{ + return 0; +} + +static int dummy_getcpuinfo (void) +{ + return 0; +} + +static int dummy_availheap (void) { return 0; } @@ -259,18 +268,23 @@ static long dummy___do_xsm_op(XEN_GUEST_ return -ENOSYS; } +static int dummy_add_range (struct domain *d, char *name, unsigned long s, unsigned long e) +{ + return 0; +} + +static int dummy_remove_range (struct domain *d, char *name, unsigned long s, + unsigned long e) +{ + return 0; +} + #ifdef CONFIG_X86 static int dummy_shadow_control (struct domain *d, uint32_t op) { return 0; } -static int dummy_ioport_permission (struct domain *d, uint32_t ioport, - uint8_t access) -{ - return 0; -} - static int dummy_getpageframeinfo (struct page_info *page) { return 0; @@ -356,6 +370,26 @@ static int dummy_platform_quirk (uint32_ return 0; } +static int dummy_firmware_info (void) +{ + return 0; +} + +static int dummy_acpi_sleep (void) +{ + return 0; +} + +static int dummy_change_freq (void) +{ + return 0; +} + +static int dummy_getidletime (void) +{ + return 0; +} + static int dummy_machine_memory_map (void) { return 0; @@ -366,7 +400,8 @@ static int dummy_domain_memory_map (stru return 0; } -static int dummy_mmu_normal_update (struct domain *d, intpte_t fpte) +static int dummy_mmu_normal_update (struct domain *d, struct domain *f, + intpte_t fpte) { return 0; } @@ -376,12 +411,48 @@ static int dummy_mmu_machphys_update (st return 0; } _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-changelog
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |