Xen project Mailing List

[Xen-changelog] [xen-unstable] merge

From: Xen patchbot-unstable <patchbot@xxxxxxx>

Date: Sat, 10 Dec 2011 09:33:30 +0000

Delivery-date: Sat, 10 Dec 2011 10:24:53 +0000

List-id: BK change log <xen-changelog.lists.xensource.com>

# HG changeset patch # User Tim Deegan <tim@xxxxxxx> # Date 1322760252 0 # Node ID f30a33c5b5bd1e2b0bdffa3a649490157e451a4e # Parent f25a004a6de8efc15d95408f3e92081393360acb # Parent 3e5683b6b37f9010772dac4a92166b1666485ddd merge --- diff -r f25a004a6de8 -r f30a33c5b5bd Config.mk --- a/Config.mk Thu Dec 01 17:21:24 2011 +0000 +++ b/Config.mk Thu Dec 01 17:24:12 2011 +0000 @@ -232,7 +232,7 @@ OCAML_TOOLS ?= y CONFIG_MINITERM ?= n CONFIG_LOMOUNT ?= n -CONFIG_SYSTEM_LIBAIO ?= n +CONFIG_SYSTEM_LIBAIO ?= y ifeq ($(OCAML_TOOLS),y) OCAML_TOOLS := $(shell ocamlopt -v > /dev/null 2>&1 && echo "y" || echo "n") diff -r f25a004a6de8 -r f30a33c5b5bd docs/gen-html-index --- a/docs/gen-html-index Thu Dec 01 17:21:24 2011 +0000 +++ b/docs/gen-html-index Thu Dec 01 17:24:12 2011 +0000 @@ -10,7 +10,6 @@ use Getopt::Long; use IO::File; use File::Basename; -use List::MoreUtils qw/ uniq /; Getopt::Long::Configure('bundling'); @@ -99,6 +98,12 @@ } } +sub uniq (@) { + my %h; + foreach (@_) { $h{$_} = 1; } + return keys %h; +} + for (@docs) { s,^\Q$outdir\E/,, } @docs = grep { -e "$outdir/$_" && (make_linktext($_) ne "NO-INDEX") } @docs; diff -r f25a004a6de8 -r f30a33c5b5bd docs/man/xl.pod.1 --- a/docs/man/xl.pod.1 Thu Dec 01 17:21:24 2011 +0000 +++ b/docs/man/xl.pod.1 Thu Dec 01 17:24:12 2011 +0000 @@ -32,19 +32,51 @@ =head1 NOTES +=over 4 + +=item start the script B</etc/init.d/xencommons> at boot time + Most B<xl> operations rely upon B<xenstored> and B<xenconsoled>: make sure you start the script B</etc/init.d/xencommons> at boot time to initialize all the daemons needed by B<xl>. +=item setup a B<xenbr0> bridge in dom0 + In the most common network configuration, you need to setup a bridge in dom0 named B<xenbr0> in order to have a working network in the guest domains. Please refer to the documentation of your Linux distribution to know how to setup the bridge. +=item B<autoballoon> + +If you specify the amount of memory dom0 has, passing B<dom0_mem> to +Xen, it is highly reccomended to disable B<autoballoon>. Edit +B</etc/xen/xl.conf> and set it to 0. + +=item run xl as B<root> + Most B<xl> commands require root privileges to run due to the communications channels used to talk to the hypervisor. Running as non root will return an error. +=back + +=head1 GLOBAL OPTIONS + +Some global options are always available: + +=over 4 + +=item B<-v> + +Verbose. + +=item B<-N> + +Dry run: do not actually execute the command. + +=back + =head1 DOMAIN SUBCOMMANDS The following subcommands manipulate domains directly. As stated @@ -52,13 +84,19 @@ =over 4 -=item B<create> [I<OPTIONS>] I<configfile> +=item B<button-press> I<domain-id> I<button> -The create subcommand requires a config file: see L<xl.cfg(5)> for -full details of that file format and possible options. +Indicate an ACPI button press to the domain. I<button> is may be 'power' or +'sleep'. This command is only available for HVM domains. -I<configfile> can either be an absolute path to a file, or a relative -path to a file located in /etc/xen. +=item B<create> [I<configfile>] [I<OPTIONS>] + +The create subcommand takes a config file as first argument: see +L<xl.cfg> for full details of that file format and possible options. +If I<configfile> is missing B<XL> creates the domain starting from the +default value for every option. + +I<configfile> has to be an absolute path to a file. Create will return B<as soon> as the domain is started. This B<does not> mean the guest OS in the domain has actually booted, or is @@ -76,11 +114,6 @@ Use the given configuration file. -=item B<-n>, B<--dryrun> - -Dry run - prints the resulting configuration in SXP but does not create -the domain. - =item B<-p> Leave the domain paused after it is created. @@ -88,7 +121,15 @@ =item B<-c> Attach console to the domain as soon as it has started. This is -useful for determining issues with crashing domains. +useful for determining issues with crashing domains and just as a +general convenience since you often want to watch the +domain boot. + +=item B<key=value> + +It is possible to pass I<key=value> pairs on the command line to provide +options as if they were written in the configuration file; these override +whatever is in the I<configfile>. =back @@ -105,7 +146,7 @@ =back -=item B<console> I<domain-id> +=item B<console> [I<OPTIONS>] I<domain-id> Attach to domain I<domain-id>'s console. If you've set up your domains to have a traditional log in console this will look much like a normal @@ -113,17 +154,20 @@ Use the key combination Ctrl+] to detach the domain console. -=item B<vncviewer> [I<OPTIONS>] I<domain-id> - -Attach to domain's VNC server, forking a vncviewer process. - B<OPTIONS> =over 4 -=item I<--autopass> +=item I<-t [pv|serial]> -Pass VNC password to vncviewer via stdin. +Connect to a PV console or connect to an emulated serial console. +PV consoles are the only consoles available for PV domains while HVM +domains can have both. If this option is not specified it defaults to +emulated serial for HVM guests and PV console for PV guests. + +=item I<-n NUM> + +Connect to console number I<NUM>. Console numbers start from 0. =back @@ -153,6 +197,10 @@ be written to a distribution specific directory for dump files. Such as: /var/lib/xen/dump or /var/xen/dump. +=item B<getenforce> + +Returns the current enforcing mode of the Flask Xen security module. + =item B<help> [I<--long>] Displays the short help message (i.e. common commands). @@ -226,7 +274,8 @@ =item B<s - shutdown> -FIXME: Why would you ever see this state? +The guest OS has shut down (SCHEDOP_shutdown has been called) but the +domain is not dying yet. =item B<c - crashed> @@ -239,8 +288,6 @@ The domain is in process of dying, but hasn't completely shutdown or crashed. -FIXME: Is this right? - =back B<NOTES> @@ -256,6 +303,10 @@ =back +=item B<loadpolicy> I<policyfile> + +Loads a new policy int the Flask Xen security module. + =item B<mem-max> I<domain-id> I<mem> Specify the maximum amount of memory the domain is able to use, appending 't' @@ -297,7 +348,7 @@ =item B<-e> On the new host, do not wait in the background (on <host>) for the death of the -domain. +domain. See the corresponding option of the I<create> subcommand. =item B<-C> I<config> @@ -317,6 +368,7 @@ command run from the console. The command returns as soon as it has executed the reboot action, which may be significantly before the domain actually reboots. +It requires PV drivers installed in your guest OS. The behavior of what happens to a domain when it reboots is set by the B<on_reboot> parameter of the domain configuration file when the @@ -337,6 +389,7 @@ =item B<-e> Do not wait in the background for the death of the domain on the new host. +See the corresponding option of the I<create> subcommand. =item B<-d> @@ -344,6 +397,10 @@ =back +=item B<setenforce> I<1|0|Enforcing|Permissive> + +Sets the current enforcing mode of the Flask Xen security module + =item B<save> [I<OPTIONS>] I<domain-id> I<CheckpointFile> [I<ConfigFile>] Saves a running domain to a state file so that it can be restored @@ -353,7 +410,6 @@ Passing a config file argument allows the user to manually select the VM config file used to create the domain. - =over 4 =item B<-c> @@ -370,6 +426,7 @@ succeed, and may take a variable length of time depending on what services must be shutdown in the domain. The command returns immediately after signally the domain unless that B<-w> flag is used. +For HVM domains it requires PV drivers to be installed in your guest OS. The behavior of what happens to a domain when it reboots is set by the B<on_shutdown> parameter of the domain configuration file when the @@ -387,9 +444,17 @@ =item B<sysrq> I<domain-id> I<letter> -Send a I<Magic System Request> signal to the domain. For more -information on available magic sys req operations, see sysrq.txt in -your Linux Kernel sources. +Send a <Magic System Request> to the domain, each type of request is +represented by a different letter. +It can be used to send SysRq requests to Linux guests, see sysrq.txt in +your Linux Kernel sources for more information. +It requires PV drivers to be installed in your guest OS. + +=item B<trigger> I<domain-id> I<nmi|reset|init|power|sleep> [I<VCPU>] + +Send a trigger to a domain, where the trigger can be: nmi, reset, init, power +or sleep. Optionally a specific vcpu number can be passed as an argument. +This command is only available for HVM domains. =item B<unpause> I<domain-id> @@ -410,10 +475,6 @@ configured VCPU count is an error. Trying to set VCPUs to < 1 will be quietly ignored. -Because this operation requires cooperation from the domain operating -system, there is no guarantee that it will succeed. This command will -not work with a full virt domain. - =item B<vcpu-list> [I<domain-id>] Lists VCPU information for a specific domain. If no domain is @@ -430,27 +491,19 @@ this, by ensuring certain VCPUs can only run on certain physical CPUs. -=item B<button-press> I<domain-id> I<button> +=item B<vncviewer> [I<OPTIONS>] I<domain-id> -Indicate an ACPI button press to the domain. I<button> is may be 'power' or -'sleep'. +Attach to domain's VNC server, forking a vncviewer process. -=item B<trigger> I<domain-id> I<nmi|reset|init|power|sleep> [I<VCPU>] +B<OPTIONS> -Send a trigger to a domain, where the trigger can be: nmi, reset, init, power -or sleep. Optionally a specific vcpu number can be passed as an argument. +=over 4 -=item B<getenforce> +=item I<--autopass> -Returns the current enforcing mode of the Flask Xen security module. +Pass VNC password to vncviewer via stdin. -=item B<setenforce> I<1|0|Enforcing|Permissive> - -Sets the current enforcing mode of the Flask Xen security module - -=item B<loadpolicy> I<policyfile> - -Loads a new policy int the Flask Xen security module. +=back =back @@ -460,7 +513,8 @@ =item B<debug-keys> I<keys> -Send debug I<keys> to Xen. +Send debug I<keys> to Xen. It is the same as pressing the Xen +"conswitch" (Ctrl-A by default) three times and then pressing "keys". =item B<dmesg> [B<-c>] @@ -483,39 +537,41 @@ Print information about the Xen host in I<name : value> format. When reporting a Xen bug, please provide this information as part of the -bug report. +bug report. See I<http://wiki.xen.org/xenwiki/ReportingBugs> on how to +report Xen bugs. -Sample output looks as follows (lines wrapped manually to make the man -page more readable): +Sample output looks as follows: - host : talon - release : 2.6.12.6-xen0 - version : #1 Mon Nov 14 14:26:26 EST 2005 - machine : i686 - nr_cpus : 2 + host : scarlett + release : 3.1.0-rc4+ + version : #1001 SMP Wed Oct 19 11:09:54 UTC 2011 + machine : x86_64 + nr_cpus : 4 nr_nodes : 1 - cores_per_socket : 1 + cores_per_socket : 4 threads_per_core : 1 - cpu_mhz : 696 - hw_caps : 0383fbff:00000000:00000000:00000040 - total_memory : 767 - free_memory : 37 - xen_major : 3 - xen_minor : 0 - xen_extra : -devel - xen_caps : xen-3.0-x86_32 + cpu_mhz : 2266 + hw_caps : bfebfbff:28100800:00000000:00003b40:009ce3bd:00000000:00000001:00000000 + virt_caps : hvm hvm_directio + total_memory : 6141 + free_memory : 4274 + free_cpus : 0 + xen_major : 4 + xen_minor : 2 + xen_extra : -unstable + xen_caps : xen-3.0-x86_64 xen-3.0-x86_32p hvm-3.0-x86_32 hvm-3.0-x86_32p hvm-3.0-x86_64 xen_scheduler : credit xen_pagesize : 4096 - platform_params : virt_start=0xfc000000 - xen_changeset : Mon Nov 14 18:13:38 2005 +0100 - 7793:090e44133d40 - cc_compiler : gcc version 3.4.3 (Mandrakelinux - 10.2 3.4.3-7mdk) - cc_compile_by : sdague - cc_compile_domain : (none) - cc_compile_date : Mon Nov 14 14:16:48 EST 2005 + platform_params : virt_start=0xffff800000000000 + xen_changeset : Wed Nov 02 17:09:09 2011 +0000 24066:54a5e994a241 + xen_commandline : com1=115200,8n1 guest_loglvl=all dom0_mem=750M console=com1 + cc_compiler : gcc version 4.4.5 (Debian 4.4.5-8) + cc_compile_by : sstabellini + cc_compile_domain : uk.xensource.com + cc_compile_date : Tue Nov 8 12:03:05 UTC 2011 xend_config_format : 4 + B<FIELDS> Not all fields will be explained here, but some of the less obvious @@ -527,7 +583,8 @@ A vector showing what hardware capabilities are supported by your processor. This is equivalent to, though more cryptic, the flags -field in /proc/cpuinfo on a normal Linux machine. +field in /proc/cpuinfo on a normal Linux machine: they both derive from +the feature bits returned by the cpuid command on x86 platforms. =item B<free_memory> @@ -568,6 +625,9 @@ =item B<pci-list-assignable-devices> List all the assignable PCI devices. +These are devices in the system which are configured to be +available for passthrough and are bound to a suitable PCI +backend driver in domain 0 rather than a real driver. =back @@ -635,10 +695,6 @@ Use the given configuration file. -=item B<-n>, B<--dryrun> - -Dry run - prints the resulting configuration. - =back =item B<cpupool-list> [I<-c|--cpus>] [I<cpu-pool>] @@ -676,8 +732,8 @@ =head1 VIRTUAL DEVICE COMMANDS Most virtual devices can be added and removed while guests are -running. The effect to the guest OS is much the same as any hotplug -event. +running, assuming that the necessary support exists in the guest. The +effect to the guest OS is much the same as any hotplug event. =head2 BLOCK DEVICES @@ -699,7 +755,8 @@ =item I<disc-spec-component> A disc specification in the same format used for the B<disk> variable in -the domain config file. See F<xl-disk-configuration>. +the domain config file. See +L<http://xenbits.xen.org/docs/unstable/misc/xl-disk-configuration.txt>. =back @@ -754,8 +811,9 @@ Creates a new network device in the domain specified by I<domain-id>. I<network-device> describes the device to attach, using the same format as the -B<vif> string in the domain config file. See L<xl.cfg(5)> for the -description. +B<vif> string in the domain config file. See L<xl.cfg> and +L<http://xenbits.xen.org/docs/unstable/misc/xl-network-configuration.html> +for more informations. =item B<network-detach> I<domain-id> I<devid|mac> @@ -793,17 +851,100 @@ =back +=head2 TMEM + +=over 4 + +=item B<tmem-list> I[<-l>] I<domain-id> + +List tmem pools. If I<-l> is specified, also list tmem stats. + +=item B<tmem-freeze> I<domain-id> + +Freeze tmem pools. + +=item B<tmem-destroy> I<domain-id> + +Destroy tmem pools. + +=item B<tmem-thaw> I<domain-id> + +Thaw tmem pools. + +=item B<tmem-set> I<domain-id> [I<OPTIONS>] + +Change tmem settings. + +B<OPTIONS> + +=over 4 + +=item B<-w> I<WEIGHT> + +Weight (int) + +=item B<-c> I<CAP> + +Cap (int) + +=item B<-p> I<COMPRESS> + +Compress (int) + +=back + +=item B<tmem-shared-auth> I<domain-id> [I<OPTIONS>] + +De/authenticate shared tmem pool. + +B<OPTIONS> + +=over 4 + +=item B<-u> I<UUID> + +Specify uuid (abcdef01-2345-6789-1234-567890abcdef) + +=item B<-a> I<AUTH> + +0=auth,1=deauth + +=back + +=item B<tmem-freeable> + +Get information about how much freeable memory (MB) is in-use by tmem. + +=back + +=head1 TO BE DOCUMENTED + +We need better documentation for: + +=over 4 + +=item B<tmem> + +Trascendent Memory. + +=item B<Flask> + +Xen Flask security module. + +=back + =head1 SEE ALSO -L<xl.cfg(5)>, L<xlcpupool.cfg(5)>, B<xentop(1)> +The following man pages: -=head1 AUTHOR +L<xl.cfg>(5), L<xlcpupool.cfg>(5), B<xentop>(1) - Stefano Stabellini <stefano.stabellini@xxxxxxxxxxxxx> - Vincent Hanquez <vincent.hanquez@xxxxxxxxxxxxx> - Ian Jackson <ian.jackson@xxxxxxxxxxxxx> - Ian Campbell <Ian.Campbell@xxxxxxxxxx> +And the following documents on the xen.org website: + +L<http://xenbits.xen.org/docs/unstable/misc/xl-network-configuration.html> +L<http://xenbits.xen.org/docs/unstable/misc/xl-disk-configuration.txt> =head1 BUGS -Send bugs to xen-devel@xxxxxxxxxxxxxxxxxxxx +Send bugs to xen-devel@xxxxxxxxxxxxxxxxxxx, see +http://wiki.xen.org/xenwiki/ReportingBugs on how to send bug reports. diff -r f25a004a6de8 -r f30a33c5b5bd tools/libxc/Makefile --- a/tools/libxc/Makefile Thu Dec 01 17:21:24 2011 +0000 +++ b/tools/libxc/Makefile Thu Dec 01 17:24:12 2011 +0000 @@ -42,7 +42,7 @@ GUEST_SRCS-y := GUEST_SRCS-y += xg_private.c xc_suspend.c GUEST_SRCS-$(CONFIG_MIGRATE) += xc_domain_restore.c xc_domain_save.c -GUEST_SRCS-$(CONFIG_MIGRATE) += xc_offline_page.c +GUEST_SRCS-$(CONFIG_MIGRATE) += xc_offline_page.c xc_compression.c GUEST_SRCS-$(CONFIG_HVM) += xc_hvm_build.c vpath %.c ../../xen/common/libelf diff -r f25a004a6de8 -r f30a33c5b5bd tools/libxc/xc_compression.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/libxc/xc_compression.c Thu Dec 01 17:24:12 2011 +0000 @@ -0,0 +1,552 @@ +/****************************************************************************** + * xc_compression.c + * + * Checkpoint Compression using Page Delta Algorithm. + * - A LRU cache of recently dirtied guest pages is maintained. + * - For each dirty guest page in the checkpoint, if a previous version of the + * page exists in the cache, XOR both pages and send the non-zero sections + * to the receiver. The cache is then updated with the newer copy of guest page. + * - The receiver will XOR the non-zero sections against its copy of the guest + * page, thereby bringing the guest page up-to-date with the sender side. + * + * Copyright (c) 2011 Shriram Rajagopalan (rshriram@xxxxxxxxx). + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; + * version 2.1 of the License. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/types.h> +#include <inttypes.h> +#include <errno.h> +#include "xc_private.h" +#include "xenctrl.h" +#include "xg_save_restore.h" +#include "xg_private.h" +#include "xc_dom.h" + +/* Page Cache for Delta Compression*/ +#define DELTA_CACHE_SIZE (XC_PAGE_SIZE * 8192) + +/* Internal page buffer to hold dirty pages of a checkpoint, + * to be compressed after the domain is resumed for execution. + */ +#define PAGE_BUFFER_SIZE (XC_PAGE_SIZE * 8192) + +struct cache_page +{ + char *page; + xen_pfn_t pfn; + struct cache_page *next; + struct cache_page *prev; +}; + +struct compression_ctx +{ + /* compression buffer - holds compressed data */ + char *compbuf; + unsigned long compbuf_size; + unsigned long compbuf_pos; + + /* Page buffer to hold pages to be compressed */ + char *inputbuf; + /* pfns of pages to be compressed */ + xen_pfn_t *sendbuf_pfns; + unsigned int pfns_len; + unsigned int pfns_index; + + /* Compression Cache (LRU) */ + char *cache_base; + struct cache_page **pfn2cache; + struct cache_page *cache; + struct cache_page *page_list_head; + struct cache_page *page_list_tail; + unsigned long dom_pfnlist_size; +}; + +#define RUNFLAG 0 +#define SKIPFLAG ((char)128) +#define FLAGMASK SKIPFLAG +#define LENMASK ((char)127) + +/* + * see xg_save_restore.h for details on the compressed stream format. + * delta size = 4 bytes. + * run header = 1 byte (1 bit for runtype, 7bits for run length). + * i.e maximum size of a run = 127 * 4 = 508 bytes. + * Worst case compression: Entire page has changed. + * In the worst case, the size of the compressed page is + * 8 runs of 508 bytes + 1 run of 32 bytes + 9 run headers + * = 4105 bytes. + * We could detect this worst case and send the entire page with a + * FULL_PAGE marker, reducing the total size to 4097 bytes. The cost + * of this size reduction is an additional memcpy, on top of two previous + * memcpy (to the compressed stream and the cache page in the for loop). + * + * We might as well sacrifice an extra 8 bytes instead of a memcpy. + */ +#define WORST_COMP_PAGE_SIZE (XC_PAGE_SIZE + 9) + +/* + * A zero length skip indicates full page. + */ +#define EMPTY_PAGE 0 +#define FULL_PAGE SKIPFLAG +#define FULL_PAGE_SIZE (XC_PAGE_SIZE + 1) +#define MAX_DELTAS (XC_PAGE_SIZE/sizeof(uint32_t)) + +/* + * Add a pagetable page or a new page (uncached) + * if srcpage is a pagetable page, cache_page is null. + * if srcpage is a page that was not previously in the cache, + * cache_page points to a free page slot in the cache where + * this new page can be copied to. + */ +static int add_full_page(comp_ctx *ctx, char *srcpage, char *cache_page) +{ + char *dest = (ctx->compbuf + ctx->compbuf_pos); + + if ( (ctx->compbuf_pos + FULL_PAGE_SIZE) > ctx->compbuf_size) + return -1; + + if (cache_page) + memcpy(cache_page, srcpage, XC_PAGE_SIZE); + dest[0] = FULL_PAGE; + memcpy(&dest[1], srcpage, XC_PAGE_SIZE); + ctx->compbuf_pos += FULL_PAGE_SIZE; + + return FULL_PAGE_SIZE; +} + +static int compress_page(comp_ctx *ctx, char *srcpage, char *cache_page) +{ + char *dest = (ctx->compbuf + ctx->compbuf_pos); + uint32_t *new, *old; + + int off, runptr = 0; + int wascopying = 0, copying = 0, bytes_skipped = 0; + int complen = 0, pageoff = 0, runbytes = 0; + + char runlen = 0; + + if ( (ctx->compbuf_pos + WORST_COMP_PAGE_SIZE) > ctx->compbuf_size) + return -1; + + /* + * There are no alignment issues here since srcpage is + * domU's page passed from xc_domain_save and cache_page is + * a ptr to cache page (cache is page aligned). + */ + new = (uint32_t*)srcpage; + old = (uint32_t*)cache_page; + + for (off = 0; off <= MAX_DELTAS; off++) + { + /* + * At (off == MAX_DELTAS), we are processing the last run + * in the page. Since there is no XORing, make wascopying != copying + * to satisfy the if-block below. + */ + copying = ((off < MAX_DELTAS) ? (old[off] != new[off]) : !wascopying); + + if (runlen) + { + /* switching between run types or current run is full */ + if ( (wascopying != copying) || (runlen == LENMASK) ) + { + runbytes = runlen * sizeof(uint32_t); + runlen |= (wascopying ? RUNFLAG : SKIPFLAG); + dest[complen++] = runlen; + + if (wascopying) /* RUNFLAG */ + { + pageoff = runptr * sizeof(uint32_t); + memcpy(dest + complen, srcpage + pageoff, runbytes); + memcpy(cache_page + pageoff, srcpage + pageoff, runbytes); + complen += runbytes; + } + else /* SKIPFLAG */ + { + bytes_skipped += runbytes; + } + + runlen = 0; + runptr = off; + } + } + runlen++; + wascopying = copying; + } + + /* + * Check for empty page. + */ + if (bytes_skipped == XC_PAGE_SIZE) + { + complen = 1; + dest[0] = EMPTY_PAGE; + } + ctx->compbuf_pos += complen; + + return complen; +} + +static +char *get_cache_page(comp_ctx *ctx, xen_pfn_t pfn, + int *israw) +{ + struct cache_page *item = NULL; + + item = ctx->pfn2cache[pfn]; + + if (!item) + { + *israw = 1; + + /* If the list is full, evict a page from the tail end. */ + item = ctx->page_list_tail; + if (item->pfn != INVALID_P2M_ENTRY) + ctx->pfn2cache[item->pfn] = NULL; + + item->pfn = pfn; + ctx->pfn2cache[pfn] = item; + } + + /* if requested item is in cache move to head of list */ + if (item != ctx->page_list_head) + { + if (item == ctx->page_list_tail) + { + /* item at tail of list. */ + ctx->page_list_tail = item->prev; + (ctx->page_list_tail)->next = NULL; + } + else + { + /* item in middle of list */ + item->prev->next = item->next; + item->next->prev = item->prev; + } + + item->prev = NULL; + item->next = ctx->page_list_head; + (ctx->page_list_head)->prev = item; + ctx->page_list_head = item; + } + + return (ctx->page_list_head)->page; +} + +/* Remove pagetable pages from cache and move to tail, as free pages */ +static +void invalidate_cache_page(comp_ctx *ctx, xen_pfn_t pfn) +{ + struct cache_page *item = NULL; + + item = ctx->pfn2cache[pfn]; + if (item) + { + if (item != ctx->page_list_tail) + { + /* item at head of list */ + if (item == ctx->page_list_head) + { + ctx->page_list_head = (ctx->page_list_head)->next; + (ctx->page_list_head)->prev = NULL; + } + else /* item in middle of list */ + { + item->prev->next = item->next; + item->next->prev = item->prev; + } + + item->next = NULL; + item->prev = ctx->page_list_tail; + (ctx->page_list_tail)->next = item; + ctx->page_list_tail = item; + } + ctx->pfn2cache[pfn] = NULL; + (ctx->page_list_tail)->pfn = INVALID_P2M_ENTRY; + } +} + +int xc_compression_add_page(xc_interface *xch, comp_ctx *ctx, + char *page, xen_pfn_t pfn, int israw) +{ + if (pfn > ctx->dom_pfnlist_size) + { + ERROR("Invalid pfn passed into " + "xc_compression_add_page %" PRIpfn "\n", pfn); + return -2; + } + + /* pagetable page */ + if (israw) + invalidate_cache_page(ctx, pfn); + ctx->sendbuf_pfns[ctx->pfns_len] = israw ? INVALID_P2M_ENTRY : pfn; + memcpy(ctx->inputbuf + ctx->pfns_len * XC_PAGE_SIZE, page, XC_PAGE_SIZE); + ctx->pfns_len++; + + /* check if we have run out of space. If so, + * we need to synchronously compress the pages and flush them out + */ + if (ctx->pfns_len == NRPAGES(PAGE_BUFFER_SIZE)) + return -1; + return 0; +} + +int xc_compression_compress_pages(xc_interface *xch, comp_ctx *ctx, + char *compbuf, unsigned long compbuf_size, + unsigned long *compbuf_len) +{ + char *cache_copy = NULL, *current_page = NULL; + int israw, rc = 1; + + if (!ctx->pfns_len || (ctx->pfns_index == ctx->pfns_len)) { + ctx->pfns_len = ctx->pfns_index = 0; + return 0; + } + + ctx->compbuf_pos = 0; + ctx->compbuf = compbuf; + ctx->compbuf_size = compbuf_size; + + for (; ctx->pfns_index < ctx->pfns_len; ctx->pfns_index++) + { + israw = 0; + cache_copy = NULL; + current_page = ctx->inputbuf + ctx->pfns_index * XC_PAGE_SIZE; + + if (ctx->sendbuf_pfns[ctx->pfns_index] == INVALID_P2M_ENTRY) + israw = 1; + else + cache_copy = get_cache_page(ctx, + ctx->sendbuf_pfns[ctx->pfns_index], + &israw); + + if (israw) + rc = (add_full_page(ctx, current_page, cache_copy) >= 0); + else + rc = (compress_page(ctx, current_page, cache_copy) >= 0); + + if ( !rc ) + { + /* Out of space in outbuf! flush and come back */ + rc = -1; + break; + } + } + if (compbuf_len) + *compbuf_len = ctx->compbuf_pos; + + return rc; +} + +inline +void xc_compression_reset_pagebuf(xc_interface *xch, comp_ctx *ctx) +{ + ctx->pfns_index = ctx->pfns_len = 0; +} + +int xc_compression_uncompress_page(xc_interface *xch, char *compbuf, + unsigned long compbuf_size, + unsigned long *compbuf_pos, char *destpage) +{ + unsigned long pos; + unsigned int len = 0, pagepos = 0; + char flag; + + pos = *compbuf_pos; + if (pos >= compbuf_size) + { + ERROR("Out of bounds exception in compression buffer (a):" + "read ptr:%lu, bufsize = %lu\n", + *compbuf_pos, compbuf_size); + return -1; + } + + switch (compbuf[pos]) + { + case EMPTY_PAGE: + pos++; + break; + + case FULL_PAGE: + { + /* Check if the input buffer has 4KB of data */ + if ((pos + FULL_PAGE_SIZE) > compbuf_size) + { + ERROR("Out of bounds exception in compression buffer (b):" + "read ptr = %lu, bufsize = %lu\n", + *compbuf_pos, compbuf_size); + return -1; + } + memcpy(destpage, &compbuf[pos + 1], XC_PAGE_SIZE); + pos += FULL_PAGE_SIZE; + } + break; + + default: /* Normal page with one or more runs */ + { + do + { + flag = compbuf[pos] & FLAGMASK; + len = (compbuf[pos] & LENMASK) * sizeof(uint32_t); + /* Sanity Check: Zero-length runs are allowed only for + * FULL_PAGE and EMPTY_PAGE. + */ + if (!len) + { + ERROR("Zero length run encountered for normal page: " + "buffer (d):read ptr = %lu, flag = %u, " + "bufsize = %lu, pagepos = %u\n", + pos, (unsigned int)flag, compbuf_size, pagepos); + return -1; + } + + pos++; + if (flag == RUNFLAG) + { + /* Check if the input buffer has len bytes of data + * and whether it would fit in the destination page. + */ + if (((pos + len) > compbuf_size) + || ((pagepos + len) > XC_PAGE_SIZE)) + { + ERROR("Out of bounds exception in compression " + "buffer (c):read ptr = %lu, runlen = %u, " + "bufsize = %lu, pagepos = %u\n", + pos, len, compbuf_size, pagepos); + return -1; + } + memcpy(&destpage[pagepos], &compbuf[pos], len); + pos += len; + } + pagepos += len; + } while ((pagepos < XC_PAGE_SIZE) && (pos < compbuf_size)); + + /* Make sure we have copied/skipped 4KB worth of data */ + if (pagepos != XC_PAGE_SIZE) + { + ERROR("Invalid data in compression buffer:" + "read ptr = %lu, bufsize = %lu, pagepos = %u\n", + pos, compbuf_size, pagepos); + return -1; + } + } + } + *compbuf_pos = pos; + return 0; +} + +void xc_compression_free_context(xc_interface *xch, comp_ctx *ctx) +{ + if (!ctx) return; + + if (ctx->inputbuf) + free(ctx->inputbuf); + if (ctx->sendbuf_pfns) + free(ctx->sendbuf_pfns); + if (ctx->cache_base) + free(ctx->cache_base); + if (ctx->pfn2cache) + free(ctx->pfn2cache); + if (ctx->cache) + free(ctx->cache); + free(ctx); +} + +comp_ctx *xc_compression_create_context(xc_interface *xch, + unsigned long p2m_size) +{ + unsigned long i; + comp_ctx *ctx = NULL; + unsigned long num_cache_pages = DELTA_CACHE_SIZE/XC_PAGE_SIZE; + + ctx = (comp_ctx *)malloc(sizeof(comp_ctx)); + if (!ctx) + { + ERROR("Failed to allocate compression_ctx\n"); + goto error; + } + memset(ctx, 0, sizeof(comp_ctx)); + + ctx->inputbuf = xc_memalign(xch, XC_PAGE_SIZE, PAGE_BUFFER_SIZE); + if (!ctx->inputbuf) + { + ERROR("Failed to allocate page buffer\n"); + goto error; + } + + ctx->cache_base = xc_memalign(xch, XC_PAGE_SIZE, DELTA_CACHE_SIZE); + if (!ctx->cache_base) + { + ERROR("Failed to allocate delta cache\n"); + goto error; + } + + ctx->sendbuf_pfns = malloc(NRPAGES(PAGE_BUFFER_SIZE) * + sizeof(xen_pfn_t)); + if (!ctx->sendbuf_pfns) + { + ERROR("Could not alloc sendbuf_pfns\n"); + goto error; + } + memset(ctx->sendbuf_pfns, -1, + NRPAGES(PAGE_BUFFER_SIZE) * sizeof(xen_pfn_t)); + + ctx->pfn2cache = calloc(p2m_size, sizeof(struct cache_page *)); + if (!ctx->pfn2cache) + { + ERROR("Could not alloc pfn2cache map\n"); + goto error; + } + + ctx->cache = malloc(num_cache_pages * sizeof(struct cache_page)); + if (!ctx->cache) + { + ERROR("Could not alloc compression cache\n"); + goto error; + } + + for (i = 0; i < num_cache_pages; i++) + { + ctx->cache[i].pfn = INVALID_P2M_ENTRY; + ctx->cache[i].page = ctx->cache_base + i * XC_PAGE_SIZE; + ctx->cache[i].prev = (i == 0) ? NULL : &(ctx->cache[i - 1]); + ctx->cache[i].next = ((i+1) == num_cache_pages)? NULL : + &(ctx->cache[i + 1]); + } + ctx->page_list_head = &(ctx->cache[0]); + ctx->page_list_tail = &(ctx->cache[num_cache_pages -1]); + ctx->dom_pfnlist_size = p2m_size; + + return ctx; +error: + xc_compression_free_context(xch, ctx); + return NULL; +} + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff -r f25a004a6de8 -r f30a33c5b5bd tools/libxc/xc_domain_restore.c --- a/tools/libxc/xc_domain_restore.c Thu Dec 01 17:21:24 2011 +0000 +++ b/tools/libxc/xc_domain_restore.c Thu Dec 01 17:24:12 2011 +0000 @@ -43,6 +43,7 @@ xen_pfn_t *p2m_batch; /* A table of P2M mappings in the current region. */ int completed; /* Set when a consistent image is available */ int last_checkpoint; /* Set when we should commit to the current checkpoint when it completes. */ + int compressing; /* Set when sender signals that pages would be sent compressed (for Remus) */ struct domain_info_context dinfo; }; @@ -663,6 +664,10 @@ /* pages is of length nr_physpages, pfn_types is of length nr_pages */ unsigned int nr_physpages, nr_pages; + /* checkpoint compression state */ + int compressing; + unsigned long compbuf_pos, compbuf_size; + /* Types of the pfns in the current region */ unsigned long* pfn_types; @@ -701,6 +706,7 @@ { int count, countpages, oldcount, i; void* ptmp; + unsigned long compbuf_size; if ( RDEXACT(fd, &count, sizeof(count)) ) { @@ -820,6 +826,40 @@ } return pagebuf_get_one(xch, ctx, buf, fd, dom); + case XC_SAVE_ID_ENABLE_COMPRESSION: + /* We cannot set compression flag directly in pagebuf structure, + * since this pagebuf still has uncompressed pages that are yet to + * be applied. We enable the compression field in pagebuf structure + * after receiving the first tailbuf. + */ + ctx->compressing = 1; + // DPRINTF("compression flag received"); + return pagebuf_get_one(xch, ctx, buf, fd, dom); + + case XC_SAVE_ID_COMPRESSED_DATA: + + /* read the length of compressed chunk coming in */ + if ( RDEXACT(fd, &compbuf_size, sizeof(unsigned long)) ) + { + PERROR("Error when reading compbuf_size"); + return -1; + } + if (!compbuf_size) return 1; + + buf->compbuf_size += compbuf_size; + if (!(ptmp = realloc(buf->pages, buf->compbuf_size))) { + ERROR("Could not (re)allocate compression buffer"); + return -1; + } + buf->pages = ptmp; + + if ( RDEXACT(fd, buf->pages + (buf->compbuf_size - compbuf_size), + compbuf_size) ) { + PERROR("Error when reading compression buffer"); + return -1; + } + return compbuf_size; + default: if ( (count > MAX_BATCH_SIZE) || (count < 0) ) { ERROR("Max batch size exceeded (%d). Giving up.", count); @@ -857,6 +897,13 @@ if (!countpages) return count; + /* If Remus Checkpoint Compression is turned on, we will only be + * receiving the pfn lists now. The compressed pages will come in later, + * following a <XC_SAVE_ID_COMPRESSED_DATA, compressedChunkSize> tuple. + */ + if (buf->compressing) + return pagebuf_get_one(xch, ctx, buf, fd, dom); + oldcount = buf->nr_physpages; buf->nr_physpages += countpages; if (!buf->pages) { @@ -885,6 +932,7 @@ int rc; buf->nr_physpages = buf->nr_pages = 0; + buf->compbuf_pos = buf->compbuf_size = 0; do { rc = pagebuf_get_one(xch, ctx, buf, fd, dom); @@ -1102,7 +1150,21 @@ /* In verify mode, we use a copy; otherwise we work in place */ page = pagebuf->verify ? (void *)buf : (region_base + i*PAGE_SIZE); - memcpy(page, pagebuf->pages + (curpage + curbatch) * PAGE_SIZE, PAGE_SIZE); + /* Remus - page decompression */ + if (pagebuf->compressing) + { + if (xc_compression_uncompress_page(xch, pagebuf->pages, + pagebuf->compbuf_size, + &pagebuf->compbuf_pos, + (char *)page)) + { + ERROR("Failed to uncompress page (pfn=%lx)\n", pfn); + goto err_mapped; + } + } + else + memcpy(page, pagebuf->pages + (curpage + curbatch) * PAGE_SIZE, + PAGE_SIZE); pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK; @@ -1364,6 +1426,7 @@ if ( !ctx->completed ) { pagebuf.nr_physpages = pagebuf.nr_pages = 0; + pagebuf.compbuf_pos = pagebuf.compbuf_size = 0; if ( pagebuf_get_one(xch, ctx, &pagebuf, io_fd, dom) < 0 ) { PERROR("Error when reading batch"); goto out; @@ -1406,6 +1469,7 @@ } pagebuf.nr_physpages = pagebuf.nr_pages = 0; + pagebuf.compbuf_pos = pagebuf.compbuf_size = 0; n += j; /* crude stats */ @@ -1449,6 +1513,13 @@ */ if ( !ctx->last_checkpoint ) fcntl(io_fd, F_SETFL, orig_io_fd_flags | O_NONBLOCK); + + /* + * If sender had sent enable compression flag, switch to compressed + * checkpoints mode once the first checkpoint is received. + */ + if (ctx->compressing) + pagebuf.compressing = 1; } if (pagebuf.viridian != 0) diff -r f25a004a6de8 -r f30a33c5b5bd tools/libxc/xc_domain_save.c --- a/tools/libxc/xc_domain_save.c Thu Dec 01 17:21:24 2011 +0000 +++ b/tools/libxc/xc_domain_save.c Thu Dec 01 17:24:12 2011 +0000 @@ -218,6 +218,56 @@ return noncached_write(xch, ob, fd, buf, len); } +static int write_compressed(xc_interface *xch, comp_ctx *compress_ctx, + int dobuf, struct outbuf* ob, int fd) +{ + int rc = 0; + int header = sizeof(int) + sizeof(unsigned long); + int marker = XC_SAVE_ID_COMPRESSED_DATA; + unsigned long compbuf_len = 0; + + do + { + /* check for available space (atleast 8k) */ + if ((ob->pos + header + XC_PAGE_SIZE * 2) > ob->size) + { + if (outbuf_flush(xch, ob, fd) < 0) + { + ERROR("Error when flushing outbuf intermediate"); + return -1; + } + } + + rc = xc_compression_compress_pages(xch, compress_ctx, + ob->buf + ob->pos + header, + ob->size - ob->pos - header, + &compbuf_len); + if (!rc) + return 0; + + if (outbuf_hardwrite(xch, ob, fd, &marker, sizeof(marker)) < 0) + { + PERROR("Error when writing marker (errno %d)", errno); + return -1; + } + + if (outbuf_hardwrite(xch, ob, fd, &compbuf_len, sizeof(compbuf_len)) < 0) + { + PERROR("Error when writing compbuf_len (errno %d)", errno); + return -1; + } + + ob->pos += (size_t) compbuf_len; + if (!dobuf && outbuf_flush(xch, ob, fd) < 0) + { + ERROR("Error when writing compressed chunk"); + return -1; + } + } while (rc != 0); + + return 0; +} + struct time_stats { struct timeval wall; long long d0_cpu, d1_cpu; @@ -815,11 +865,35 @@ unsigned long mfn; - struct outbuf ob; + /* Without checkpoint compression, the dirty pages, pfn arrays + * and tailbuf (vcpu ctx, shared info page, etc.) are written + * directly to outbuf. All of this is done while the domain is + * suspended. + * + * When checkpoint compression is enabled, the dirty pages are + * buffered, compressed "after" the domain is resumed and then + * written to outbuf. Since tailbuf data are collected while a + * domain is suspended, they cannot be directly written to the + * outbuf as there is no dirty page data preceeding tailbuf. + * + * So,two output buffers are maintained. Tailbuf data goes into + * ob_tailbuf. The dirty pages are compressed after resuming the + * domain and written to ob_pagebuf. ob_tailbuf is then appended + * to ob_pagebuf and finally flushed out. + */ + struct outbuf ob_pagebuf, ob_tailbuf, *ob = NULL; struct save_ctx _ctx; struct save_ctx *ctx = &_ctx; struct domain_info_context *dinfo = &ctx->dinfo; + /* Compression context */ + comp_ctx *compress_ctx= NULL; + /* Even if XCFLAGS_CHECKPOINT_COMPRESS is set, we enable compression only + * after sending XC_SAVE_ID_ENABLE_COMPRESSION and the tailbuf for + * first time. + */ + int compressing = 0; + int completed = 0; if ( hvm && !callbacks->switch_qemu_logdirty ) @@ -829,7 +903,7 @@ return 1; } - outbuf_init(xch, &ob, OUTBUF_SIZE); + outbuf_init(xch, &ob_pagebuf, OUTBUF_SIZE); memset(ctx, 0, sizeof(*ctx)); @@ -917,6 +991,16 @@ } } + if ( flags & XCFLAGS_CHECKPOINT_COMPRESS ) + { + if (!(compress_ctx = xc_compression_create_context(xch, dinfo->p2m_size))) + { + ERROR("Failed to create compression context"); + goto out; + } + outbuf_init(xch, &ob_tailbuf, OUTBUF_SIZE/4); + } + last_iter = !live; /* pretend we sent all the pages last iteration */ @@ -1025,9 +1109,11 @@ } copypages: -#define wrexact(fd, buf, len) write_buffer(xch, last_iter, &ob, (fd), (buf), (len)) -#define wruncached(fd, live, buf, len) write_uncached(xch, last_iter, &ob, (fd), (buf), (len)) +#define wrexact(fd, buf, len) write_buffer(xch, last_iter, ob, (fd), (buf), (len)) +#define wruncached(fd, live, buf, len) write_uncached(xch, last_iter, ob, (fd), (buf), (len)) +#define wrcompressed(fd) write_compressed(xch, compress_ctx, last_iter, ob, (fd)) + ob = &ob_pagebuf; /* Holds pfn_types, pages/compressed pages */ /* Now write out each data page, canonicalising page tables as we go... */ for ( ; ; ) { @@ -1270,7 +1356,7 @@ { /* If the page is not a normal data page, write out any run of pages we may have previously acumulated */ - if ( run ) + if ( !compressing && run ) { if ( wruncached(io_fd, live, (char*)region_base+(PAGE_SIZE*(j-run)), @@ -1305,7 +1391,41 @@ goto out; } - if ( wruncached(io_fd, live, page, PAGE_SIZE) != PAGE_SIZE ) + if (compressing) + { + int c_err; + /* Mark pagetable page to be sent uncompressed */ + c_err = xc_compression_add_page(xch, compress_ctx, page, + pfn, 1 /* raw page */); + if (c_err == -2) /* OOB PFN */ + { + ERROR("Could not add pagetable page " + "(pfn:%" PRIpfn "to page buffer\n", pfn); + goto out; + } + + if (c_err == -1) + { + /* + * We are out of buffer space to hold dirty + * pages. Compress and flush the current buffer + * to make space. This is a corner case, that + * slows down checkpointing as the compression + * happens while domain is suspended. Happens + * seldom and if you find this occuring + * frequently, increase the PAGE_BUFFER_SIZE + * in xc_compression.c. + */ + if (wrcompressed(io_fd) < 0) + { + ERROR("Error when writing compressed" + " data (4b)\n"); + goto out; + } + } + } + else if ( wruncached(io_fd, live, page, + PAGE_SIZE) != PAGE_SIZE ) { PERROR("Error when writing to state file (4b)" " (errno %d)", errno); @@ -1315,7 +1435,34 @@ else { /* We have a normal page: accumulate it for writing. */ - run++; + if (compressing) + { + int c_err; + /* For checkpoint compression, accumulate the page in the + * page buffer, to be compressed later. + */ + c_err = xc_compression_add_page(xch, compress_ctx, spage, + pfn, 0 /* not raw page */); + + if (c_err == -2) /* OOB PFN */ + { + ERROR("Could not add page " + "(pfn:%" PRIpfn "to page buffer\n", pfn); + goto out; + } + + if (c_err == -1) + { + if (wrcompressed(io_fd) < 0) + { + ERROR("Error when writing compressed" + " data (4c)\n"); + goto out; + } + } + } + else + run++; } } /* end of the write out for this batch */ @@ -1423,6 +1570,15 @@ DPRINTF("All memory is saved\n"); + /* After last_iter, buffer the rest of pagebuf & tailbuf data into a + * separate output buffer and flush it after the compressed page chunks. + */ + if (compressing) + { + ob = &ob_tailbuf; + ob->pos = 0; + } + { struct { int id; @@ -1534,6 +1690,25 @@ } } + /* Enable compression logic on both sides by sending this + * one time marker. + * NOTE: We could have simplified this procedure by sending + * the enable/disable compression flag before the beginning of + * the main for loop. But this would break compatibility for + * live migration code, with older versions of xen. So we have + * to enable it after the last_iter, when the XC_SAVE_ID_* + * elements are sent. + */ + if (!compressing && (flags & XCFLAGS_CHECKPOINT_COMPRESS)) + { + i = XC_SAVE_ID_ENABLE_COMPRESSION; + if ( wrexact(io_fd, &i, sizeof(int)) ) + { + PERROR("Error when writing enable_compression marker"); + goto out; + } + } + /* Zero terminate */ i = 0; if ( wrexact(io_fd, &i, sizeof(int)) ) @@ -1778,14 +1953,38 @@ if ( !rc && callbacks->postcopy ) callbacks->postcopy(callbacks->data); + /* guest has been resumed. Now we can compress data + * at our own pace. + */ + if (!rc && compressing) + { + ob = &ob_pagebuf; + if (wrcompressed(io_fd) < 0) + { + ERROR("Error when writing compressed data, after postcopy\n"); + rc = 1; + goto out; + } + /* Append the tailbuf data to the main outbuf */ + if ( wrexact(io_fd, ob_tailbuf.buf, ob_tailbuf.pos) ) + { + rc = 1; + PERROR("Error when copying tailbuf into outbuf"); + goto out; + } + } + /* Flush last write and discard cache for file. */ - if ( outbuf_flush(xch, &ob, io_fd) < 0 ) { + if ( outbuf_flush(xch, ob, io_fd) < 0 ) { PERROR("Error when flushing output buffer"); rc = 1; } discard_file_cache(xch, io_fd, 1 /* flush */); + /* Enable compression now, finally */ + compressing = (flags & XCFLAGS_CHECKPOINT_COMPRESS); + /* checkpoint_cb can spend arbitrarily long in between rounds */ if (!rc && callbacks->checkpoint && callbacks->checkpoint(callbacks->data) > 0) @@ -1827,6 +2026,9 @@ DPRINTF("Warning - couldn't disable qemu log-dirty mode"); } + if (compress_ctx) + xc_compression_free_context(xch, compress_ctx); + if ( live_shinfo ) munmap(live_shinfo, PAGE_SIZE); diff -r f25a004a6de8 -r f30a33c5b5bd tools/libxc/xc_linux.c --- a/tools/libxc/xc_linux.c Thu Dec 01 17:21:24 2011 +0000 +++ b/tools/libxc/xc_linux.c Thu Dec 01 17:24:12 2011 +0000 @@ -55,6 +55,18 @@ errno = saved_errno; } +void *xc_memalign(xc_interface *xch, size_t alignment, size_t size) +{ + int ret; + void *ptr; + + ret = posix_memalign(&ptr, alignment, size); + if (ret != 0 || !ptr) + return NULL; + + return ptr; +} + /* * Local variables: * mode: C diff -r f25a004a6de8 -r f30a33c5b5bd tools/libxc/xc_linux_osdep.c --- a/tools/libxc/xc_linux_osdep.c Thu Dec 01 17:21:24 2011 +0000 +++ b/tools/libxc/xc_linux_osdep.c Thu Dec 01 17:24:12 2011 +0000 @@ -91,10 +91,9 @@ { size_t size = npages * XC_PAGE_SIZE; void *p; - int ret; - ret = posix_memalign(&p, XC_PAGE_SIZE, size); - if (ret != 0 || !p) + p = xc_memalign(xch, XC_PAGE_SIZE, size); + if (!p) return NULL; if ( mlock(p, size) < 0 ) diff -r f25a004a6de8 -r f30a33c5b5bd tools/libxc/xc_minios.c --- a/tools/libxc/xc_minios.c Thu Dec 01 17:21:24 2011 +0000 +++ b/tools/libxc/xc_minios.c Thu Dec 01 17:24:12 2011 +0000 @@ -73,7 +73,7 @@ static void *minios_privcmd_alloc_hypercall_buffer(xc_interface *xch, xc_osdep_handle h, int npages) { - return memalign(PAGE_SIZE, npages * PAGE_SIZE); + return xc_memalign(xch, PAGE_SIZE, npages * PAGE_SIZE); } static void minios_privcmd_free_hypercall_buffer(xc_interface *xch, xc_osdep_handle h, void *ptr, int npages) @@ -437,6 +437,11 @@ fsync(fd); } +void *xc_memalign(xc_interface *xch, size_t alignment, size_t size) +{ + return memalign(alignment, size); +} + static xc_osdep_handle minios_gnttab_open(xc_gnttab *xcg) { int fd = alloc_fd(FTYPE_GNTMAP); diff -r f25a004a6de8 -r f30a33c5b5bd tools/libxc/xc_netbsd.c --- a/tools/libxc/xc_netbsd.c Thu Dec 01 17:21:24 2011 +0000 +++ b/tools/libxc/xc_netbsd.c Thu Dec 01 17:24:12 2011 +0000 @@ -71,8 +71,9 @@ static void *netbsd_privcmd_alloc_hypercall_buffer(xc_interface *xch, xc_osdep_handle h, int npages) { size_t size = npages * XC_PAGE_SIZE; - void *p = valloc(size); + void *p; + p = xc_memalign(xch, XC_PAGE_SIZE, size); if (!p) return NULL; @@ -378,6 +379,11 @@ errno = saved_errno; } +void *xc_memalign(xc_interface *xch, size_t alignment, size_t size) +{ + return valloc(size); +} + static struct xc_osdep_ops *netbsd_osdep_init(xc_interface *xch, enum xc_osdep_type type) { switch ( type ) diff -r f25a004a6de8 -r f30a33c5b5bd tools/libxc/xc_solaris.c --- a/tools/libxc/xc_solaris.c Thu Dec 01 17:21:24 2011 +0000 +++ b/tools/libxc/xc_solaris.c Thu Dec 01 17:24:12 2011 +0000 @@ -70,7 +70,7 @@ static void *solaris_privcmd_alloc_hypercall_buffer(xc_interface *xch, xc_osdep_handle h, int npages) { - return memalign(XC_PAGE_SIZE, npages * XC_PAGE_SIZE); + return xc_memalign(xch, XC_PAGE_SIZE, npages * XC_PAGE_SIZE); } static void solaris_privcmd_free_hypercall_buffer(xc_interface *xch, xc_osdep_handle h, void *ptr, int npages) @@ -314,6 +314,11 @@ // TODO: Implement for Solaris! } +void *xc_memalign(xc_interface *xch, size_t alignment, size_t size) +{ + return memalign(alignment, size); +} + static struct xc_osdep_ops *solaris_osdep_init(xc_interface *xch, enum xc_osdep_type type) { switch ( type ) diff -r f25a004a6de8 -r f30a33c5b5bd tools/libxc/xenctrl.h --- a/tools/libxc/xenctrl.h Thu Dec 01 17:21:24 2011 +0000 +++ b/tools/libxc/xenctrl.h Thu Dec 01 17:24:12 2011 +0000 @@ -1156,6 +1156,8 @@ uint64_t *time, xc_hypercall_buffer_t *data); +void *xc_memalign(xc_interface *xch, size_t alignment, size_t size); + /** * Memory maps a range within one domain to a local address range. Mappings * should be unmapped with munmap and should follow the same rules as mmap @@ -1935,4 +1937,64 @@ int verbose); /* Useful for callers who also use libelf. */ +/** + * Checkpoint Compression + */ +typedef struct compression_ctx comp_ctx; +comp_ctx *xc_compression_create_context(xc_interface *xch, + unsigned long p2m_size); +void xc_compression_free_context(xc_interface *xch, comp_ctx *ctx); + +/** + * Add a page to compression page buffer, to be compressed later. + * + * returns 0 if the page was successfully added to the page buffer + * + * returns -1 if there is no space in buffer. In this case, the + * application should call xc_compression_compress_pages to compress + * the buffer (or atleast part of it), thereby freeing some space in + * the page buffer. + * + * returns -2 if the pfn is out of bounds, where the bound is p2m_size + * parameter passed during xc_compression_create_context. + */ +int xc_compression_add_page(xc_interface *xch, comp_ctx *ctx, char *page, + unsigned long pfn, int israw); + +/** + * Delta compress pages in the compression buffer and inserts the + * compressed data into the supplied compression buffer compbuf, whose + * size is compbuf_size. + * After compression, the pages are copied to the internal LRU cache. + * + * This function compresses as many pages as possible into the + * supplied compression buffer. It maintains an internal iterator to + * keep track of pages in the input buffer that are yet to be compressed. + * + * returns -1 if the compression buffer has run out of space. + * returns 1 on success. + * returns 0 if no more pages are left to be compressed. + * When the return value is non-zero, compbuf_len indicates the actual + * amount of data present in compbuf (<=compbuf_size). + */ +int xc_compression_compress_pages(xc_interface *xch, comp_ctx *ctx, + char *compbuf, unsigned long compbuf_size, + unsigned long *compbuf_len); + +/** + * Resets the internal page buffer that holds dirty pages before compression. + * Also resets the iterators. + */ +void xc_compression_reset_pagebuf(xc_interface *xch, comp_ctx *ctx); + +/** + * Caller must supply the compression buffer (compbuf), + * its size (compbuf_size) and a reference to index variable (compbuf_pos) + * that is used internally. Each call pulls out one page from the compressed + * chunk and copies it to dest. + */ +int xc_compression_uncompress_page(xc_interface *xch, char *compbuf, + unsigned long compbuf_size, + unsigned long *compbuf_pos, char *dest); + #endif /* XENCTRL_H */ diff -r f25a004a6de8 -r f30a33c5b5bd tools/libxc/xenguest.h --- a/tools/libxc/xenguest.h Thu Dec 01 17:21:24 2011 +0000 +++ b/tools/libxc/xenguest.h Thu Dec 01 17:24:12 2011 +0000 @@ -27,6 +27,7 @@ #define XCFLAGS_DEBUG 2 #define XCFLAGS_HVM 4 #define XCFLAGS_STDVGA 8 +#define XCFLAGS_CHECKPOINT_COMPRESS 16 #define X86_64_B_SIZE 64 #define X86_32_B_SIZE 32 diff -r f25a004a6de8 -r f30a33c5b5bd tools/libxc/xg_save_restore.h --- a/tools/libxc/xg_save_restore.h Thu Dec 01 17:21:24 2011 +0000 +++ b/tools/libxc/xg_save_restore.h Thu Dec 01 17:24:12 2011 +0000 @@ -67,7 +67,7 @@ * * consists of p2m_size bytes comprising an array of xen_pfn_t sized entries. * - * BODY PHASE + * BODY PHASE - Format A (for live migration or Remus without compression) * ---------- * * A series of chunks with a common header: @@ -87,6 +87,122 @@ * * If chunk type is 0 then body phase is complete. * + * + * BODY PHASE - Format B (for Remus with compression) + * ---------- + * + * A series of chunks with a common header: + * int : chunk type + * + * If the chunk type is +ve then chunk contains array of PFNs corresponding + * to guest memory and type contains the number of PFNs in the batch: + * + * unsigned long[] : PFN array, length == number of pages in batch + * Each entry consists of XEN_DOMCTL_PFINFO_* + * in bits 31-28 and the PFN number in bits 27-0. + * + * If the chunk type is -ve then chunk consists of one of a number of + * metadata types. See definitions of XC_SAVE_ID_* below. + * + * If the chunk type is -ve and equals XC_SAVE_ID_COMPRESSED_DATA, then the + * chunk consists of compressed page data, in the following format: + * + * unsigned long : Size of the compressed chunk to follow + * compressed data : variable length data of size indicated above. + * This chunk consists of compressed page data. + * The number of pages in one chunk depends on + * the amount of space available in the sender's + * output buffer. + * + * Format of compressed data: + * compressed_data = <deltas>* + * delta = <marker, run*> + * marker = (RUNFLAG|SKIPFLAG) bitwise-or RUNLEN [1 byte marker] + * RUNFLAG = 0 + * SKIPFLAG = 1 << 7 + * RUNLEN = 7-bit unsigned value indicating number of WORDS in the run + * run = string of bytes of length sizeof(WORD) * RUNLEN + * + * If marker contains RUNFLAG, then RUNLEN * sizeof(WORD) bytes of data following + * the marker is copied into the target page at the appropriate offset indicated by + * the offset_ptr + * If marker contains SKIPFLAG, then the offset_ptr is advanced + * by RUNLEN * sizeof(WORD). + * + * If chunk type is 0 then body phase is complete. + * + * There can be one or more chunks with type XC_SAVE_ID_COMPRESSED_DATA, + * containing compressed pages. The compressed chunks are collated to form + * one single compressed chunk for the entire iteration. The number of pages + * present in this final compressed chunk will be equal to the total number + * of valid PFNs specified by the +ve chunks. + * + * At the sender side, compressed pages are inserted into the output stream + * in the same order as they would have been if compression logic was absent. + * + * Until last iteration, the BODY is sent in Format A, to maintain live + * migration compatibility with receivers of older Xen versions. + * At the last iteration, if Remus compression was enabled, the sender sends + * a trigger, XC_SAVE_ID_ENABLE_COMPRESSION to tell the receiver to parse the + * BODY in Format B from the next iteration onwards. + * + * An example sequence of chunks received in Format B: + * +16 +ve chunk + * unsigned long[16] PFN array + * +100 +ve chunk + * unsigned long[100] PFN array + * +50 +ve chunk + * unsigned long[50] PFN array + * + * XC_SAVE_ID_COMPRESSED_DATA TAG + * N Length of compressed data + * N bytes of DATA Decompresses to 166 pages + * + * XC_SAVE_ID_* other xc save chunks + * 0 END BODY TAG + * + * Corner case with checkpoint compression: + * At sender side, after pausing the domain, dirty pages are usually + * copied out to a temporary buffer. After the domain is resumed, + * compression is done and the compressed chunk(s) are sent, followed by + * other XC_SAVE_ID_* chunks. + * If the temporary buffer gets full while scanning for dirty pages, + * the sender stops buffering of dirty pages, compresses the temporary + * buffer and sends the compressed data with XC_SAVE_ID_COMPRESSED_DATA. + * The sender then resumes the buffering of dirty pages and continues + * scanning for the dirty pages. + * For e.g., assume that the temporary buffer can hold 4096 pages and + * there are 5000 dirty pages. The following is the sequence of chunks + * that the receiver will see: + * + * +1024 +ve chunk + * unsigned long[1024] PFN array + * +1024 +ve chunk + * unsigned long[1024] PFN array + * +1024 +ve chunk + * unsigned long[1024] PFN array + * +1024 +ve chunk + * unsigned long[1024] PFN array + * + * XC_SAVE_ID_COMPRESSED_DATA TAG + * N Length of compressed data + * N bytes of DATA Decompresses to 4096 pages + * + * +4 +ve chunk + * unsigned long[4] PFN array + * + * XC_SAVE_ID_COMPRESSED_DATA TAG + * M Length of compressed data + * M bytes of DATA Decompresses to 4 pages + * + * XC_SAVE_ID_* other xc save chunks + * 0 END BODY TAG + * + * In other words, XC_SAVE_ID_COMPRESSED_DATA can be interleaved with + * +ve chunks arbitrarily. But at the receiver end, the following condition + * always holds true until the end of BODY PHASE: + * num(PFN entries +ve chunks) >= num(pages received in compressed form) + * * TAIL PHASE * ---------- * @@ -135,6 +251,8 @@ #define XC_SAVE_ID_LAST_CHECKPOINT -9 /* Commit to restoring after completion of current iteration. */ #define XC_SAVE_ID_HVM_ACPI_IOPORTS_LOCATION -10 #define XC_SAVE_ID_HVM_VIRIDIAN -11 +#define XC_SAVE_ID_COMPRESSED_DATA -12 /* Marker to indicate arrival of compressed data */ +#define XC_SAVE_ID_ENABLE_COMPRESSION -13 /* Marker to enable compression logic at receiver side */ /* ** We process save/restore/migrate in batches of pages; the below diff -r f25a004a6de8 -r f30a33c5b5bd tools/libxl/libxl.c --- a/tools/libxl/libxl.c Thu Dec 01 17:21:24 2011 +0000 +++ b/tools/libxl/libxl.c Thu Dec 01 17:24:12 2011 +0000 @@ -3330,6 +3330,19 @@ return 0; } +int libxl_fd_set_cloexec(int fd) +{ + int flags = 0; + + if ((flags = fcntl(fd, F_GETFD)) == -1) { + flags = 0; + } + if ((flags & FD_CLOEXEC)) { + return 0; + } + return fcntl(fd, F_SETFD, flags | FD_CLOEXEC); +} + /* * Local variables: * mode: C diff -r f25a004a6de8 -r f30a33c5b5bd tools/libxl/libxl.h --- a/tools/libxl/libxl.h Thu Dec 01 17:21:24 2011 +0000 +++ b/tools/libxl/libxl.h Thu Dec 01 17:24:12 2011 +0000 @@ -635,6 +635,9 @@ const char *libxl_run_dir_path(void); const char *libxl_xenpaging_dir_path(void); +/* misc */ +int libxl_fd_set_cloexec(int fd); + #endif /* LIBXL_H */ /* diff -r f25a004a6de8 -r f30a33c5b5bd tools/libxl/libxl_internal.c --- a/tools/libxl/libxl_internal.c Thu Dec 01 17:21:24 2011 +0000 +++ b/tools/libxl/libxl_internal.c Thu Dec 01 17:24:12 2011 +0000 @@ -306,19 +306,6 @@ return 0; } -int libxl__fd_set_cloexec(int fd) -{ - int flags = 0; - - if ((flags = fcntl(fd, F_GETFD)) == -1) { - flags = 0; - } - if ((flags & FD_CLOEXEC)) { - return 0; - } - return fcntl(fd, F_SETFD, flags | FD_CLOEXEC); -} - libxl_device_model_version libxl__device_model_version_running(libxl__gc *gc, uint32_t domid) { diff -r f25a004a6de8 -r f30a33c5b5bd tools/libxl/libxl_internal.h --- a/tools/libxl/libxl_internal.h Thu Dec 01 17:21:24 2011 +0000 +++ b/tools/libxl/libxl_internal.h Thu Dec 01 17:24:12 2011 +0000 @@ -503,7 +503,6 @@ _hidden int libxl__file_reference_map(libxl_file_reference *f); _hidden int libxl__file_reference_unmap(libxl_file_reference *f); -_hidden int libxl__fd_set_cloexec(int fd); _hidden int libxl__e820_alloc(libxl__gc *gc, uint32_t domid, libxl_domain_config *d_config); diff -r f25a004a6de8 -r f30a33c5b5bd tools/libxl/libxl_qmp.c --- a/tools/libxl/libxl_qmp.c Thu Dec 01 17:21:24 2011 +0000 +++ b/tools/libxl/libxl_qmp.c Thu Dec 01 17:24:12 2011 +0000 @@ -324,7 +324,7 @@ if (fcntl(qmp->qmp_fd, F_SETFL, flags | O_NONBLOCK) == -1) { return -1; } - libxl__fd_set_cloexec(qmp->qmp_fd); + libxl_fd_set_cloexec(qmp->qmp_fd); memset(&qmp->addr, 0, sizeof (&qmp->addr)); qmp->addr.sun_family = AF_UNIX; diff -r f25a004a6de8 -r f30a33c5b5bd tools/libxl/xl_cmdimpl.c --- a/tools/libxl/xl_cmdimpl.c Thu Dec 01 17:21:24 2011 +0000 +++ b/tools/libxl/xl_cmdimpl.c Thu Dec 01 17:24:12 2011 +0000 @@ -1459,8 +1459,12 @@ union { uint32_t u32; char b[4]; } u32buf; uint32_t badflags; - restore_fd = migrate_fd >= 0 ? migrate_fd : - open(restore_file, O_RDONLY); + if (migrate_fd >= 0) { + restore_fd = migrate_fd; + } else { + restore_fd = open(restore_file, O_RDONLY); + libxl_fd_set_cloexec(restore_fd); + } CHK_ERRNO( libxl_read_exactly(ctx, restore_fd, &hdr, sizeof(hdr), restore_file, "header") ); diff -r f25a004a6de8 -r f30a33c5b5bd tools/python/xen/lowlevel/checkpoint/checkpoint.c --- a/tools/python/xen/lowlevel/checkpoint/checkpoint.c Thu Dec 01 17:21:24 2011 +0000 +++ b/tools/python/xen/lowlevel/checkpoint/checkpoint.c Thu Dec 01 17:24:12 2011 +0000 @@ -104,13 +104,14 @@ PyObject* postcopy_cb = NULL; PyObject* checkpoint_cb = NULL; unsigned int interval = 0; + unsigned int flags = 0; int fd; struct save_callbacks callbacks; int rc; - if (!PyArg_ParseTuple(args, "O|OOOI", &iofile, &suspend_cb, &postcopy_cb, - &checkpoint_cb, &interval)) + if (!PyArg_ParseTuple(args, "O|OOOII", &iofile, &suspend_cb, &postcopy_cb, + &checkpoint_cb, &interval, &flags)) return NULL; self->interval = interval; @@ -160,7 +161,7 @@ callbacks.data = self; self->threadstate = PyEval_SaveThread(); - rc = checkpoint_start(&self->cps, fd, &callbacks); + rc = checkpoint_start(&self->cps, fd, &callbacks, flags); PyEval_RestoreThread(self->threadstate); if (rc < 0) { diff -r f25a004a6de8 -r f30a33c5b5bd tools/python/xen/lowlevel/checkpoint/checkpoint.h --- a/tools/python/xen/lowlevel/checkpoint/checkpoint.h Thu Dec 01 17:21:24 2011 +0000 +++ b/tools/python/xen/lowlevel/checkpoint/checkpoint.h Thu Dec 01 17:24:12 2011 +0000 @@ -40,13 +40,15 @@ timer_t timer; } checkpoint_state; +#define CHECKPOINT_FLAGS_COMPRESSION 1 char* checkpoint_error(checkpoint_state* s); void checkpoint_init(checkpoint_state* s); int checkpoint_open(checkpoint_state* s, unsigned int domid); void checkpoint_close(checkpoint_state* s); int checkpoint_start(checkpoint_state* s, int fd, - struct save_callbacks* callbacks); + struct save_callbacks* callbacks, + unsigned int remus_flags); int checkpoint_suspend(checkpoint_state* s); int checkpoint_resume(checkpoint_state* s); int checkpoint_postflush(checkpoint_state* s); diff -r f25a004a6de8 -r f30a33c5b5bd tools/python/xen/lowlevel/checkpoint/libcheckpoint.c --- a/tools/python/xen/lowlevel/checkpoint/libcheckpoint.c Thu Dec 01 17:21:24 2011 +0000 +++ b/tools/python/xen/lowlevel/checkpoint/libcheckpoint.c Thu Dec 01 17:24:12 2011 +0000 @@ -170,7 +170,8 @@ } int checkpoint_start(checkpoint_state* s, int fd, - struct save_callbacks* callbacks) + struct save_callbacks* callbacks, + unsigned int remus_flags) { int hvm, rc; int flags = XCFLAGS_LIVE; @@ -188,6 +189,8 @@ if (switch_qemu_logdirty(s, 1)) return -1; } + if (remus_flags & CHECKPOINT_FLAGS_COMPRESSION) + flags |= XCFLAGS_CHECKPOINT_COMPRESS; callbacks->switch_qemu_logdirty = noop_switch_logdirty; diff -r f25a004a6de8 -r f30a33c5b5bd tools/python/xen/remus/save.py --- a/tools/python/xen/remus/save.py Thu Dec 01 17:21:24 2011 +0000 +++ b/tools/python/xen/remus/save.py Thu Dec 01 17:24:12 2011 +0000 @@ -133,7 +133,7 @@ class Saver(object): def __init__(self, domid, fd, suspendcb=None, resumecb=None, - checkpointcb=None, interval=0): + checkpointcb=None, interval=0, flags=0): """Create a Saver object for taking guest checkpoints. domid: name, number or UUID of a running domain fd: a stream to which checkpoint data will be written. @@ -141,12 +141,14 @@ resumecb: callback invoked before guest resumes checkpointcb: callback invoked when a checkpoint is complete. Return True to take another checkpoint, or False to stop. + flags: Remus flags to be passed to xc_domain_save """ self.fd = fd self.suspendcb = suspendcb self.resumecb = resumecb self.checkpointcb = checkpointcb self.interval = interval + self.flags = flags self.vm = vm.VM(domid) @@ -164,7 +166,8 @@ try: self.checkpointer.open(self.vm.domid) self.checkpointer.start(self.fd, self.suspendcb, self.resumecb, - self.checkpointcb, self.interval) + self.checkpointcb, self.interval, + self.flags) except xen.lowlevel.checkpoint.error, e: raise CheckpointError(e) finally: diff -r f25a004a6de8 -r f30a33c5b5bd tools/remus/remus --- a/tools/remus/remus Thu Dec 01 17:21:24 2011 +0000 +++ b/tools/remus/remus Thu Dec 01 17:24:12 2011 +0000 @@ -16,6 +16,9 @@ class CfgException(Exception): pass class Cfg(object): + + REMUS_FLAGS_COMPRESSION = 1 + def __init__(self): # must be set self.domid = 0 @@ -25,6 +28,7 @@ self.port = XendOptions.instance().get_xend_relocation_port() self.interval = 200 self.netbuffer = True + self.flags = self.REMUS_FLAGS_COMPRESSION self.timer = False parser = optparse.OptionParser() @@ -38,6 +42,8 @@ help='replicate to /dev/null (no disk checkpoints, only memory & net buffering)') parser.add_option('', '--no-net', dest='nonet', action='store_true', help='run without net buffering (benchmark option)') + parser.add_option('', '--no-compression', dest='nocompress', action='store_true', + help='run without checkpoint compression') parser.add_option('', '--timer', dest='timer', action='store_true', help='force pause at checkpoint interval (experimental)') self.parser = parser @@ -56,6 +62,8 @@ self.nullremus = True if opts.nonet: self.netbuffer = False + if opts.nocompress: + self.flags &= ~self.REMUS_FLAGS_COMPRESSION if opts.timer: self.timer = True @@ -190,7 +198,7 @@ rc = 0 checkpointer = save.Saver(cfg.domid, fd, postsuspend, preresume, commit, - interval) + interval, cfg.flags) try: checkpointer.start() diff -r f25a004a6de8 -r f30a33c5b5bd xen/arch/x86/Makefile --- a/xen/arch/x86/Makefile Thu Dec 01 17:21:24 2011 +0000 +++ b/xen/arch/x86/Makefile Thu Dec 01 17:24:12 2011 +0000 @@ -30,9 +30,10 @@ obj-y += msi.o obj-y += ioport_emulate.o obj-y += irq.o -obj-y += microcode.o obj-y += microcode_amd.o obj-y += microcode_intel.o +# This must come after the vendor specific files. +obj-y += microcode.o obj-y += mm.o obj-y += mpparse.o obj-y += nmi.o diff -r f25a004a6de8 -r f30a33c5b5bd xen/arch/x86/efi/boot.c --- a/xen/arch/x86/efi/boot.c Thu Dec 01 17:21:24 2011 +0000 +++ b/xen/arch/x86/efi/boot.c Thu Dec 01 17:24:12 2011 +0000 @@ -49,6 +49,7 @@ static struct file __initdata cfg; static struct file __initdata kernel; static struct file __initdata ramdisk; +static struct file __initdata ucode; static struct file __initdata xsm; static multiboot_info_t __initdata mbi = { @@ -174,6 +175,8 @@ efi_bs->FreePages(kernel.addr, PFN_UP(kernel.size)); if ( ramdisk.addr ) efi_bs->FreePages(ramdisk.addr, PFN_UP(ramdisk.size)); + if ( ucode.addr ) + efi_bs->FreePages(ucode.addr, PFN_UP(ucode.size)); if ( xsm.addr ) efi_bs->FreePages(xsm.addr, PFN_UP(xsm.size)); @@ -806,6 +809,17 @@ efi_bs->FreePool(name.w); } + name.s = get_value(&cfg, section.s, "ucode"); + if ( !name.s ) + name.s = get_value(&cfg, "global", "ucode"); + if ( name.s ) + { + microcode_set_module(mbi.mods_count); + split_value(name.s); + read_file(dir_handle, s2w(&name), &ucode); + efi_bs->FreePool(name.w); + } + name.s = get_value(&cfg, section.s, "xsm"); if ( name.s ) { diff -r f25a004a6de8 -r f30a33c5b5bd xen/arch/x86/microcode.c --- a/xen/arch/x86/microcode.c Thu Dec 01 17:21:24 2011 +0000 +++ b/xen/arch/x86/microcode.c Thu Dec 01 17:24:12 2011 +0000 @@ -22,20 +22,56 @@ */ #include <xen/config.h> +#include <xen/cpu.h> #include <xen/lib.h> #include <xen/kernel.h> #include <xen/init.h> +#include <xen/notifier.h> #include <xen/sched.h> #include <xen/smp.h> +#include <xen/softirq.h> #include <xen/spinlock.h> +#include <xen/tasklet.h> #include <xen/guest_access.h> -#include <asm/current.h> #include <asm/msr.h> -#include <asm/uaccess.h> #include <asm/processor.h> +#include <asm/setup.h> #include <asm/microcode.h> +static module_t __initdata ucode_mod; +static void *(*__initdata ucode_mod_map)(const module_t *); +static unsigned int __initdata ucode_mod_idx; +static bool_t __initdata ucode_mod_forced; +static cpumask_t __initdata init_mask; + +void __init microcode_set_module(unsigned int idx) +{ + ucode_mod_idx = idx; + ucode_mod_forced = 1; +} + +static void __init parse_ucode(char *s) +{ + if ( !ucode_mod_forced ) + ucode_mod_idx = simple_strtoul(s, NULL, 0); +} +custom_param("ucode", parse_ucode); + +void __init microcode_grab_module( + unsigned long *module_map, + const multiboot_info_t *mbi, + void *(*map)(const module_t *)) +{ + module_t *mod = (module_t *)__va(mbi->mods_addr); + + if ( !ucode_mod_idx || ucode_mod_idx >= mbi->mods_count || + !__test_and_clear_bit(ucode_mod_idx, module_map) ) + return; + ucode_mod = mod[ucode_mod_idx]; + ucode_mod_map = map; +} + const struct microcode_ops *microcode_ops; static DEFINE_SPINLOCK(microcode_mutex); @@ -69,30 +105,50 @@ int err; struct ucode_cpu_info *uci = &per_cpu(ucode_cpu_info, cpu); struct cpu_signature nsig; + unsigned int cpu2; - if ( !uci->mc.mc_valid ) - return -EIO; + spin_lock(&microcode_mutex); - /* - * Let's verify that the 'cached' ucode does belong - * to this cpu (a bit of paranoia): - */ - err = microcode_ops->collect_cpu_info(cpu, &nsig); + err = microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig); if ( err ) { - microcode_fini_cpu(cpu); + __microcode_fini_cpu(cpu); + spin_unlock(&microcode_mutex); return err; } - if ( microcode_ops->microcode_resume_match(cpu, &nsig) ) + if ( uci->mc.mc_valid ) { - return microcode_ops->apply_microcode(cpu); + err = microcode_ops->microcode_resume_match(cpu, uci->mc.mc_valid); + if ( err >= 0 ) + { + if ( err ) + err = microcode_ops->apply_microcode(cpu); + spin_unlock(&microcode_mutex); + return err; + } } - else + + nsig = uci->cpu_sig; + __microcode_fini_cpu(cpu); + uci->cpu_sig = nsig; + + err = -EIO; + for_each_online_cpu ( cpu2 ) { - microcode_fini_cpu(cpu); - return -EIO; + uci = &per_cpu(ucode_cpu_info, cpu2); + if ( uci->mc.mc_valid && + microcode_ops->microcode_resume_match(cpu, uci->mc.mc_valid) > 0 ) + { + err = microcode_ops->apply_microcode(cpu); + break; + } } + + __microcode_fini_cpu(cpu); + spin_unlock(&microcode_mutex); + + return err; } static int microcode_update_cpu(const void *buf, size_t size) @@ -162,3 +218,78 @@ return continue_hypercall_on_cpu(info->cpu, do_microcode_update, info); } + +static void __init _do_microcode_update(unsigned long data) +{ + microcode_update_cpu((void *)data, ucode_mod.mod_end); + cpumask_set_cpu(smp_processor_id(), &init_mask); +} + +static int __init microcode_init(void) +{ + void *data; + static struct tasklet __initdata tasklet; + unsigned int cpu; + + if ( !microcode_ops || !ucode_mod.mod_end ) + return 0; + + data = ucode_mod_map(&ucode_mod); + if ( !data ) + return -ENOMEM; + + softirq_tasklet_init(&tasklet, _do_microcode_update, (unsigned long)data); + + for_each_online_cpu ( cpu ) + { + tasklet_schedule_on_cpu(&tasklet, cpu); + do { + process_pending_softirqs(); + } while ( !cpumask_test_cpu(cpu, &init_mask) ); + } + + ucode_mod_map(NULL); + + return 0; +} +__initcall(microcode_init); + +static int microcode_percpu_callback( + struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + + switch ( action ) + { + case CPU_DEAD: + microcode_fini_cpu(cpu); + break; + } + + return NOTIFY_DONE; +} + +static struct notifier_block microcode_percpu_nfb = { + .notifier_call = microcode_percpu_callback, +}; + +static int __init microcode_presmp_init(void) +{ + if ( microcode_ops ) + { + if ( ucode_mod.mod_end ) + { + void *data = ucode_mod_map(&ucode_mod); + + if ( data ) + microcode_update_cpu(data, ucode_mod.mod_end); + + ucode_mod_map(NULL); + } + + register_cpu_notifier(&microcode_percpu_nfb); + } + + return 0; +} +presmp_initcall(microcode_presmp_init); diff -r f25a004a6de8 -r f30a33c5b5bd xen/arch/x86/microcode_amd.c --- a/xen/arch/x86/microcode_amd.c Thu Dec 01 17:21:24 2011 +0000 +++ b/xen/arch/x86/microcode_amd.c Thu Dec 01 17:24:12 2011 +0000 @@ -23,27 +23,53 @@ #include <xen/spinlock.h> #include <asm/msr.h> -#include <asm/uaccess.h> #include <asm/processor.h> #include <asm/microcode.h> #define pr_debug(x...) ((void)0) +struct equiv_cpu_entry { + uint32_t installed_cpu; + uint32_t fixed_errata_mask; + uint32_t fixed_errata_compare; + uint16_t equiv_cpu; + uint16_t reserved; +} __attribute__((packed)); + +struct microcode_header_amd { + uint32_t data_code; + uint32_t patch_id; + uint8_t mc_patch_data_id[2]; + uint8_t mc_patch_data_len; + uint8_t init_flag; + uint32_t mc_patch_data_checksum; + uint32_t nb_dev_id; + uint32_t sb_dev_id; + uint16_t processor_rev_id; + uint8_t nb_rev_id; + uint8_t sb_rev_id; + uint8_t bios_api_rev; + uint8_t reserved1[3]; + uint32_t match_reg[8]; +} __attribute__((packed)); + #define UCODE_MAGIC 0x00414d44 #define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000 #define UCODE_UCODE_TYPE 0x00000001 #define UCODE_MAX_SIZE (2048) -#define DEFAULT_UCODE_DATASIZE (896) #define MC_HEADER_SIZE (sizeof(struct microcode_header_amd)) -#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) -#define DWSIZE (sizeof(uint32_t)) + +struct microcode_amd { + struct microcode_header_amd hdr; + unsigned int mpb[(UCODE_MAX_SIZE - MC_HEADER_SIZE) / 4]; + unsigned int equiv_cpu_table_size; + struct equiv_cpu_entry equiv_cpu_table[]; +}; /* serialize access to the physical write */ static DEFINE_SPINLOCK(microcode_update_lock); -struct equiv_cpu_entry *equiv_cpu_table; - static int collect_cpu_info(int cpu, struct cpu_signature *csig) { struct cpuinfo_x86 *c = &cpu_data[cpu]; @@ -65,10 +91,11 @@ return 0; } -static int microcode_fits(void *mc, int cpu) +static int microcode_fits(const struct microcode_amd *mc_amd, int cpu) { struct ucode_cpu_info *uci = &per_cpu(ucode_cpu_info, cpu); - struct microcode_header_amd *mc_header = mc; + const struct microcode_header_amd *mc_header = &mc_amd->hdr; + const struct equiv_cpu_entry *equiv_cpu_table = mc_amd->equiv_cpu_table; unsigned int current_cpu_id; unsigned int equiv_cpu_id = 0x0; unsigned int i; @@ -99,7 +126,7 @@ } if ( mc_header->patch_id <= uci->cpu_sig.rev ) - return -EINVAL; + return 0; printk(KERN_DEBUG "microcode: CPU%d found a matching microcode " "update with version 0x%x (current=0x%x)\n", @@ -186,17 +213,15 @@ return 0; } -static int install_equiv_cpu_table(const void *buf, uint32_t size, - unsigned long *offset) +static int install_equiv_cpu_table( + struct microcode_amd *mc_amd, + const uint32_t *buf_pos, + unsigned long *offset) { - const uint32_t *buf_pos = buf; - unsigned long off; - - off = *offset; - *offset = 0; + uint32_t size = buf_pos[2]; /* No more data */ - if ( off >= size ) + if ( size + 12 >= *offset ) return -EINVAL; if ( buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE ) @@ -213,15 +238,8 @@ return -EINVAL; } - equiv_cpu_table = xmalloc_bytes(size); - if ( equiv_cpu_table == NULL ) - { - printk(KERN_ERR "microcode: error, can't allocate " - "memory for equiv CPU table\n"); - return -ENOMEM; - } - - memcpy(equiv_cpu_table, (const void *)&buf_pos[3], size); + memcpy(mc_amd->equiv_cpu_table, &buf_pos[3], size); + mc_amd->equiv_cpu_table_size = size; *offset = size + 12; /* add header length */ @@ -231,11 +249,11 @@ static int cpu_request_microcode(int cpu, const void *buf, size_t size) { const uint32_t *buf_pos; - unsigned long offset = 0; + struct microcode_amd *mc_amd, *mc_old; + unsigned long offset = size; int error = 0; int ret; struct ucode_cpu_info *uci = &per_cpu(ucode_cpu_info, cpu); - void *mc; /* We should bind the task to the CPU */ BUG_ON(cpu != raw_smp_processor_id()); @@ -249,59 +267,85 @@ return -EINVAL; } - error = install_equiv_cpu_table(buf, (uint32_t)(buf_pos[2]), &offset); + mc_amd = xmalloc_bytes(sizeof(*mc_amd) + buf_pos[2]); + if ( !mc_amd ) + { + printk(KERN_ERR "microcode: error! " + "Can not allocate memory for microcode patch\n"); + return -ENOMEM; + } + + error = install_equiv_cpu_table(mc_amd, buf, &offset); if ( error ) { + xfree(mc_amd); printk(KERN_ERR "microcode: installing equivalent cpu table failed\n"); return -EINVAL; } - mc = xmalloc_bytes(UCODE_MAX_SIZE); - if ( mc == NULL ) - { - printk(KERN_ERR "microcode: error! " - "Can not allocate memory for microcode patch\n"); - error = -ENOMEM; - goto out; - } - + mc_old = uci->mc.mc_amd; /* implicitely validates uci->mc.mc_valid */ - uci->mc.mc_amd = mc; + uci->mc.mc_amd = mc_amd; /* * It's possible the data file has multiple matching ucode, * lets keep searching till the latest version */ - while ( (ret = get_next_ucode_from_buffer_amd(mc, buf, size, &offset)) == 0) + while ( (ret = get_next_ucode_from_buffer_amd(&mc_amd->hdr, buf, size, + &offset)) == 0 ) { - error = microcode_fits(mc, cpu); + error = microcode_fits(mc_amd, cpu); if (error <= 0) continue; error = apply_microcode(cpu); if (error == 0) + { + error = 1; break; + } } + if ( ret < 0 ) + error = ret; + /* On success keep the microcode patch for * re-apply on resume. */ - if (error) { - xfree(mc); - mc = NULL; + if (error == 1) + { + xfree(mc_old); + return 0; } - uci->mc.mc_amd = mc; - -out: - xfree(equiv_cpu_table); - equiv_cpu_table = NULL; + xfree(mc_amd); + uci->mc.mc_amd = mc_old; return error; } -static int microcode_resume_match(int cpu, struct cpu_signature *nsig) +static int microcode_resume_match(int cpu, const void *mc) { - return 0; + struct ucode_cpu_info *uci = &per_cpu(ucode_cpu_info, cpu); + struct microcode_amd *mc_amd = uci->mc.mc_amd; + const struct microcode_amd *src = mc; + int res = microcode_fits(src, cpu); + + if ( res <= 0 ) + return res; + + if ( src != mc_amd ) + { + xfree(mc_amd); + mc_amd = xmalloc_bytes(sizeof(*src) + src->equiv_cpu_table_size); + uci->mc.mc_amd = mc_amd; + if ( !mc_amd ) + return -ENOMEM; + memcpy(mc_amd, src, UCODE_MAX_SIZE); + memcpy(mc_amd->equiv_cpu_table, src->equiv_cpu_table, + src->equiv_cpu_table_size); + } + + return 1; } static const struct microcode_ops microcode_amd_ops = { @@ -317,4 +361,4 @@ microcode_ops = &microcode_amd_ops; return 0; } -__initcall(microcode_init_amd); +presmp_initcall(microcode_init_amd); diff -r f25a004a6de8 -r f30a33c5b5bd xen/arch/x86/microcode_intel.c --- a/xen/arch/x86/microcode_intel.c Thu Dec 01 17:21:24 2011 +0000 +++ b/xen/arch/x86/microcode_intel.c Thu Dec 01 17:24:12 2011 +0000 @@ -30,12 +30,43 @@ #include <xen/spinlock.h> #include <asm/msr.h> -#include <asm/uaccess.h> #include <asm/processor.h> #include <asm/microcode.h> #define pr_debug(x...) ((void)0) +struct microcode_header_intel { + unsigned int hdrver; + unsigned int rev; + unsigned int date; + unsigned int sig; + unsigned int cksum; + unsigned int ldrver; + unsigned int pf; + unsigned int datasize; + unsigned int totalsize; + unsigned int reserved[3]; +}; + +struct microcode_intel { + struct microcode_header_intel hdr; + unsigned int bits[0]; +}; + +/* microcode format is extended from prescott processors */ +struct extended_signature { + unsigned int sig; + unsigned int pf; + unsigned int cksum; +}; + +struct extended_sigtable { + unsigned int count; + unsigned int cksum; + unsigned int reserved[3]; + struct extended_signature sigs[0]; +}; + #define DEFAULT_UCODE_DATASIZE (2000) #define MC_HEADER_SIZE (sizeof(struct microcode_header_intel)) #define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) @@ -98,7 +129,8 @@ } static inline int microcode_update_match( - int cpu_num, struct microcode_header_intel *mc_header, int sig, int pf) + int cpu_num, const struct microcode_header_intel *mc_header, + int sig, int pf) { struct ucode_cpu_info *uci = &per_cpu(ucode_cpu_info, cpu_num); @@ -200,11 +232,11 @@ * return 1 - found update * return < 0 - error */ -static int get_matching_microcode(void *mc, int cpu) +static int get_matching_microcode(const void *mc, int cpu) { struct ucode_cpu_info *uci = &per_cpu(ucode_cpu_info, cpu); - struct microcode_header_intel *mc_header = mc; - struct extended_sigtable *ext_header; + const struct microcode_header_intel *mc_header = mc; + const struct extended_sigtable *ext_header; unsigned long total_size = get_totalsize(mc_header); int ext_sigcount, i; struct extended_signature *ext_sig; @@ -229,6 +261,8 @@ } return 0; find: + if ( uci->mc.mc_intel && uci->mc.mc_intel->hdr.rev >= mc_header->rev ) + return 0; pr_debug("microcode: CPU%d found a matching microcode update with" " version 0x%x (current=0x%x)\n", cpu, mc_header->rev, uci->cpu_sig.rev); @@ -239,10 +273,8 @@ return -ENOMEM; } - /* free previous update file */ + memcpy(new_mc, mc, total_size); xfree(uci->mc.mc_intel); - - memcpy(new_mc, mc, total_size); uci->mc.mc_intel = new_mc; return 1; } @@ -361,12 +393,9 @@ return error; } -static int microcode_resume_match(int cpu, struct cpu_signature *nsig) +static int microcode_resume_match(int cpu, const void *mc) { - struct ucode_cpu_info *uci = &per_cpu(ucode_cpu_info, cpu); - - return (sigmatch(nsig->sig, uci->cpu_sig.sig, nsig->pf, uci->cpu_sig.pf) && - (uci->cpu_sig.rev > nsig->rev)); + return get_matching_microcode(mc, cpu); } static const struct microcode_ops microcode_intel_ops = { @@ -382,4 +411,4 @@ microcode_ops = &microcode_intel_ops; return 0; } -__initcall(microcode_init_intel); +presmp_initcall(microcode_init_intel); diff -r f25a004a6de8 -r f30a33c5b5bd xen/arch/x86/setup.c --- a/xen/arch/x86/setup.c Thu Dec 01 17:21:24 2011 +0000 +++ b/xen/arch/x86/setup.c Thu Dec 01 17:24:12 2011 +0000 @@ -550,10 +550,10 @@ { char *memmap_type = NULL; char *cmdline, *kextra, *loader; - unsigned int initrdidx = 1; + unsigned int initrdidx; multiboot_info_t *mbi = __va(mbi_p); module_t *mod = (module_t *)__va(mbi->mods_addr); - unsigned long nr_pages, modules_headroom; + unsigned long nr_pages, modules_headroom, *module_map; int i, j, e820_warn = 0, bytes = 0; bool_t acpi_boot_table_init_done = 0; struct ns16550_defaults ns16550 = { @@ -1229,7 +1229,13 @@ init_IRQ(); - xsm_init(&initrdidx, mbi, bootstrap_map); + module_map = xmalloc_array(unsigned long, BITS_TO_LONGS(mbi->mods_count)); + bitmap_fill(module_map, mbi->mods_count); + __clear_bit(0, module_map); /* Dom0 kernel is always first */ + + xsm_init(module_map, mbi, bootstrap_map); + + microcode_grab_module(module_map, mbi, bootstrap_map); timer_init(); @@ -1356,6 +1362,12 @@ if ( xen_cpuidle ) xen_processor_pmbits |= XEN_PROCESSOR_PM_CX; + initrdidx = find_first_bit(module_map, mbi->mods_count); + if ( bitmap_weight(module_map, mbi->mods_count) > 1 ) + printk(XENLOG_WARNING + "Multiple initrd candidates, picking module #%u\n", + initrdidx); + /* * We're going to setup domain0 using the module(s) that we stashed safely * above our heap. The second module, if present, is an initrd ramdisk. diff -r f25a004a6de8 -r f30a33c5b5bd xen/include/asm-x86/microcode.h --- a/xen/include/asm-x86/microcode.h Thu Dec 01 17:21:24 2011 +0000 +++ b/xen/include/asm-x86/microcode.h Thu Dec 01 17:24:12 2011 +0000 @@ -7,74 +7,12 @@ struct ucode_cpu_info; struct microcode_ops { - int (*microcode_resume_match)(int cpu, struct cpu_signature *nsig); + int (*microcode_resume_match)(int cpu, const void *mc); int (*cpu_request_microcode)(int cpu, const void *buf, size_t size); int (*collect_cpu_info)(int cpu, struct cpu_signature *csig); int (*apply_microcode)(int cpu); }; -struct microcode_header_intel { - unsigned int hdrver; - unsigned int rev; - unsigned int date; - unsigned int sig; - unsigned int cksum; - unsigned int ldrver; - unsigned int pf; - unsigned int datasize; - unsigned int totalsize; - unsigned int reserved[3]; -}; - -struct microcode_intel { - struct microcode_header_intel hdr; - unsigned int bits[0]; -}; - -/* microcode format is extended from prescott processors */ -struct extended_signature { - unsigned int sig; - unsigned int pf; - unsigned int cksum; -}; - -struct extended_sigtable { - unsigned int count; - unsigned int cksum; - unsigned int reserved[3]; - struct extended_signature sigs[0]; -}; - -struct equiv_cpu_entry { - uint32_t installed_cpu; - uint32_t fixed_errata_mask; - uint32_t fixed_errata_compare; - uint16_t equiv_cpu; - uint16_t reserved; -} __attribute__((packed)); - -struct microcode_header_amd { - uint32_t data_code; - uint32_t patch_id; - uint8_t mc_patch_data_id[2]; - uint8_t mc_patch_data_len; - uint8_t init_flag; - uint32_t mc_patch_data_checksum; - uint32_t nb_dev_id; - uint32_t sb_dev_id; - uint16_t processor_rev_id; - uint8_t nb_rev_id; - uint8_t sb_rev_id; - uint8_t bios_api_rev; - uint8_t reserved1[3]; - uint32_t match_reg[8]; -} __attribute__((packed)); - -struct microcode_amd { - struct microcode_header_amd hdr; - unsigned int mpb[0]; -}; - struct cpu_signature { unsigned int sig; unsigned int pf; diff -r f25a004a6de8 -r f30a33c5b5bd xen/include/asm-x86/processor.h --- a/xen/include/asm-x86/processor.h Thu Dec 01 17:21:24 2011 +0000 +++ b/xen/include/asm-x86/processor.h Thu Dec 01 17:24:12 2011 +0000 @@ -599,6 +599,7 @@ int rdmsr_hypervisor_regs(uint32_t idx, uint64_t *val); int wrmsr_hypervisor_regs(uint32_t idx, uint64_t val); +void microcode_set_module(unsigned int); int microcode_update(XEN_GUEST_HANDLE(const_void), unsigned long len); int microcode_resume_cpu(int cpu); diff -r f25a004a6de8 -r f30a33c5b5bd xen/include/asm-x86/setup.h --- a/xen/include/asm-x86/setup.h Thu Dec 01 17:21:24 2011 +0000 +++ b/xen/include/asm-x86/setup.h Thu Dec 01 17:24:12 2011 +0000 @@ -44,4 +44,7 @@ int xen_in_range(unsigned long mfn); void arch_get_xen_caps(xen_capabilities_info_t *info); +void microcode_grab_module( + unsigned long *, const multiboot_info_t *, void *(*)(const module_t *)); + #endif diff -r f25a004a6de8 -r f30a33c5b5bd xen/include/xsm/xsm.h --- a/xen/include/xsm/xsm.h Thu Dec 01 17:21:24 2011 +0000 +++ b/xen/include/xsm/xsm.h Thu Dec 01 17:24:12 2011 +0000 @@ -454,14 +454,15 @@ } #ifdef XSM_ENABLE -extern int xsm_init(unsigned int *initrdidx, const multiboot_info_t *mbi, +extern int xsm_init(unsigned long *module_map, const multiboot_info_t *mbi, void *(*bootstrap_map)(const module_t *)); -extern int xsm_policy_init(unsigned int *initrdidx, const multiboot_info_t *mbi, +extern int xsm_policy_init(unsigned long *module_map, + const multiboot_info_t *mbi, void *(*bootstrap_map)(const module_t *)); extern int register_xsm(struct xsm_operations *ops); extern int unregister_xsm(struct xsm_operations *ops); #else -static inline int xsm_init (unsigned int *initrdidx, +static inline int xsm_init (unsigned long *module_map, const multiboot_info_t *mbi, void *(*bootstrap_map)(const module_t *)) { diff -r f25a004a6de8 -r f30a33c5b5bd xen/xsm/xsm_core.c --- a/xen/xsm/xsm_core.c Thu Dec 01 17:21:24 2011 +0000 +++ b/xen/xsm/xsm_core.c Thu Dec 01 17:24:12 2011 +0000 @@ -43,7 +43,7 @@ } } -int __init xsm_init(unsigned int *initrdidx, const multiboot_info_t *mbi, +int __init xsm_init(unsigned long *module_map, const multiboot_info_t *mbi, void *(*bootstrap_map)(const module_t *)) { int ret = 0; @@ -52,7 +52,7 @@ if ( XSM_MAGIC ) { - ret = xsm_policy_init(initrdidx, mbi, bootstrap_map); + ret = xsm_policy_init(module_map, mbi, bootstrap_map); if ( ret ) { bootstrap_map(NULL); diff -r f25a004a6de8 -r f30a33c5b5bd xen/xsm/xsm_policy.c --- a/xen/xsm/xsm_policy.c Thu Dec 01 17:21:24 2011 +0000 +++ b/xen/xsm/xsm_policy.c Thu Dec 01 17:24:12 2011 +0000 @@ -20,11 +20,12 @@ #include <xsm/xsm.h> #include <xen/multiboot.h> +#include <asm/bitops.h> char *__initdata policy_buffer = NULL; u32 __initdata policy_size = 0; -int xsm_policy_init(unsigned int *initrdidx, const multiboot_info_t *mbi, +int xsm_policy_init(unsigned long *module_map, const multiboot_info_t *mbi, void *(*bootstrap_map)(const module_t *)) { int i; @@ -35,10 +36,13 @@ /* * Try all modules and see whichever could be the binary policy. - * Adjust the initrdidx if module[1] is the binary policy. + * Adjust module_map for the module that is the binary policy. */ for ( i = mbi->mods_count-1; i >= 1; i-- ) { + if ( !test_bit(i, module_map) ) + continue; + _policy_start = bootstrap_map(mod + i); _policy_len = mod[i].mod_end; @@ -50,8 +54,7 @@ printk("Policy len 0x%lx, start at %p.\n", _policy_len,_policy_start); - if ( i == 1 ) - *initrdidx = (mbi->mods_count > 2) ? 2 : 0; + __clear_bit(i, module_map); break; } _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-changelog

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.