[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen-unstable] merge



# HG changeset patch
# User Tim Deegan <tim@xxxxxxx>
# Date 1322760252 0
# Node ID f30a33c5b5bd1e2b0bdffa3a649490157e451a4e
# Parent  f25a004a6de8efc15d95408f3e92081393360acb
# Parent  3e5683b6b37f9010772dac4a92166b1666485ddd
merge
---


diff -r f25a004a6de8 -r f30a33c5b5bd Config.mk
--- a/Config.mk Thu Dec 01 17:21:24 2011 +0000
+++ b/Config.mk Thu Dec 01 17:24:12 2011 +0000
@@ -232,7 +232,7 @@
 OCAML_TOOLS        ?= y
 CONFIG_MINITERM    ?= n
 CONFIG_LOMOUNT     ?= n
-CONFIG_SYSTEM_LIBAIO ?= n
+CONFIG_SYSTEM_LIBAIO ?= y
 
 ifeq ($(OCAML_TOOLS),y)
 OCAML_TOOLS := $(shell ocamlopt -v > /dev/null 2>&1 && echo "y" || echo "n")
diff -r f25a004a6de8 -r f30a33c5b5bd docs/gen-html-index
--- a/docs/gen-html-index       Thu Dec 01 17:21:24 2011 +0000
+++ b/docs/gen-html-index       Thu Dec 01 17:24:12 2011 +0000
@@ -10,7 +10,6 @@
 use Getopt::Long;
 use IO::File;
 use File::Basename;
-use List::MoreUtils qw/ uniq /;
 
 Getopt::Long::Configure('bundling');
 
@@ -99,6 +98,12 @@
     }
 }
 
+sub uniq (@) {
+    my %h;
+    foreach (@_) { $h{$_} = 1; }
+    return keys %h;
+}
+    
 for (@docs) { s,^\Q$outdir\E/,, }
 
 @docs = grep { -e "$outdir/$_" && (make_linktext($_) ne "NO-INDEX") } @docs;
diff -r f25a004a6de8 -r f30a33c5b5bd docs/man/xl.pod.1
--- a/docs/man/xl.pod.1 Thu Dec 01 17:21:24 2011 +0000
+++ b/docs/man/xl.pod.1 Thu Dec 01 17:24:12 2011 +0000
@@ -32,19 +32,51 @@
 
 =head1 NOTES
 
+=over 4
+
+=item start the script B</etc/init.d/xencommons> at boot time
+
 Most B<xl> operations rely upon B<xenstored> and B<xenconsoled>: make
 sure you start the script B</etc/init.d/xencommons> at boot time to
 initialize all the daemons needed by B<xl>.
 
+=item setup a B<xenbr0> bridge in dom0
+
 In the most common network configuration, you need to setup a bridge in dom0
 named B<xenbr0> in order to have a working network in the guest domains.
 Please refer to the documentation of your Linux distribution to know how to
 setup the bridge.
 
+=item B<autoballoon>
+
+If you specify the amount of memory dom0 has, passing B<dom0_mem> to
+Xen, it is highly reccomended to disable B<autoballoon>. Edit
+B</etc/xen/xl.conf> and set it to 0.
+
+=item run xl as B<root>
+
 Most B<xl> commands require root privileges to run due to the
 communications channels used to talk to the hypervisor.  Running as
 non root will return an error.
 
+=back
+
+=head1 GLOBAL OPTIONS
+
+Some global options are always available:
+
+=over 4
+
+=item B<-v>
+
+Verbose.
+
+=item B<-N>
+
+Dry run: do not actually execute the command.
+
+=back
+
 =head1 DOMAIN SUBCOMMANDS
 
 The following subcommands manipulate domains directly.  As stated
@@ -52,13 +84,19 @@
 
 =over 4
 
-=item B<create> [I<OPTIONS>] I<configfile>
+=item B<button-press> I<domain-id> I<button>
 
-The create subcommand requires a config file: see L<xl.cfg(5)> for
-full details of that file format and possible options.
+Indicate an ACPI button press to the domain. I<button> is may be 'power' or
+'sleep'. This command is only available for HVM domains.
 
-I<configfile> can either be an absolute path to a file, or a relative
-path to a file located in /etc/xen.
+=item B<create> [I<configfile>] [I<OPTIONS>]
+
+The create subcommand takes a config file as first argument: see
+L<xl.cfg> for full details of that file format and possible options.
+If I<configfile> is missing B<XL> creates the domain starting from the
+default value for every option.
+
+I<configfile> has to be an absolute path to a file.
 
 Create will return B<as soon> as the domain is started.  This B<does
 not> mean the guest OS in the domain has actually booted, or is
@@ -76,11 +114,6 @@
 
 Use the given configuration file.
 
-=item B<-n>, B<--dryrun>
-
-Dry run - prints the resulting configuration in SXP but does not create
-the domain.
-
 =item B<-p>
 
 Leave the domain paused after it is created.
@@ -88,7 +121,15 @@
 =item B<-c>
 
 Attach console to the domain as soon as it has started.  This is
-useful for determining issues with crashing domains.
+useful for determining issues with crashing domains and just as a
+general convenience since you often want to watch the
+domain boot.
+
+=item B<key=value>
+
+It is possible to pass I<key=value> pairs on the command line to provide
+options as if they were written in the configuration file; these override
+whatever is in the I<configfile>.
 
 =back
 
@@ -105,7 +146,7 @@
 
 =back
 
-=item B<console> I<domain-id>
+=item B<console> [I<OPTIONS>] I<domain-id>
 
 Attach to domain I<domain-id>'s console.  If you've set up your domains to
 have a traditional log in console this will look much like a normal
@@ -113,17 +154,20 @@
 
 Use the key combination Ctrl+] to detach the domain console.
 
-=item B<vncviewer> [I<OPTIONS>] I<domain-id>
-
-Attach to domain's VNC server, forking a vncviewer process.
-
 B<OPTIONS>
 
 =over 4
 
-=item I<--autopass>
+=item I<-t [pv|serial]>
 
-Pass VNC password to vncviewer via stdin.
+Connect to a PV console or connect to an emulated serial console.
+PV consoles are the only consoles available for PV domains while HVM
+domains can have both. If this option is not specified it defaults to
+emulated serial for HVM guests and PV console for PV guests.
+
+=item I<-n NUM>
+
+Connect to console number I<NUM>. Console numbers start from 0.
 
 =back
 
@@ -153,6 +197,10 @@
 be written to a distribution specific directory for dump files.  Such
 as: /var/lib/xen/dump or /var/xen/dump.
 
+=item B<getenforce>
+
+Returns the current enforcing mode of the Flask Xen security module.
+
 =item B<help> [I<--long>]
 
 Displays the short help message (i.e. common commands).
@@ -226,7 +274,8 @@
 
 =item B<s - shutdown>
 
-FIXME: Why would you ever see this state?
+The guest OS has shut down (SCHEDOP_shutdown has been called) but the
+domain is not dying yet.
 
 =item B<c - crashed>
 
@@ -239,8 +288,6 @@
 The domain is in process of dying, but hasn't completely shutdown or
 crashed.
 
-FIXME: Is this right?
-
 =back
 
 B<NOTES>
@@ -256,6 +303,10 @@
 
 =back
 
+=item B<loadpolicy> I<policyfile>
+
+Loads a new policy int the Flask Xen security module.
+
 =item B<mem-max> I<domain-id> I<mem>
 
 Specify the maximum amount of memory the domain is able to use, appending 't'
@@ -297,7 +348,7 @@
 =item B<-e>
 
 On the new host, do not wait in the background (on <host>) for the death of the
-domain.
+domain. See the corresponding option of the I<create> subcommand.
 
 =item B<-C> I<config>
 
@@ -317,6 +368,7 @@
 command run from the console.  The command returns as soon as it has
 executed the reboot action, which may be significantly before the
 domain actually reboots.
+It requires PV drivers installed in your guest OS.
 
 The behavior of what happens to a domain when it reboots is set by the
 B<on_reboot> parameter of the domain configuration file when the
@@ -337,6 +389,7 @@
 =item B<-e>
 
 Do not wait in the background for the death of the domain on the new host.
+See the corresponding option of the I<create> subcommand.
 
 =item B<-d>
 
@@ -344,6 +397,10 @@
 
 =back
 
+=item B<setenforce> I<1|0|Enforcing|Permissive>
+
+Sets the current enforcing mode of the Flask Xen security module
+
 =item B<save> [I<OPTIONS>] I<domain-id> I<CheckpointFile> [I<ConfigFile>]
 
 Saves a running domain to a state file so that it can be restored
@@ -353,7 +410,6 @@
 Passing a config file argument allows the user to manually select the VM config
 file used to create the domain.
 
-
 =over 4
 
 =item B<-c>
@@ -370,6 +426,7 @@
 succeed, and may take a variable length of time depending on what
 services must be shutdown in the domain.  The command returns
 immediately after signally the domain unless that B<-w> flag is used.
+For HVM domains it requires PV drivers to be installed in your guest OS.
 
 The behavior of what happens to a domain when it reboots is set by the
 B<on_shutdown> parameter of the domain configuration file when the
@@ -387,9 +444,17 @@
 
 =item B<sysrq> I<domain-id> I<letter>
 
-Send a I<Magic System Request> signal to the domain.  For more
-information on available magic sys req operations, see sysrq.txt in
-your Linux Kernel sources.
+Send a <Magic System Request> to the domain, each type of request is
+represented by a different letter.
+It can be used to send SysRq requests to Linux guests, see sysrq.txt in
+your Linux Kernel sources for more information.
+It requires PV drivers to be installed in your guest OS.
+
+=item B<trigger> I<domain-id> I<nmi|reset|init|power|sleep> [I<VCPU>]
+
+Send a trigger to a domain, where the trigger can be: nmi, reset, init, power
+or sleep.  Optionally a specific vcpu number can be passed as an argument.
+This command is only available for HVM domains.
 
 =item B<unpause> I<domain-id>
 
@@ -410,10 +475,6 @@
 configured VCPU count is an error.  Trying to set VCPUs to < 1 will be
 quietly ignored.
 
-Because this operation requires cooperation from the domain operating
-system, there is no guarantee that it will succeed.  This command will
-not work with a full virt domain.
-
 =item B<vcpu-list> [I<domain-id>]
 
 Lists VCPU information for a specific domain.  If no domain is
@@ -430,27 +491,19 @@
 this, by ensuring certain VCPUs can only run on certain physical
 CPUs.
 
-=item B<button-press> I<domain-id> I<button>
+=item B<vncviewer> [I<OPTIONS>] I<domain-id>
 
-Indicate an ACPI button press to the domain. I<button> is may be 'power' or
-'sleep'.
+Attach to domain's VNC server, forking a vncviewer process.
 
-=item B<trigger> I<domain-id> I<nmi|reset|init|power|sleep> [I<VCPU>]
+B<OPTIONS>
 
-Send a trigger to a domain, where the trigger can be: nmi, reset, init, power
-or sleep.  Optionally a specific vcpu number can be passed as an argument.
+=over 4
 
-=item B<getenforce>
+=item I<--autopass>
 
-Returns the current enforcing mode of the Flask Xen security module.
+Pass VNC password to vncviewer via stdin.
 
-=item B<setenforce> I<1|0|Enforcing|Permissive>
-
-Sets the current enforcing mode of the Flask Xen security module
-
-=item B<loadpolicy> I<policyfile>
-
-Loads a new policy int the Flask Xen security module.
+=back
 
 =back
 
@@ -460,7 +513,8 @@
 
 =item B<debug-keys> I<keys>
 
-Send debug I<keys> to Xen.
+Send debug I<keys> to Xen. It is the same as pressing the Xen
+"conswitch" (Ctrl-A by default) three times and then pressing "keys".
 
 =item B<dmesg> [B<-c>]
 
@@ -483,39 +537,41 @@
 
 Print information about the Xen host in I<name : value> format.  When
 reporting a Xen bug, please provide this information as part of the
-bug report.
+bug report. See I<http://wiki.xen.org/xenwiki/ReportingBugs> on how to
+report Xen bugs.
 
-Sample output looks as follows (lines wrapped manually to make the man
-page more readable):
+Sample output looks as follows:
 
- host                   : talon
- release                : 2.6.12.6-xen0
- version                : #1 Mon Nov 14 14:26:26 EST 2005
- machine                : i686
- nr_cpus                : 2
+ host                   : scarlett
+ release                : 3.1.0-rc4+
+ version                : #1001 SMP Wed Oct 19 11:09:54 UTC 2011
+ machine                : x86_64
+ nr_cpus                : 4
  nr_nodes               : 1
- cores_per_socket       : 1
+ cores_per_socket       : 4
  threads_per_core       : 1
- cpu_mhz                : 696
- hw_caps                : 0383fbff:00000000:00000000:00000040
- total_memory           : 767
- free_memory            : 37
- xen_major              : 3
- xen_minor              : 0
- xen_extra              : -devel
- xen_caps               : xen-3.0-x86_32
+ cpu_mhz                : 2266
+ hw_caps                : 
bfebfbff:28100800:00000000:00003b40:009ce3bd:00000000:00000001:00000000
+ virt_caps              : hvm hvm_directio
+ total_memory           : 6141
+ free_memory            : 4274
+ free_cpus              : 0
+ xen_major              : 4
+ xen_minor              : 2
+ xen_extra              : -unstable
+ xen_caps               : xen-3.0-x86_64 xen-3.0-x86_32p hvm-3.0-x86_32 
hvm-3.0-x86_32p hvm-3.0-x86_64 
  xen_scheduler          : credit
  xen_pagesize           : 4096
- platform_params        : virt_start=0xfc000000
- xen_changeset          : Mon Nov 14 18:13:38 2005 +0100 
-                          7793:090e44133d40
- cc_compiler            : gcc version 3.4.3 (Mandrakelinux 
-                          10.2 3.4.3-7mdk)
- cc_compile_by          : sdague
- cc_compile_domain      : (none)
- cc_compile_date        : Mon Nov 14 14:16:48 EST 2005
+ platform_params        : virt_start=0xffff800000000000
+ xen_changeset          : Wed Nov 02 17:09:09 2011 +0000 24066:54a5e994a241
+ xen_commandline        : com1=115200,8n1 guest_loglvl=all dom0_mem=750M 
console=com1 
+ cc_compiler            : gcc version 4.4.5 (Debian 4.4.5-8) 
+ cc_compile_by          : sstabellini
+ cc_compile_domain      : uk.xensource.com
+ cc_compile_date        : Tue Nov  8 12:03:05 UTC 2011
  xend_config_format     : 4
 
+
 B<FIELDS>
 
 Not all fields will be explained here, but some of the less obvious
@@ -527,7 +583,8 @@
 
 A vector showing what hardware capabilities are supported by your
 processor.  This is equivalent to, though more cryptic, the flags
-field in /proc/cpuinfo on a normal Linux machine.
+field in /proc/cpuinfo on a normal Linux machine: they both derive from
+the feature bits returned by the cpuid command on x86 platforms.
 
 =item B<free_memory>
 
@@ -568,6 +625,9 @@
 =item B<pci-list-assignable-devices>
 
 List all the assignable PCI devices.
+These are devices in the system which are configured to be
+available for passthrough and are bound to a suitable PCI
+backend driver in domain 0 rather than a real driver.
 
 =back
 
@@ -635,10 +695,6 @@
 
 Use the given configuration file.
 
-=item B<-n>, B<--dryrun>
-
-Dry run - prints the resulting configuration.
-
 =back
 
 =item B<cpupool-list> [I<-c|--cpus>] [I<cpu-pool>]
@@ -676,8 +732,8 @@
 =head1 VIRTUAL DEVICE COMMANDS
 
 Most virtual devices can be added and removed while guests are
-running.  The effect to the guest OS is much the same as any hotplug
-event.
+running, assuming that the necessary support exists in the guest.  The
+effect to the guest OS is much the same as any hotplug event.
 
 =head2 BLOCK DEVICES
 
@@ -699,7 +755,8 @@
 =item I<disc-spec-component>
 
 A disc specification in the same format used for the B<disk> variable in
-the domain config file. See F<xl-disk-configuration>.
+the domain config file. See
+L<http://xenbits.xen.org/docs/unstable/misc/xl-disk-configuration.txt>.
 
 =back
 
@@ -754,8 +811,9 @@
 
 Creates a new network device in the domain specified by I<domain-id>.
 I<network-device> describes the device to attach, using the same format as the
-B<vif> string in the domain config file. See L<xl.cfg(5)> for the
-description.
+B<vif> string in the domain config file. See L<xl.cfg> and
+L<http://xenbits.xen.org/docs/unstable/misc/xl-network-configuration.html>
+for more informations.
 
 =item B<network-detach> I<domain-id> I<devid|mac>
 
@@ -793,17 +851,100 @@
 
 =back
 
+=head2 TMEM
+
+=over 4
+
+=item B<tmem-list> I[<-l>] I<domain-id>
+
+List tmem pools. If I<-l> is specified, also list tmem stats.
+
+=item B<tmem-freeze> I<domain-id>
+
+Freeze tmem pools.
+
+=item B<tmem-destroy> I<domain-id>
+
+Destroy tmem pools.
+
+=item B<tmem-thaw> I<domain-id>
+
+Thaw tmem pools.
+
+=item B<tmem-set> I<domain-id> [I<OPTIONS>]
+
+Change tmem settings.
+
+B<OPTIONS>
+
+=over 4
+
+=item B<-w> I<WEIGHT>
+
+Weight (int)
+
+=item B<-c> I<CAP>
+
+Cap (int)
+
+=item B<-p> I<COMPRESS>
+
+Compress (int)
+
+=back
+
+=item B<tmem-shared-auth> I<domain-id> [I<OPTIONS>]
+
+De/authenticate shared tmem pool.
+
+B<OPTIONS>
+
+=over 4
+
+=item B<-u> I<UUID>
+
+Specify uuid (abcdef01-2345-6789-1234-567890abcdef)
+
+=item B<-a> I<AUTH>
+
+0=auth,1=deauth
+
+=back
+
+=item B<tmem-freeable>
+
+Get information about how much freeable memory (MB) is in-use by tmem.
+
+=back
+
+=head1 TO BE DOCUMENTED
+
+We need better documentation for:
+
+=over 4
+
+=item B<tmem>
+
+Trascendent Memory.
+
+=item B<Flask>
+
+Xen Flask security module.
+
+=back
+
 =head1 SEE ALSO
 
-L<xl.cfg(5)>, L<xlcpupool.cfg(5)>, B<xentop(1)>
+The following man pages:
 
-=head1 AUTHOR
+L<xl.cfg>(5), L<xlcpupool.cfg>(5), B<xentop>(1)
 
-  Stefano Stabellini <stefano.stabellini@xxxxxxxxxxxxx>
-  Vincent Hanquez <vincent.hanquez@xxxxxxxxxxxxx>
-  Ian Jackson <ian.jackson@xxxxxxxxxxxxx>
-  Ian Campbell <Ian.Campbell@xxxxxxxxxx>
+And the following documents on the xen.org website:
+
+L<http://xenbits.xen.org/docs/unstable/misc/xl-network-configuration.html>
+L<http://xenbits.xen.org/docs/unstable/misc/xl-disk-configuration.txt>
 
 =head1 BUGS
 
-Send bugs to xen-devel@xxxxxxxxxxxxxxxxxxxx
+Send bugs to xen-devel@xxxxxxxxxxxxxxxxxxx, see
+http://wiki.xen.org/xenwiki/ReportingBugs on how to send bug reports.
diff -r f25a004a6de8 -r f30a33c5b5bd tools/libxc/Makefile
--- a/tools/libxc/Makefile      Thu Dec 01 17:21:24 2011 +0000
+++ b/tools/libxc/Makefile      Thu Dec 01 17:24:12 2011 +0000
@@ -42,7 +42,7 @@
 GUEST_SRCS-y :=
 GUEST_SRCS-y += xg_private.c xc_suspend.c
 GUEST_SRCS-$(CONFIG_MIGRATE) += xc_domain_restore.c xc_domain_save.c
-GUEST_SRCS-$(CONFIG_MIGRATE) += xc_offline_page.c
+GUEST_SRCS-$(CONFIG_MIGRATE) += xc_offline_page.c xc_compression.c
 GUEST_SRCS-$(CONFIG_HVM) += xc_hvm_build.c
 
 vpath %.c ../../xen/common/libelf
diff -r f25a004a6de8 -r f30a33c5b5bd tools/libxc/xc_compression.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libxc/xc_compression.c      Thu Dec 01 17:24:12 2011 +0000
@@ -0,0 +1,552 @@
+/******************************************************************************
+ * xc_compression.c
+ *
+ * Checkpoint Compression using Page Delta Algorithm.
+ * - A LRU cache of recently dirtied guest pages is maintained.
+ * - For each dirty guest page in the checkpoint, if a previous version of the
+ * page exists in the cache, XOR both pages and send the non-zero sections
+ * to the receiver. The cache is then updated with the newer copy of guest 
page.
+ * - The receiver will XOR the non-zero sections against its copy of the guest
+ * page, thereby bringing the guest page up-to-date with the sender side.
+ *
+ * Copyright (c) 2011 Shriram Rajagopalan (rshriram@xxxxxxxxx).
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  
USA
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <inttypes.h>
+#include <errno.h>
+#include "xc_private.h"
+#include "xenctrl.h"
+#include "xg_save_restore.h"
+#include "xg_private.h"
+#include "xc_dom.h"
+
+/* Page Cache for Delta Compression*/
+#define DELTA_CACHE_SIZE (XC_PAGE_SIZE * 8192)
+
+/* Internal page buffer to hold dirty pages of a checkpoint,
+ * to be compressed after the domain is resumed for execution.
+ */
+#define PAGE_BUFFER_SIZE (XC_PAGE_SIZE * 8192)
+
+struct cache_page
+{
+    char *page;
+    xen_pfn_t pfn;
+    struct cache_page *next;
+    struct cache_page *prev;
+};
+
+struct compression_ctx
+{
+    /* compression buffer - holds compressed data */
+    char *compbuf;
+    unsigned long compbuf_size;
+    unsigned long compbuf_pos;
+
+    /* Page buffer to hold pages to be compressed */
+    char *inputbuf;
+    /* pfns of pages to be compressed */
+    xen_pfn_t *sendbuf_pfns;
+    unsigned int pfns_len;
+    unsigned int pfns_index;
+
+    /* Compression Cache (LRU) */
+    char *cache_base;
+    struct cache_page **pfn2cache;
+    struct cache_page *cache;
+    struct cache_page *page_list_head;
+    struct cache_page *page_list_tail;
+    unsigned long dom_pfnlist_size;
+};
+
+#define RUNFLAG 0
+#define SKIPFLAG ((char)128)
+#define FLAGMASK SKIPFLAG
+#define LENMASK ((char)127)
+
+/*
+ * see xg_save_restore.h for details on the compressed stream format.
+ * delta size = 4 bytes.
+ * run header = 1 byte (1 bit for runtype, 7bits for run length).
+ *  i.e maximum size of a run = 127 * 4 = 508 bytes.
+ * Worst case compression: Entire page has changed.
+ * In the worst case, the size of the compressed page is
+ *  8 runs of 508 bytes + 1 run of 32 bytes + 9 run headers 
+ *  = 4105 bytes.
+ * We could detect this worst case and send the entire page with a
+ * FULL_PAGE marker, reducing the total size to 4097 bytes. The cost
+ * of this size reduction is an additional memcpy, on top of two previous
+ * memcpy (to the compressed stream and the cache page in the for loop).
+ *
+ * We might as well sacrifice an extra 8 bytes instead of a memcpy.
+ */
+#define WORST_COMP_PAGE_SIZE (XC_PAGE_SIZE + 9)
+
+/*
+ * A zero length skip indicates full page.
+ */
+#define EMPTY_PAGE 0
+#define FULL_PAGE SKIPFLAG
+#define FULL_PAGE_SIZE (XC_PAGE_SIZE + 1)
+#define MAX_DELTAS (XC_PAGE_SIZE/sizeof(uint32_t))
+
+/*
+ * Add a pagetable page or a new page (uncached)
+ * if srcpage is a pagetable page, cache_page is null.
+ * if srcpage is a page that was not previously in the cache,
+ *  cache_page points to a free page slot in the cache where
+ *  this new page can be copied to.
+ */
+static int add_full_page(comp_ctx *ctx, char *srcpage, char *cache_page)
+{
+    char *dest = (ctx->compbuf + ctx->compbuf_pos);
+
+    if ( (ctx->compbuf_pos + FULL_PAGE_SIZE) > ctx->compbuf_size)
+        return -1;
+
+    if (cache_page)
+        memcpy(cache_page, srcpage, XC_PAGE_SIZE);
+    dest[0] = FULL_PAGE;
+    memcpy(&dest[1], srcpage, XC_PAGE_SIZE);
+    ctx->compbuf_pos += FULL_PAGE_SIZE;
+
+    return FULL_PAGE_SIZE;
+}
+
+static int compress_page(comp_ctx *ctx, char *srcpage, char *cache_page)
+{
+    char *dest = (ctx->compbuf + ctx->compbuf_pos);
+    uint32_t *new, *old;
+
+    int off, runptr = 0;
+    int wascopying = 0, copying = 0, bytes_skipped = 0;
+    int complen = 0, pageoff = 0, runbytes = 0;
+
+    char runlen = 0;
+
+    if ( (ctx->compbuf_pos + WORST_COMP_PAGE_SIZE) > ctx->compbuf_size)
+        return -1;
+
+    /*
+     * There are no alignment issues here since srcpage is
+     * domU's page passed from xc_domain_save and cache_page is
+     * a ptr to cache page (cache is page aligned).
+     */
+    new = (uint32_t*)srcpage;
+    old = (uint32_t*)cache_page;
+
+    for (off = 0; off <= MAX_DELTAS; off++)
+    {
+        /*
+         * At (off == MAX_DELTAS), we are processing the last run
+         * in the page. Since there is no XORing, make wascopying != copying
+         * to satisfy the if-block below.
+         */
+        copying = ((off < MAX_DELTAS) ? (old[off] != new[off]) : !wascopying);
+
+        if (runlen)
+        {
+            /* switching between run types or current run is full */
+            if ( (wascopying != copying) || (runlen == LENMASK) )
+            {
+                runbytes = runlen * sizeof(uint32_t);
+                runlen |= (wascopying ? RUNFLAG : SKIPFLAG);
+                dest[complen++] = runlen;
+
+                if (wascopying) /* RUNFLAG */
+                {
+                    pageoff = runptr * sizeof(uint32_t);
+                    memcpy(dest + complen, srcpage + pageoff, runbytes);
+                    memcpy(cache_page + pageoff, srcpage + pageoff, runbytes);
+                    complen += runbytes;
+                }
+                else /* SKIPFLAG */
+                {
+                    bytes_skipped += runbytes;
+                }
+
+                runlen = 0;
+                runptr = off;
+            }
+        }
+        runlen++;
+        wascopying = copying;
+    }
+
+    /*
+     * Check for empty page.
+     */
+    if (bytes_skipped == XC_PAGE_SIZE)
+    {
+        complen = 1;
+        dest[0] = EMPTY_PAGE;
+    }
+    ctx->compbuf_pos += complen;
+
+    return complen;
+}
+
+static
+char *get_cache_page(comp_ctx *ctx, xen_pfn_t pfn,
+                     int *israw)
+{
+    struct cache_page *item = NULL;
+
+    item = ctx->pfn2cache[pfn];
+
+    if (!item)
+    {
+        *israw = 1;
+
+        /* If the list is full, evict a page from the tail end. */
+        item = ctx->page_list_tail;
+        if (item->pfn != INVALID_P2M_ENTRY)
+            ctx->pfn2cache[item->pfn] = NULL;
+
+        item->pfn = pfn;
+        ctx->pfn2cache[pfn] = item;
+    }
+        
+    /*         if requested item is in cache move to head of list */
+    if (item != ctx->page_list_head)
+    {
+        if (item == ctx->page_list_tail)
+        {
+            /* item at tail of list. */
+            ctx->page_list_tail = item->prev;
+            (ctx->page_list_tail)->next = NULL;
+        }
+        else
+        {
+            /* item in middle of list */
+            item->prev->next = item->next;
+            item->next->prev = item->prev;
+        }
+
+        item->prev = NULL;
+        item->next = ctx->page_list_head;
+        (ctx->page_list_head)->prev = item;
+        ctx->page_list_head = item;
+    }
+
+    return (ctx->page_list_head)->page;
+}
+
+/* Remove pagetable pages from cache and move to tail, as free pages */
+static
+void invalidate_cache_page(comp_ctx *ctx, xen_pfn_t pfn)
+{
+    struct cache_page *item = NULL;
+
+    item = ctx->pfn2cache[pfn];
+    if (item)
+    {
+        if (item != ctx->page_list_tail)
+        {
+            /* item at head of list */
+            if (item == ctx->page_list_head)
+            {
+                ctx->page_list_head = (ctx->page_list_head)->next;
+                (ctx->page_list_head)->prev = NULL;
+            }
+            else /* item in middle of list */
+            {            
+                item->prev->next = item->next;
+                item->next->prev = item->prev;
+            }
+
+            item->next = NULL;
+            item->prev = ctx->page_list_tail;
+            (ctx->page_list_tail)->next = item;
+            ctx->page_list_tail = item;
+        }
+        ctx->pfn2cache[pfn] = NULL;
+        (ctx->page_list_tail)->pfn = INVALID_P2M_ENTRY;
+    }
+}
+
+int xc_compression_add_page(xc_interface *xch, comp_ctx *ctx,
+                            char *page, xen_pfn_t pfn, int israw)
+{
+    if (pfn > ctx->dom_pfnlist_size)
+    {
+        ERROR("Invalid pfn passed into "
+              "xc_compression_add_page %" PRIpfn "\n", pfn);
+        return -2;
+    }
+
+    /* pagetable page */
+    if (israw)
+        invalidate_cache_page(ctx, pfn);
+    ctx->sendbuf_pfns[ctx->pfns_len] = israw ? INVALID_P2M_ENTRY : pfn;
+    memcpy(ctx->inputbuf + ctx->pfns_len * XC_PAGE_SIZE, page, XC_PAGE_SIZE);
+    ctx->pfns_len++;
+
+    /* check if we have run out of space. If so,
+     * we need to synchronously compress the pages and flush them out
+     */
+    if (ctx->pfns_len == NRPAGES(PAGE_BUFFER_SIZE))
+        return -1;
+    return 0;
+}
+
+int xc_compression_compress_pages(xc_interface *xch, comp_ctx *ctx,
+                                  char *compbuf, unsigned long compbuf_size,
+                                  unsigned long *compbuf_len)
+{
+    char *cache_copy = NULL, *current_page = NULL;
+    int israw, rc = 1;
+
+    if (!ctx->pfns_len || (ctx->pfns_index == ctx->pfns_len)) {
+        ctx->pfns_len = ctx->pfns_index = 0;
+        return 0;
+    }
+
+    ctx->compbuf_pos = 0;
+    ctx->compbuf = compbuf;
+    ctx->compbuf_size = compbuf_size;
+
+    for (; ctx->pfns_index < ctx->pfns_len; ctx->pfns_index++)
+    {
+        israw = 0;
+        cache_copy = NULL;
+        current_page = ctx->inputbuf + ctx->pfns_index * XC_PAGE_SIZE;
+
+        if (ctx->sendbuf_pfns[ctx->pfns_index] == INVALID_P2M_ENTRY)
+            israw = 1;
+        else
+            cache_copy = get_cache_page(ctx,
+                                        ctx->sendbuf_pfns[ctx->pfns_index],
+                                        &israw);
+
+        if (israw)
+            rc = (add_full_page(ctx, current_page, cache_copy) >= 0);
+        else
+            rc = (compress_page(ctx, current_page, cache_copy) >= 0);
+
+        if ( !rc )
+        {
+            /* Out of space in outbuf! flush and come back */
+            rc = -1;
+            break;
+        }
+    }
+    if (compbuf_len)
+        *compbuf_len = ctx->compbuf_pos;
+
+    return rc;
+}
+
+inline
+void xc_compression_reset_pagebuf(xc_interface *xch, comp_ctx *ctx)
+{
+    ctx->pfns_index = ctx->pfns_len = 0;
+}
+
+int xc_compression_uncompress_page(xc_interface *xch, char *compbuf,
+                                   unsigned long compbuf_size,
+                                   unsigned long *compbuf_pos, char *destpage)
+{
+    unsigned long pos;
+    unsigned int len = 0, pagepos = 0;
+    char flag;
+
+    pos = *compbuf_pos;
+    if (pos >= compbuf_size)
+    {
+        ERROR("Out of bounds exception in compression buffer (a):"
+              "read ptr:%lu, bufsize = %lu\n",
+              *compbuf_pos, compbuf_size);
+        return -1;
+    }
+
+    switch (compbuf[pos])
+    {
+    case EMPTY_PAGE:
+        pos++;
+        break;
+
+    case FULL_PAGE:
+        {
+            /* Check if the input buffer has 4KB of data */
+            if ((pos + FULL_PAGE_SIZE) > compbuf_size)
+            {
+                ERROR("Out of bounds exception in compression buffer (b):"
+                      "read ptr = %lu, bufsize = %lu\n",
+                      *compbuf_pos, compbuf_size);
+                return -1;
+            }
+            memcpy(destpage, &compbuf[pos + 1], XC_PAGE_SIZE);
+            pos += FULL_PAGE_SIZE;
+        }
+        break;
+
+    default: /* Normal page with one or more runs */
+        {
+            do
+            {
+                flag = compbuf[pos] & FLAGMASK;
+                len = (compbuf[pos] & LENMASK) * sizeof(uint32_t);
+                /* Sanity Check: Zero-length runs are allowed only for
+                 * FULL_PAGE and EMPTY_PAGE.
+                 */
+                if (!len)
+                {
+                    ERROR("Zero length run encountered for normal page: "
+                          "buffer (d):read ptr = %lu, flag = %u, "
+                          "bufsize = %lu, pagepos = %u\n",
+                          pos, (unsigned int)flag, compbuf_size, pagepos);
+                    return -1;
+                }
+
+                pos++;
+                if (flag == RUNFLAG)
+                {
+                    /* Check if the input buffer has len bytes of data
+                     * and whether it would fit in the destination page.
+                     */
+                    if (((pos + len) > compbuf_size)
+                        || ((pagepos + len) > XC_PAGE_SIZE))
+                    {
+                        ERROR("Out of bounds exception in compression "
+                              "buffer (c):read ptr = %lu, runlen = %u, "
+                              "bufsize = %lu, pagepos = %u\n",
+                              pos, len, compbuf_size, pagepos);
+                        return -1;
+                    }
+                    memcpy(&destpage[pagepos], &compbuf[pos], len);
+                    pos += len;
+                }
+                pagepos += len;
+            } while ((pagepos < XC_PAGE_SIZE) && (pos < compbuf_size));
+
+            /* Make sure we have copied/skipped 4KB worth of data */
+            if (pagepos != XC_PAGE_SIZE)
+            {
+                ERROR("Invalid data in compression buffer:"
+                      "read ptr = %lu, bufsize = %lu, pagepos = %u\n",
+                      pos, compbuf_size, pagepos);
+                return -1;
+            }
+        }
+    }
+    *compbuf_pos = pos;
+    return 0;
+}
+
+void xc_compression_free_context(xc_interface *xch, comp_ctx *ctx)
+{
+    if (!ctx) return;
+
+    if (ctx->inputbuf)
+        free(ctx->inputbuf);
+    if (ctx->sendbuf_pfns)
+        free(ctx->sendbuf_pfns);
+    if (ctx->cache_base)
+        free(ctx->cache_base);
+    if (ctx->pfn2cache)
+        free(ctx->pfn2cache);
+    if (ctx->cache)
+        free(ctx->cache);
+    free(ctx);
+}
+
+comp_ctx *xc_compression_create_context(xc_interface *xch,
+                                        unsigned long p2m_size)
+{
+    unsigned long i;
+    comp_ctx *ctx = NULL;
+    unsigned long num_cache_pages = DELTA_CACHE_SIZE/XC_PAGE_SIZE;
+
+    ctx = (comp_ctx *)malloc(sizeof(comp_ctx));
+    if (!ctx)
+    {
+        ERROR("Failed to allocate compression_ctx\n");
+        goto error;
+    }
+    memset(ctx, 0, sizeof(comp_ctx));
+
+    ctx->inputbuf = xc_memalign(xch, XC_PAGE_SIZE, PAGE_BUFFER_SIZE);
+    if (!ctx->inputbuf)
+    {
+        ERROR("Failed to allocate page buffer\n");
+        goto error;
+    }
+
+    ctx->cache_base = xc_memalign(xch, XC_PAGE_SIZE, DELTA_CACHE_SIZE);
+    if (!ctx->cache_base)
+    {
+        ERROR("Failed to allocate delta cache\n");
+        goto error;
+    }
+
+    ctx->sendbuf_pfns = malloc(NRPAGES(PAGE_BUFFER_SIZE) *
+                               sizeof(xen_pfn_t));
+    if (!ctx->sendbuf_pfns)
+    {
+        ERROR("Could not alloc sendbuf_pfns\n");
+        goto error;
+    }
+    memset(ctx->sendbuf_pfns, -1,
+           NRPAGES(PAGE_BUFFER_SIZE) * sizeof(xen_pfn_t));
+
+    ctx->pfn2cache = calloc(p2m_size, sizeof(struct cache_page *));
+    if (!ctx->pfn2cache)
+    {
+        ERROR("Could not alloc pfn2cache map\n");
+        goto error;
+    }
+
+    ctx->cache = malloc(num_cache_pages * sizeof(struct cache_page));
+    if (!ctx->cache)
+    {
+        ERROR("Could not alloc compression cache\n");
+        goto error;
+    }
+
+    for (i = 0; i < num_cache_pages; i++)
+    {
+        ctx->cache[i].pfn = INVALID_P2M_ENTRY;
+        ctx->cache[i].page = ctx->cache_base + i * XC_PAGE_SIZE;
+        ctx->cache[i].prev = (i == 0) ? NULL : &(ctx->cache[i - 1]);
+        ctx->cache[i].next = ((i+1) == num_cache_pages)? NULL :
+            &(ctx->cache[i + 1]);
+    }
+    ctx->page_list_head = &(ctx->cache[0]);
+    ctx->page_list_tail = &(ctx->cache[num_cache_pages -1]);
+    ctx->dom_pfnlist_size = p2m_size;
+
+    return ctx;
+error:
+    xc_compression_free_context(xch, ctx);
+    return NULL;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r f25a004a6de8 -r f30a33c5b5bd tools/libxc/xc_domain_restore.c
--- a/tools/libxc/xc_domain_restore.c   Thu Dec 01 17:21:24 2011 +0000
+++ b/tools/libxc/xc_domain_restore.c   Thu Dec 01 17:24:12 2011 +0000
@@ -43,6 +43,7 @@
     xen_pfn_t *p2m_batch; /* A table of P2M mappings in the current region.  */
     int completed; /* Set when a consistent image is available */
     int last_checkpoint; /* Set when we should commit to the current 
checkpoint when it completes. */
+    int compressing; /* Set when sender signals that pages would be sent 
compressed (for Remus) */
     struct domain_info_context dinfo;
 };
 
@@ -663,6 +664,10 @@
     /* pages is of length nr_physpages, pfn_types is of length nr_pages */
     unsigned int nr_physpages, nr_pages;
 
+    /* checkpoint compression state */
+    int compressing;
+    unsigned long compbuf_pos, compbuf_size;
+
     /* Types of the pfns in the current region */
     unsigned long* pfn_types;
 
@@ -701,6 +706,7 @@
 {
     int count, countpages, oldcount, i;
     void* ptmp;
+    unsigned long compbuf_size;
 
     if ( RDEXACT(fd, &count, sizeof(count)) )
     {
@@ -820,6 +826,40 @@
         }
         return pagebuf_get_one(xch, ctx, buf, fd, dom);
 
+    case XC_SAVE_ID_ENABLE_COMPRESSION:
+        /* We cannot set compression flag directly in pagebuf structure,
+         * since this pagebuf still has uncompressed pages that are yet to
+         * be applied. We enable the compression field in pagebuf structure
+         * after receiving the first tailbuf.
+         */
+        ctx->compressing = 1;
+        // DPRINTF("compression flag received");
+        return pagebuf_get_one(xch, ctx, buf, fd, dom);
+
+    case XC_SAVE_ID_COMPRESSED_DATA:
+
+        /* read the length of compressed chunk coming in */
+        if ( RDEXACT(fd, &compbuf_size, sizeof(unsigned long)) )
+        {
+            PERROR("Error when reading compbuf_size");
+            return -1;
+        }
+        if (!compbuf_size) return 1;
+
+        buf->compbuf_size += compbuf_size;
+        if (!(ptmp = realloc(buf->pages, buf->compbuf_size))) {
+            ERROR("Could not (re)allocate compression buffer");
+            return -1;
+        }
+        buf->pages = ptmp;
+
+        if ( RDEXACT(fd, buf->pages + (buf->compbuf_size - compbuf_size),
+                     compbuf_size) ) {
+            PERROR("Error when reading compression buffer");
+            return -1;
+        }
+        return compbuf_size;
+
     default:
         if ( (count > MAX_BATCH_SIZE) || (count < 0) ) {
             ERROR("Max batch size exceeded (%d). Giving up.", count);
@@ -857,6 +897,13 @@
     if (!countpages)
         return count;
 
+    /* If Remus Checkpoint Compression is turned on, we will only be
+     * receiving the pfn lists now. The compressed pages will come in later,
+     * following a <XC_SAVE_ID_COMPRESSED_DATA, compressedChunkSize> tuple.
+     */
+    if (buf->compressing)
+        return pagebuf_get_one(xch, ctx, buf, fd, dom);
+
     oldcount = buf->nr_physpages;
     buf->nr_physpages += countpages;
     if (!buf->pages) {
@@ -885,6 +932,7 @@
     int rc;
 
     buf->nr_physpages = buf->nr_pages = 0;
+    buf->compbuf_pos = buf->compbuf_size = 0;
 
     do {
         rc = pagebuf_get_one(xch, ctx, buf, fd, dom);
@@ -1102,7 +1150,21 @@
         /* In verify mode, we use a copy; otherwise we work in place */
         page = pagebuf->verify ? (void *)buf : (region_base + i*PAGE_SIZE);
 
-        memcpy(page, pagebuf->pages + (curpage + curbatch) * PAGE_SIZE, 
PAGE_SIZE);
+        /* Remus - page decompression */
+        if (pagebuf->compressing)
+        {
+            if (xc_compression_uncompress_page(xch, pagebuf->pages,
+                                               pagebuf->compbuf_size,
+                                               &pagebuf->compbuf_pos,
+                                               (char *)page))
+            {
+                ERROR("Failed to uncompress page (pfn=%lx)\n", pfn);
+                goto err_mapped;
+            }
+        }
+        else
+            memcpy(page, pagebuf->pages + (curpage + curbatch) * PAGE_SIZE,
+                   PAGE_SIZE);
 
         pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
 
@@ -1364,6 +1426,7 @@
 
         if ( !ctx->completed ) {
             pagebuf.nr_physpages = pagebuf.nr_pages = 0;
+            pagebuf.compbuf_pos = pagebuf.compbuf_size = 0;
             if ( pagebuf_get_one(xch, ctx, &pagebuf, io_fd, dom) < 0 ) {
                 PERROR("Error when reading batch");
                 goto out;
@@ -1406,6 +1469,7 @@
         }
 
         pagebuf.nr_physpages = pagebuf.nr_pages = 0;
+        pagebuf.compbuf_pos = pagebuf.compbuf_size = 0;
 
         n += j; /* crude stats */
 
@@ -1449,6 +1513,13 @@
          */
         if ( !ctx->last_checkpoint )
             fcntl(io_fd, F_SETFL, orig_io_fd_flags | O_NONBLOCK);
+
+        /*
+         * If sender had sent enable compression flag, switch to compressed
+         * checkpoints mode once the first checkpoint is received.
+         */
+        if (ctx->compressing)
+            pagebuf.compressing = 1;
     }
 
     if (pagebuf.viridian != 0)
diff -r f25a004a6de8 -r f30a33c5b5bd tools/libxc/xc_domain_save.c
--- a/tools/libxc/xc_domain_save.c      Thu Dec 01 17:21:24 2011 +0000
+++ b/tools/libxc/xc_domain_save.c      Thu Dec 01 17:24:12 2011 +0000
@@ -218,6 +218,56 @@
         return noncached_write(xch, ob, fd, buf, len);
 }
 
+static int write_compressed(xc_interface *xch, comp_ctx *compress_ctx,
+                            int dobuf, struct outbuf* ob, int fd)
+{
+    int rc = 0;
+    int header = sizeof(int) + sizeof(unsigned long);
+    int marker = XC_SAVE_ID_COMPRESSED_DATA;
+    unsigned long compbuf_len = 0;
+
+    do
+    {
+        /* check for available space (atleast 8k) */
+        if ((ob->pos + header + XC_PAGE_SIZE * 2) > ob->size)
+        {
+            if (outbuf_flush(xch, ob, fd) < 0)
+            {
+                ERROR("Error when flushing outbuf intermediate");
+                return -1;
+            }
+        }
+
+        rc = xc_compression_compress_pages(xch, compress_ctx,
+                                           ob->buf + ob->pos + header,
+                                           ob->size - ob->pos - header,
+                                           &compbuf_len);
+        if (!rc)
+            return 0;
+
+        if (outbuf_hardwrite(xch, ob, fd, &marker, sizeof(marker)) < 0)
+        {
+            PERROR("Error when writing marker (errno %d)", errno);
+            return -1;
+        }
+
+        if (outbuf_hardwrite(xch, ob, fd, &compbuf_len, sizeof(compbuf_len)) < 
0)
+        {
+            PERROR("Error when writing compbuf_len (errno %d)", errno);
+            return -1;
+        }
+
+        ob->pos += (size_t) compbuf_len;
+        if (!dobuf && outbuf_flush(xch, ob, fd) < 0)
+        {
+            ERROR("Error when writing compressed chunk");
+            return -1;
+        }
+    } while (rc != 0);
+
+    return 0;
+}
+
 struct time_stats {
     struct timeval wall;
     long long d0_cpu, d1_cpu;
@@ -815,11 +865,35 @@
 
     unsigned long mfn;
 
-    struct outbuf ob;
+    /* Without checkpoint compression, the dirty pages, pfn arrays
+     * and tailbuf (vcpu ctx, shared info page, etc.)  are written
+     * directly to outbuf. All of this is done while the domain is
+     * suspended.
+     *
+     * When checkpoint compression is enabled, the dirty pages are
+     * buffered, compressed "after" the domain is resumed and then
+     * written to outbuf. Since tailbuf data are collected while a
+     * domain is suspended, they cannot be directly written to the
+     * outbuf as there is no dirty page data preceeding tailbuf.
+     *
+     * So,two output buffers are maintained. Tailbuf data goes into
+     * ob_tailbuf. The dirty pages are compressed after resuming the
+     * domain and written to ob_pagebuf. ob_tailbuf is then appended
+     * to ob_pagebuf and finally flushed out.
+     */
+    struct outbuf ob_pagebuf, ob_tailbuf, *ob = NULL;
     struct save_ctx _ctx;
     struct save_ctx *ctx = &_ctx;
     struct domain_info_context *dinfo = &ctx->dinfo;
 
+    /* Compression context */
+    comp_ctx *compress_ctx= NULL;
+    /* Even if XCFLAGS_CHECKPOINT_COMPRESS is set, we enable compression only
+     * after sending XC_SAVE_ID_ENABLE_COMPRESSION and the tailbuf for
+     * first time.
+     */
+    int compressing = 0;
+
     int completed = 0;
 
     if ( hvm && !callbacks->switch_qemu_logdirty )
@@ -829,7 +903,7 @@
         return 1;
     }
 
-    outbuf_init(xch, &ob, OUTBUF_SIZE);
+    outbuf_init(xch, &ob_pagebuf, OUTBUF_SIZE);
 
     memset(ctx, 0, sizeof(*ctx));
 
@@ -917,6 +991,16 @@
         }
     }
 
+    if ( flags & XCFLAGS_CHECKPOINT_COMPRESS )
+    {
+        if (!(compress_ctx = xc_compression_create_context(xch, 
dinfo->p2m_size)))
+        {
+            ERROR("Failed to create compression context");
+            goto out;
+        }
+        outbuf_init(xch, &ob_tailbuf, OUTBUF_SIZE/4);
+    }
+
     last_iter = !live;
 
     /* pretend we sent all the pages last iteration */
@@ -1025,9 +1109,11 @@
     }
 
   copypages:
-#define wrexact(fd, buf, len) write_buffer(xch, last_iter, &ob, (fd), (buf), 
(len))
-#define wruncached(fd, live, buf, len) write_uncached(xch, last_iter, &ob, 
(fd), (buf), (len))
+#define wrexact(fd, buf, len) write_buffer(xch, last_iter, ob, (fd), (buf), 
(len))
+#define wruncached(fd, live, buf, len) write_uncached(xch, last_iter, ob, 
(fd), (buf), (len))
+#define wrcompressed(fd) write_compressed(xch, compress_ctx, last_iter, ob, 
(fd))
 
+    ob = &ob_pagebuf; /* Holds pfn_types, pages/compressed pages */
     /* Now write out each data page, canonicalising page tables as we go... */
     for ( ; ; )
     {
@@ -1270,7 +1356,7 @@
                 {
                     /* If the page is not a normal data page, write out any
                        run of pages we may have previously acumulated */
-                    if ( run )
+                    if ( !compressing && run )
                     {
                         if ( wruncached(io_fd, live,
                                        (char*)region_base+(PAGE_SIZE*(j-run)), 
@@ -1305,7 +1391,41 @@
                         goto out;
                     }
 
-                    if ( wruncached(io_fd, live, page, PAGE_SIZE) != PAGE_SIZE 
)
+                    if (compressing)
+                    {
+                        int c_err;
+                        /* Mark pagetable page to be sent uncompressed */
+                        c_err = xc_compression_add_page(xch, compress_ctx, 
page,
+                                                        pfn, 1 /* raw page */);
+                        if (c_err == -2) /* OOB PFN */
+                        {
+                            ERROR("Could not add pagetable page "
+                                  "(pfn:%" PRIpfn "to page buffer\n", pfn);
+                            goto out;
+                        }
+
+                        if (c_err == -1)
+                        {
+                            /*
+                             * We are out of buffer space to hold dirty
+                             * pages. Compress and flush the current buffer
+                             * to make space. This is a corner case, that
+                             * slows down checkpointing as the compression
+                             * happens while domain is suspended. Happens
+                             * seldom and if you find this occuring
+                             * frequently, increase the PAGE_BUFFER_SIZE
+                             * in xc_compression.c.
+                             */
+                            if (wrcompressed(io_fd) < 0)
+                            {
+                                ERROR("Error when writing compressed"
+                                      " data (4b)\n");
+                                goto out;
+                            }
+                        }
+                    }
+                    else if ( wruncached(io_fd, live, page,
+                                         PAGE_SIZE) != PAGE_SIZE )
                     {
                         PERROR("Error when writing to state file (4b)"
                               " (errno %d)", errno);
@@ -1315,7 +1435,34 @@
                 else
                 {
                     /* We have a normal page: accumulate it for writing. */
-                    run++;
+                    if (compressing)
+                    {
+                        int c_err;
+                        /* For checkpoint compression, accumulate the page in 
the
+                         * page buffer, to be compressed later.
+                         */
+                        c_err = xc_compression_add_page(xch, compress_ctx, 
spage,
+                                                        pfn, 0 /* not raw page 
*/);
+
+                        if (c_err == -2) /* OOB PFN */
+                        {
+                            ERROR("Could not add page "
+                                  "(pfn:%" PRIpfn "to page buffer\n", pfn);
+                            goto out;
+                        }
+
+                        if (c_err == -1)
+                        {
+                            if (wrcompressed(io_fd) < 0)
+                            {
+                                ERROR("Error when writing compressed"
+                                      " data (4c)\n");
+                                goto out;
+                            }
+                        }
+                    }
+                    else
+                        run++;
                 }
             } /* end of the write out for this batch */
 
@@ -1423,6 +1570,15 @@
 
     DPRINTF("All memory is saved\n");
 
+    /* After last_iter, buffer the rest of pagebuf & tailbuf data into a
+     * separate output buffer and flush it after the compressed page chunks.
+     */
+    if (compressing)
+    {
+        ob = &ob_tailbuf;
+        ob->pos = 0;
+    }
+
     {
         struct {
             int id;
@@ -1534,6 +1690,25 @@
         }
     }
 
+    /* Enable compression logic on both sides by sending this
+     * one time marker.
+     * NOTE: We could have simplified this procedure by sending
+     * the enable/disable compression flag before the beginning of
+     * the main for loop. But this would break compatibility for
+     * live migration code, with older versions of xen. So we have
+     * to enable it after the last_iter, when the XC_SAVE_ID_*
+     * elements are sent.
+     */
+    if (!compressing && (flags & XCFLAGS_CHECKPOINT_COMPRESS))
+    {
+        i = XC_SAVE_ID_ENABLE_COMPRESSION;
+        if ( wrexact(io_fd, &i, sizeof(int)) )
+        {
+            PERROR("Error when writing enable_compression marker");
+            goto out;
+        }
+    }
+
     /* Zero terminate */
     i = 0;
     if ( wrexact(io_fd, &i, sizeof(int)) )
@@ -1778,14 +1953,38 @@
     if ( !rc && callbacks->postcopy )
         callbacks->postcopy(callbacks->data);
 
+    /* guest has been resumed. Now we can compress data
+     * at our own pace.
+     */
+    if (!rc && compressing)
+    {
+        ob = &ob_pagebuf;
+        if (wrcompressed(io_fd) < 0)
+        {
+            ERROR("Error when writing compressed data, after postcopy\n");
+            rc = 1;
+            goto out;
+        }
+        /* Append the tailbuf data to the main outbuf */
+        if ( wrexact(io_fd, ob_tailbuf.buf, ob_tailbuf.pos) )
+        {
+            rc = 1;
+            PERROR("Error when copying tailbuf into outbuf");
+            goto out;
+        }
+    }
+
     /* Flush last write and discard cache for file. */
-    if ( outbuf_flush(xch, &ob, io_fd) < 0 ) {
+    if ( outbuf_flush(xch, ob, io_fd) < 0 ) {
         PERROR("Error when flushing output buffer");
         rc = 1;
     }
 
     discard_file_cache(xch, io_fd, 1 /* flush */);
 
+    /* Enable compression now, finally */
+    compressing = (flags & XCFLAGS_CHECKPOINT_COMPRESS);
+
     /* checkpoint_cb can spend arbitrarily long in between rounds */
     if (!rc && callbacks->checkpoint &&
         callbacks->checkpoint(callbacks->data) > 0)
@@ -1827,6 +2026,9 @@
             DPRINTF("Warning - couldn't disable qemu log-dirty mode");
     }
 
+    if (compress_ctx)
+        xc_compression_free_context(xch, compress_ctx);
+
     if ( live_shinfo )
         munmap(live_shinfo, PAGE_SIZE);
 
diff -r f25a004a6de8 -r f30a33c5b5bd tools/libxc/xc_linux.c
--- a/tools/libxc/xc_linux.c    Thu Dec 01 17:21:24 2011 +0000
+++ b/tools/libxc/xc_linux.c    Thu Dec 01 17:24:12 2011 +0000
@@ -55,6 +55,18 @@
     errno = saved_errno;
 }
 
+void *xc_memalign(xc_interface *xch, size_t alignment, size_t size)
+{
+    int ret;
+    void *ptr;
+
+    ret = posix_memalign(&ptr, alignment, size);
+    if (ret != 0 || !ptr)
+        return NULL;
+
+    return ptr;
+}
+
 /*
  * Local variables:
  * mode: C
diff -r f25a004a6de8 -r f30a33c5b5bd tools/libxc/xc_linux_osdep.c
--- a/tools/libxc/xc_linux_osdep.c      Thu Dec 01 17:21:24 2011 +0000
+++ b/tools/libxc/xc_linux_osdep.c      Thu Dec 01 17:24:12 2011 +0000
@@ -91,10 +91,9 @@
 {
     size_t size = npages * XC_PAGE_SIZE;
     void *p;
-    int ret;
 
-    ret = posix_memalign(&p, XC_PAGE_SIZE, size);
-    if (ret != 0 || !p)
+    p = xc_memalign(xch, XC_PAGE_SIZE, size);
+    if (!p)
         return NULL;
 
     if ( mlock(p, size) < 0 )
diff -r f25a004a6de8 -r f30a33c5b5bd tools/libxc/xc_minios.c
--- a/tools/libxc/xc_minios.c   Thu Dec 01 17:21:24 2011 +0000
+++ b/tools/libxc/xc_minios.c   Thu Dec 01 17:24:12 2011 +0000
@@ -73,7 +73,7 @@
 
 static void *minios_privcmd_alloc_hypercall_buffer(xc_interface *xch, 
xc_osdep_handle h, int npages)
 {
-    return memalign(PAGE_SIZE, npages * PAGE_SIZE);
+    return xc_memalign(xch, PAGE_SIZE, npages * PAGE_SIZE);
 }
 
 static void minios_privcmd_free_hypercall_buffer(xc_interface *xch, 
xc_osdep_handle h, void *ptr, int npages)
@@ -437,6 +437,11 @@
         fsync(fd);
 }
 
+void *xc_memalign(xc_interface *xch, size_t alignment, size_t size)
+{
+    return memalign(alignment, size);
+}
+
 static xc_osdep_handle minios_gnttab_open(xc_gnttab *xcg)
 {
     int fd = alloc_fd(FTYPE_GNTMAP);
diff -r f25a004a6de8 -r f30a33c5b5bd tools/libxc/xc_netbsd.c
--- a/tools/libxc/xc_netbsd.c   Thu Dec 01 17:21:24 2011 +0000
+++ b/tools/libxc/xc_netbsd.c   Thu Dec 01 17:24:12 2011 +0000
@@ -71,8 +71,9 @@
 static void *netbsd_privcmd_alloc_hypercall_buffer(xc_interface *xch, 
xc_osdep_handle h, int npages)
 {
     size_t size = npages * XC_PAGE_SIZE;
-    void *p = valloc(size);
+    void *p;
 
+    p = xc_memalign(xch, XC_PAGE_SIZE, size);
     if (!p)
         return NULL;
 
@@ -378,6 +379,11 @@
     errno = saved_errno;
 }
 
+void *xc_memalign(xc_interface *xch, size_t alignment, size_t size)
+{
+    return valloc(size);
+}
+
 static struct xc_osdep_ops *netbsd_osdep_init(xc_interface *xch, enum 
xc_osdep_type type)
 {
     switch ( type )
diff -r f25a004a6de8 -r f30a33c5b5bd tools/libxc/xc_solaris.c
--- a/tools/libxc/xc_solaris.c  Thu Dec 01 17:21:24 2011 +0000
+++ b/tools/libxc/xc_solaris.c  Thu Dec 01 17:24:12 2011 +0000
@@ -70,7 +70,7 @@
 
 static void *solaris_privcmd_alloc_hypercall_buffer(xc_interface *xch, 
xc_osdep_handle h, int npages)
 {
-    return memalign(XC_PAGE_SIZE, npages * XC_PAGE_SIZE);
+    return xc_memalign(xch, XC_PAGE_SIZE, npages * XC_PAGE_SIZE);
 }
 
 static void solaris_privcmd_free_hypercall_buffer(xc_interface *xch, 
xc_osdep_handle h, void *ptr, int npages)
@@ -314,6 +314,11 @@
     // TODO: Implement for Solaris!
 }
 
+void *xc_memalign(xc_interface *xch, size_t alignment, size_t size)
+{
+    return memalign(alignment, size);
+}
+
 static struct xc_osdep_ops *solaris_osdep_init(xc_interface *xch, enum 
xc_osdep_type type)
 {
     switch ( type )
diff -r f25a004a6de8 -r f30a33c5b5bd tools/libxc/xenctrl.h
--- a/tools/libxc/xenctrl.h     Thu Dec 01 17:21:24 2011 +0000
+++ b/tools/libxc/xenctrl.h     Thu Dec 01 17:24:12 2011 +0000
@@ -1156,6 +1156,8 @@
                       uint64_t *time,
                       xc_hypercall_buffer_t *data);
 
+void *xc_memalign(xc_interface *xch, size_t alignment, size_t size);
+
 /**
  * Memory maps a range within one domain to a local address range.  Mappings
  * should be unmapped with munmap and should follow the same rules as mmap
@@ -1935,4 +1937,64 @@
                         int verbose);
 /* Useful for callers who also use libelf. */
 
+/**
+ * Checkpoint Compression
+ */
+typedef struct compression_ctx comp_ctx;
+comp_ctx *xc_compression_create_context(xc_interface *xch,
+                                       unsigned long p2m_size);
+void xc_compression_free_context(xc_interface *xch, comp_ctx *ctx);
+
+/**
+ * Add a page to compression page buffer, to be compressed later.
+ *
+ * returns 0 if the page was successfully added to the page buffer
+ *
+ * returns -1 if there is no space in buffer. In this case, the
+ *  application should call xc_compression_compress_pages to compress
+ *  the buffer (or atleast part of it), thereby freeing some space in
+ *  the page buffer.
+ *
+ * returns -2 if the pfn is out of bounds, where the bound is p2m_size
+ *  parameter passed during xc_compression_create_context.
+ */
+int xc_compression_add_page(xc_interface *xch, comp_ctx *ctx, char *page,
+                           unsigned long pfn, int israw);
+
+/**
+ * Delta compress pages in the compression buffer and inserts the
+ * compressed data into the supplied compression buffer compbuf, whose
+ * size is compbuf_size.
+ * After compression, the pages are copied to the internal LRU cache.
+ *
+ * This function compresses as many pages as possible into the
+ * supplied compression buffer. It maintains an internal iterator to
+ * keep track of pages in the input buffer that are yet to be compressed.
+ *
+ * returns -1 if the compression buffer has run out of space.  
+ * returns 1 on success.
+ * returns 0 if no more pages are left to be compressed.
+ *  When the return value is non-zero, compbuf_len indicates the actual
+ *  amount of data present in compbuf (<=compbuf_size).
+ */
+int xc_compression_compress_pages(xc_interface *xch, comp_ctx *ctx,
+                                 char *compbuf, unsigned long compbuf_size,
+                                 unsigned long *compbuf_len);
+
+/**
+ * Resets the internal page buffer that holds dirty pages before compression.
+ * Also resets the iterators.
+ */
+void xc_compression_reset_pagebuf(xc_interface *xch, comp_ctx *ctx);
+
+/**
+ * Caller must supply the compression buffer (compbuf),
+ * its size (compbuf_size) and a reference to index variable (compbuf_pos)
+ * that is used internally. Each call pulls out one page from the compressed
+ * chunk and copies it to dest.
+ */
+int xc_compression_uncompress_page(xc_interface *xch, char *compbuf,
+                                  unsigned long compbuf_size,
+                                  unsigned long *compbuf_pos, char *dest);
+
 #endif /* XENCTRL_H */
diff -r f25a004a6de8 -r f30a33c5b5bd tools/libxc/xenguest.h
--- a/tools/libxc/xenguest.h    Thu Dec 01 17:21:24 2011 +0000
+++ b/tools/libxc/xenguest.h    Thu Dec 01 17:24:12 2011 +0000
@@ -27,6 +27,7 @@
 #define XCFLAGS_DEBUG     2
 #define XCFLAGS_HVM       4
 #define XCFLAGS_STDVGA    8
+#define XCFLAGS_CHECKPOINT_COMPRESS    16
 #define X86_64_B_SIZE   64 
 #define X86_32_B_SIZE   32
 
diff -r f25a004a6de8 -r f30a33c5b5bd tools/libxc/xg_save_restore.h
--- a/tools/libxc/xg_save_restore.h     Thu Dec 01 17:21:24 2011 +0000
+++ b/tools/libxc/xg_save_restore.h     Thu Dec 01 17:24:12 2011 +0000
@@ -67,7 +67,7 @@
  *
  *   consists of p2m_size bytes comprising an array of xen_pfn_t sized entries.
  *
- * BODY PHASE
+ * BODY PHASE - Format A (for live migration or Remus without compression)
  * ----------
  *
  * A series of chunks with a common header:
@@ -87,6 +87,122 @@
  *
  * If chunk type is 0 then body phase is complete.
  *
+ *
+ * BODY PHASE - Format B (for Remus with compression)
+ * ----------
+ *
+ * A series of chunks with a common header:
+ *   int              : chunk type
+ *
+ * If the chunk type is +ve then chunk contains array of PFNs corresponding
+ * to guest memory and type contains the number of PFNs in the batch:
+ *
+ *     unsigned long[]  : PFN array, length == number of pages in batch
+ *                        Each entry consists of XEN_DOMCTL_PFINFO_*
+ *                        in bits 31-28 and the PFN number in bits 27-0.
+ *
+ * If the chunk type is -ve then chunk consists of one of a number of
+ * metadata types.  See definitions of XC_SAVE_ID_* below.
+ *
+ * If the chunk type is -ve and equals XC_SAVE_ID_COMPRESSED_DATA, then the
+ * chunk consists of compressed page data, in the following format:
+ *
+ *     unsigned long        : Size of the compressed chunk to follow
+ *     compressed data :      variable length data of size indicated above.
+ *                            This chunk consists of compressed page data.
+ *                            The number of pages in one chunk depends on
+ *                            the amount of space available in the sender's
+ *                            output buffer.
+ *
+ * Format of compressed data:
+ *   compressed_data = <deltas>*
+ *   delta           = <marker, run*>
+ *   marker          = (RUNFLAG|SKIPFLAG) bitwise-or RUNLEN [1 byte marker]
+ *   RUNFLAG         = 0
+ *   SKIPFLAG        = 1 << 7
+ *   RUNLEN          = 7-bit unsigned value indicating number of WORDS in the 
run
+ *   run             = string of bytes of length sizeof(WORD) * RUNLEN
+ *
+ *    If marker contains RUNFLAG, then RUNLEN * sizeof(WORD) bytes of data 
following
+ *   the marker is copied into the target page at the appropriate offset 
indicated by
+ *   the offset_ptr
+ *    If marker contains SKIPFLAG, then the offset_ptr is advanced
+ *   by RUNLEN * sizeof(WORD).
+ *
+ * If chunk type is 0 then body phase is complete.
+ *
+ * There can be one or more chunks with type XC_SAVE_ID_COMPRESSED_DATA,
+ * containing compressed pages. The compressed chunks are collated to form
+ * one single compressed chunk for the entire iteration. The number of pages
+ * present in this final compressed chunk will be equal to the total number
+ * of valid PFNs specified by the +ve chunks.
+ *
+ * At the sender side, compressed pages are inserted into the output stream
+ * in the same order as they would have been if compression logic was absent.
+ *
+ * Until last iteration, the BODY is sent in Format A, to maintain live
+ * migration compatibility with receivers of older Xen versions.
+ * At the last iteration, if Remus compression was enabled, the sender sends
+ * a trigger, XC_SAVE_ID_ENABLE_COMPRESSION to tell the receiver to parse the
+ * BODY in Format B from the next iteration onwards.
+ *
+ * An example sequence of chunks received in Format B:
+ *     +16                              +ve chunk
+ *     unsigned long[16]                PFN array
+ *     +100                             +ve chunk
+ *     unsigned long[100]               PFN array
+ *     +50                              +ve chunk
+ *     unsigned long[50]                PFN array
+ *
+ *     XC_SAVE_ID_COMPRESSED_DATA       TAG
+ *       N                              Length of compressed data
+ *       N bytes of DATA                Decompresses to 166 pages
+ *
+ *     XC_SAVE_ID_*                     other xc save chunks
+ *     0                                END BODY TAG
+ *
+ * Corner case with checkpoint compression:
+ *     At sender side, after pausing the domain, dirty pages are usually
+ *   copied out to a temporary buffer. After the domain is resumed,
+ *   compression is done and the compressed chunk(s) are sent, followed by
+ *   other XC_SAVE_ID_* chunks.
+ *     If the temporary buffer gets full while scanning for dirty pages,
+ *   the sender stops buffering of dirty pages, compresses the temporary
+ *   buffer and sends the compressed data with XC_SAVE_ID_COMPRESSED_DATA.
+ *   The sender then resumes the buffering of dirty pages and continues
+ *   scanning for the dirty pages.
+ *     For e.g., assume that the temporary buffer can hold 4096 pages and
+ *   there are 5000 dirty pages. The following is the sequence of chunks
+ *   that the receiver will see:
+ *
+ *     +1024                       +ve chunk
+ *     unsigned long[1024]         PFN array
+ *     +1024                       +ve chunk
+ *     unsigned long[1024]         PFN array
+ *     +1024                       +ve chunk
+ *     unsigned long[1024]         PFN array
+ *     +1024                       +ve chunk
+ *     unsigned long[1024]         PFN array
+ *
+ *     XC_SAVE_ID_COMPRESSED_DATA  TAG
+ *      N                          Length of compressed data
+ *      N bytes of DATA            Decompresses to 4096 pages
+ *
+ *     +4                          +ve chunk
+ *     unsigned long[4]            PFN array
+ *
+ *     XC_SAVE_ID_COMPRESSED_DATA  TAG
+ *      M                          Length of compressed data
+ *      M bytes of DATA            Decompresses to 4 pages
+ *
+ *     XC_SAVE_ID_*                other xc save chunks
+ *     0                           END BODY TAG
+ *
+ *     In other words, XC_SAVE_ID_COMPRESSED_DATA can be interleaved with
+ *   +ve chunks arbitrarily. But at the receiver end, the following condition
+ *   always holds true until the end of BODY PHASE:
+ *    num(PFN entries +ve chunks) >= num(pages received in compressed form)
+ *
  * TAIL PHASE
  * ----------
  *
@@ -135,6 +251,8 @@
 #define XC_SAVE_ID_LAST_CHECKPOINT    -9 /* Commit to restoring after 
completion of current iteration. */
 #define XC_SAVE_ID_HVM_ACPI_IOPORTS_LOCATION -10
 #define XC_SAVE_ID_HVM_VIRIDIAN       -11
+#define XC_SAVE_ID_COMPRESSED_DATA    -12 /* Marker to indicate arrival of 
compressed data */
+#define XC_SAVE_ID_ENABLE_COMPRESSION -13 /* Marker to enable compression 
logic at receiver side */
 
 /*
 ** We process save/restore/migrate in batches of pages; the below
diff -r f25a004a6de8 -r f30a33c5b5bd tools/libxl/libxl.c
--- a/tools/libxl/libxl.c       Thu Dec 01 17:21:24 2011 +0000
+++ b/tools/libxl/libxl.c       Thu Dec 01 17:24:12 2011 +0000
@@ -3330,6 +3330,19 @@
     return 0;
 }
 
+int libxl_fd_set_cloexec(int fd)
+{
+    int flags = 0;
+
+    if ((flags = fcntl(fd, F_GETFD)) == -1) {
+        flags = 0;
+    }
+    if ((flags & FD_CLOEXEC)) {
+        return 0;
+    }
+    return fcntl(fd, F_SETFD, flags | FD_CLOEXEC);
+}
+
 /*
  * Local variables:
  * mode: C
diff -r f25a004a6de8 -r f30a33c5b5bd tools/libxl/libxl.h
--- a/tools/libxl/libxl.h       Thu Dec 01 17:21:24 2011 +0000
+++ b/tools/libxl/libxl.h       Thu Dec 01 17:24:12 2011 +0000
@@ -635,6 +635,9 @@
 const char *libxl_run_dir_path(void);
 const char *libxl_xenpaging_dir_path(void);
 
+/* misc */
+int libxl_fd_set_cloexec(int fd);
+
 #endif /* LIBXL_H */
 
 /*
diff -r f25a004a6de8 -r f30a33c5b5bd tools/libxl/libxl_internal.c
--- a/tools/libxl/libxl_internal.c      Thu Dec 01 17:21:24 2011 +0000
+++ b/tools/libxl/libxl_internal.c      Thu Dec 01 17:24:12 2011 +0000
@@ -306,19 +306,6 @@
     return 0;
 }
 
-int libxl__fd_set_cloexec(int fd)
-{
-    int flags = 0;
-
-    if ((flags = fcntl(fd, F_GETFD)) == -1) {
-        flags = 0;
-    }
-    if ((flags & FD_CLOEXEC)) {
-        return 0;
-    }
-    return fcntl(fd, F_SETFD, flags | FD_CLOEXEC);
-}
-
 libxl_device_model_version libxl__device_model_version_running(libxl__gc *gc,
                                                                uint32_t domid)
 {
diff -r f25a004a6de8 -r f30a33c5b5bd tools/libxl/libxl_internal.h
--- a/tools/libxl/libxl_internal.h      Thu Dec 01 17:21:24 2011 +0000
+++ b/tools/libxl/libxl_internal.h      Thu Dec 01 17:24:12 2011 +0000
@@ -503,7 +503,6 @@
 
 _hidden int libxl__file_reference_map(libxl_file_reference *f);
 _hidden int libxl__file_reference_unmap(libxl_file_reference *f);
-_hidden int libxl__fd_set_cloexec(int fd);
 
 _hidden int libxl__e820_alloc(libxl__gc *gc, uint32_t domid, 
libxl_domain_config *d_config);
 
diff -r f25a004a6de8 -r f30a33c5b5bd tools/libxl/libxl_qmp.c
--- a/tools/libxl/libxl_qmp.c   Thu Dec 01 17:21:24 2011 +0000
+++ b/tools/libxl/libxl_qmp.c   Thu Dec 01 17:24:12 2011 +0000
@@ -324,7 +324,7 @@
     if (fcntl(qmp->qmp_fd, F_SETFL, flags | O_NONBLOCK) == -1) {
         return -1;
     }
-    libxl__fd_set_cloexec(qmp->qmp_fd);
+    libxl_fd_set_cloexec(qmp->qmp_fd);
 
     memset(&qmp->addr, 0, sizeof (&qmp->addr));
     qmp->addr.sun_family = AF_UNIX;
diff -r f25a004a6de8 -r f30a33c5b5bd tools/libxl/xl_cmdimpl.c
--- a/tools/libxl/xl_cmdimpl.c  Thu Dec 01 17:21:24 2011 +0000
+++ b/tools/libxl/xl_cmdimpl.c  Thu Dec 01 17:24:12 2011 +0000
@@ -1459,8 +1459,12 @@
         union { uint32_t u32; char b[4]; } u32buf;
         uint32_t badflags;
 
-        restore_fd = migrate_fd >= 0 ? migrate_fd :
-            open(restore_file, O_RDONLY);
+        if (migrate_fd >= 0) {
+            restore_fd = migrate_fd;
+        } else {
+            restore_fd = open(restore_file, O_RDONLY);
+            libxl_fd_set_cloexec(restore_fd);
+        }
 
         CHK_ERRNO( libxl_read_exactly(ctx, restore_fd, &hdr,
                    sizeof(hdr), restore_file, "header") );
diff -r f25a004a6de8 -r f30a33c5b5bd 
tools/python/xen/lowlevel/checkpoint/checkpoint.c
--- a/tools/python/xen/lowlevel/checkpoint/checkpoint.c Thu Dec 01 17:21:24 
2011 +0000
+++ b/tools/python/xen/lowlevel/checkpoint/checkpoint.c Thu Dec 01 17:24:12 
2011 +0000
@@ -104,13 +104,14 @@
   PyObject* postcopy_cb = NULL;
   PyObject* checkpoint_cb = NULL;
   unsigned int interval = 0;
+  unsigned int flags = 0;
 
   int fd;
   struct save_callbacks callbacks;
   int rc;
 
-  if (!PyArg_ParseTuple(args, "O|OOOI", &iofile, &suspend_cb, &postcopy_cb,
-                       &checkpoint_cb, &interval))
+  if (!PyArg_ParseTuple(args, "O|OOOII", &iofile, &suspend_cb, &postcopy_cb,
+                       &checkpoint_cb, &interval, &flags))
     return NULL;
 
   self->interval = interval;
@@ -160,7 +161,7 @@
   callbacks.data = self;
 
   self->threadstate = PyEval_SaveThread();
-  rc = checkpoint_start(&self->cps, fd, &callbacks);
+  rc = checkpoint_start(&self->cps, fd, &callbacks, flags);
   PyEval_RestoreThread(self->threadstate);
 
   if (rc < 0) {
diff -r f25a004a6de8 -r f30a33c5b5bd 
tools/python/xen/lowlevel/checkpoint/checkpoint.h
--- a/tools/python/xen/lowlevel/checkpoint/checkpoint.h Thu Dec 01 17:21:24 
2011 +0000
+++ b/tools/python/xen/lowlevel/checkpoint/checkpoint.h Thu Dec 01 17:24:12 
2011 +0000
@@ -40,13 +40,15 @@
     timer_t timer;
 } checkpoint_state;
 
+#define CHECKPOINT_FLAGS_COMPRESSION 1
 char* checkpoint_error(checkpoint_state* s);
 
 void checkpoint_init(checkpoint_state* s);
 int checkpoint_open(checkpoint_state* s, unsigned int domid);
 void checkpoint_close(checkpoint_state* s);
 int checkpoint_start(checkpoint_state* s, int fd,
-                    struct save_callbacks* callbacks);
+                    struct save_callbacks* callbacks,
+                    unsigned int remus_flags);
 int checkpoint_suspend(checkpoint_state* s);
 int checkpoint_resume(checkpoint_state* s);
 int checkpoint_postflush(checkpoint_state* s);
diff -r f25a004a6de8 -r f30a33c5b5bd 
tools/python/xen/lowlevel/checkpoint/libcheckpoint.c
--- a/tools/python/xen/lowlevel/checkpoint/libcheckpoint.c      Thu Dec 01 
17:21:24 2011 +0000
+++ b/tools/python/xen/lowlevel/checkpoint/libcheckpoint.c      Thu Dec 01 
17:24:12 2011 +0000
@@ -170,7 +170,8 @@
 }
 
 int checkpoint_start(checkpoint_state* s, int fd,
-                    struct save_callbacks* callbacks)
+                    struct save_callbacks* callbacks,
+                    unsigned int remus_flags)
 {
     int hvm, rc;
     int flags = XCFLAGS_LIVE;
@@ -188,6 +189,8 @@
        if (switch_qemu_logdirty(s, 1))
            return -1;
     }
+    if (remus_flags & CHECKPOINT_FLAGS_COMPRESSION)
+      flags |= XCFLAGS_CHECKPOINT_COMPRESS;
 
     callbacks->switch_qemu_logdirty = noop_switch_logdirty;
 
diff -r f25a004a6de8 -r f30a33c5b5bd tools/python/xen/remus/save.py
--- a/tools/python/xen/remus/save.py    Thu Dec 01 17:21:24 2011 +0000
+++ b/tools/python/xen/remus/save.py    Thu Dec 01 17:24:12 2011 +0000
@@ -133,7 +133,7 @@
 
 class Saver(object):
     def __init__(self, domid, fd, suspendcb=None, resumecb=None,
-                 checkpointcb=None, interval=0):
+                 checkpointcb=None, interval=0, flags=0):
         """Create a Saver object for taking guest checkpoints.
         domid:        name, number or UUID of a running domain
         fd:           a stream to which checkpoint data will be written.
@@ -141,12 +141,14 @@
         resumecb:     callback invoked before guest resumes
         checkpointcb: callback invoked when a checkpoint is complete. Return
                       True to take another checkpoint, or False to stop.
+        flags:        Remus flags to be passed to xc_domain_save
         """
         self.fd = fd
         self.suspendcb = suspendcb
         self.resumecb = resumecb
         self.checkpointcb = checkpointcb
         self.interval = interval
+        self.flags = flags
 
         self.vm = vm.VM(domid)
 
@@ -164,7 +166,8 @@
             try:
                 self.checkpointer.open(self.vm.domid)
                 self.checkpointer.start(self.fd, self.suspendcb, self.resumecb,
-                                        self.checkpointcb, self.interval)
+                                        self.checkpointcb, self.interval,
+                                        self.flags)
             except xen.lowlevel.checkpoint.error, e:
                 raise CheckpointError(e)
         finally:
diff -r f25a004a6de8 -r f30a33c5b5bd tools/remus/remus
--- a/tools/remus/remus Thu Dec 01 17:21:24 2011 +0000
+++ b/tools/remus/remus Thu Dec 01 17:24:12 2011 +0000
@@ -16,6 +16,9 @@
 class CfgException(Exception): pass
 
 class Cfg(object):
+
+    REMUS_FLAGS_COMPRESSION = 1
+
     def __init__(self):
         # must be set
         self.domid = 0
@@ -25,6 +28,7 @@
         self.port = XendOptions.instance().get_xend_relocation_port()
         self.interval = 200
         self.netbuffer = True
+        self.flags = self.REMUS_FLAGS_COMPRESSION
         self.timer = False
 
         parser = optparse.OptionParser()
@@ -38,6 +42,8 @@
                           help='replicate to /dev/null (no disk checkpoints, 
only memory & net buffering)')
         parser.add_option('', '--no-net', dest='nonet', action='store_true',
                           help='run without net buffering (benchmark option)')
+        parser.add_option('', '--no-compression', dest='nocompress', 
action='store_true',
+                          help='run without checkpoint compression')
         parser.add_option('', '--timer', dest='timer', action='store_true',
                           help='force pause at checkpoint interval 
(experimental)')
         self.parser = parser
@@ -56,6 +62,8 @@
             self.nullremus = True
         if opts.nonet:
             self.netbuffer = False
+        if opts.nocompress:
+            self.flags &= ~self.REMUS_FLAGS_COMPRESSION
         if opts.timer:
             self.timer = True
 
@@ -190,7 +198,7 @@
     rc = 0
 
     checkpointer = save.Saver(cfg.domid, fd, postsuspend, preresume, commit,
-                              interval)
+                              interval, cfg.flags)
 
     try:
         checkpointer.start()
diff -r f25a004a6de8 -r f30a33c5b5bd xen/arch/x86/Makefile
--- a/xen/arch/x86/Makefile     Thu Dec 01 17:21:24 2011 +0000
+++ b/xen/arch/x86/Makefile     Thu Dec 01 17:24:12 2011 +0000
@@ -30,9 +30,10 @@
 obj-y += msi.o
 obj-y += ioport_emulate.o
 obj-y += irq.o
-obj-y += microcode.o
 obj-y += microcode_amd.o
 obj-y += microcode_intel.o
+# This must come after the vendor specific files.
+obj-y += microcode.o
 obj-y += mm.o
 obj-y += mpparse.o
 obj-y += nmi.o
diff -r f25a004a6de8 -r f30a33c5b5bd xen/arch/x86/efi/boot.c
--- a/xen/arch/x86/efi/boot.c   Thu Dec 01 17:21:24 2011 +0000
+++ b/xen/arch/x86/efi/boot.c   Thu Dec 01 17:24:12 2011 +0000
@@ -49,6 +49,7 @@
 static struct file __initdata cfg;
 static struct file __initdata kernel;
 static struct file __initdata ramdisk;
+static struct file __initdata ucode;
 static struct file __initdata xsm;
 
 static multiboot_info_t __initdata mbi = {
@@ -174,6 +175,8 @@
         efi_bs->FreePages(kernel.addr, PFN_UP(kernel.size));
     if ( ramdisk.addr )
         efi_bs->FreePages(ramdisk.addr, PFN_UP(ramdisk.size));
+    if ( ucode.addr )
+        efi_bs->FreePages(ucode.addr, PFN_UP(ucode.size));
     if ( xsm.addr )
         efi_bs->FreePages(xsm.addr, PFN_UP(xsm.size));
 
@@ -806,6 +809,17 @@
         efi_bs->FreePool(name.w);
     }
 
+    name.s = get_value(&cfg, section.s, "ucode");
+    if ( !name.s )
+        name.s = get_value(&cfg, "global", "ucode");
+    if ( name.s )
+    {
+        microcode_set_module(mbi.mods_count);
+        split_value(name.s);
+        read_file(dir_handle, s2w(&name), &ucode);
+        efi_bs->FreePool(name.w);
+    }
+
     name.s = get_value(&cfg, section.s, "xsm");
     if ( name.s )
     {
diff -r f25a004a6de8 -r f30a33c5b5bd xen/arch/x86/microcode.c
--- a/xen/arch/x86/microcode.c  Thu Dec 01 17:21:24 2011 +0000
+++ b/xen/arch/x86/microcode.c  Thu Dec 01 17:24:12 2011 +0000
@@ -22,20 +22,56 @@
  */
 
 #include <xen/config.h>
+#include <xen/cpu.h>
 #include <xen/lib.h>
 #include <xen/kernel.h>
 #include <xen/init.h>
+#include <xen/notifier.h>
 #include <xen/sched.h>
 #include <xen/smp.h>
+#include <xen/softirq.h>
 #include <xen/spinlock.h>
+#include <xen/tasklet.h>
 #include <xen/guest_access.h>
 
-#include <asm/current.h>
 #include <asm/msr.h>
-#include <asm/uaccess.h>
 #include <asm/processor.h>
+#include <asm/setup.h>
 #include <asm/microcode.h>
 
+static module_t __initdata ucode_mod;
+static void *(*__initdata ucode_mod_map)(const module_t *);
+static unsigned int __initdata ucode_mod_idx;
+static bool_t __initdata ucode_mod_forced;
+static cpumask_t __initdata init_mask;
+
+void __init microcode_set_module(unsigned int idx)
+{
+    ucode_mod_idx = idx;
+    ucode_mod_forced = 1;
+}
+
+static void __init parse_ucode(char *s)
+{
+    if ( !ucode_mod_forced )
+        ucode_mod_idx = simple_strtoul(s, NULL, 0);
+}
+custom_param("ucode", parse_ucode);
+
+void __init microcode_grab_module(
+    unsigned long *module_map,
+    const multiboot_info_t *mbi,
+    void *(*map)(const module_t *))
+{
+    module_t *mod = (module_t *)__va(mbi->mods_addr);
+
+    if ( !ucode_mod_idx || ucode_mod_idx >= mbi->mods_count ||
+         !__test_and_clear_bit(ucode_mod_idx, module_map) )
+        return;
+    ucode_mod = mod[ucode_mod_idx];
+    ucode_mod_map = map;
+}
+
 const struct microcode_ops *microcode_ops;
 
 static DEFINE_SPINLOCK(microcode_mutex);
@@ -69,30 +105,50 @@
     int err;
     struct ucode_cpu_info *uci = &per_cpu(ucode_cpu_info, cpu);
     struct cpu_signature nsig;
+    unsigned int cpu2;
 
-    if ( !uci->mc.mc_valid )
-        return -EIO;
+    spin_lock(&microcode_mutex);
 
-    /*
-     * Let's verify that the 'cached' ucode does belong
-     * to this cpu (a bit of paranoia):
-     */
-    err = microcode_ops->collect_cpu_info(cpu, &nsig);
+    err = microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig);
     if ( err )
     {
-        microcode_fini_cpu(cpu);
+        __microcode_fini_cpu(cpu);
+        spin_unlock(&microcode_mutex);
         return err;
     }
 
-    if ( microcode_ops->microcode_resume_match(cpu, &nsig) )
+    if ( uci->mc.mc_valid )
     {
-        return microcode_ops->apply_microcode(cpu);
+        err = microcode_ops->microcode_resume_match(cpu, uci->mc.mc_valid);
+        if ( err >= 0 )
+        {
+            if ( err )
+                err = microcode_ops->apply_microcode(cpu);
+            spin_unlock(&microcode_mutex);
+            return err;
+        }
     }
-    else
+
+    nsig = uci->cpu_sig;
+    __microcode_fini_cpu(cpu);
+    uci->cpu_sig = nsig;
+
+    err = -EIO;
+    for_each_online_cpu ( cpu2 )
     {
-        microcode_fini_cpu(cpu);
-        return -EIO;
+        uci = &per_cpu(ucode_cpu_info, cpu2);
+        if ( uci->mc.mc_valid &&
+             microcode_ops->microcode_resume_match(cpu, uci->mc.mc_valid) > 0 )
+        {
+            err = microcode_ops->apply_microcode(cpu);
+            break;
+        }
     }
+
+    __microcode_fini_cpu(cpu);
+    spin_unlock(&microcode_mutex);
+
+    return err;
 }
 
 static int microcode_update_cpu(const void *buf, size_t size)
@@ -162,3 +218,78 @@
 
     return continue_hypercall_on_cpu(info->cpu, do_microcode_update, info);
 }
+
+static void __init _do_microcode_update(unsigned long data)
+{
+    microcode_update_cpu((void *)data, ucode_mod.mod_end);
+    cpumask_set_cpu(smp_processor_id(), &init_mask);
+}
+
+static int __init microcode_init(void)
+{
+    void *data;
+    static struct tasklet __initdata tasklet;
+    unsigned int cpu;
+
+    if ( !microcode_ops || !ucode_mod.mod_end )
+        return 0;
+
+    data = ucode_mod_map(&ucode_mod);
+    if ( !data )
+        return -ENOMEM;
+
+    softirq_tasklet_init(&tasklet, _do_microcode_update, (unsigned long)data);
+
+    for_each_online_cpu ( cpu )
+    {
+        tasklet_schedule_on_cpu(&tasklet, cpu);
+        do {
+            process_pending_softirqs();
+        } while ( !cpumask_test_cpu(cpu, &init_mask) );
+    }
+
+    ucode_mod_map(NULL);
+
+    return 0;
+}
+__initcall(microcode_init);
+
+static int microcode_percpu_callback(
+    struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+    unsigned int cpu = (unsigned long)hcpu;
+
+    switch ( action )
+    {
+    case CPU_DEAD:
+        microcode_fini_cpu(cpu);
+        break;
+    }
+
+    return NOTIFY_DONE;
+}
+
+static struct notifier_block microcode_percpu_nfb = {
+    .notifier_call = microcode_percpu_callback,
+};
+
+static int __init microcode_presmp_init(void)
+{
+    if ( microcode_ops )
+    {
+        if ( ucode_mod.mod_end )
+        {
+            void *data = ucode_mod_map(&ucode_mod);
+
+            if ( data )
+                microcode_update_cpu(data, ucode_mod.mod_end);
+
+            ucode_mod_map(NULL);
+        }
+
+        register_cpu_notifier(&microcode_percpu_nfb);
+    }
+
+    return 0;
+}
+presmp_initcall(microcode_presmp_init);
diff -r f25a004a6de8 -r f30a33c5b5bd xen/arch/x86/microcode_amd.c
--- a/xen/arch/x86/microcode_amd.c      Thu Dec 01 17:21:24 2011 +0000
+++ b/xen/arch/x86/microcode_amd.c      Thu Dec 01 17:24:12 2011 +0000
@@ -23,27 +23,53 @@
 #include <xen/spinlock.h>
 
 #include <asm/msr.h>
-#include <asm/uaccess.h>
 #include <asm/processor.h>
 #include <asm/microcode.h>
 
 #define pr_debug(x...) ((void)0)
 
+struct equiv_cpu_entry {
+    uint32_t installed_cpu;
+    uint32_t fixed_errata_mask;
+    uint32_t fixed_errata_compare;
+    uint16_t equiv_cpu;
+    uint16_t reserved;
+} __attribute__((packed));
+
+struct microcode_header_amd {
+    uint32_t data_code;
+    uint32_t patch_id;
+    uint8_t  mc_patch_data_id[2];
+    uint8_t  mc_patch_data_len;
+    uint8_t  init_flag;
+    uint32_t mc_patch_data_checksum;
+    uint32_t nb_dev_id;
+    uint32_t sb_dev_id;
+    uint16_t processor_rev_id;
+    uint8_t  nb_rev_id;
+    uint8_t  sb_rev_id;
+    uint8_t  bios_api_rev;
+    uint8_t  reserved1[3];
+    uint32_t match_reg[8];
+} __attribute__((packed));
+
 #define UCODE_MAGIC                0x00414d44
 #define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000
 #define UCODE_UCODE_TYPE           0x00000001
 
 #define UCODE_MAX_SIZE          (2048)
-#define DEFAULT_UCODE_DATASIZE  (896)
 #define MC_HEADER_SIZE          (sizeof(struct microcode_header_amd))
-#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
-#define DWSIZE                  (sizeof(uint32_t))
+
+struct microcode_amd {
+    struct microcode_header_amd hdr;
+    unsigned int mpb[(UCODE_MAX_SIZE - MC_HEADER_SIZE) / 4];
+    unsigned int equiv_cpu_table_size;
+    struct equiv_cpu_entry equiv_cpu_table[];
+};
 
 /* serialize access to the physical write */
 static DEFINE_SPINLOCK(microcode_update_lock);
 
-struct equiv_cpu_entry *equiv_cpu_table;
-
 static int collect_cpu_info(int cpu, struct cpu_signature *csig)
 {
     struct cpuinfo_x86 *c = &cpu_data[cpu];
@@ -65,10 +91,11 @@
     return 0;
 }
 
-static int microcode_fits(void *mc, int cpu)
+static int microcode_fits(const struct microcode_amd *mc_amd, int cpu)
 {
     struct ucode_cpu_info *uci = &per_cpu(ucode_cpu_info, cpu);
-    struct microcode_header_amd *mc_header = mc;
+    const struct microcode_header_amd *mc_header = &mc_amd->hdr;
+    const struct equiv_cpu_entry *equiv_cpu_table = mc_amd->equiv_cpu_table;
     unsigned int current_cpu_id;
     unsigned int equiv_cpu_id = 0x0;
     unsigned int i;
@@ -99,7 +126,7 @@
     }
 
     if ( mc_header->patch_id <= uci->cpu_sig.rev )
-        return -EINVAL;
+        return 0;
 
     printk(KERN_DEBUG "microcode: CPU%d found a matching microcode "
            "update with version 0x%x (current=0x%x)\n",
@@ -186,17 +213,15 @@
     return 0;
 }
 
-static int install_equiv_cpu_table(const void *buf, uint32_t size,
-                                   unsigned long *offset)
+static int install_equiv_cpu_table(
+    struct microcode_amd *mc_amd,
+    const uint32_t *buf_pos,
+    unsigned long *offset)
 {
-    const uint32_t *buf_pos = buf;
-    unsigned long off;
-
-    off = *offset;
-    *offset = 0;
+    uint32_t size = buf_pos[2];
 
     /* No more data */
-    if ( off >= size )
+    if ( size + 12 >= *offset )
         return -EINVAL;
 
     if ( buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE )
@@ -213,15 +238,8 @@
         return -EINVAL;
     }
 
-    equiv_cpu_table = xmalloc_bytes(size);
-    if ( equiv_cpu_table == NULL )
-    {
-        printk(KERN_ERR "microcode: error, can't allocate "
-               "memory for equiv CPU table\n");
-        return -ENOMEM;
-    }
-
-    memcpy(equiv_cpu_table, (const void *)&buf_pos[3], size);
+    memcpy(mc_amd->equiv_cpu_table, &buf_pos[3], size);
+    mc_amd->equiv_cpu_table_size = size;
 
     *offset = size + 12;       /* add header length */
 
@@ -231,11 +249,11 @@
 static int cpu_request_microcode(int cpu, const void *buf, size_t size)
 {
     const uint32_t *buf_pos;
-    unsigned long offset = 0;
+    struct microcode_amd *mc_amd, *mc_old;
+    unsigned long offset = size;
     int error = 0;
     int ret;
     struct ucode_cpu_info *uci = &per_cpu(ucode_cpu_info, cpu);
-    void *mc;
 
     /* We should bind the task to the CPU */
     BUG_ON(cpu != raw_smp_processor_id());
@@ -249,59 +267,85 @@
         return -EINVAL;
     }
 
-    error = install_equiv_cpu_table(buf, (uint32_t)(buf_pos[2]), &offset);
+    mc_amd = xmalloc_bytes(sizeof(*mc_amd) + buf_pos[2]);
+    if ( !mc_amd )
+    {
+        printk(KERN_ERR "microcode: error! "
+               "Can not allocate memory for microcode patch\n");
+        return -ENOMEM;
+    }
+
+    error = install_equiv_cpu_table(mc_amd, buf, &offset);
     if ( error )
     {
+        xfree(mc_amd);
         printk(KERN_ERR "microcode: installing equivalent cpu table failed\n");
         return -EINVAL;
     }
 
-    mc = xmalloc_bytes(UCODE_MAX_SIZE);
-    if ( mc == NULL )
-    {
-        printk(KERN_ERR "microcode: error! "
-               "Can not allocate memory for microcode patch\n");
-        error = -ENOMEM;
-        goto out;
-    }
-
+    mc_old = uci->mc.mc_amd;
     /* implicitely validates uci->mc.mc_valid */
-    uci->mc.mc_amd = mc;
+    uci->mc.mc_amd = mc_amd;
 
     /*
      * It's possible the data file has multiple matching ucode,
      * lets keep searching till the latest version
      */
-    while ( (ret = get_next_ucode_from_buffer_amd(mc, buf, size, &offset)) == 
0)
+    while ( (ret = get_next_ucode_from_buffer_amd(&mc_amd->hdr, buf, size,
+                                                  &offset)) == 0 )
     {
-        error = microcode_fits(mc, cpu);
+        error = microcode_fits(mc_amd, cpu);
         if (error <= 0)
             continue;
 
         error = apply_microcode(cpu);
         if (error == 0)
+        {
+            error = 1;
             break;
+        }
     }
 
+    if ( ret < 0 )
+        error = ret;
+
     /* On success keep the microcode patch for
      * re-apply on resume.
      */
-    if (error) {
-        xfree(mc);
-        mc = NULL;
+    if (error == 1)
+    {
+        xfree(mc_old);
+        return 0;
     }
-    uci->mc.mc_amd = mc;
-
-out:
-    xfree(equiv_cpu_table);
-    equiv_cpu_table = NULL;
+    xfree(mc_amd);
+    uci->mc.mc_amd = mc_old;
 
     return error;
 }
 
-static int microcode_resume_match(int cpu, struct cpu_signature *nsig)
+static int microcode_resume_match(int cpu, const void *mc)
 {
-    return 0;
+    struct ucode_cpu_info *uci = &per_cpu(ucode_cpu_info, cpu);
+    struct microcode_amd *mc_amd = uci->mc.mc_amd;
+    const struct microcode_amd *src = mc;
+    int res = microcode_fits(src, cpu);
+
+    if ( res <= 0 )
+        return res;
+
+    if ( src != mc_amd )
+    {
+        xfree(mc_amd);
+        mc_amd = xmalloc_bytes(sizeof(*src) + src->equiv_cpu_table_size);
+        uci->mc.mc_amd = mc_amd;
+        if ( !mc_amd )
+            return -ENOMEM;
+        memcpy(mc_amd, src, UCODE_MAX_SIZE);
+        memcpy(mc_amd->equiv_cpu_table, src->equiv_cpu_table,
+               src->equiv_cpu_table_size);
+    }
+
+    return 1;
 }
 
 static const struct microcode_ops microcode_amd_ops = {
@@ -317,4 +361,4 @@
         microcode_ops = &microcode_amd_ops;
     return 0;
 }
-__initcall(microcode_init_amd);
+presmp_initcall(microcode_init_amd);
diff -r f25a004a6de8 -r f30a33c5b5bd xen/arch/x86/microcode_intel.c
--- a/xen/arch/x86/microcode_intel.c    Thu Dec 01 17:21:24 2011 +0000
+++ b/xen/arch/x86/microcode_intel.c    Thu Dec 01 17:24:12 2011 +0000
@@ -30,12 +30,43 @@
 #include <xen/spinlock.h>
 
 #include <asm/msr.h>
-#include <asm/uaccess.h>
 #include <asm/processor.h>
 #include <asm/microcode.h>
 
 #define pr_debug(x...) ((void)0)
 
+struct microcode_header_intel {
+    unsigned int hdrver;
+    unsigned int rev;
+    unsigned int date;
+    unsigned int sig;
+    unsigned int cksum;
+    unsigned int ldrver;
+    unsigned int pf;
+    unsigned int datasize;
+    unsigned int totalsize;
+    unsigned int reserved[3];
+};
+
+struct microcode_intel {
+    struct microcode_header_intel hdr;
+    unsigned int bits[0];
+};
+
+/* microcode format is extended from prescott processors */
+struct extended_signature {
+    unsigned int sig;
+    unsigned int pf;
+    unsigned int cksum;
+};
+
+struct extended_sigtable {
+    unsigned int count;
+    unsigned int cksum;
+    unsigned int reserved[3];
+    struct extended_signature sigs[0];
+};
+
 #define DEFAULT_UCODE_DATASIZE  (2000)
 #define MC_HEADER_SIZE          (sizeof(struct microcode_header_intel))
 #define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
@@ -98,7 +129,8 @@
 }
 
 static inline int microcode_update_match(
-    int cpu_num, struct microcode_header_intel *mc_header, int sig, int pf)
+    int cpu_num, const struct microcode_header_intel *mc_header,
+    int sig, int pf)
 {
     struct ucode_cpu_info *uci = &per_cpu(ucode_cpu_info, cpu_num);
 
@@ -200,11 +232,11 @@
  * return 1 - found update
  * return < 0 - error
  */
-static int get_matching_microcode(void *mc, int cpu)
+static int get_matching_microcode(const void *mc, int cpu)
 {
     struct ucode_cpu_info *uci = &per_cpu(ucode_cpu_info, cpu);
-    struct microcode_header_intel *mc_header = mc;
-    struct extended_sigtable *ext_header;
+    const struct microcode_header_intel *mc_header = mc;
+    const struct extended_sigtable *ext_header;
     unsigned long total_size = get_totalsize(mc_header);
     int ext_sigcount, i;
     struct extended_signature *ext_sig;
@@ -229,6 +261,8 @@
     }
     return 0;
  find:
+    if ( uci->mc.mc_intel && uci->mc.mc_intel->hdr.rev >= mc_header->rev )
+        return 0;
     pr_debug("microcode: CPU%d found a matching microcode update with"
              " version 0x%x (current=0x%x)\n",
              cpu, mc_header->rev, uci->cpu_sig.rev);
@@ -239,10 +273,8 @@
         return -ENOMEM;
     }
 
-    /* free previous update file */
+    memcpy(new_mc, mc, total_size);
     xfree(uci->mc.mc_intel);
-
-    memcpy(new_mc, mc, total_size);
     uci->mc.mc_intel = new_mc;
     return 1;
 }
@@ -361,12 +393,9 @@
     return error;
 }
 
-static int microcode_resume_match(int cpu, struct cpu_signature *nsig)
+static int microcode_resume_match(int cpu, const void *mc)
 {
-    struct ucode_cpu_info *uci = &per_cpu(ucode_cpu_info, cpu);
-
-    return (sigmatch(nsig->sig, uci->cpu_sig.sig, nsig->pf, uci->cpu_sig.pf) &&
-            (uci->cpu_sig.rev > nsig->rev));
+    return get_matching_microcode(mc, cpu);
 }
 
 static const struct microcode_ops microcode_intel_ops = {
@@ -382,4 +411,4 @@
         microcode_ops = &microcode_intel_ops;
     return 0;
 }
-__initcall(microcode_init_intel);
+presmp_initcall(microcode_init_intel);
diff -r f25a004a6de8 -r f30a33c5b5bd xen/arch/x86/setup.c
--- a/xen/arch/x86/setup.c      Thu Dec 01 17:21:24 2011 +0000
+++ b/xen/arch/x86/setup.c      Thu Dec 01 17:24:12 2011 +0000
@@ -550,10 +550,10 @@
 {
     char *memmap_type = NULL;
     char *cmdline, *kextra, *loader;
-    unsigned int initrdidx = 1;
+    unsigned int initrdidx;
     multiboot_info_t *mbi = __va(mbi_p);
     module_t *mod = (module_t *)__va(mbi->mods_addr);
-    unsigned long nr_pages, modules_headroom;
+    unsigned long nr_pages, modules_headroom, *module_map;
     int i, j, e820_warn = 0, bytes = 0;
     bool_t acpi_boot_table_init_done = 0;
     struct ns16550_defaults ns16550 = {
@@ -1229,7 +1229,13 @@
 
     init_IRQ();
 
-    xsm_init(&initrdidx, mbi, bootstrap_map);
+    module_map = xmalloc_array(unsigned long, BITS_TO_LONGS(mbi->mods_count));
+    bitmap_fill(module_map, mbi->mods_count);
+    __clear_bit(0, module_map); /* Dom0 kernel is always first */
+
+    xsm_init(module_map, mbi, bootstrap_map);
+
+    microcode_grab_module(module_map, mbi, bootstrap_map);
 
     timer_init();
 
@@ -1356,6 +1362,12 @@
     if ( xen_cpuidle )
         xen_processor_pmbits |= XEN_PROCESSOR_PM_CX;
 
+    initrdidx = find_first_bit(module_map, mbi->mods_count);
+    if ( bitmap_weight(module_map, mbi->mods_count) > 1 )
+        printk(XENLOG_WARNING
+               "Multiple initrd candidates, picking module #%u\n",
+               initrdidx);
+
     /*
      * We're going to setup domain0 using the module(s) that we stashed safely
      * above our heap. The second module, if present, is an initrd ramdisk.
diff -r f25a004a6de8 -r f30a33c5b5bd xen/include/asm-x86/microcode.h
--- a/xen/include/asm-x86/microcode.h   Thu Dec 01 17:21:24 2011 +0000
+++ b/xen/include/asm-x86/microcode.h   Thu Dec 01 17:24:12 2011 +0000
@@ -7,74 +7,12 @@
 struct ucode_cpu_info;
 
 struct microcode_ops {
-    int (*microcode_resume_match)(int cpu, struct cpu_signature *nsig);
+    int (*microcode_resume_match)(int cpu, const void *mc);
     int (*cpu_request_microcode)(int cpu, const void *buf, size_t size);
     int (*collect_cpu_info)(int cpu, struct cpu_signature *csig);
     int (*apply_microcode)(int cpu);
 };
 
-struct microcode_header_intel {
-    unsigned int hdrver;
-    unsigned int rev;
-    unsigned int date;
-    unsigned int sig;
-    unsigned int cksum;
-    unsigned int ldrver;
-    unsigned int pf;
-    unsigned int datasize;
-    unsigned int totalsize;
-    unsigned int reserved[3];
-};
-
-struct microcode_intel {
-    struct microcode_header_intel hdr;
-    unsigned int bits[0];
-};
-
-/* microcode format is extended from prescott processors */
-struct extended_signature {
-    unsigned int sig;
-    unsigned int pf;
-    unsigned int cksum;
-};
-
-struct extended_sigtable {
-    unsigned int count;
-    unsigned int cksum;
-    unsigned int reserved[3];
-    struct extended_signature sigs[0];
-};
-
-struct equiv_cpu_entry {
-    uint32_t installed_cpu;
-    uint32_t fixed_errata_mask;
-    uint32_t fixed_errata_compare;
-    uint16_t equiv_cpu;
-    uint16_t reserved;
-} __attribute__((packed));
-
-struct microcode_header_amd {
-    uint32_t data_code;
-    uint32_t patch_id;
-    uint8_t  mc_patch_data_id[2];
-    uint8_t  mc_patch_data_len;
-    uint8_t  init_flag;
-    uint32_t mc_patch_data_checksum;
-    uint32_t nb_dev_id;
-    uint32_t sb_dev_id;
-    uint16_t processor_rev_id;
-    uint8_t  nb_rev_id;
-    uint8_t  sb_rev_id;
-    uint8_t  bios_api_rev;
-    uint8_t  reserved1[3];
-    uint32_t match_reg[8];
-} __attribute__((packed));
-
-struct microcode_amd {
-    struct microcode_header_amd hdr;
-    unsigned int mpb[0];
-};
-
 struct cpu_signature {
     unsigned int sig;
     unsigned int pf;
diff -r f25a004a6de8 -r f30a33c5b5bd xen/include/asm-x86/processor.h
--- a/xen/include/asm-x86/processor.h   Thu Dec 01 17:21:24 2011 +0000
+++ b/xen/include/asm-x86/processor.h   Thu Dec 01 17:24:12 2011 +0000
@@ -599,6 +599,7 @@
 int rdmsr_hypervisor_regs(uint32_t idx, uint64_t *val);
 int wrmsr_hypervisor_regs(uint32_t idx, uint64_t val);
 
+void microcode_set_module(unsigned int);
 int microcode_update(XEN_GUEST_HANDLE(const_void), unsigned long len);
 int microcode_resume_cpu(int cpu);
 
diff -r f25a004a6de8 -r f30a33c5b5bd xen/include/asm-x86/setup.h
--- a/xen/include/asm-x86/setup.h       Thu Dec 01 17:21:24 2011 +0000
+++ b/xen/include/asm-x86/setup.h       Thu Dec 01 17:24:12 2011 +0000
@@ -44,4 +44,7 @@
 int xen_in_range(unsigned long mfn);
 void arch_get_xen_caps(xen_capabilities_info_t *info);
 
+void microcode_grab_module(
+    unsigned long *, const multiboot_info_t *, void *(*)(const module_t *));
+
 #endif
diff -r f25a004a6de8 -r f30a33c5b5bd xen/include/xsm/xsm.h
--- a/xen/include/xsm/xsm.h     Thu Dec 01 17:21:24 2011 +0000
+++ b/xen/include/xsm/xsm.h     Thu Dec 01 17:24:12 2011 +0000
@@ -454,14 +454,15 @@
 }
 
 #ifdef XSM_ENABLE
-extern int xsm_init(unsigned int *initrdidx, const multiboot_info_t *mbi,
+extern int xsm_init(unsigned long *module_map, const multiboot_info_t *mbi,
                     void *(*bootstrap_map)(const module_t *));
-extern int xsm_policy_init(unsigned int *initrdidx, const multiboot_info_t 
*mbi,
+extern int xsm_policy_init(unsigned long *module_map,
+                           const multiboot_info_t *mbi,
                            void *(*bootstrap_map)(const module_t *));
 extern int register_xsm(struct xsm_operations *ops);
 extern int unregister_xsm(struct xsm_operations *ops);
 #else
-static inline int xsm_init (unsigned int *initrdidx,
+static inline int xsm_init (unsigned long *module_map,
                             const multiboot_info_t *mbi,
                             void *(*bootstrap_map)(const module_t *))
 {
diff -r f25a004a6de8 -r f30a33c5b5bd xen/xsm/xsm_core.c
--- a/xen/xsm/xsm_core.c        Thu Dec 01 17:21:24 2011 +0000
+++ b/xen/xsm/xsm_core.c        Thu Dec 01 17:24:12 2011 +0000
@@ -43,7 +43,7 @@
     }
 }
 
-int __init xsm_init(unsigned int *initrdidx, const multiboot_info_t *mbi,
+int __init xsm_init(unsigned long *module_map, const multiboot_info_t *mbi,
                     void *(*bootstrap_map)(const module_t *))
 {
     int ret = 0;
@@ -52,7 +52,7 @@
 
     if ( XSM_MAGIC )
     {
-        ret = xsm_policy_init(initrdidx, mbi, bootstrap_map);
+        ret = xsm_policy_init(module_map, mbi, bootstrap_map);
         if ( ret )
         {
             bootstrap_map(NULL);
diff -r f25a004a6de8 -r f30a33c5b5bd xen/xsm/xsm_policy.c
--- a/xen/xsm/xsm_policy.c      Thu Dec 01 17:21:24 2011 +0000
+++ b/xen/xsm/xsm_policy.c      Thu Dec 01 17:24:12 2011 +0000
@@ -20,11 +20,12 @@
 
 #include <xsm/xsm.h>
 #include <xen/multiboot.h>
+#include <asm/bitops.h>
 
 char *__initdata policy_buffer = NULL;
 u32 __initdata policy_size = 0;
 
-int xsm_policy_init(unsigned int *initrdidx, const multiboot_info_t *mbi,
+int xsm_policy_init(unsigned long *module_map, const multiboot_info_t *mbi,
                     void *(*bootstrap_map)(const module_t *))
 {
     int i;
@@ -35,10 +36,13 @@
 
     /*
      * Try all modules and see whichever could be the binary policy.
-     * Adjust the initrdidx if module[1] is the binary policy.
+     * Adjust module_map for the module that is the binary policy.
      */
     for ( i = mbi->mods_count-1; i >= 1; i-- )
     {
+        if ( !test_bit(i, module_map) )
+            continue;
+
         _policy_start = bootstrap_map(mod + i);
         _policy_len   = mod[i].mod_end;
 
@@ -50,8 +54,7 @@
             printk("Policy len  0x%lx, start at %p.\n",
                    _policy_len,_policy_start);
 
-            if ( i == 1 )
-                *initrdidx = (mbi->mods_count > 2) ? 2 : 0;
+            __clear_bit(i, module_map);
             break;
 
         }

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.