[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] merge



# HG changeset patch
# User awilliam@xxxxxxxxxxx
# Node ID 673f62edbfbe4098ea1d5a34d8a77667da762090
# Parent  88f97bb8f3ae7e0fb85dbe8fb420d7f02f844a34
# Parent  d8451bb6278cb5f3f477dd9392213be7c66730b4
merge

diff -r 88f97bb8f3ae -r 673f62edbfbe buildconfigs/linux-defconfig_xen0_x86_32
--- a/buildconfigs/linux-defconfig_xen0_x86_32  Wed Mar  1 17:01:54 2006
+++ b/buildconfigs/linux-defconfig_xen0_x86_32  Wed Mar  1 19:47:25 2006
@@ -1320,6 +1320,7 @@
 # CONFIG_XEN_BLKDEV_TAP_BE is not set
 CONFIG_XEN_NETDEV_BACKEND=y
 # CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER is not set
+CONFIG_XEN_NETDEV_LOOPBACK=y
 # CONFIG_XEN_TPMDEV_BACKEND is not set
 CONFIG_XEN_BLKDEV_FRONTEND=y
 CONFIG_XEN_NETDEV_FRONTEND=y
diff -r 88f97bb8f3ae -r 673f62edbfbe buildconfigs/linux-defconfig_xen0_x86_64
--- a/buildconfigs/linux-defconfig_xen0_x86_64  Wed Mar  1 17:01:54 2006
+++ b/buildconfigs/linux-defconfig_xen0_x86_64  Wed Mar  1 19:47:25 2006
@@ -1244,6 +1244,7 @@
 # CONFIG_XEN_BLKDEV_TAP_BE is not set
 CONFIG_XEN_NETDEV_BACKEND=y
 # CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER is not set
+CONFIG_XEN_NETDEV_LOOPBACK=y
 # CONFIG_XEN_TPMDEV_BACKEND is not set
 CONFIG_XEN_BLKDEV_FRONTEND=y
 CONFIG_XEN_NETDEV_FRONTEND=y
diff -r 88f97bb8f3ae -r 673f62edbfbe buildconfigs/linux-defconfig_xen_x86_32
--- a/buildconfigs/linux-defconfig_xen_x86_32   Wed Mar  1 17:01:54 2006
+++ b/buildconfigs/linux-defconfig_xen_x86_32   Wed Mar  1 19:47:25 2006
@@ -2986,6 +2986,7 @@
 # CONFIG_XEN_BLKDEV_TAP_BE is not set
 CONFIG_XEN_NETDEV_BACKEND=y
 # CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER is not set
+CONFIG_XEN_NETDEV_LOOPBACK=y
 # CONFIG_XEN_TPMDEV_BACKEND is not set
 CONFIG_XEN_BLKDEV_FRONTEND=y
 CONFIG_XEN_NETDEV_FRONTEND=y
diff -r 88f97bb8f3ae -r 673f62edbfbe buildconfigs/linux-defconfig_xen_x86_64
--- a/buildconfigs/linux-defconfig_xen_x86_64   Wed Mar  1 17:01:54 2006
+++ b/buildconfigs/linux-defconfig_xen_x86_64   Wed Mar  1 19:47:25 2006
@@ -2656,6 +2656,7 @@
 # CONFIG_XEN_BLKDEV_TAP_BE is not set
 CONFIG_XEN_NETDEV_BACKEND=y
 # CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER is not set
+CONFIG_XEN_NETDEV_LOOPBACK=y
 # CONFIG_XEN_TPMDEV_BACKEND is not set
 CONFIG_XEN_BLKDEV_FRONTEND=y
 CONFIG_XEN_NETDEV_FRONTEND=y
diff -r 88f97bb8f3ae -r 673f62edbfbe buildconfigs/mk.linux-2.6-xen
--- a/buildconfigs/mk.linux-2.6-xen     Wed Mar  1 17:01:54 2006
+++ b/buildconfigs/mk.linux-2.6-xen     Wed Mar  1 19:47:25 2006
@@ -2,8 +2,8 @@
 OS           = linux
 
 LINUX_SERIES = 2.6
-LINUX_VER    = 2.6.16-rc4
-LINUX_SRCS = linux-2.6.15.tar.bz2 patch-2.6.16-rc4.bz2
+LINUX_VER    = 2.6.16-rc5
+LINUX_SRCS = linux-2.6.15.tar.bz2 patch-2.6.16-rc5.bz2
 LINUX_PDIR = linux-$(LINUX_VER)
 
 EXTRAVERSION ?= xen
@@ -34,7 +34,7 @@
        touch $(@D)/.hgskip
        touch $@
 
-pristine-linux-%.16-rc4/.valid-pristine: pristine-$(LINUX_PDIR)/.valid-srcs
+pristine-linux-%.16-rc5/.valid-pristine: pristine-$(LINUX_PDIR)/.valid-srcs
        touch $@ # update timestamp to avoid rebuild
 
 $(LINUX_DIR)/include/linux/autoconf.h: ref-$(OS)-$(LINUX_VER)/.valid-ref
diff -r 88f97bb8f3ae -r 673f62edbfbe docs/src/user.tex
--- a/docs/src/user.tex Wed Mar  1 17:01:54 2006
+++ b/docs/src/user.tex Wed Mar  1 19:47:25 2006
@@ -626,7 +626,7 @@
 allow you to monitor and log the Xen boot process via serial console and
 can be very useful in debugging.
 
-%% kernel /boot/xen-2.0.gz dom0_mem=131072 com1=115200,8n1
+%% kernel /boot/xen-2.0.gz dom0_mem=131072 console=com1,vga com1=115200,8n1
 %% module /boot/vmlinuz-2.6-xen0 root=/dev/sda4 ro
 
 In order to configure Xen serial console output, it is necessary to
@@ -637,8 +637,9 @@
 \end{verbatim}}
 \end{quote}
 
-This configures Xen to output on COM1 at 115,200 baud, 8 data bits, 1
-stop bit and no parity. Modify these parameters for your environment.
+This configures Xen to output on COM1 at 115,200 baud, 8 data bits, no
+parity and 1 stop bit. Modify these parameters for your environment.
+See Section~\ref{s:xboot} for an explanation of all boot parameters.
 
 One can also configure XenLinux to share the serial console; to achieve
 this append ``\path{console=ttyS0}'' to your module line.
diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/arch/i386/Kconfig
--- a/linux-2.6-xen-sparse/arch/i386/Kconfig    Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/i386/Kconfig    Wed Mar  1 19:47:25 2006
@@ -770,7 +770,7 @@
 
 config HOTPLUG_CPU
        bool "Support for hot-pluggable CPUs (EXPERIMENTAL)"
-       depends on SMP && HOTPLUG && EXPERIMENTAL
+       depends on SMP && HOTPLUG && EXPERIMENTAL && !X86_VOYAGER
        ---help---
          Say Y here to experiment with turning CPUs off and on.  CPUs
          can be controlled through /sys/devices/system/cpu.
@@ -1122,6 +1122,7 @@
 
 config KPROBES
        bool "Kprobes (EXPERIMENTAL)"
+       depends on EXPERIMENTAL && MODULES
        help
          Kprobes allows you to trap at almost any kernel address and
          execute a callback function.  register_kprobe() establishes
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/arch/i386/kernel/Makefile
--- a/linux-2.6-xen-sparse/arch/i386/kernel/Makefile    Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/i386/kernel/Makefile    Wed Mar  1 19:47:25 2006
@@ -7,7 +7,7 @@
 obj-y  := process.o semaphore.o signal.o entry.o traps.o irq.o \
                ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \
                pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o \
-               quirks.o i8237.o
+               quirks.o i8237.o topology.o
 
 obj-y                          += cpu/
 obj-y                          += timers/
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/arch/i386/kernel/acpi/boot-xen.c
--- a/linux-2.6-xen-sparse/arch/i386/kernel/acpi/boot-xen.c     Wed Mar  1 
17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/i386/kernel/acpi/boot-xen.c     Wed Mar  1 
19:47:25 2006
@@ -44,9 +44,6 @@
 extern int gsi_irq_sharing(int gsi);
 #include <asm/proto.h>
 
-static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id) { 
return 0; }
-
-
 #else                          /* X86 */
 
 #ifdef CONFIG_X86_LOCAL_APIC
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/arch/i386/kernel/cpu/common-xen.c
--- a/linux-2.6-xen-sparse/arch/i386/kernel/cpu/common-xen.c    Wed Mar  1 
17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/i386/kernel/cpu/common-xen.c    Wed Mar  1 
19:47:25 2006
@@ -4,6 +4,7 @@
 #include <linux/smp.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
+#include <linux/bootmem.h>
 #include <asm/semaphore.h>
 #include <asm/processor.h>
 #include <asm/i387.h>
@@ -18,6 +19,9 @@
 #include <asm/hypervisor.h>
 
 #include "cpu.h"
+
+DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
+EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr);
 
 #ifndef CONFIG_XEN
 DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]);
@@ -598,6 +602,8 @@
        struct tss_struct * t = &per_cpu(init_tss, cpu);
 #endif
        struct thread_struct *thread = &current->thread;
+       struct desc_struct *gdt;
+       struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
 
        if (cpu_test_and_set(cpu, cpu_initialized)) {
                printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
@@ -614,7 +620,54 @@
                set_in_cr4(X86_CR4_TSD);
        }
 
-       cpu_gdt_init(&cpu_gdt_descr[cpu]);
+#ifndef CONFIG_XEN
+       /*
+        * This is a horrible hack to allocate the GDT.  The problem
+        * is that cpu_init() is called really early for the boot CPU
+        * (and hence needs bootmem) but much later for the secondary
+        * CPUs, when bootmem will have gone away
+        */
+       if (NODE_DATA(0)->bdata->node_bootmem_map) {
+               gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE);
+               /* alloc_bootmem_pages panics on failure, so no check */
+               memset(gdt, 0, PAGE_SIZE);
+       } else {
+               gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL);
+               if (unlikely(!gdt)) {
+                       printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu);
+                       for (;;)
+                               local_irq_enable();
+               }
+       }
+
+       /*
+        * Initialize the per-CPU GDT with the boot GDT,
+        * and set up the GDT descriptor:
+        */
+       memcpy(gdt, cpu_gdt_table, GDT_SIZE);
+
+       /* Set up GDT entry for 16bit stack */
+       *(__u64 *)(&gdt[GDT_ENTRY_ESPFIX_SS]) |=
+               ((((__u64)stk16_off) << 16) & 0x000000ffffff0000ULL) |
+               ((((__u64)stk16_off) << 32) & 0xff00000000000000ULL) |
+               (CPU_16BIT_STACK_SIZE - 1);
+
+       cpu_gdt_descr->size = GDT_SIZE - 1;
+       cpu_gdt_descr->address = (unsigned long)gdt;
+#else
+       if (cpu == 0 && cpu_gdt_descr->address == 0) {
+               gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE);
+               /* alloc_bootmem_pages panics on failure, so no check */
+               memset(gdt, 0, PAGE_SIZE);
+
+               memcpy(gdt, cpu_gdt_table, GDT_SIZE);
+               
+               cpu_gdt_descr->size = GDT_SIZE;
+               cpu_gdt_descr->address = (unsigned long)gdt;
+       }
+#endif
+
+       cpu_gdt_init(cpu_gdt_descr);
 
        /*
         * Set up and load the per-CPU TSS and LDT
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/arch/i386/kernel/head-xen.S
--- a/linux-2.6-xen-sparse/arch/i386/kernel/head-xen.S  Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/i386/kernel/head-xen.S  Wed Mar  1 19:47:25 2006
@@ -87,19 +87,9 @@
  */
 .data
 
-       ALIGN
-       .word 0                         # 32 bit align gdt_desc.address
-       .globl cpu_gdt_descr
-cpu_gdt_descr:
-       .word GDT_SIZE
-       .long cpu_gdt_table
-
-       .fill NR_CPUS-1,8,0             # space for the other GDT descriptors
-
 /*
  * The Global Descriptor Table contains 28 quadwords, per-CPU.
  */
-       .align PAGE_SIZE_asm
 ENTRY(cpu_gdt_table)
        .quad 0x0000000000000000        /* NULL descriptor */
        .quad 0x0000000000000000        /* 0x0b reserved */
@@ -148,10 +138,6 @@
        .quad 0x0000000000000000        /* 0xf0 - unused */
        .quad 0x0000000000000000        /* 0xf8 - GDT entry 31: double-fault 
TSS */
 
-       /* Be sure this is zeroed to avoid false validations in Xen */
-       .fill PAGE_SIZE_asm / 8 - GDT_ENTRIES,8,0
-
-
 /*
  * __xen_guest information
  */
@@ -176,6 +162,7 @@
        .ascii  ",FEATURES=writable_page_tables"
        .ascii           "|writable_descriptor_tables"
        .ascii           "|auto_translated_physmap"
+       .ascii           "|pae_pgdir_above_4gb"
        .ascii           "|supervisor_mode_kernel"
 #ifdef CONFIG_X86_PAE
        .ascii  ",PAE=yes"
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/arch/i386/kernel/io_apic-xen.c
--- a/linux-2.6-xen-sparse/arch/i386/kernel/io_apic-xen.c       Wed Mar  1 
17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/i386/kernel/io_apic-xen.c       Wed Mar  1 
19:47:25 2006
@@ -2634,8 +2634,10 @@
                spin_unlock_irqrestore(&ioapic_lock, flags);
 
                /* Sanity check */
-               if (reg_00.bits.ID != apic_id)
-                       panic("IOAPIC[%d]: Unable change apic_id!\n", ioapic);
+               if (reg_00.bits.ID != apic_id) {
+                       printk("IOAPIC[%d]: Unable to change apic_id!\n", 
ioapic);
+                       return -1;
+               }
        }
 
        apic_printk(APIC_VERBOSE, KERN_INFO
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/arch/i386/kernel/mpparse-xen.c
--- a/linux-2.6-xen-sparse/arch/i386/kernel/mpparse-xen.c       Wed Mar  1 
17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/i386/kernel/mpparse-xen.c       Wed Mar  1 
19:47:25 2006
@@ -935,6 +935,7 @@
        u32                     gsi_base)
 {
        int                     idx = 0;
+       int                     tmpid;
 
        if (nr_ioapics >= MAX_IO_APICS) {
                printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
@@ -957,9 +958,14 @@
        set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
 #endif
        if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && 
(boot_cpu_data.x86 < 15))
-               mp_ioapics[idx].mpc_apicid = io_apic_get_unique_id(idx, id);
+               tmpid = io_apic_get_unique_id(idx, id);
        else
-               mp_ioapics[idx].mpc_apicid = id;
+               tmpid = id;
+       if (tmpid == -1) {
+               nr_ioapics--;
+               return;
+       }
+       mp_ioapics[idx].mpc_apicid = tmpid;
        mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
        
        /* 
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/arch/i386/kernel/smpboot.c
--- a/linux-2.6-xen-sparse/arch/i386/kernel/smpboot.c   Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/i386/kernel/smpboot.c   Wed Mar  1 19:47:25 2006
@@ -898,12 +898,6 @@
        unsigned long start_eip;
        unsigned short nmi_high = 0, nmi_low = 0;
 
-       if (!cpu_gdt_descr[cpu].address &&
-           !(cpu_gdt_descr[cpu].address = get_zeroed_page(GFP_KERNEL))) {
-               printk("Failed to allocate GDT for CPU %d\n", cpu);
-               return 1;
-       }
-
        ++cpucount;
 
        /*
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/arch/i386/kernel/time-xen.c
--- a/linux-2.6-xen-sparse/arch/i386/kernel/time-xen.c  Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/i386/kernel/time-xen.c  Wed Mar  1 19:47:25 2006
@@ -48,6 +48,8 @@
 #include <linux/mca.h>
 #include <linux/sysctl.h>
 #include <linux/percpu.h>
+#include <linux/kernel_stat.h>
+#include <linux/posix-timers.h>
 
 #include <asm/io.h>
 #include <asm/smp.h>
@@ -70,6 +72,7 @@
 #include <asm/arch_hooks.h>
 
 #include <xen/evtchn.h>
+#include <xen/interface/vcpu.h>
 
 #if defined (__i386__)
 #include <asm/i8259.h>
@@ -122,6 +125,13 @@
 /* Keep track of last time we did processing/updating of jiffies and xtime. */
 static u64 processed_system_time;   /* System time (ns) at last processing. */
 static DEFINE_PER_CPU(u64, processed_system_time);
+
+/* How much CPU time was spent blocked and how much was 'stolen'? */
+static DEFINE_PER_CPU(u64, processed_stolen_time);
+static DEFINE_PER_CPU(u64, processed_blocked_time);
+
+/* Current runstate of each CPU (updated automatically by the hypervisor). */
+static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
 
 /* Must be signed, as it's compared with s64 quantities which can be -ve. */
 #define NS_PER_TICK (1000000000LL/HZ)
@@ -477,14 +487,45 @@
 
 EXPORT_SYMBOL(do_settimeofday);
 
-#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+static void sync_xen_wallclock(unsigned long dummy);
+static DEFINE_TIMER(sync_xen_wallclock_timer, sync_xen_wallclock, 0, 0);
+static void sync_xen_wallclock(unsigned long dummy)
+{
+       time_t sec;
+       s64 nsec;
+       dom0_op_t op;
+
+       if (!ntp_synced() || independent_wallclock ||
+           !(xen_start_info->flags & SIF_INITDOMAIN))
+               return;
+
+       write_seqlock_irq(&xtime_lock);
+
+       sec  = xtime.tv_sec;
+       nsec = xtime.tv_nsec + ((jiffies - wall_jiffies) * (u64)NS_PER_TICK);
+       __normalize_time(&sec, &nsec);
+
+       op.cmd = DOM0_SETTIME;
+       op.u.settime.secs        = sec;
+       op.u.settime.nsecs       = nsec;
+       op.u.settime.system_time = processed_system_time;
+       HYPERVISOR_dom0_op(&op);
+
+       update_wallclock();
+
+       write_sequnlock_irq(&xtime_lock);
+
+       /* Once per minute. */
+       mod_timer(&sync_xen_wallclock_timer, jiffies + 60*HZ);
+}
+
 static int set_rtc_mmss(unsigned long nowtime)
 {
        int retval;
 
        WARN_ON(irqs_disabled());
 
-       if (!(xen_start_info->flags & SIF_INITDOMAIN))
+       if (independent_wallclock || !(xen_start_info->flags & SIF_INITDOMAIN))
                return 0;
 
        /* gets recalled with irq locally disabled */
@@ -497,12 +538,6 @@
 
        return retval;
 }
-#else
-static int set_rtc_mmss(unsigned long nowtime)
-{
-       return 0;
-}
-#endif
 
 /* monotonic_clock(): returns # of nanoseconds passed since time_init()
  *             Note: This function is required to return accurate
@@ -567,19 +602,37 @@
 
 irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 {
-       s64 delta, delta_cpu;
+       s64 delta, delta_cpu, stolen, blocked;
+       u64 sched_time;
        int i, cpu = smp_processor_id();
        struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
+       struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
 
        write_seqlock(&xtime_lock);
 
        do {
                get_time_values_from_xen();
 
+               /* Obtain a consistent snapshot of elapsed wallclock cycles. */
                delta = delta_cpu = 
                        shadow->system_timestamp + get_nsec_offset(shadow);
                delta     -= processed_system_time;
                delta_cpu -= per_cpu(processed_system_time, cpu);
+
+               /*
+                * Obtain a consistent snapshot of stolen/blocked cycles. We
+                * can use state_entry_time to detect if we get preempted here.
+                */
+               do {
+                       sched_time = runstate->state_entry_time;
+                       barrier();
+                       stolen = runstate->time[RUNSTATE_runnable] +
+                               runstate->time[RUNSTATE_offline] -
+                               per_cpu(processed_stolen_time, cpu);
+                       blocked = runstate->time[RUNSTATE_blocked] -
+                               per_cpu(processed_blocked_time, cpu);
+                       barrier();
+               } while (sched_time != runstate->state_entry_time);
        }
        while (!time_values_up_to_date(cpu));
 
@@ -612,18 +665,67 @@
        write_sequnlock(&xtime_lock);
 
        /*
-         * Local CPU jiffy work. No need to hold xtime_lock, and I'm not sure
-         * if there is risk of deadlock if we do (since update_process_times
-         * may do scheduler rebalancing work and thus acquire runqueue locks).
-         */
-       while (delta_cpu >= NS_PER_TICK) {
-               delta_cpu -= NS_PER_TICK;
-               per_cpu(processed_system_time, cpu) += NS_PER_TICK;
-               update_process_times(user_mode(regs));
-               profile_tick(CPU_PROFILING, regs);
-       }
+        * Account stolen ticks.
+        * HACK: Passing NULL to account_steal_time()
+        * ensures that the ticks are accounted as stolen.
+        */
+       if (stolen > 0) {
+               delta_cpu -= stolen;
+               do_div(stolen, NS_PER_TICK);
+               per_cpu(processed_stolen_time, cpu) += stolen * NS_PER_TICK;
+               per_cpu(processed_system_time, cpu) += stolen * NS_PER_TICK;
+               account_steal_time(NULL, (cputime_t)stolen);
+       }
+
+       /*
+        * Account blocked ticks.
+        * HACK: Passing idle_task to account_steal_time()
+        * ensures that the ticks are accounted as idle/wait.
+        */
+       if (blocked > 0) {
+               delta_cpu -= blocked;
+               do_div(blocked, NS_PER_TICK);
+               per_cpu(processed_blocked_time, cpu) += blocked * NS_PER_TICK;
+               per_cpu(processed_system_time, cpu)  += blocked * NS_PER_TICK;
+               account_steal_time(idle_task(cpu), (cputime_t)blocked);
+       }
+
+       /* Account user/system ticks. */
+       if (delta_cpu > 0) {
+               do_div(delta_cpu, NS_PER_TICK);
+               per_cpu(processed_system_time, cpu) += delta_cpu * NS_PER_TICK;
+               if (user_mode(regs))
+                       account_user_time(current, (cputime_t)delta_cpu);
+               else
+                       account_system_time(current, HARDIRQ_OFFSET,
+                                           (cputime_t)delta_cpu);
+       }
+
+       /* Local timer processing (see update_process_times()). */
+       run_local_timers();
+       if (rcu_pending(cpu))
+               rcu_check_callbacks(cpu, user_mode(regs));
+       scheduler_tick();
+       run_posix_cpu_timers(current);
 
        return IRQ_HANDLED;
+}
+
+static void init_missing_ticks_accounting(int cpu)
+{
+       struct vcpu_register_runstate_memory_area area;
+       struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
+
+       memset(runstate, 0, sizeof(*runstate));
+
+       area.addr.v = runstate;
+       HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area);
+
+       per_cpu(processed_blocked_time, cpu) =
+               runstate->time[RUNSTATE_blocked];
+       per_cpu(processed_stolen_time, cpu) =
+               runstate->time[RUNSTATE_runnable] +
+               runstate->time[RUNSTATE_offline];
 }
 
 /* not static: needed by APM */
@@ -691,6 +793,7 @@
 void notify_arch_cmos_timer(void)
 {
        mod_timer(&sync_cmos_timer, jiffies + 1);
+       mod_timer(&sync_xen_wallclock_timer, jiffies + 1);
 }
 
 static long clock_cmos_diff, sleep_start;
@@ -814,6 +917,7 @@
 
        processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
        per_cpu(processed_system_time, 0) = processed_system_time;
+       init_missing_ticks_accounting(0);
 
        update_wallclock();
 
@@ -891,6 +995,7 @@
 
        processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
        per_cpu(processed_system_time, 0) = processed_system_time;
+       init_missing_ticks_accounting(0);
 
        update_wallclock();
 }
@@ -909,6 +1014,7 @@
                /* Use cpu0 timestamp: cpu's shadow is not initialised yet. */
                per_cpu(processed_system_time, cpu) = 
                        per_cpu(shadow_time, 0).system_timestamp;
+               init_missing_ticks_accounting(cpu);
        } while (read_seqretry(&xtime_lock, seq));
 
        sprintf(timer_name[cpu], "timer%d", cpu);
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/arch/i386/mach-xen/Makefile
--- a/linux-2.6-xen-sparse/arch/i386/mach-xen/Makefile  Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/i386/mach-xen/Makefile  Wed Mar  1 19:47:25 2006
@@ -2,6 +2,4 @@
 # Makefile for the linux kernel.
 #
 
-obj-y                          := setup.o topology.o
-  
-topology-y                     := ../mach-default/topology.o
+obj-y                          := setup.o
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/arch/i386/mm/init-xen.c
--- a/linux-2.6-xen-sparse/arch/i386/mm/init-xen.c      Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/i386/mm/init-xen.c      Wed Mar  1 19:47:25 2006
@@ -454,6 +454,7 @@
 
 static int disable_nx __initdata = 0;
 u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
+EXPORT_SYMBOL(__supported_pte_mask);
 
 /*
  * noexec = on|off
diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/arch/x86_64/Kconfig
--- a/linux-2.6-xen-sparse/arch/x86_64/Kconfig  Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/x86_64/Kconfig  Wed Mar  1 19:47:25 2006
@@ -381,21 +381,6 @@
          as it is off-chip.  You can find the HPET spec at
          <http://www.intel.com/hardwaredesign/hpetspec.htm>.
 
-config X86_PM_TIMER
-       bool "PM timer" if EMBEDDED
-       depends on ACPI && !X86_64_XEN
-       default y
-       help
-         Support the ACPI PM timer for time keeping. This is slow,
-         but is useful on some chipsets without HPET on systems with more
-         than one CPU. On a single processor or single socket multi core
-         system it is normally not required.
-         When the PM timer is active 64bit vsyscalls are disabled
-         and should not be enabled (/proc/sys/kernel/vsyscall64 should
-         not be changed).
-         The kernel selects the PM timer only as a last resort, so it is
-         useful to enable just in case.
-
 config HPET_EMULATE_RTC
        bool "Provide RTC interrupt"
        depends on HPET_TIMER && RTC=y
@@ -640,6 +625,7 @@
 
 config KPROBES
        bool "Kprobes (EXPERIMENTAL)"
+       depends on EXPERIMENTAL && MODULES
        help
          Kprobes allows you to trap at almost any kernel address and
          execute a callback function.  register_kprobe() establishes
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile
--- a/linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile  Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile  Wed Mar  1 19:47:25 2006
@@ -45,7 +45,7 @@
 
 bootflag-y                     += ../../i386/kernel/bootflag.o
 cpuid-$(subst m,y,$(CONFIG_X86_CPUID))  += ../../i386/kernel/cpuid.o
-topology-y                     += ../../i386/mach-default/topology.o
+topology-y                     += ../../i386/kernel/topology.o
 microcode-$(subst m,y,$(CONFIG_MICROCODE))  += ../../i386/kernel/microcode.o
 intel_cacheinfo-y              += ../../i386/kernel/cpu/intel_cacheinfo.o
 quirks-y                       += ../../i386/kernel/quirks.o
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/arch/x86_64/kernel/apic-xen.c
--- a/linux-2.6-xen-sparse/arch/x86_64/kernel/apic-xen.c        Wed Mar  1 
17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/apic-xen.c        Wed Mar  1 
19:47:25 2006
@@ -114,6 +114,8 @@
        irq_exit();
 }
 
+int __initdata unsync_tsc_on_multicluster;
+
 /*
  * This interrupt should _never_ happen with our APIC/SMP architecture
  */
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/arch/x86_64/kernel/entry-xen.S
--- a/linux-2.6-xen-sparse/arch/x86_64/kernel/entry-xen.S       Wed Mar  1 
17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/entry-xen.S       Wed Mar  1 
19:47:25 2006
@@ -51,6 +51,7 @@
 #include <asm/page.h>
 #include <asm/errno.h>
 #include <xen/interface/arch-x86_64.h>
+#include <xen/interface/features.h>
 
 #include "irq_vectors.h"
 
@@ -146,16 +147,19 @@
          */
        .macro HYPERVISOR_IRET flag
        testb $3,1*8(%rsp)
-       jnz   1f
+       jnz   2f
        testl $NMI_MASK,2*8(%rsp)
+       jnz   2f
+
+       testb $1,(xen_features+XENFEAT_supervisor_mode_kernel)
        jnz   1f
 
        /* Direct iret to kernel space. Correct CS and SS. */
        orb   $3,1*8(%rsp)
        orb   $3,4*8(%rsp)
-       iretq
-
-1:     /* Slow iret via hypervisor. */
+1:     iretq
+
+2:     /* Slow iret via hypervisor. */
        andl  $~NMI_MASK, 16(%rsp)
        pushq $\flag
        jmp  hypercall_page + (__HYPERVISOR_iret * 32)
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/arch/x86_64/kernel/io_apic-xen.c
--- a/linux-2.6-xen-sparse/arch/x86_64/kernel/io_apic-xen.c     Wed Mar  1 
17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/io_apic-xen.c     Wed Mar  1 
19:47:25 2006
@@ -51,6 +51,8 @@
 int disable_timer_pin_1 __initdata;
 
 #ifndef CONFIG_XEN
+int timer_over_8254 __initdata = 1;
+
 /* Where if anywhere is the i8259 connect in external int mode */
 static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
 #endif
@@ -300,6 +302,22 @@
 
 __setup("noapic", disable_ioapic_setup);
 __setup("apic", enable_ioapic_setup);
+
+#ifndef CONFIG_XEN
+static int __init setup_disable_8254_timer(char *s)
+{
+       timer_over_8254 = -1;
+       return 1;
+}
+static int __init setup_enable_8254_timer(char *s)
+{
+       timer_over_8254 = 2;
+       return 1;
+}
+
+__setup("disable_8254_timer", setup_disable_8254_timer);
+__setup("enable_8254_timer", setup_enable_8254_timer);
+#endif /* !CONFIG_XEN */
 
 #include <asm/pci-direct.h>
 #include <linux/pci_ids.h>
@@ -360,27 +378,20 @@
                                        /* RED-PEN skip them on mptables too? */
                                        return;
                                case PCI_VENDOR_ID_ATI:
+
+                               /* This should be actually default, but
+                                  for 2.6.16 let's do it for ATI only where
+                                  it's really needed. */
 #ifndef CONFIG_XEN
-                                       if (apic_runs_main_timer != 0)
-                                               break;
-#ifdef CONFIG_ACPI
-                                       /* Don't do this for laptops right
-                                          right now because their timer
-                                          doesn't necessarily tick in C2/3 */
-                                       if (acpi_fadt.revision >= 3 &&
-                       (acpi_fadt.plvl2_lat + acpi_fadt.plvl3_lat) < 1100) {
-                                               printk(KERN_INFO
-"ATI board detected, but seems to be a laptop. Timer might be shakey, 
sorry\n");
-                                               break;
-                                       }
-#endif                                 
+                                       if (timer_over_8254 == 1) {     
+                                               timer_over_8254 = 0;    
                                        printk(KERN_INFO
-            "ATI board detected. Using APIC/PM timer.\n");
-                                       apic_runs_main_timer = 1;
-                                       nohpet = 1;
+               "ATI board detected. Disabling timer routing over 8254.\n");
+                                       }       
 #endif
                                        return;
                                } 
+
 
                                /* No multi-function device? */
                                type = read_pci_config_byte(num,slot,func,
@@ -1848,6 +1859,8 @@
  * a wide range of boards and BIOS bugs.  Fortunately only the timer IRQ
  * is so screwy.  Thanks to Brian Perkins for testing/hacking this beast
  * fanatically on his truly buggy board.
+ *
+ * FIXME: really need to revamp this for modern platforms only.
  */
 static inline void check_timer(void)
 {
@@ -1870,7 +1883,8 @@
         */
        apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
        init_8259A(1);
-       enable_8259A_irq(0);
+       if (timer_over_8254 > 0)
+               enable_8259A_irq(0);
 
        pin1  = find_isa_irq_pin(0, mp_INT);
        apic1 = find_isa_irq_apic(0, mp_INT);
@@ -1925,7 +1939,7 @@
        }
        printk(" failed.\n");
 
-       if (nmi_watchdog) {
+       if (nmi_watchdog == NMI_IO_APIC) {
                printk(KERN_WARNING "timer doesn't work through the IO-APIC - 
disabling NMI Watchdog!\n");
                nmi_watchdog = 0;
        }
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c
--- a/linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c       Wed Mar  1 
17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c       Wed Mar  1 
19:47:25 2006
@@ -462,6 +462,12 @@
                else if(!memcmp(from, "elfcorehdr=", 11))
                        elfcorehdr_addr = memparse(from+11, &from);
 #endif
+
+#if defined(CONFIG_HOTPLUG_CPU) && !defined(CONFIG_XEN)
+               else if (!memcmp(from, "additional_cpus=", 16))
+                       setup_additional_cpus(from+16);
+#endif
+
        next_char:
                c = *(from++);
                if (!c)
diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/drivers/acpi/Kconfig
--- a/linux-2.6-xen-sparse/drivers/acpi/Kconfig Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/acpi/Kconfig Wed Mar  1 19:47:25 2006
@@ -247,7 +247,7 @@
          Enter the full path name to the file wich includes the AmlCode 
declaration.
 
 config ACPI_BLACKLIST_YEAR
-       int "Disable ACPI for systems before Jan 1st this year" if X86
+       int "Disable ACPI for systems before Jan 1st this year" if X86_32
        default 0
        help
          enter a 4-digit year, eg. 2001 to disable ACPI by default
@@ -285,9 +285,9 @@
          dump your ACPI DSDT table using /proc/acpi/dsdt.
 
 config X86_PM_TIMER
-       bool "Power Management Timer Support"
-       depends on X86
-       depends on !X86_64
+       bool "Power Management Timer Support" if EMBEDDED
+       depends on X86
+       depends on !XEN
        default y
        help
          The Power Management Timer is available on all ACPI-capable,
@@ -298,9 +298,8 @@
          voltage scaling, unlike the commonly used Time Stamp Counter
          (TSC) timing source.
 
-         So, if you see messages like 'Losing too many ticks!' in the
-         kernel logs, and/or you are using this on a notebook which
-         does not yet have an HPET, you should say "Y" here.
+         You should nearly always say Y here because many modern
+         systems require this timer. 
 
 config ACPI_CONTAINER
        tristate "ACPI0004,PNP0A05 and PNP0A06 Container Driver (EXPERIMENTAL)"
diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/drivers/video/Kconfig
--- a/linux-2.6-xen-sparse/drivers/video/Kconfig        Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/video/Kconfig        Wed Mar  1 19:47:25 2006
@@ -520,7 +520,7 @@
 config FB_GBE_MEM
        int "Video memory size in MB"
        depends on FB_GBE
-       default 8
+       default 4
        help
          This is the amount of memory reserved for the framebuffer,
          which can be any value between 1MB and 8MB.
diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/drivers/xen/Kconfig
--- a/linux-2.6-xen-sparse/drivers/xen/Kconfig  Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/Kconfig  Wed Mar  1 19:47:25 2006
@@ -68,7 +68,7 @@
        default n
 
 config XEN_BLKDEV_BACKEND
-       bool "Block-device backend driver"
+       tristate "Block-device backend driver"
        default y
        help
          The block-device backend driver allows the kernel to export its
@@ -76,7 +76,7 @@
          interface.
 
 config XEN_BLKDEV_TAP_BE
-        bool "Block Tap support for backend driver (DANGEROUS)"
+        tristate "Block Tap support for backend driver (DANGEROUS)"
         depends on XEN_BLKDEV_BACKEND
         default n
         help
@@ -89,7 +89,7 @@
           modified to use grant tables.
 
 config XEN_NETDEV_BACKEND
-       bool "Network-device backend driver"
+       tristate "Network-device backend driver"
        default y
        help
          The network-device backend driver allows the kernel to export its
@@ -109,8 +109,16 @@
          are unsure; or if you experience network hangs when this option is
          enabled; then you must say N here.
 
+config XEN_NETDEV_LOOPBACK
+       tristate "Network-device loopback driver"
+       depends on XEN_NETDEV_BACKEND
+       default y
+       help
+         A two-interface loopback device to emulate a local netfront-netback
+         connection.
+
 config XEN_TPMDEV_BACKEND
-       bool "TPM-device backend driver"
+       tristate "TPM-device backend driver"
        default n
        help
          The TPM-device backend driver
@@ -145,7 +153,7 @@
          (domain 0), then you almost certainly want to say Y here.
 
 config XEN_BLKDEV_TAP
-       bool "Block device tap driver"
+       tristate "Block device tap driver"
        default n
        help
          This driver allows a VM to interact on block device channels
@@ -154,7 +162,7 @@
          space.  Odds are that you want to say N here.
 
 config XEN_TPMDEV_FRONTEND
-       bool "TPM-device frontend driver"
+       tristate "TPM-device frontend driver"
        default n
        select TCG_TPM
        select TCG_XEN
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/drivers/xen/blkback/Makefile
--- a/linux-2.6-xen-sparse/drivers/xen/blkback/Makefile Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/blkback/Makefile Wed Mar  1 19:47:25 2006
@@ -1,2 +1,3 @@
+obj-$(CONFIG_XEN_BLKDEV_BACKEND) := blkbk.o
 
-obj-y  := blkback.o xenbus.o interface.o vbd.o
+blkbk-y        := blkback.o xenbus.o interface.o vbd.o
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c
--- a/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c        Wed Mar  1 
17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c        Wed Mar  1 
19:47:25 2006
@@ -29,14 +29,10 @@
  * 64 should be enough to keep us competitive with Linux.
  */
 static int blkif_reqs = 64;
+module_param_named(reqs, blkif_reqs, int, 0);
+MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");
+
 static int mmap_pages;
-
-static int __init set_blkif_reqs(char *str)
-{
-       get_option(&str, &blkif_reqs);
-       return 1;
-}
-__setup("blkif_reqs=", set_blkif_reqs);
 
 /* Run-time switchable: /sys/module/blkback/parameters/ */
 static unsigned int log_stats = 0;
@@ -574,10 +570,20 @@
                list_add_tail(&pending_reqs[i].free_list, &pending_free);
     
        blkif_xenbus_init();
+       __unsafe(THIS_MODULE);
        return 0;
 }
 
-__initcall(blkif_init);
+module_init(blkif_init);
+
+static void blkif_exit(void)
+{
+       BUG();
+}
+
+module_exit(blkif_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
 
 /*
  * Local variables:
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/drivers/xen/core/skbuff.c
--- a/linux-2.6-xen-sparse/drivers/xen/core/skbuff.c    Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/core/skbuff.c    Wed Mar  1 19:47:25 2006
@@ -16,6 +16,7 @@
 
 /* Referenced in netback.c. */
 /*static*/ kmem_cache_t *skbuff_cachep;
+EXPORT_SYMBOL(skbuff_cachep);
 
 #define MAX_SKBUFF_ORDER 4
 static kmem_cache_t *skbuff_order_cachep[MAX_SKBUFF_ORDER + 1];
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/drivers/xen/core/smpboot.c
--- a/linux-2.6-xen-sparse/drivers/xen/core/smpboot.c   Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/core/smpboot.c   Wed Mar  1 19:47:25 2006
@@ -150,6 +150,11 @@
 {
        vcpu_guest_context_t ctxt;
        struct task_struct *idle = idle_task(vcpu);
+#ifdef __x86_64__
+       struct desc_ptr *gdt_descr = &cpu_gdt_descr[vcpu];
+#else
+       struct Xgt_desc_struct *gdt_descr = &per_cpu(cpu_gdt_descr, vcpu);
+#endif
 
        if (vcpu == 0)
                return;
@@ -171,8 +176,8 @@
 
        ctxt.ldt_ents = 0;
 
-       ctxt.gdt_frames[0] = virt_to_mfn(cpu_gdt_descr[vcpu].address);
-       ctxt.gdt_ents      = cpu_gdt_descr[vcpu].size / 8;
+       ctxt.gdt_frames[0] = virt_to_mfn(gdt_descr->address);
+       ctxt.gdt_ents      = gdt_descr->size / 8;
 
 #ifdef __i386__
        ctxt.user_regs.cs = __KERNEL_CS;
@@ -210,6 +215,11 @@
 {
        int cpu;
        struct task_struct *idle;
+#ifdef __x86_64__
+       struct desc_ptr *gdt_descr;
+#else
+       struct Xgt_desc_struct *gdt_descr;
+#endif
 
        cpu_data[0] = boot_cpu_data;
 
@@ -225,6 +235,22 @@
        for_each_cpu_mask (cpu, cpu_possible_map) {
                if (cpu == 0)
                        continue;
+
+#ifdef __x86_64__
+               gdt_descr = &cpu_gdt_descr[cpu];
+#else
+               gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
+#endif
+               gdt_descr->address = get_zeroed_page(GFP_KERNEL);
+               if (unlikely(!gdt_descr->address)) {
+                       printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu);
+                       continue;
+               }
+               gdt_descr->size = GDT_SIZE;
+               memcpy((void *)gdt_descr->address, cpu_gdt_table, GDT_SIZE);
+               make_page_readonly(
+                       (void *)gdt_descr->address,
+                       XENFEAT_writable_descriptor_tables);
 
                cpu_data[cpu] = boot_cpu_data;
                cpu_2_logical_apicid[cpu] = cpu;
@@ -241,17 +267,6 @@
 #endif
 
                irq_ctx_init(cpu);
-
-               cpu_gdt_descr[cpu].address =
-                       __get_free_page(GFP_KERNEL|__GFP_ZERO);
-               BUG_ON(cpu_gdt_descr[0].size > PAGE_SIZE);
-               cpu_gdt_descr[cpu].size = cpu_gdt_descr[0].size;
-               memcpy((void *)cpu_gdt_descr[cpu].address,
-                      (void *)cpu_gdt_descr[0].address,
-                      cpu_gdt_descr[0].size);
-               make_page_readonly(
-                       (void *)cpu_gdt_descr[cpu].address,
-                       XENFEAT_writable_descriptor_tables);
 
 #ifdef CONFIG_HOTPLUG_CPU
                if (xen_start_info->flags & SIF_INITDOMAIN)
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/drivers/xen/net_driver_util.c
--- a/linux-2.6-xen-sparse/drivers/xen/net_driver_util.c        Wed Mar  1 
17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/net_driver_util.c        Wed Mar  1 
19:47:25 2006
@@ -30,6 +30,7 @@
 
 #include <linux/if_ether.h>
 #include <linux/err.h>
+#include <linux/module.h>
 #include <xen/net_driver_util.h>
 
 
@@ -54,7 +55,7 @@
        kfree(macstr);
        return 0;
 }
-
+EXPORT_SYMBOL(xen_net_read_mac);
 
 /*
  * Local variables:
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/drivers/xen/netback/Makefile
--- a/linux-2.6-xen-sparse/drivers/xen/netback/Makefile Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/netback/Makefile Wed Mar  1 19:47:25 2006
@@ -1,2 +1,5 @@
+obj-$(CONFIG_XEN_NETDEV_BACKEND) := netbk.o
+obj-$(CONFIG_XEN_NETDEV_LOOPBACK) += netloop.o
 
-obj-y  := netback.o xenbus.o interface.o loopback.o
+netbk-y   := netback.o xenbus.o interface.o
+netloop-y := loopback.o
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/drivers/xen/netback/loopback.c
--- a/linux-2.6-xen-sparse/drivers/xen/netback/loopback.c       Wed Mar  1 
17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/netback/loopback.c       Wed Mar  1 
19:47:25 2006
@@ -178,6 +178,23 @@
        return err;
 }
 
+static void __init clean_loopback(int i)
+{
+       struct net_device *dev1, *dev2;
+       char dev_name[IFNAMSIZ];
+
+       sprintf(dev_name, "vif0.%d", i);
+       dev1 = dev_get_by_name(dev_name);
+       sprintf(dev_name, "veth%d", i);
+       dev2 = dev_get_by_name(dev_name);
+       if (dev1 && dev2) {
+               unregister_netdev(dev2);
+               unregister_netdev(dev1);
+               free_netdev(dev2);
+               free_netdev(dev1);
+       }
+}
+
 static int __init loopback_init(void)
 {
        int i, err = 0;
@@ -190,6 +207,18 @@
 }
 
 module_init(loopback_init);
+
+static void __exit loopback_exit(void)
+{
+       int i;
+
+       for (i = nloopbacks; i-- > 0; )
+               clean_loopback(i);
+}
+
+module_exit(loopback_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
 
 /*
  * Local variables:
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/drivers/xen/netback/netback.c
--- a/linux-2.6-xen-sparse/drivers/xen/netback/netback.c        Wed Mar  1 
17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/netback/netback.c        Wed Mar  1 
19:47:25 2006
@@ -505,14 +505,12 @@
                        /* Still too big to send right now? Set a callback. */
                        if (txreq.size > netif->remaining_credit) {
                                netif->remaining_credit = 0;
-                               netif->credit_timeout.expires  = 
-                                       next_credit;
                                netif->credit_timeout.data     =
                                        (unsigned long)netif;
                                netif->credit_timeout.function =
                                        tx_credit_callback;
-                               add_timer_on(&netif->credit_timeout,
-                                            smp_processor_id());
+                               __mod_timer(&netif->credit_timeout,
+                                           next_credit);
                                break;
                        }
                }
@@ -811,6 +809,8 @@
                &netif_be_dbg);
 #endif
 
+       __unsafe(THIS_MODULE);
+
        return 0;
 }
 
@@ -821,6 +821,8 @@
 
 module_init(netback_init);
 module_exit(netback_cleanup);
+
+MODULE_LICENSE("Dual BSD/GPL");
 
 /*
  * Local variables:
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c
--- a/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c      Wed Mar  1 
17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c      Wed Mar  1 
19:47:25 2006
@@ -114,6 +114,7 @@
 
        /* Receive-ring batched refills. */
 #define RX_MIN_TARGET 8
+#define RX_DFL_MIN_TARGET 64
 #define RX_MAX_TARGET NET_RX_RING_SIZE
        int rx_min_target, rx_max_target, rx_target;
        struct sk_buff_head rx_batch;
@@ -1102,8 +1103,8 @@
        spin_lock_init(&np->rx_lock);
 
        skb_queue_head_init(&np->rx_batch);
-       np->rx_target     = RX_MIN_TARGET;
-       np->rx_min_target = RX_MIN_TARGET;
+       np->rx_target     = RX_DFL_MIN_TARGET;
+       np->rx_min_target = RX_DFL_MIN_TARGET;
        np->rx_max_target = RX_MAX_TARGET;
 
        init_timer(&np->rx_refill_timer);
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/drivers/xen/tpmback/common.h
--- a/linux-2.6-xen-sparse/drivers/xen/tpmback/common.h Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/tpmback/common.h Wed Mar  1 19:47:25 2006
@@ -54,9 +54,11 @@
 void tpmif_disconnect_complete(tpmif_t * tpmif);
 tpmif_t *tpmif_find(domid_t domid, long int instance);
 void tpmif_interface_init(void);
+void tpmif_interface_exit(void);
 void tpmif_schedule_work(tpmif_t * tpmif);
 void tpmif_deschedule_work(tpmif_t * tpmif);
 void tpmif_xenbus_init(void);
+void tpmif_xenbus_exit(void);
 int tpmif_map(tpmif_t *tpmif, unsigned long shared_page, unsigned int evtchn);
 irqreturn_t tpmif_be_int(int irq, void *dev_id, struct pt_regs *regs);
 int tpmif_vtpm_open(tpmif_t *tpmif, domid_t domain, u32 instance);
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/drivers/xen/tpmback/interface.c
--- a/linux-2.6-xen-sparse/drivers/xen/tpmback/interface.c      Wed Mar  1 
17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/tpmback/interface.c      Wed Mar  1 
19:47:25 2006
@@ -186,6 +186,12 @@
                                         0, 0, NULL, NULL);
 }
 
+void __init
+tpmif_interface_exit(void)
+{
+       kmem_cache_destroy(tpmif_cachep);
+}
+
 /*
  * Local variables:
  *  c-file-style: "linux"
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/drivers/xen/tpmback/tpmback.c
--- a/linux-2.6-xen-sparse/drivers/xen/tpmback/tpmback.c        Wed Mar  1 
17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/tpmback/tpmback.c        Wed Mar  1 
19:47:25 2006
@@ -1092,7 +1092,20 @@
        return 0;
 }
 
-__initcall(tpmback_init);
+module_init(tpmback_init);
+
+static void __exit
+tpmback_exit(void)
+{
+
+       tpmif_xenbus_exit();
+       tpmif_interface_exit();
+       misc_deregister(&ibmvtpms_miscdevice);
+}
+
+module_exit(tpmback_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
 
 /*
  * Local variables:
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/drivers/xen/tpmback/xenbus.c
--- a/linux-2.6-xen-sparse/drivers/xen/tpmback/xenbus.c Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/tpmback/xenbus.c Wed Mar  1 19:47:25 2006
@@ -317,6 +317,11 @@
        xenbus_register_backend(&tpmback);
 }
 
+void tpmif_xenbus_exit(void)
+{
+       xenbus_unregister_driver(&tpmback);
+}
+
 /*
  * Local variables:
  *  c-file-style: "linux"
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/drivers/xen/tpmfront/tpmfront.c
--- a/linux-2.6-xen-sparse/drivers/xen/tpmfront/tpmfront.c      Wed Mar  1 
17:01:54 2006
+++ b/linux-2.6-xen-sparse/drivers/xen/tpmfront/tpmfront.c      Wed Mar  1 
19:47:25 2006
@@ -480,6 +480,11 @@
        xenbus_register_frontend(&tpmfront);
 }
 
+static void __exit exit_tpm_xenbus(void)
+{
+       xenbus_unregister_driver(&tpmfront);
+}
+
 
 static int
 tpm_allocate_buffers(struct tpm_private *tp)
@@ -700,7 +705,18 @@
        return 0;
 }
 
-__initcall(tpmif_init);
+module_init(tpmif_init);
+
+static void __exit
+tpmif_exit(void)
+{
+       exit_tpm_xenbus();
+       gnttab_free_grant_references(gref_head);
+}
+
+module_exit(tpmif_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
 
 /*
  * Local variables:
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/desc.h
--- a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/desc.h Wed Mar  1 
17:01:54 2006
+++ b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/desc.h Wed Mar  1 
19:47:25 2006
@@ -23,11 +23,13 @@
        unsigned short pad;
 } __attribute__ ((packed));
 
-extern struct Xgt_desc_struct idt_descr, cpu_gdt_descr[NR_CPUS];
+extern struct Xgt_desc_struct idt_descr;
+DECLARE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
+
 
 static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
 {
-       return ((struct desc_struct *)cpu_gdt_descr[cpu].address);
+       return (struct desc_struct *)per_cpu(cpu_gdt_descr, cpu).address;
 }
 
 #define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8))
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/pci.h
--- a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/pci.h        Wed Mar 
 1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/pci.h        Wed Mar 
 1 19:47:25 2006
@@ -18,8 +18,6 @@
 #define pcibios_assign_all_busses()    0
 #endif
 #define pcibios_scan_all_fns(a, b)     0
-
-extern int no_iommu, force_iommu;
 
 extern unsigned long pci_mem_start;
 #define PCIBIOS_MIN_IO         0x1000
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/pgtable.h
--- a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/pgtable.h    Wed Mar 
 1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/pgtable.h    Wed Mar 
 1 19:47:25 2006
@@ -169,7 +169,7 @@
 #define PGDIR_SIZE     (1UL << PGDIR_SHIFT)
 #define PGDIR_MASK     (~(PGDIR_SIZE-1))
 
-#define USER_PTRS_PER_PGD      (TASK_SIZE/PGDIR_SIZE)
+#define USER_PTRS_PER_PGD      ((TASK_SIZE-1)/PGDIR_SIZE+1)
 #define FIRST_USER_ADDRESS     0
 
 #ifndef __ASSEMBLY__
diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/include/linux/mm.h
--- a/linux-2.6-xen-sparse/include/linux/mm.h   Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/include/linux/mm.h   Wed Mar  1 19:47:25 2006
@@ -1064,7 +1064,11 @@
 void drop_pagecache(void);
 void drop_slab(void);
 
+#ifndef CONFIG_MMU
+#define randomize_va_space 0
+#else
 extern int randomize_va_space;
+#endif
 
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/mm/page_alloc.c
--- a/linux-2.6-xen-sparse/mm/page_alloc.c      Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/mm/page_alloc.c      Wed Mar  1 19:47:25 2006
@@ -1017,7 +1017,7 @@
                if (page)
                        goto got_pg;
 
-               out_of_memory(gfp_mask, order);
+               out_of_memory(zonelist, gfp_mask, order);
                goto restart;
        }
 
diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/net/core/skbuff.c
--- a/linux-2.6-xen-sparse/net/core/skbuff.c    Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/net/core/skbuff.c    Wed Mar  1 19:47:25 2006
@@ -434,6 +434,9 @@
        C(pkt_type);
        C(ip_summed);
        C(priority);
+#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
+       C(ipvs_property);
+#endif
        C(protocol);
        n->destructor = NULL;
 #ifdef CONFIG_NETFILTER
@@ -441,13 +444,6 @@
        C(nfct);
        nf_conntrack_get(skb->nfct);
        C(nfctinfo);
-#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
-       C(nfct_reasm);
-       nf_conntrack_get_reasm(skb->nfct_reasm);
-#endif
-#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
-       C(ipvs_property);
-#endif
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
        C(nfct_reasm);
        nf_conntrack_get_reasm(skb->nfct_reasm);
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/Makefile
--- a/tools/examples/Makefile   Wed Mar  1 17:01:54 2006
+++ b/tools/examples/Makefile   Wed Mar  1 19:47:25 2006
@@ -26,10 +26,11 @@
 XEN_SCRIPTS += network-nat vif-nat
 XEN_SCRIPTS += block
 XEN_SCRIPTS += block-enbd block-nbd
-XEN_SCRIPTS += vtpm
-XEN_SCRIPT_DATA = xen-script-common.sh
+XEN_SCRIPTS += vtpm vtpm-delete
+XEN_SCRIPTS += xen-hotplug-cleanup
+XEN_SCRIPT_DATA = xen-script-common.sh locking.sh logging.sh
 XEN_SCRIPT_DATA += xen-hotplug-common.sh xen-network-common.sh vif-common.sh
-XEN_SCRIPT_DATA += block-common.sh vtpm-common.sh
+XEN_SCRIPT_DATA += block-common.sh vtpm-common.sh vtpm-hotplug-common.sh
 
 XEN_HOTPLUG_DIR = /etc/hotplug
 XEN_HOTPLUG_SCRIPTS = xen-backend.agent
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/vif-common.sh
--- a/tools/examples/vif-common.sh      Wed Mar  1 17:01:54 2006
+++ b/tools/examples/vif-common.sh      Wed Mar  1 19:47:25 2006
@@ -125,7 +125,7 @@
 #
 function ip_of()
 {
-  ip addr show "$1" | awk "/^.*inet.*$1\$/{print \$2}" | sed 's,/.*,,' | head 
-1
+  ip addr show "$1" | awk "/^.*inet.*$1\$/{print \$2}" | sed -n '1 s,/.*,,p'
 }
 
 
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/vtpm
--- a/tools/examples/vtpm       Wed Mar  1 17:01:54 2006
+++ b/tools/examples/vtpm       Wed Mar  1 19:47:25 2006
@@ -1,7 +1,7 @@
 #!/bin/sh
 
 dir=$(dirname "$0")
-. "$dir/vtpm-common.sh"
+. "$dir/vtpm-hotplug-common.sh"
 
 vtpm_fatal_error=0
 
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/vtpm-common.sh
--- a/tools/examples/vtpm-common.sh     Wed Mar  1 17:01:54 2006
+++ b/tools/examples/vtpm-common.sh     Wed Mar  1 19:47:25 2006
@@ -17,21 +17,8 @@
 #
 
 dir=$(dirname "$0")
-. "$dir/xen-hotplug-common.sh"
-
-findCommand "$@"
-if [ "$command" != "online" ]  &&
-   [ "$command" != "offline" ] &&
-   [ "$command" != "add" ]     &&
-   [ "$command" != "remove" ]
-then
-       log err "Invalid command: $command"
-       exit 1
-fi
-
-
-XENBUS_PATH="${XENBUS_PATH:?}"
-
+. "$dir/logging.sh"
+. "$dir/locking.sh"
 
 VTPMDB="/etc/xen/vtpm.db"
 
@@ -58,7 +45,11 @@
        function vtpm_resume() {
                true
        }
+       function vtpm_delete() {
+               true
+       }
 fi
+
 
 #Find the instance number for the vtpm given the name of the domain
 # Parameters
@@ -66,7 +57,7 @@
 # Return value
 #  Returns '0' if instance number could not be found, otherwise
 #  it returns the instance number in the variable 'instance'
-function find_instance () {
+function vtpmdb_find_instance () {
        local vmname=$1
        local ret=0
        instance=`cat $VTPMDB |                    \
@@ -80,18 +71,17 @@
                     }                             \
                   }'`
        if [ "$instance" != "" ]; then
-               ret=1
-       fi
-       return $ret
+               ret=$instance
+       fi
+       echo "$ret"
 }
 
 
 # Check whether a particular instance number is still available
-# returns '1' if it is available
-function is_free_instancenum () {
+# returns "0" if it is not available, "1" otherwise.
+function vtpmdb_is_free_instancenum () {
        local instance=$1
        local avail=1
-
        #Allowed instance number range: 1-255
        if [ $instance -eq 0 -o $instance -gt 255 ]; then
                avail=0
@@ -110,13 +100,13 @@
                        fi
                done
        fi
-       return $avail
+       echo "$avail"
 }
 
 
 # Get an available instance number given the database
 # Returns an unused instance number
-function get_free_instancenum () {
+function vtpmdb_get_free_instancenum () {
        local ctr
        local instances
        local don
@@ -145,12 +135,12 @@
                fi
                let ctr=ctr+1
        done
-       let instance=$ctr
+       echo "$ctr"
 }
 
 
 # Add a domain name and instance number to the DB file
-function add_instance () {
+function vtpmdb_add_instance () {
        local vmname=$1
        local inst=$2
 
@@ -159,8 +149,8 @@
                echo "#1st column: domain name" >> $VTPMDB
                echo "#2nd column: TPM instance number" >> $VTPMDB
        fi
-       validate_entry $vmname $inst
-       if [ $? -eq 0 ]; then
+       res=$(vtpmdb_validate_entry $vmname $inst)
+       if [ $res -eq 0 ]; then
                echo "$vmname $inst" >> $VTPMDB
        fi
 }
@@ -168,11 +158,10 @@
 
 #Validate whether an entry is the same as passed to this
 #function
-function validate_entry () {
+function vtpmdb_validate_entry () {
        local rc=0
        local vmname=$1
        local inst=$2
-       local res
 
        res=`cat $VTPMDB |             \
             gawk -vvmname=$vmname     \
@@ -197,13 +186,15 @@
        elif [ "$res" == "2" ]; then
                let rc=2
        fi
-       return $rc
+       echo "$rc"
 }
 
 
 #Remove an entry from the vTPM database given its domain name
-function remove_entry () {
+#and instance number
+function vtpmdb_remove_entry () {
        local vmname=$1
+       local instance=$2
        local VTPMDB_TMP="$VTPMDB".tmp
        `cat $VTPMDB |             \
         gawk -vvmname=$vmname     \
@@ -214,6 +205,7 @@
         '} > $VTPMDB_TMP`
        if [ -e $VTPMDB_TMP ]; then
                mv -f $VTPMDB_TMP $VTPMDB
+               vtpm_delete $instance
        else
                log err "Error creating temporary file '$VTPMDB_TMP'."
        fi
@@ -222,7 +214,7 @@
 
 # Find the reason for the creation of this device:
 # Set global REASON variable to 'resume' or 'create'
-function get_create_reason () {
+function vtpm_get_create_reason () {
        local resume=$(xenstore-read $XENBUS_PATH/resume)
        if [ "$resume" == "True" ]; then
                REASON="resume"
@@ -230,6 +222,7 @@
                REASON="create"
        fi
 }
+
 
 #Create a vTPM instance
 # If no entry in the TPM database is found, the instance is
@@ -237,26 +230,23 @@
 function vtpm_create_instance () {
        local domname=$(xenstore_read "$XENBUS_PATH"/domain)
        local res
-       set +e
-       get_create_reason
+       local instance
+       vtpm_get_create_reason
 
        claim_lock vtpmdb
-
-       find_instance $domname
-       res=$?
-       if [ $res -eq 0 ]; then
+       instance=$(vtpmdb_find_instance $domname)
+       if [ "$instance" == "0" ]; then
                #Try to give the preferred instance to the domain
                instance=$(xenstore_read "$XENBUS_PATH"/pref_instance)
                if [ "$instance" != "" ]; then
-                       is_free_instancenum $instance
-                       res=$?
+                       res=$(vtpmdb_is_free_instancenum $instance)
                        if [ $res -eq 0 ]; then
-                               get_free_instancenum
+                               instance=$(vtpmdb_get_free_instancenum)
                        fi
                else
-                       get_free_instancenum
+                       instance=$(vtpmdb_get_free_instancenum)
                fi
-               add_instance $domname $instance
+               vtpmdb_add_instance $domname $instance
                if [ "$REASON" == "create" ]; then
                        vtpm_create $instance
                elif [ "$REASON" == "resume" ]; then
@@ -279,25 +269,40 @@
                true
        fi
        xenstore_write $XENBUS_PATH/instance $instance
-       set -e
-}
-
-
-#Remove an instance
+}
+
+
+#Remove an instance when a VM is terminating or suspending.
+#Since it is assumed that the VM will appear again, the
+#entry is kept in the VTPMDB file.
 function vtpm_remove_instance () {
        local domname=$(xenstore_read "$XENBUS_PATH"/domain)
-       set +e
-       find_instance $domname
-       res=$?
-       if [ $res -eq 0 ]; then
-               #Something is really wrong with the DB
-               log err "vTPM DB file $VTPMDB has no entry for '$domname'"
-       else
+
+       claim_lock vtpmdb
+
+       instance=$(vtpmdb_find_instance $domname)
+
+       if [ "$instance" != "0" ]; then
                if [ "$REASON" == "suspend" ]; then
                        vtpm_suspend $instance
                fi
        fi
-       set -e
-}
-
-
+
+       release_lock vtpmdb
+}
+
+
+#Remove an entry in the VTPMDB file given the domain's name
+#1st parameter: The name of the domain
+function vtpm_delete_instance () {
+       local rc
+
+       claim_lock vtpmdb
+
+       instance=$(vtpmdb_find_instance $1)
+       if [ "$instance" != "0" ]; then
+               vtpmdb_remove_entry $1 $instance
+       fi
+
+       release_lock vtpmdb
+}
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/xen-backend.agent
--- a/tools/examples/xen-backend.agent  Wed Mar  1 17:01:54 2006
+++ b/tools/examples/xen-backend.agent  Wed Mar  1 19:47:25 2006
@@ -18,12 +18,7 @@
   add)
     ;;
   remove)
-    # remove device frontend store entries
-    xenstore-rm -t $(xenstore-read "$XENBUS_PATH/frontend") || true
-
-    # remove device backend store entries
-    xenstore-rm -t "$XENBUS_PATH"       || true
-    xenstore-rm -t "error/$XENBUS_PATH" || true
+    /etc/xen/scripts/xen-hotplug-cleanup
     ;;
   online)
     ;;
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/xen-backend.rules
--- a/tools/examples/xen-backend.rules  Wed Mar  1 17:01:54 2006
+++ b/tools/examples/xen-backend.rules  Wed Mar  1 19:47:25 2006
@@ -2,6 +2,4 @@
 SUBSYSTEM=="xen-backend", KERNEL=="vtpm*", RUN+="/etc/xen/scripts/vtpm 
$env{ACTION}"
 SUBSYSTEM=="xen-backend", KERNEL=="vif*", ACTION=="online", RUN+="$env{script} 
online"
 SUBSYSTEM=="xen-backend", KERNEL=="vif*", ACTION=="offline", 
RUN+="$env{script} offline"
-SUBSYSTEM=="xen-backend", ACTION=="remove", RUN+="/bin/bash -c 
'/usr/bin/xenstore-rm -t $$(/usr/bin/xenstore-read $env{XENBUS_PATH}/frontend)'"
-SUBSYSTEM=="xen-backend", ACTION=="remove", RUN+="/usr/bin/xenstore-rm -t 
$env{XENBUS_PATH}"
-SUBSYSTEM=="xen-backend", ACTION=="remove", RUN+="/usr/bin/xenstore-rm -t 
error/$env{XENBUS_PATH}"
+SUBSYSTEM=="xen-backend", ACTION=="remove", 
RUN+="/etc/xen/scripts/xen-hotplug-cleanup"
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/xen-hotplug-common.sh
--- a/tools/examples/xen-hotplug-common.sh      Wed Mar  1 17:01:54 2006
+++ b/tools/examples/xen-hotplug-common.sh      Wed Mar  1 19:47:25 2006
@@ -17,19 +17,15 @@
 
 
 dir=$(dirname "$0")
+. "$dir/logging.sh"
 . "$dir/xen-script-common.sh"
+. "$dir/locking.sh"
 
 exec 2>>/var/log/xen-hotplug.log
 
 export PATH="/sbin:/bin:/usr/bin:/usr/sbin:$PATH"
 export LANG="POSIX"
 unset $(set | grep ^LC_ | cut -d= -f1)
-
-log() {
-  local level="$1"
-  shift
-  logger -p "daemon.$level" -- "$0:" "$@" || echo "$0 $@" >&2
-}
 
 fatal() {
   xenstore_write "$XENBUS_PATH"/hotplug-status error
@@ -93,87 +89,4 @@
 }
 
 
-#
-# Serialisation
-#
-
-LOCK_SLEEPTIME=1
-LOCK_SPINNING_RETRIES=5
-LOCK_RETRIES=10
-LOCK_BASEDIR=/var/run/xen-hotplug
-
-
-claim_lock()
-{
-  local lockdir="$LOCK_BASEDIR/$1"
-  mkdir -p "$LOCK_BASEDIR"
-  _claim_lock "$lockdir"
-}
-
-
-release_lock()
-{
-  _release_lock "$LOCK_BASEDIR/$1"
-}
-
-
-_claim_lock()
-{
-  local lockdir="$1"
-  local owner=$(_lock_owner "$lockdir")
-  local retries=0
-
-  while [ $retries -lt $LOCK_RETRIES ]
-  do
-    mkdir "$lockdir" 2>/dev/null && trap "release_lock $1; sigerr" ERR &&
-      _update_lock_info "$lockdir" && return
-
-    local new_owner=$(_lock_owner "$lockdir")
-    if [ "$new_owner" != "$owner" ]
-    then
-      owner="$new_owner"
-      retries=0
-    fi
-
-    if [ $retries -gt $LOCK_SPINNING_RETRIES ]
-    then
-      sleep $LOCK_SLEEPTIME
-    else
-      sleep 0
-    fi
-    retries=$(($retries + 1))
-  done
-  _steal_lock "$lockdir"
-}
-
-
-_release_lock()
-{
-  trap sigerr ERR
-  rm -rf "$1" 2>/dev/null || true
-}
-
-
-_steal_lock()
-{
-  local lockdir="$1"
-  local owner=$(cat "$lockdir/owner" 2>/dev/null || echo "unknown")
-  log err "Forced to steal lock on $lockdir from $owner!"
-  _release_lock "$lockdir"
-  _claim_lock "$lockdir"
-}
-
-
-_lock_owner()
-{
-  cat "$1/owner" 2>/dev/null || echo "unknown"
-}
-
-
-_update_lock_info()
-{
-  echo "$$: $0" >"$1/owner"
-}
-
-
 log debug "$@" "XENBUS_PATH=$XENBUS_PATH"
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/firmware/hvmloader/Makefile
--- a/tools/firmware/hvmloader/Makefile Wed Mar  1 17:01:54 2006
+++ b/tools/firmware/hvmloader/Makefile Wed Mar  1 19:47:25 2006
@@ -19,7 +19,7 @@
 #
 
 XEN_ROOT = ../../..
-include $(XEN_ROOT)/tools/Rules.mk
+include $(XEN_ROOT)/Config.mk
 
 # The HVM loader is started in 32-bit mode at the address below:
 LOADADDR = 0x100000
@@ -29,9 +29,13 @@
 
 OBJECTS         = hvmloader.o acpi_madt.o 
 
-CC       = gcc
+# Disable PIE/SSP if GCC supports them. They can break us.
+CFLAGS  += $(call test-gcc-flag,$(CC),-nopie)
+CFLAGS  += $(call test-gcc-flag,$(CC),-fno-stack-protector)
+CFLAGS  += $(call test-gcc-flag,$(CC),-fno-stack-protector-all)
+
 OBJCOPY  = objcopy
-CFLAGS   = $(DEFINES) -I. $(XENINC) -Wall -fno-builtin -O2 -msoft-float
+CFLAGS  += $(DEFINES) -I. $(XENINC) -Wall -fno-builtin -O2 -msoft-float
 CFLAGS  += -m32 -march=i686
 LDFLAGS  = -m32 -nostdlib -Wl,-N -Wl,-Ttext -Wl,$(LOADADDR)
 
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/firmware/vgabios/Makefile
--- a/tools/firmware/vgabios/Makefile   Wed Mar  1 17:01:54 2006
+++ b/tools/firmware/vgabios/Makefile   Wed Mar  1 19:47:25 2006
@@ -1,6 +1,4 @@
 CC      = gcc
-CFLAGS  = -g -O2 -Wall -Wstrict-prototypes
-LDFLAGS = 
 
 GCC = gcc
 BCC = bcc
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/firmware/vmxassist/Makefile
--- a/tools/firmware/vmxassist/Makefile Wed Mar  1 17:01:54 2006
+++ b/tools/firmware/vmxassist/Makefile Wed Mar  1 19:47:25 2006
@@ -19,7 +19,7 @@
 #
 
 XEN_ROOT = ../../..
-include $(XEN_ROOT)/tools/Rules.mk
+include $(XEN_ROOT)/Config.mk
 
 # The emulator code lives in ROM space
 TEXTADDR=0x000D0000
@@ -27,11 +27,14 @@
 DEFINES=-DDEBUG -DTEXTADDR=$(TEXTADDR)
 XENINC=-I$(XEN_ROOT)/tools/libxc
 
-LD       = ld
-CC       = gcc
+# Disable PIE/SSP if GCC supports them. They can break us.
+CFLAGS  += $(call test-gcc-flag,$(CC),-nopie)
+CFLAGS  += $(call test-gcc-flag,$(CC),-fno-stack-protector)
+CFLAGS  += $(call test-gcc-flag,$(CC),-fno-stack-protector-all)
+
 CPP      = cpp -P
 OBJCOPY  = objcopy -p -O binary -R .note -R .comment -R .bss -S --gap-fill=0
-CFLAGS   = $(DEFINES) -I. $(XENINC) -Wall -fno-builtin -O2 -msoft-float
+CFLAGS  += $(DEFINES) -I. $(XENINC) -Wall -fno-builtin -O2 -msoft-float
 CFLAGS  += -m32 -march=i686
 LDFLAGS  = -m elf_i386
 
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/ioemu/Makefile
--- a/tools/ioemu/Makefile      Wed Mar  1 17:01:54 2006
+++ b/tools/ioemu/Makefile      Wed Mar  1 19:47:25 2006
@@ -1,6 +1,9 @@
+XEN_ROOT=../..
+include $(XEN_ROOT)/tools/Rules.mk
+
 -include config-host.mak
 
-CFLAGS=-Wall -O2 -g -fno-strict-aliasing 
+CFLAGS+=-Wall -O2 -g -fno-strict-aliasing 
 ifdef CONFIG_DARWIN
 CFLAGS+= -mdynamic-no-pic
 endif
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/ioemu/hw/ide.c
--- a/tools/ioemu/hw/ide.c      Wed Mar  1 17:01:54 2006
+++ b/tools/ioemu/hw/ide.c      Wed Mar  1 19:47:25 2006
@@ -669,9 +669,6 @@
     }
     if (s->io_buffer_index >= s->io_buffer_size && s->nsector == 0) {
         s->status = READY_STAT | SEEK_STAT;
-        s->bmdma->status &= ~BM_STATUS_DMAING;
-        s->bmdma->status |= BM_STATUS_INT;
-        ide_set_irq(s);
 #ifdef DEBUG_IDE_ATAPI
         printf("dma status=0x%x\n", s->status);
 #endif
@@ -738,9 +735,6 @@
             if (n == 0) {
                 /* end of transfer */
                 s->status = READY_STAT | SEEK_STAT;
-                s->bmdma->status &= ~BM_STATUS_DMAING;
-                s->bmdma->status |= BM_STATUS_INT;
-                ide_set_irq(s);
                 return 0;
             }
             if (n > MAX_MULT_SECTORS)
@@ -987,9 +981,6 @@
     if (s->packet_transfer_size <= 0) {
         s->status = READY_STAT;
         s->nsector = (s->nsector & ~7) | ATAPI_INT_REASON_IO | 
ATAPI_INT_REASON_CD;
-        s->bmdma->status &= ~BM_STATUS_DMAING;
-        s->bmdma->status |= BM_STATUS_INT;
-        ide_set_irq(s);
 #ifdef DEBUG_IDE_ATAPI
         printf("dma status=0x%x\n", s->status);
 #endif
@@ -2025,6 +2016,17 @@
     }
 }
 
+static void ide_dma_finish(BMDMAState *bm)
+{
+    IDEState *s = bm->ide_if;
+
+    bm->status &= ~BM_STATUS_DMAING;
+    bm->status |= BM_STATUS_INT;
+    bm->dma_cb = NULL;
+    bm->ide_if = NULL;
+    ide_set_irq(s);
+}
+
 /* XXX: full callback usage to prepare non blocking I/Os support -
    error handling */
 #ifdef DMA_MULTI_THREAD
@@ -2070,9 +2072,8 @@
         cur_addr += 8;
     }
     /* end of transfer */
- the_end:
-    bm->dma_cb = NULL;
-    bm->ide_if = NULL;
+the_end:
+    ide_dma_finish(bm);
 }
 
 static void ide_dma_start(IDEState *s, IDEDMAFunc *dma_cb)
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/ioemu/hw/pcnet.c
--- a/tools/ioemu/hw/pcnet.c    Wed Mar  1 17:01:54 2006
+++ b/tools/ioemu/hw/pcnet.c    Wed Mar  1 19:47:25 2006
@@ -376,6 +376,10 @@
     if (s->recv_pos > 0)
         return 0;
 
+    pcnet_rdte_poll(s);
+    if (!(CSR_CRST(s) & 0x8000)) {
+        return 0;
+    }
     return sizeof(s->buffer)-16;
 }
 
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/ioemu/target-i386-dm/Makefile
--- a/tools/ioemu/target-i386-dm/Makefile       Wed Mar  1 17:01:54 2006
+++ b/tools/ioemu/target-i386-dm/Makefile       Wed Mar  1 19:47:25 2006
@@ -1,7 +1,8 @@
+include config.mak
+override TARGET_ARCH=i386
+
 XEN_ROOT=../../..
 include $(XEN_ROOT)/tools/Rules.mk
-include config.mak
-override TARGET_ARCH=i386
 
 INSTALL_DIR := $(DESTDIR)/usr/$(LIBDIR)/xen/bin
 TARGET_PATH=$(SRC_PATH)/target-$(TARGET_ARCH)
@@ -12,7 +13,7 @@
 VPATH+=:$(SRC_PATH)/linux-user
 DEFINES+=-I$(SRC_PATH)/linux-user -I$(SRC_PATH)/linux-user/$(TARGET_ARCH)
 endif
-CFLAGS=-Wall -O2 -g -fno-strict-aliasing
+CFLAGS+=-Wall -O2 -g -fno-strict-aliasing
 LDFLAGS=-g
 LIBS=
 HELPER_CFLAGS=$(CFLAGS)
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/libxc/xc_linux_build.c
--- a/tools/libxc/xc_linux_build.c      Wed Mar  1 17:01:54 2006
+++ b/tools/libxc/xc_linux_build.c      Wed Mar  1 19:47:25 2006
@@ -45,6 +45,77 @@
 #ifdef __ia64__
 #define probe_aout9(image,image_size,load_funcs) 1
 #endif
+
+static const char *feature_names[XENFEAT_NR_SUBMAPS*32] = {
+    [XENFEAT_writable_page_tables]       = "writable_page_tables",
+    [XENFEAT_writable_descriptor_tables] = "writable_descriptor_tables",
+    [XENFEAT_auto_translated_physmap]    = "auto_translated_physmap",
+    [XENFEAT_supervisor_mode_kernel]     = "supervisor_mode_kernel",
+    [XENFEAT_pae_pgdir_above_4gb]        = "pae_pgdir_above_4gb"
+};
+
+static inline void set_feature_bit (int nr, uint32_t *addr)
+{
+    addr[nr>>5] |= (1<<(nr&31));
+}
+
+static inline int test_feature_bit(int nr, uint32_t *addr)
+{
+    return !!(addr[nr>>5] & (1<<(nr&31)));
+}
+
+static int parse_features(
+    const char *feats,
+    uint32_t supported[XENFEAT_NR_SUBMAPS],
+    uint32_t required[XENFEAT_NR_SUBMAPS])
+{
+    const char *end, *p;
+    int i, req;
+
+    if ( (end = strchr(feats, ',')) == NULL )
+        end = feats + strlen(feats);
+
+    while ( feats < end )
+    {
+        p = strchr(feats, '|');
+        if ( (p == NULL) || (p > end) )
+            p = end;
+
+        req = (*feats == '!');
+        if ( req )
+            feats++;
+
+        for ( i = 0; i < XENFEAT_NR_SUBMAPS*32; i++ )
+        {
+            if ( feature_names[i] == NULL )
+                continue;
+
+            if ( strncmp(feature_names[i], feats, p-feats) == 0 )
+            {
+                set_feature_bit(i, supported);
+                if ( required && req )
+                    set_feature_bit(i, required);
+                break;
+            }
+        }
+
+        if ( i == XENFEAT_NR_SUBMAPS*32 )
+        {
+            ERROR("Unknown feature \"%.*s\".\n", (int)(p-feats), feats);
+            if ( req )
+            {
+                ERROR("Kernel requires an unknown hypervisor feature.\n");
+                return -EINVAL;
+            }
+        }
+
+        feats = p;
+        if ( *feats == '|' )
+            feats++;
+    }
+
+    return -EINVAL;
+}
 
 static int probeimageformat(char *image,
                             unsigned long image_size,
@@ -344,7 +415,8 @@
                        unsigned long shared_info_frame,
                        unsigned long flags,
                        unsigned int store_evtchn, unsigned long *store_mfn,
-                       unsigned int console_evtchn, unsigned long *console_mfn)
+                       unsigned int console_evtchn, unsigned long *console_mfn,
+                       uint32_t required_features[XENFEAT_NR_SUBMAPS])
 {
     unsigned long *page_array = NULL;
     struct load_funcs load_funcs;
@@ -483,7 +555,8 @@
                        unsigned long shared_info_frame,
                        unsigned long flags,
                        unsigned int store_evtchn, unsigned long *store_mfn,
-                       unsigned int console_evtchn, unsigned long *console_mfn)
+                       unsigned int console_evtchn, unsigned long *console_mfn,
+                       uint32_t required_features[XENFEAT_NR_SUBMAPS])
 {
     unsigned long *page_array = NULL;
     unsigned long count, i, hypercall_pfn;
@@ -515,8 +588,9 @@
     unsigned long vpt_start;
     unsigned long vpt_end;
     unsigned long v_end;
-    unsigned shadow_mode_enabled;
     unsigned long guest_store_mfn, guest_console_mfn, guest_shared_info_mfn;
+    unsigned long shadow_mode_enabled;
+    uint32_t supported_features[XENFEAT_NR_SUBMAPS] = { 0, };
 
     rc = probeimageformat(image, image_size, &load_funcs);
     if ( rc != 0 )
@@ -534,8 +608,6 @@
         goto error_out;
     }
 
-    shadow_mode_enabled = !!strstr(dsi.xen_guest_string,
-                                   "SHADOW=translate");
     /*
      * Why do we need this? The number of page-table frames depends on the 
      * size of the bootstrap address space. But the size of the address space 
@@ -637,6 +709,35 @@
     (load_funcs.loadimage)(image, image_size, xc_handle, dom, page_array,
                            &dsi);
 
+    /* Parse and validate kernel features. */
+    p = strstr(dsi.xen_guest_string, "FEATURES=");
+    if ( p != NULL )
+    {
+        if ( !parse_features(p + strlen("FEATURES="),
+                             supported_features,
+                             required_features) )
+        {
+            ERROR("Failed to parse guest kernel features.\n");
+            goto error_out;
+        }
+
+        fprintf(stderr, "Supported features  = { %08x }.\n",
+                supported_features[0]);
+        fprintf(stderr, "Required features   = { %08x }.\n",
+                required_features[0]);
+    }
+
+    for ( i = 0; i < XENFEAT_NR_SUBMAPS; i++ )
+    {
+        if ( (supported_features[i]&required_features[i]) != 
required_features[i] )
+        {
+            ERROR("Guest kernel does not support a required feature.\n");
+            goto error_out;
+        }
+    }
+
+    shadow_mode_enabled = test_feature_bit(XENFEAT_auto_translated_physmap, 
required_features);
+
     /* Load the initial ramdisk image. */
     if ( initrd_len != 0 )
     {
@@ -870,6 +971,7 @@
                    const char *image_name,
                    const char *ramdisk_name,
                    const char *cmdline,
+                   const char *features,
                    unsigned long flags,
                    unsigned int store_evtchn,
                    unsigned long *store_mfn,
@@ -886,6 +988,16 @@
     char         *image = NULL;
     unsigned long image_size, initrd_size=0;
     unsigned long vstartinfo_start, vkern_entry, vstack_start;
+    uint32_t      features_bitmap[XENFEAT_NR_SUBMAPS] = { 0, };
+
+    if ( features != NULL )
+    {
+        if ( !parse_features(features, features_bitmap, NULL) )
+        {
+            PERROR("Failed to parse configured features\n");
+            goto error_out;
+        }
+    }
 
     if ( (nr_pages = get_tot_pages(xc_handle, domid)) < 0 )
     {
@@ -940,7 +1052,8 @@
                      &vstack_start, ctxt, cmdline,
                      op.u.getdomaininfo.shared_info_frame,
                      flags, store_evtchn, store_mfn,
-                     console_evtchn, console_mfn) < 0 )
+                     console_evtchn, console_mfn,
+                     features_bitmap) < 0 )
     {
         ERROR("Error constructing guest OS");
         goto error_out;
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/libxc/xenguest.h
--- a/tools/libxc/xenguest.h    Wed Mar  1 17:01:54 2006
+++ b/tools/libxc/xenguest.h    Wed Mar  1 19:47:25 2006
@@ -47,6 +47,7 @@
                    const char *image_name,
                    const char *ramdisk_name,
                    const char *cmdline,
+                   const char *features,
                    unsigned long flags,
                    unsigned int store_evtchn,
                    unsigned long *store_mfn,
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/pygrub/src/pygrub
--- a/tools/pygrub/src/pygrub   Wed Mar  1 17:01:54 2006
+++ b/tools/pygrub/src/pygrub   Wed Mar  1 19:47:25 2006
@@ -94,11 +94,17 @@
             return struct.unpack("<L", buf[poff+8:poff+12])[0] * SECTOR_SIZE
     return -1
 
-def get_config(fn):
+def get_config(fn, isconfig = False):
     if not os.access(fn, os.R_OK):
         raise RuntimeError, "Unable to access %s" %(fn,)
 
     cf = grub.GrubConf.GrubConfigFile()
+
+    if isconfig:
+        # set the config file and parse it
+        cf.filename = fn
+        cf.parse()
+        return cf
 
     offset = 0
     if is_disk_image(fn):
@@ -130,9 +136,7 @@
         # then parse the grub config
         cf.parse(buf)
     else:
-        # set the config file and parse it
-        cf.filename = fn
-        cf.parse()
+        raise RuntimeError, "Unable to read filesystem" 
     
     return cf
 
@@ -214,7 +218,8 @@
 
     try:
         opts, args = getopt.gnu_getopt(sys.argv[1:], 'qh::',
-                                   ["quiet", "help", "output=", "entry="])
+                                   ["quiet", "help", "output=", "entry=",
+                                    "isconfig"])
     except getopt.GetoptError:
         usage()
         sys.exit(1)
@@ -227,6 +232,7 @@
     output = None
     entry = None
     interactive = True
+    isconfig = False
     for o, a in opts:
         if o in ("-q", "--quiet"):
             interactive = False
@@ -239,13 +245,15 @@
             entry = a
             # specifying the entry to boot implies non-interactive
             interactive = False
+        elif o in ("--isconfig",):
+            isconfig = True
 
     if output is None or output == "-":
         fd = sys.stdout.fileno()
     else:
         fd = os.open(output, os.O_WRONLY)
 
-    cf = get_config(file)
+    cf = get_config(file, isconfig)
     if interactive:
         curses.wrapper(run_main)
     else:
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/python/xen/lowlevel/xc/xc.c
--- a/tools/python/xen/lowlevel/xc/xc.c Wed Mar  1 17:01:54 2006
+++ b/tools/python/xen/lowlevel/xc/xc.c Wed Mar  1 19:47:25 2006
@@ -326,27 +326,29 @@
                                   PyObject *kwds)
 {
     uint32_t dom;
-    char *image, *ramdisk = NULL, *cmdline = "";
+    char *image, *ramdisk = NULL, *cmdline = "", *features = NULL;
     int flags = 0;
     int store_evtchn, console_evtchn;
     unsigned long store_mfn = 0;
     unsigned long console_mfn = 0;
 
-    static char *kwd_list[] = { "dom", "store_evtchn", 
-                                "console_evtchn", "image", 
+    static char *kwd_list[] = { "dom", "store_evtchn",
+                                "console_evtchn", "image",
                                /* optional */
-                               "ramdisk", "cmdline", "flags", NULL };
-
-    if ( !PyArg_ParseTupleAndKeywords(args, kwds, "iiis|ssi", kwd_list,
+                               "ramdisk", "cmdline", "flags",
+                               "features", NULL };
+
+    if ( !PyArg_ParseTupleAndKeywords(args, kwds, "iiis|ssis", kwd_list,
                                       &dom, &store_evtchn,
-                                     &console_evtchn, &image, 
+                                     &console_evtchn, &image,
                                      /* optional */
-                                     &ramdisk, &cmdline, &flags) )
+                                     &ramdisk, &cmdline, &flags,
+                                     &features) )
         return NULL;
 
     if ( xc_linux_build(self->xc_handle, dom, image,
-                        ramdisk, cmdline, flags,
-                        store_evtchn, &store_mfn, 
+                        ramdisk, cmdline, features, flags,
+                        store_evtchn, &store_mfn,
                        console_evtchn, &console_mfn) != 0 ) {
         if (!errno)
              errno = EINVAL;
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/python/xen/xend/XendBootloader.py
--- a/tools/python/xen/xend/XendBootloader.py   Wed Mar  1 17:01:54 2006
+++ b/tools/python/xen/xend/XendBootloader.py   Wed Mar  1 19:47:25 2006
@@ -1,7 +1,7 @@
 #
 # XendBootloader.py - Framework to run a boot loader for picking the kernel
 #
-# Copyright 2005 Red Hat, Inc.
+# Copyright 2005-2006 Red Hat, Inc.
 # Jeremy Katz <katzj@xxxxxxxxxx>
 #
 # This software may be freely redistributed under the terms of the GNU
@@ -13,12 +13,11 @@
 #
 
 import os, select, errno
+import random
 import sxp
 
 from XendLogging import log
 from XendError import VmError
-
-BL_FIFO = "/var/lib/xen/xenbl"
 
 def bootloader(blexec, disk, quiet = 0, vcpus = None, entry = None):
     """Run the boot loader executable on the given disk and return a
@@ -38,14 +37,18 @@
         log.error(msg)
         raise VmError(msg)
 
-    os.mkfifo(BL_FIFO, 0600)
+    while True:
+        fifo = "/var/lib/xen/xenbl.%s" %(random.randint(0, 32000),)
+        if not os.path.exists(fifo):
+            break
+    os.mkfifo(fifo, 0600)
 
     child = os.fork()
     if (not child):
         args = [ blexec ]
         if quiet:
             args.append("-q")
-        args.append("--output=%s" %(BL_FIFO,))
+        args.append("--output=%s" %(fifo,))
         if entry is not None:
             args.append("--entry=%s" %(entry,))
         args.append(disk)
@@ -59,7 +62,7 @@
 
     while 1:
         try:
-            r = os.open(BL_FIFO, os.O_RDONLY)
+            r = os.open(fifo, os.O_RDONLY)
         except OSError, e:
             if e.errno == errno.EINTR:
                 continue
@@ -74,7 +77,7 @@
         
     os.waitpid(child, 0)
     os.close(r)
-    os.unlink(BL_FIFO)
+    os.unlink(fifo)
 
     if len(ret) == 0:
         msg = "Boot loader didn't return any data!"
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/python/xen/xend/XendDomainInfo.py
--- a/tools/python/xen/xend/XendDomainInfo.py   Wed Mar  1 17:01:54 2006
+++ b/tools/python/xen/xend/XendDomainInfo.py   Wed Mar  1 19:47:25 2006
@@ -1502,15 +1502,14 @@
         if not self.info['bootloader']:
             return
         # if we're restarting with a bootloader, we need to run it
-        # FIXME: this assumes the disk is the first device and
-        # that we're booting from the first disk
         blcfg = None
         config = self.sxpr()
         # FIXME: this assumes that we want to use the first disk
-        dev = sxp.child_value(config, "device")
-        if dev:
-            disk = sxp.child_value(dev, "uname")
-            fn = blkdev_uname_to_file(disk)
+        for dev in sxp.children(config, "device"):
+            disk = sxp.child(dev, "vbd")
+            if disk is None:
+                continue
+            fn = blkdev_uname_to_file(sxp.child_value(disk, "uname"))
             blcfg = bootloader(self.info['bootloader'], fn, 1,
                                self.info['vcpus'])
         if blcfg is None:
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/python/xen/xend/image.py
--- a/tools/python/xen/xend/image.py    Wed Mar  1 17:01:54 2006
+++ b/tools/python/xen/xend/image.py    Wed Mar  1 19:47:25 2006
@@ -68,6 +68,7 @@
         self.kernel = None
         self.ramdisk = None
         self.cmdline = None
+        self.features = None
 
         self.configure(imageConfig, deviceConfig)
 
@@ -89,6 +90,7 @@
         if args:
             self.cmdline += " " + args
         self.ramdisk = get_cfg("ramdisk", '')
+        self.features = get_cfg("features", '')
         
         self.vm.storeVm(("image/ostype", self.ostype),
                         ("image/kernel", self.kernel),
@@ -175,13 +177,15 @@
         log.debug("cmdline        = %s", self.cmdline)
         log.debug("ramdisk        = %s", self.ramdisk)
         log.debug("vcpus          = %d", self.vm.getVCpuCount())
+        log.debug("features       = %s", self.features)
 
         return xc.linux_build(dom            = self.vm.getDomid(),
                               image          = self.kernel,
                               store_evtchn   = store_evtchn,
                               console_evtchn = console_evtchn,
                               cmdline        = self.cmdline,
-                              ramdisk        = self.ramdisk)
+                              ramdisk        = self.ramdisk,
+                              features       = self.features)
 
 class HVMImageHandler(ImageHandler):
 
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/python/xen/xend/server/netif.py
--- a/tools/python/xen/xend/server/netif.py     Wed Mar  1 17:01:54 2006
+++ b/tools/python/xen/xend/server/netif.py     Wed Mar  1 19:47:25 2006
@@ -113,7 +113,8 @@
                            script.replace(xroot.network_script_dir + os.sep,
                                           "")])
         if ip:
-            result.append(['ip', ip.split(" ")])
+            for i in ip.split(" "):
+                result.append(['ip', i])
         if bridge:
             result.append(['bridge', bridge])
         if mac:
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/python/xen/xm/create.py
--- a/tools/python/xen/xm/create.py     Wed Mar  1 17:01:54 2006
+++ b/tools/python/xen/xm/create.py     Wed Mar  1 19:47:25 2006
@@ -137,6 +137,10 @@
           fn=set_value, default='',
           use="Path to ramdisk.")
 
+gopts.var('features', val='FEATURES',
+          fn=set_value, default='',
+          use="Features to enable in guest kernel")
+
 gopts.var('builder', val='FUNCTION',
           fn=set_value, default='linux',
           use="Function to use to build the domain.")
@@ -445,6 +449,8 @@
         config_image.append(['root', cmdline_root])
     if vals.extra:
         config_image.append(['args', vals.extra])
+    if vals.features:
+        config_image.append(['features', vals.features])
 
     if vals.builder == 'hvm':
         configure_hvm(config_image, vals)
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/tests/Makefile
--- a/tools/tests/Makefile      Wed Mar  1 17:01:54 2006
+++ b/tools/tests/Makefile      Wed Mar  1 19:47:25 2006
@@ -4,13 +4,12 @@
 
 TARGET := test_x86_emulator
 
-CC     := gcc
-CFLAGS := -O2 -Wall -Werror -D__TEST_HARNESS__
+HOSTCFLAGS += -D__TEST_HARNESS__
 
 all: $(TARGET)
 
 $(TARGET): x86_emulate.o test_x86_emulator.o
-       $(CC) -o $@ $^
+       $(HOSTCC) -o $@ $^
 
 clean:
        rm -rf $(TARGET) *.o *~ core
@@ -18,7 +17,7 @@
 install:
 
 x86_emulate.o: $(XEN_ROOT)/xen/arch/x86/x86_emulate.c
-       $(CC) $(CFLAGS) -I$(XEN_ROOT)/xen/include -c -o $@ $<
+       $(HOSTCC) $(HOSTCFLAGS) -I$(XEN_ROOT)/xen/include -c -o $@ $<
 
 %.o: %.c
-       $(CC) $(CFLAGS) -I$(XEN_ROOT)/xen/include -c -o $@ $<
+       $(HOSTCC) $(HOSTCFLAGS) -I$(XEN_ROOT)/xen/include -c -o $@ $<
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/xenstore/xs.c
--- a/tools/xenstore/xs.c       Wed Mar  1 17:01:54 2006
+++ b/tools/xenstore/xs.c       Wed Mar  1 19:47:25 2006
@@ -31,7 +31,6 @@
 #include <signal.h>
 #include <stdint.h>
 #include <errno.h>
-#include <sys/ioctl.h>
 #include <pthread.h>
 #include "xs.h"
 #include "list.h"
@@ -343,7 +342,6 @@
                free(ret);
                saved_errno = EBADF;
                goto close_fd;
-               
        }
        return ret;
 
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/xm-test/configure.ac
--- a/tools/xm-test/configure.ac        Wed Mar  1 17:01:54 2006
+++ b/tools/xm-test/configure.ac        Wed Mar  1 19:47:25 2006
@@ -93,6 +93,7 @@
     tests/unpause/Makefile
     tests/vcpu-pin/Makefile
     tests/vcpu-disable/Makefile
+    tests/vtpm/Makefile
     tests/enforce_dom0_cpus/Makefile
     lib/XmTestReport/xmtest.py
     lib/XmTestLib/config.py
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/xm-test/lib/XmTestLib/Network.py
--- a/tools/xm-test/lib/XmTestLib/Network.py    Wed Mar  1 17:01:54 2006
+++ b/tools/xm-test/lib/XmTestLib/Network.py    Wed Mar  1 19:47:25 2006
@@ -22,6 +22,7 @@
 import sys;
 import os;
 import atexit;
+import random;
 
 from Test import *
 from Xm import *
@@ -53,12 +54,22 @@
         if rc == 0:
             SKIP("Zeroconf address found: " + out)
 
+        # Randomize one octet of the IP addresses we choose, so that
+        # multiple machines running network tests don't interfere 
+        # with each other. 
+        self.subnet = random.randint(1,254)
+
     def calc_ip_address(self, dom, interface):
         # Generate an IP address from the dom# and eth#:
-        #      169.254.(eth#+153).(dom#+10)
+        #      169.254.(self.subnet).(eth#)*16 + (dom# + 1)
         ethnum = int(interface[len("eth"):])
+        if (ethnum > 15):
+            raise NetworkError("ethnum > 15 : " + interface)
         domnum = int(dom[len("dom"):])
-        return "169.254."+ str(ethnum+153) + "." + str(domnum+10)
+        if (domnum > 14):
+            raise NetworkError("domnum > 14 : " + dom)
+
+        return "169.254."+ str(self.subnet) + "." + str(ethnum*16+domnum+1)
 
     def ip(self, dom, interface, todomname=None, toeth=None, bridge=None):
         newip = self.calc_ip_address(dom, interface)
@@ -96,4 +107,4 @@
         return newip
 
     def mask(self, dom, interface):
-        return "255.255.255.0"
+        return "255.255.255.240"
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/xm-test/lib/XmTestLib/XenDomain.py
--- a/tools/xm-test/lib/XmTestLib/XenDomain.py  Wed Mar  1 17:01:54 2006
+++ b/tools/xm-test/lib/XmTestLib/XenDomain.py  Wed Mar  1 19:47:25 2006
@@ -99,6 +99,7 @@
         # These options need to be lists
         self.defaultOpts["disk"] = []
         self.defaultOpts["vif"]  = []
+        self.defaultOpts["vtpm"] = []
 
         self.opts = self.defaultOpts
 
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/xm-test/tests/Makefile.am
--- a/tools/xm-test/tests/Makefile.am   Wed Mar  1 17:01:54 2006
+++ b/tools/xm-test/tests/Makefile.am   Wed Mar  1 19:47:25 2006
@@ -23,6 +23,7 @@
                unpause         \
                vcpu-disable    \
                vcpu-pin        \
+               vtpm            \
                enforce_dom0_cpus       \
                save restore migrate
 
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/Rules.mk
--- a/xen/Rules.mk      Wed Mar  1 17:01:54 2006
+++ b/xen/Rules.mk      Wed Mar  1 19:47:25 2006
@@ -45,7 +45,7 @@
 
 include $(BASEDIR)/arch/$(TARGET_ARCH)/Rules.mk
 
-CFLAGS += -g
+CFLAGS += -g -D__XEN__
 
 ifneq ($(debug),y)
 CFLAGS += -DNDEBUG
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/ia64/vmx/vmx_hypercall.c
--- a/xen/arch/ia64/vmx/vmx_hypercall.c Wed Mar  1 17:01:54 2006
+++ b/xen/arch/ia64/vmx/vmx_hypercall.c Wed Mar  1 19:47:25 2006
@@ -57,45 +57,7 @@
     vcpu_set_gr(vcpu, 8, ret, 0);
     vmx_vcpu_increment_iip(vcpu);
 }
-/* turn off temporarily, we will merge hypercall parameter convention with 
xeno, when
-    VTI domain need to call hypercall */
-#if 0
-unsigned long __hypercall_create_continuation(
-    unsigned int op, unsigned int nr_args, ...)
-{
-    struct mc_state *mcs = &mc_state[smp_processor_id()];
-    VCPU *vcpu = current;
-    struct cpu_user_regs *regs = vcpu_regs(vcpu);
-    unsigned int i;
-    va_list args;
-
-    va_start(args, nr_args);
-    if ( test_bit(_MCSF_in_multicall, &mcs->flags) ) {
-       panic("PREEMPT happen in multicall\n"); // Not support yet
-    } else {
-       vcpu_set_gr(vcpu, 15, op, 0);
-       for ( i = 0; i < nr_args; i++) {
-           switch (i) {
-           case 0: vcpu_set_gr(vcpu, 16, va_arg(args, unsigned long), 0);
-                   break;
-           case 1: vcpu_set_gr(vcpu, 17, va_arg(args, unsigned long), 0);
-                   break;
-           case 2: vcpu_set_gr(vcpu, 18, va_arg(args, unsigned long), 0);
-                   break;
-           case 3: vcpu_set_gr(vcpu, 19, va_arg(args, unsigned long), 0);
-                   break;
-           case 4: vcpu_set_gr(vcpu, 20, va_arg(args, unsigned long), 0);
-                   break;
-           default: panic("Too many args for hypercall continuation\n");
-                   break;
-           }
-       }
-    }
-    vcpu->arch.hypercall_continuation = 1;
-    va_end(args);
-    return op;
-}
-#endif
+
 void hyper_dom_mem_op(void)
 {
     VCPU *vcpu=current;
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/ia64/xen/process.c
--- a/xen/arch/ia64/xen/process.c       Wed Mar  1 17:01:54 2006
+++ b/xen/arch/ia64/xen/process.c       Wed Mar  1 19:47:25 2006
@@ -801,30 +801,48 @@
        reflect_interruption(isr,regs,vector);
 }
 
-unsigned long __hypercall_create_continuation(
-       unsigned int op, unsigned int nr_args, ...)
+unsigned long hypercall_create_continuation(
+       unsigned int op, const char *format, ...)
 {
     struct mc_state *mcs = &mc_state[smp_processor_id()];
     struct vcpu *v = current;
+    const char *p = format;
+    unsigned long arg;
     unsigned int i;
     va_list args;
 
-    va_start(args, nr_args);
+    va_start(args, format);
     if ( test_bit(_MCSF_in_multicall, &mcs->flags) ) {
        panic("PREEMPT happen in multicall\n"); // Not support yet
     } else {
        vcpu_set_gr(v, 2, op, 0);
-       for ( i = 0; i < nr_args; i++) {
+       for ( i = 0; *p != '\0'; i++) {
+            switch ( *p++ )
+            {
+            case 'i':
+                arg = (unsigned long)va_arg(args, unsigned int);
+                break;
+            case 'l':
+                arg = (unsigned long)va_arg(args, unsigned long);
+                break;
+            case 'p':
+            case 'h':
+                arg = (unsigned long)va_arg(args, void *);
+                break;
+            default:
+                arg = 0;
+                BUG();
+            }
            switch (i) {
-           case 0: vcpu_set_gr(v, 14, va_arg(args, unsigned long), 0);
+           case 0: vcpu_set_gr(v, 14, arg, 0);
                    break;
-           case 1: vcpu_set_gr(v, 15, va_arg(args, unsigned long), 0);
+           case 1: vcpu_set_gr(v, 15, arg, 0);
                    break;
-           case 2: vcpu_set_gr(v, 16, va_arg(args, unsigned long), 0);
+           case 2: vcpu_set_gr(v, 16, arg, 0);
                    break;
-           case 3: vcpu_set_gr(v, 17, va_arg(args, unsigned long), 0);
+           case 3: vcpu_set_gr(v, 17, arg, 0);
                    break;
-           case 4: vcpu_set_gr(v, 18, va_arg(args, unsigned long), 0);
+           case 4: vcpu_set_gr(v, 18, arg, 0);
                    break;
            default: panic("Too many args for hypercall continuation\n");
                    break;
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/Makefile
--- a/xen/arch/x86/Makefile     Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/Makefile     Wed Mar  1 19:47:25 2006
@@ -33,6 +33,10 @@
  endif
 endif
 
+ifneq ($(supervisor_mode_kernel),y)
+OBJS := $(subst x86_32/supervisor_mode_kernel.o,,$(OBJS))
+endif
+
 OBJS := $(subst $(TARGET_SUBARCH)/asm-offsets.o,,$(OBJS))
 OBJS := $(subst $(TARGET_SUBARCH)/xen.lds.o,,$(OBJS))
 
@@ -44,7 +48,7 @@
 
 $(TARGET): $(TARGET)-syms boot/mkelf32
        ./boot/mkelf32 $(TARGET)-syms $(TARGET) 0x100000 \
-       `nm $(TARGET)-syms | sort | tail -n 1 | sed -e 's/^\([^ ]*\).*/0x\1/'`
+       `$(NM) $(TARGET)-syms | sort | tail -n 1 | sed -e 's/^\([^ 
]*\).*/0x\1/'`
 
 $(CURDIR)/arch.o: $(OBJS)
        $(LD) $(LDFLAGS) -r -o $@ $(OBJS)
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/Rules.mk
--- a/xen/arch/x86/Rules.mk     Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/Rules.mk     Wed Mar  1 19:47:25 2006
@@ -6,6 +6,7 @@
 # 'make clean' before rebuilding.
 #
 pae ?= n
+supervisor_mode_kernel ?= n
 
 CFLAGS  += -nostdinc -fno-builtin -fno-common -fno-strict-aliasing
 CFLAGS  += -iwithprefix include -Wall -Werror -Wno-pointer-arith -pipe
@@ -32,6 +33,9 @@
 CFLAGS  += -DCONFIG_X86_PAE=1
 endif
 endif
+ifeq ($(supervisor_mode_kernel),y)
+CFLAGS  += -DCONFIG_X86_SUPERVISOR_MODE_KERNEL=1
+endif
 
 ifeq ($(TARGET_SUBARCH),x86_64)
 CFLAGS  += -m64 -mno-red-zone -fpic -fno-reorder-blocks
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/boot/mkelf32.c
--- a/xen/arch/x86/boot/mkelf32.c       Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/boot/mkelf32.c       Wed Mar  1 19:47:25 2006
@@ -244,7 +244,7 @@
 
     inimage  = argv[1];
     outimage = argv[2];
-    loadbase = strtoul(argv[3], NULL, 16);
+    loadbase = strtoull(argv[3], NULL, 16);
     final_exec_addr = strtoul(argv[4], NULL, 16);
 
     infd = open(inimage, O_RDONLY);
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/dom0_ops.c
--- a/xen/arch/x86/dom0_ops.c   Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/dom0_ops.c   Wed Mar  1 19:47:25 2006
@@ -181,10 +181,13 @@
     {
         dom0_physinfo_t *pi = &op->u.physinfo;
 
-        pi->threads_per_core = smp_num_siblings;
-        pi->cores_per_socket = boot_cpu_data.x86_max_cores;
+        pi->threads_per_core =
+            cpus_weight(cpu_sibling_map[0]);
+        pi->cores_per_socket =
+            cpus_weight(cpu_core_map[0]) / pi->threads_per_core;
         pi->sockets_per_node = 
-            num_online_cpus() / (pi->threads_per_core * pi->cores_per_socket);
+            num_online_cpus() / cpus_weight(cpu_core_map[0]);
+
         pi->nr_nodes         = 1;
         pi->total_pages      = total_pages;
         pi->free_pages       = avail_domheap_pages();
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c     Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/domain.c     Wed Mar  1 19:47:25 2006
@@ -351,17 +351,17 @@
 
     if ( !(c->flags & VGCF_HVM_GUEST) )
     {
-        fixup_guest_selector(c->user_regs.ss);
-        fixup_guest_selector(c->kernel_ss);
-        fixup_guest_selector(c->user_regs.cs);
+        fixup_guest_stack_selector(c->user_regs.ss);
+        fixup_guest_stack_selector(c->kernel_ss);
+        fixup_guest_code_selector(c->user_regs.cs);
 
 #ifdef __i386__
-        fixup_guest_selector(c->event_callback_cs);
-        fixup_guest_selector(c->failsafe_callback_cs);
+        fixup_guest_code_selector(c->event_callback_cs);
+        fixup_guest_code_selector(c->failsafe_callback_cs);
 #endif
 
         for ( i = 0; i < 256; i++ )
-            fixup_guest_selector(c->trap_ctxt[i].cs);
+            fixup_guest_code_selector(c->trap_ctxt[i].cs);
     }
     else if ( !hvm_enabled )
       return -EINVAL;
@@ -784,6 +784,11 @@
 
     context_saved(prev);
 
+    /* Update per-VCPU guest runstate shared memory area (if registered). */
+    if ( next->runstate_guest != NULL )
+        __copy_to_user(next->runstate_guest, &next->runstate,
+                       sizeof(next->runstate));
+
     schedule_tail(next);
     BUG();
 }
@@ -820,56 +825,77 @@
     flush_tlb_mask(v->vcpu_dirty_cpumask);
 }
 
-unsigned long __hypercall_create_continuation(
-    unsigned int op, unsigned int nr_args, ...)
+#define next_arg(fmt, args) ({                                              \
+    unsigned long __arg;                                                    \
+    switch ( *(fmt)++ )                                                     \
+    {                                                                       \
+    case 'i': __arg = (unsigned long)va_arg(args, unsigned int);  break;    \
+    case 'l': __arg = (unsigned long)va_arg(args, unsigned long); break;    \
+    case 'p': __arg = (unsigned long)va_arg(args, void *);        break;    \
+    case 'h': __arg = (unsigned long)va_arg(args, void *);        break;    \
+    default:  __arg = 0; BUG();                                             \
+    }                                                                       \
+    __arg;                                                                  \
+})
+
+unsigned long hypercall_create_continuation(
+    unsigned int op, const char *format, ...)
 {
     struct mc_state *mcs = &mc_state[smp_processor_id()];
     struct cpu_user_regs *regs;
+    const char *p = format;
+    unsigned long arg;
     unsigned int i;
     va_list args;
 
-    va_start(args, nr_args);
+    va_start(args, format);
 
     if ( test_bit(_MCSF_in_multicall, &mcs->flags) )
     {
         __set_bit(_MCSF_call_preempted, &mcs->flags);
 
-        for ( i = 0; i < nr_args; i++ )
-            mcs->call.args[i] = va_arg(args, unsigned long);
+        for ( i = 0; *p != '\0'; i++ )
+            mcs->call.args[i] = next_arg(p, args);
     }
     else
     {
         regs       = guest_cpu_user_regs();
 #if defined(__i386__)
         regs->eax  = op;
-        regs->eip -= 2;  /* re-execute 'int 0x82' */
-
-        for ( i = 0; i < nr_args; i++ )
-        {
+
+        if ( supervisor_mode_kernel )
+            regs->eip &= ~31; /* re-execute entire hypercall entry stub */
+        else
+            regs->eip -= 2;   /* re-execute 'int 0x82' */
+
+        for ( i = 0; *p != '\0'; i++ )
+        {
+            arg = next_arg(p, args);
             switch ( i )
             {
-            case 0: regs->ebx = va_arg(args, unsigned long); break;
-            case 1: regs->ecx = va_arg(args, unsigned long); break;
-            case 2: regs->edx = va_arg(args, unsigned long); break;
-            case 3: regs->esi = va_arg(args, unsigned long); break;
-            case 4: regs->edi = va_arg(args, unsigned long); break;
-            case 5: regs->ebp = va_arg(args, unsigned long); break;
+            case 0: regs->ebx = arg; break;
+            case 1: regs->ecx = arg; break;
+            case 2: regs->edx = arg; break;
+            case 3: regs->esi = arg; break;
+            case 4: regs->edi = arg; break;
+            case 5: regs->ebp = arg; break;
             }
         }
 #elif defined(__x86_64__)
         regs->rax  = op;
         regs->rip -= 2;  /* re-execute 'syscall' */
 
-        for ( i = 0; i < nr_args; i++ )
-        {
+        for ( i = 0; *p != '\0'; i++ )
+        {
+            arg = next_arg(p, args);
             switch ( i )
             {
-            case 0: regs->rdi = va_arg(args, unsigned long); break;
-            case 1: regs->rsi = va_arg(args, unsigned long); break;
-            case 2: regs->rdx = va_arg(args, unsigned long); break;
-            case 3: regs->r10 = va_arg(args, unsigned long); break;
-            case 4: regs->r8  = va_arg(args, unsigned long); break;
-            case 5: regs->r9  = va_arg(args, unsigned long); break;
+            case 0: regs->rdi = arg; break;
+            case 1: regs->rsi = arg; break;
+            case 2: regs->rdx = arg; break;
+            case 3: regs->r10 = arg; break;
+            case 4: regs->r8  = arg; break;
+            case 5: regs->r9  = arg; break;
             }
         }
 #endif
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/domain_build.c
--- a/xen/arch/x86/domain_build.c       Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/domain_build.c       Wed Mar  1 19:47:25 2006
@@ -27,6 +27,9 @@
 #include <asm/shadow.h>
 
 #include <public/version.h>
+
+extern unsigned long initial_images_nrpages(void);
+extern void discard_initial_images(void);
 
 static long dom0_nrpages;
 
@@ -181,7 +184,8 @@
         {
             printk("Unknown kernel feature \"%.*s\".\n",
                    (int)(p-feats), feats);
-            panic("Domain 0 requires an unknown hypervisor feature.\n");
+            if ( req )
+                panic("Domain 0 requires an unknown hypervisor feature.\n");
         }
 
         feats = p;
@@ -248,9 +252,6 @@
     uint32_t dom0_features_supported[XENFEAT_NR_SUBMAPS] = { 0 };
     uint32_t dom0_features_required[XENFEAT_NR_SUBMAPS] = { 0 };
 
-    extern void translate_l2pgtable(
-        struct domain *d, l1_pgentry_t *p2m, unsigned long l2mfn);
-
     /* Sanity! */
     BUG_ON(d->domain_id != 0);
     BUG_ON(d->vcpu[0] == NULL);
@@ -271,18 +272,14 @@
      */
     if ( dom0_nrpages == 0 )
     {
-        dom0_nrpages = avail_domheap_pages() +
-            ((initrd_len + PAGE_SIZE - 1) >> PAGE_SHIFT) +
-            ((image_len  + PAGE_SIZE - 1) >> PAGE_SHIFT);
+        dom0_nrpages = avail_domheap_pages() + initial_images_nrpages();
         dom0_nrpages = min(dom0_nrpages / 16, 128L << (20 - PAGE_SHIFT));
         dom0_nrpages = -dom0_nrpages;
     }
 
     /* Negative memory specification means "all memory - specified amount". */
     if ( dom0_nrpages < 0 )
-        nr_pages = avail_domheap_pages() +
-            ((initrd_len + PAGE_SIZE - 1) >> PAGE_SHIFT) +
-            ((image_len  + PAGE_SIZE - 1) >> PAGE_SHIFT) +
+        nr_pages = avail_domheap_pages() + initial_images_nrpages() +
             dom0_nrpages;
     else
         nr_pages = dom0_nrpages;
@@ -704,16 +701,12 @@
         hypercall_page_initialise((void *)hypercall_page);
     }
 
-    init_domheap_pages(
-        _image_start, (_image_start+image_len+PAGE_SIZE-1) & PAGE_MASK);
-
-    /* Copy the initial ramdisk and free temporary buffer. */
+    /* Copy the initial ramdisk. */
     if ( initrd_len != 0 )
-    {
         memcpy((void *)vinitrd_start, initrd_start, initrd_len);
-        init_domheap_pages(
-            _initrd_start, (_initrd_start+initrd_len+PAGE_SIZE-1) & PAGE_MASK);
-    }
+
+    /* Free temporary buffers. */
+    discard_initial_images();
 
     /* Set up start info area. */
     si = (start_info_t *)vstartinfo_start;
@@ -790,6 +783,25 @@
     {
         shadow_mode_enable(d, SHM_enable);
         update_pagetables(v);
+    }
+
+    if ( supervisor_mode_kernel )
+    {
+        v->arch.guest_context.kernel_ss &= ~3;
+        v->arch.guest_context.user_regs.ss &= ~3;
+        v->arch.guest_context.user_regs.es &= ~3;
+        v->arch.guest_context.user_regs.ds &= ~3;
+        v->arch.guest_context.user_regs.fs &= ~3;
+        v->arch.guest_context.user_regs.gs &= ~3;
+        printk("Dom0 runs in ring 0 (supervisor mode)\n");
+        if ( !test_bit(XENFEAT_supervisor_mode_kernel,
+                       dom0_features_supported) )
+            panic("Dom0 does not support supervisor-mode execution\n");
+    }
+    else
+    {
+        if ( test_bit(XENFEAT_supervisor_mode_kernel, dom0_features_required) )
+            panic("Dom0 requires supervisor-mode execution\n");
     }
 
     rc = 0;
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/hvm/hvm.c
--- a/xen/arch/x86/hvm/hvm.c    Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/hvm/hvm.c    Wed Mar  1 19:47:25 2006
@@ -25,6 +25,7 @@
 #include <xen/sched.h>
 #include <xen/irq.h>
 #include <xen/softirq.h>
+#include <xen/domain.h>
 #include <xen/domain_page.h>
 #include <asm/current.h>
 #include <asm/io.h>
@@ -59,9 +60,9 @@
 
     for ( i = 0; i < nr_pfn; i++ )
     {
-        if ( pfn + i >= 0xfffff ) 
+        if ( pfn + i >= 0xfffff )
             break;
-        
+
         __copy_to_user(&phys_to_machine_mapping[pfn + i], &val, sizeof (val));
     }
 }
@@ -217,7 +218,7 @@
     global_iodata_t *spg;
     u16   *virq_line, irqs;
     struct hvm_virpic *pic = &v->domain->arch.hvm_domain.vpic;
-    
+
     spg = &get_sp(v->domain)->sp_global;
     virq_line  = &spg->pic_clear_irr;
     if ( *virq_line ) {
@@ -312,6 +313,52 @@
 }
 
 /*
+ * only called in HVM domain BSP context
+ * when booting, vcpuid is always equal to apic_id
+ */
+int hvm_bringup_ap(int vcpuid, int trampoline_vector)
+{
+    struct vcpu *bsp = current, *v;
+    struct domain *d = bsp->domain;
+    struct vcpu_guest_context *ctxt;
+    int rc = 0;
+
+    /* current must be HVM domain BSP */
+    if ( !(HVM_DOMAIN(bsp) && bsp->vcpu_id == 0) ) {
+        printk("Not calling hvm_bringup_ap from BSP context.\n");
+        domain_crash_synchronous();
+    }
+
+    if ( (v = d->vcpu[vcpuid]) == NULL )
+        return -ENOENT;
+
+    if ( (ctxt = xmalloc(struct vcpu_guest_context)) == NULL ) {
+        printk("Failed to allocate memory in hvm_bringup_ap.\n");
+        return -ENOMEM;
+    }
+
+    hvm_init_ap_context(ctxt, vcpuid, trampoline_vector);
+
+    LOCK_BIGLOCK(d);
+    rc = -EEXIST;
+    if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) )
+        rc = boot_vcpu(d, vcpuid, ctxt);
+    UNLOCK_BIGLOCK(d);
+
+    if ( rc != 0 )
+        printk("AP %d bringup failed in boot_vcpu %x.\n", vcpuid, rc);
+    else {
+        if ( test_and_clear_bit(_VCPUF_down, &d->vcpu[vcpuid]->vcpu_flags) )
+            vcpu_wake(d->vcpu[vcpuid]);
+        printk("AP %d bringup suceeded.\n", vcpuid);
+    }
+
+    xfree(ctxt);
+
+    return rc;
+}
+
+/*
  * Local variables:
  * mode: C
  * c-set-style: "BSD"
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/hvm/svm/emulate.c
--- a/xen/arch/x86/hvm/svm/emulate.c    Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/hvm/svm/emulate.c    Wed Mar  1 19:47:25 2006
@@ -86,7 +86,7 @@
     case 0x7:
         value = regs->edi;
         break;
-#if X86_64
+#if __x86_64__
     case 0x8:
         value = regs->r8;
         break;
@@ -318,20 +318,14 @@
 
 
 /* Get the register/mode number of src register in ModRM register. */
-unsigned int decode_dest_reg(u8 m)
-{
-#if __x86_64__
-    ASSERT(0); /* Need to adjust for REX prefix if applicable */
-#endif
-    return (m >> 3) & 7;
-}
-
-unsigned int decode_src_reg(u8 m)
-{
-#if __x86_64__
-    ASSERT(0); /* Need to adjust for REX prefix if applicable */
-#endif
-    return m & 7;
+unsigned int decode_dest_reg(u8 prefix, u8 m)
+{
+    return DECODE_MODRM_REG(prefix, m);
+}
+
+unsigned int decode_src_reg(u8 prefix, u8 m)
+{
+    return DECODE_MODRM_RM(prefix, m);
 }
 
 
@@ -431,7 +425,7 @@
  * The caller can either pass a NULL pointer to the guest_eip_buf, or a pointer
  * to enough bytes to satisfy the instruction including prefix bytes.
  */
-unsigned int __get_instruction_length_from_list(struct vmcb_struct *vmcb,
+int __get_instruction_length_from_list(struct vmcb_struct *vmcb,
         enum instruction_index *list, unsigned int list_count, 
         u8 *guest_eip_buf, enum instruction_index *match)
 {
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/hvm/svm/intr.c
--- a/xen/arch/x86/hvm/svm/intr.c       Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/hvm/svm/intr.c       Wed Mar  1 19:47:25 2006
@@ -80,12 +80,7 @@
 {
     struct hvm_virpit *vpit = &(v->domain->arch.hvm_domain.vpit);
 
-    switch(type)
-    {
-    case VLAPIC_DELIV_MODE_EXT:
-    case VLAPIC_DELIV_MODE_FIXED:
-    case VLAPIC_DELIV_MODE_LPRI:
-        if ( is_pit_irq(v, vector, type) ) {
+    if ( is_pit_irq(v, vector, type) ) {
             if ( !vpit->first_injected ) {
                 vpit->first_injected = 1;
                 vpit->pending_intr_nr = 0;
@@ -95,12 +90,15 @@
             }
             vpit->inject_point = NOW();
             svm_set_tsc_shift (v, vpit);
-        }
+    }
+
+    switch(type)
+    {
+    case VLAPIC_DELIV_MODE_EXT:
         break;
 
     default:
-        printk("Not support interrupt type: %d\n", type);
-        break;
+        vlapic_post_injection(v, vector, type);
     }
 }
 
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/hvm/svm/svm.c
--- a/xen/arch/x86/hvm/svm/svm.c        Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/hvm/svm/svm.c        Wed Mar  1 19:47:25 2006
@@ -164,7 +164,7 @@
 }
 
 static inline void svm_inject_exception(struct vmcb_struct *vmcb, 
-                                        int trap, int error_code)
+                                        int trap, int ev, int error_code)
 {
     eventinj_t event;
 
@@ -172,7 +172,7 @@
     event.fields.v = 1;
     event.fields.type = EVENTTYPE_EXCEPTION;
     event.fields.vector = trap;
-    event.fields.ev = 1;
+    event.fields.ev = ev;
     event.fields.errorcode = error_code;
 
     ASSERT(vmcb->eventinj.fields.v == 0);
@@ -237,61 +237,16 @@
 }
 
 #ifdef __x86_64__
-static struct svm_msr_state percpu_msr[NR_CPUS];
-
-static u32 msr_data_index[VMX_MSR_COUNT] =
-{
-    MSR_LSTAR, MSR_STAR, MSR_CSTAR,
-    MSR_SYSCALL_MASK, MSR_EFER,
-};
 
 void svm_save_segments(struct vcpu *v)
 {
-    rdmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_svm.msr_content.shadow_gs);
-}
-
-/*
- * To avoid MSR save/restore at every VM exit/entry time, we restore
- * the x86_64 specific MSRs at domain switch time. Since those MSRs are
- * are not modified once set for generic domains, we don't save them,
- * but simply reset them to the values set at percpu_traps_init().
- */
+}
 void svm_load_msrs(void)
 {
-    struct svm_msr_state *host_state = &percpu_msr[smp_processor_id()];
-    int i;
-
-    while ( host_state->flags )
-    {
-        i = find_first_set_bit(host_state->flags);
-        wrmsrl(msr_data_index[i], host_state->msr_items[i]);
-        clear_bit(i, &host_state->flags);
-    }
-}
-
-static void svm_save_init_msrs(void)
-{
-    struct svm_msr_state *host_state = &percpu_msr[smp_processor_id()];
-    int i;
-
-    for ( i = 0; i < SVM_MSR_COUNT; i++ )
-        rdmsrl(msr_data_index[i], host_state->msr_items[i]);
-}
-
-#define CASE_READ_MSR(address)                               \
-    case MSR_ ## address:                                    \
-    msr_content = msr->msr_items[SVM_INDEX_MSR_ ## address]; \
-    break
-
-#define CASE_WRITE_MSR(address)                              \
-    case MSR_ ## address:                                    \
-    msr->msr_items[SVM_INDEX_MSR_ ## address] = msr_content; \
-    if (!test_bit(SVM_INDEX_MSR_ ## address, &msr->flags))   \
-    {                                                        \
-        set_bit(SVM_INDEX_MSR_ ## address, &msr->flags);     \
-    }                                                        \
-    break
-
+}
+void svm_restore_msrs(struct vcpu *v)
+{
+}
 
 #define IS_CANO_ADDRESS(add) 1
 
@@ -299,47 +254,45 @@
 {
     u64 msr_content = 0;
     struct vcpu *vc = current;
-    struct svm_msr_state *msr = &vc->arch.hvm_svm.msr_content;
+    //    struct svm_msr_state *msr = &vc->arch.hvm_svm.msr_content;
     struct vmcb_struct *vmcb = vc->arch.hvm_svm.vmcb;
 
     switch (regs->ecx)
     {
     case MSR_EFER:
-        msr_content = msr->msr_items[SVM_INDEX_MSR_EFER];
-        HVM_DBG_LOG(DBG_LEVEL_2, "EFER msr_content %llx\n", 
-                (unsigned long long)msr_content);
-
-        if (test_bit(SVM_CPU_STATE_LME_ENABLED, &vc->arch.hvm_svm.cpu_state))
-            msr_content |= 1 << _EFER_LME;
-
-        if (SVM_LONG_GUEST(vc))
-            msr_content |= 1 << _EFER_LMA;
-
+        // msr_content = msr->msr_items[SVM_INDEX_MSR_EFER];
+        msr_content = vmcb->efer;      
+        msr_content &= ~EFER_SVME;
         break;
 
     case MSR_FS_BASE:
-        if (!(SVM_LONG_GUEST(vc)))
-            /* XXX should it be GP fault */
-            domain_crash_synchronous();
-        
         msr_content = vmcb->fs.base;
         break;
 
     case MSR_GS_BASE:
-        if (!(SVM_LONG_GUEST(vc)))
-            domain_crash_synchronous();
-
         msr_content = vmcb->gs.base;
         break;
 
     case MSR_SHADOW_GS_BASE:
-        msr_content = msr->shadow_gs;
-        break;
-
-    CASE_READ_MSR(STAR);
-    CASE_READ_MSR(LSTAR);
-    CASE_READ_MSR(CSTAR);
-    CASE_READ_MSR(SYSCALL_MASK);
+        msr_content = vmcb->kerngsbase;
+        break;
+
+    case MSR_STAR:
+         msr_content = vmcb->star;
+         break;
+ 
+    case MSR_LSTAR:
+         msr_content = vmcb->lstar;
+         break;
+ 
+    case MSR_CSTAR:
+         msr_content = vmcb->cstar;
+         break;
+ 
+    case MSR_SYSCALL_MASK:
+         msr_content = vmcb->sfmask;
+         break;
+
     default:
         return 0;
     }
@@ -356,8 +309,6 @@
 {
     u64 msr_content = regs->eax | ((u64)regs->edx << 32); 
     struct vcpu *vc = current;
-    struct svm_msr_state *msr = &vc->arch.hvm_svm.msr_content;
-    struct svm_msr_state *host_state = &percpu_msr[smp_processor_id()];
     struct vmcb_struct *vmcb = vc->arch.hvm_svm.vmcb;
 
     HVM_DBG_LOG(DBG_LEVEL_1, "mode_do_msr_write msr %lx msr_content %lx\n", 
@@ -373,26 +324,20 @@
                     || !test_bit(SVM_CPU_STATE_PAE_ENABLED,
                                  &vc->arch.hvm_svm.cpu_state))
             {
-                svm_inject_exception(vmcb, TRAP_gp_fault, 0);
+                svm_inject_exception(vmcb, TRAP_gp_fault, 1, 0);
             }
         }
 
         if (msr_content & EFER_LME)
             set_bit(SVM_CPU_STATE_LME_ENABLED, &vc->arch.hvm_svm.cpu_state);
 
+        /* We have already recorded that we want LME, so it will be set 
+         * next time CR0 gets updated. So we clear that bit and continue.
+         */
+        if ((msr_content ^ vmcb->efer) & EFER_LME)
+            msr_content &= ~EFER_LME;  
         /* No update for LME/LMA since it have no effect */
-        msr->msr_items[SVM_INDEX_MSR_EFER] = msr_content;
-        if (msr_content & ~(EFER_LME | EFER_LMA))
-        {
-            msr->msr_items[SVM_INDEX_MSR_EFER] = msr_content;
-            if (!test_bit(SVM_INDEX_MSR_EFER, &msr->flags))
-            { 
-                rdmsrl(MSR_EFER, host_state->msr_items[SVM_INDEX_MSR_EFER]);
-                set_bit(SVM_INDEX_MSR_EFER, &host_state->flags);
-                set_bit(SVM_INDEX_MSR_EFER, &msr->flags);  
-                wrmsrl(MSR_EFER, msr_content);
-            }
-        }
+        vmcb->efer = msr_content | EFER_SVME;
         break;
 
     case MSR_FS_BASE:
@@ -403,63 +348,42 @@
         if (!IS_CANO_ADDRESS(msr_content))
         {
             HVM_DBG_LOG(DBG_LEVEL_1, "Not cano address of msr write\n");
-            svm_inject_exception(vmcb, TRAP_gp_fault, 0);
+            svm_inject_exception(vmcb, TRAP_gp_fault, 1, 0);
         }
 
         if (regs->ecx == MSR_FS_BASE)
-           vmcb->fs.base = msr_content;
+            vmcb->fs.base = msr_content;
         else 
-           vmcb->gs.base = msr_content;
+            vmcb->gs.base = msr_content;
         break;
 
     case MSR_SHADOW_GS_BASE:
-        if (!(SVM_LONG_GUEST(vc)))
-            domain_crash_synchronous();
-
-        vc->arch.hvm_svm.msr_content.shadow_gs = msr_content;
-        wrmsrl(MSR_SHADOW_GS_BASE, msr_content);
-        break;
-
-    CASE_WRITE_MSR(STAR);
-    CASE_WRITE_MSR(LSTAR);
-    CASE_WRITE_MSR(CSTAR);
-    CASE_WRITE_MSR(SYSCALL_MASK);
+         vmcb->kerngsbase = msr_content;
+         break;
+ 
+    case MSR_STAR:
+         vmcb->star = msr_content;
+         break;
+ 
+    case MSR_LSTAR:
+         vmcb->lstar = msr_content;
+         break;
+ 
+    case MSR_CSTAR:
+         vmcb->cstar = msr_content;
+         break;
+ 
+    case MSR_SYSCALL_MASK:
+         vmcb->sfmask = msr_content;
+         break;
+
     default:
         return 0;
     }
     return 1;
 }
 
-void
-svm_restore_msrs(struct vcpu *v)
-{
-    int i = 0;
-    struct svm_msr_state *guest_state;
-    struct svm_msr_state *host_state;
-    unsigned long guest_flags;
-
-    guest_state = &v->arch.hvm_svm.msr_content;;
-    host_state = &percpu_msr[smp_processor_id()];
-
-    wrmsrl(MSR_SHADOW_GS_BASE, guest_state->shadow_gs);
-    guest_flags = guest_state->flags;
-    if (!guest_flags)
-        return;
-
-    while (guest_flags){
-        i = find_first_set_bit(guest_flags);
-
-        HVM_DBG_LOG(DBG_LEVEL_2,
-                    "restore guest's index %d msr %lx with %lx\n",
-                    i, (unsigned long) msr_data_index[i], (unsigned long) 
guest_state->msr_items[i]);
-        set_bit(i, &host_state->flags);
-        wrmsrl(msr_data_index[i], guest_state->msr_items[i]);
-        clear_bit(i, &guest_flags);
-    }
-}
 #else
-#define        svm_save_init_msrs()    ((void)0)
-
 static inline int long_mode_do_msr_read(struct cpu_user_regs *regs)
 {
     return 0;
@@ -497,9 +421,28 @@
 {
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
     unsigned long cr0 = vmcb->cr0, eflags = vmcb->rflags, mode;
-
-    mode = (eflags & X86_EFLAGS_VM) || !(cr0 & X86_CR0_PE) ? 2 : 4;
+    /* check which operating mode the guest is running */
+    if( vmcb->efer & EFER_LMA )
+        mode = vmcb->cs.attributes.fields.l ? 8 : 4;
+    else
+        mode = (eflags & X86_EFLAGS_VM) || !(cr0 & X86_CR0_PE) ? 2 : 4;
     return svm_instrlen(guest_cpu_user_regs(), mode);
+}
+
+unsigned long svm_get_ctrl_reg(struct vcpu *v, unsigned int num)
+{
+    switch ( num )
+    {
+    case 0:
+        return v->arch.hvm_svm.cpu_shadow_cr0;
+    case 2:
+        return v->arch.hvm_svm.cpu_cr2;
+    case 3:
+        return v->arch.hvm_svm.cpu_cr3;
+    default:
+        BUG();
+    }
+    return 0;                   /* dummy */
 }
 
 int start_svm(void)
@@ -519,8 +462,6 @@
     asidpool_init(smp_processor_id());    
     printk("AMD SVM Extension is enabled for cpu %d.\n", smp_processor_id());
     
-    svm_save_init_msrs();
-
     /* Setup HVM interfaces */
     hvm_funcs.disable = stop_svm;
 
@@ -542,6 +483,7 @@
     hvm_funcs.realmode = svm_realmode;
     hvm_funcs.paging_enabled = svm_paging_enabled;
     hvm_funcs.instruction_length = svm_instruction_length;
+    hvm_funcs.get_guest_ctrl_reg = svm_get_ctrl_reg;
 
     hvm_enabled = 1;    
 
@@ -631,8 +573,17 @@
 }
 
 #if defined (__x86_64__)
-void svm_store_cpu_user_regs(struct cpu_user_regs *regs, struct vcpu *c )
-{
+void svm_store_cpu_user_regs(struct cpu_user_regs *regs, struct vcpu *v )
+{
+    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
+
+    regs->rip    = vmcb->rip;
+    regs->rsp    = vmcb->rsp;
+    regs->rflags = vmcb->rflags;
+    regs->cs     = vmcb->cs.sel;
+    regs->ds     = vmcb->ds.sel;
+    regs->es     = vmcb->es.sel;
+    regs->ss     = vmcb->ss.sel;
 }
 #elif defined (__i386__)
 void svm_store_cpu_user_regs(struct cpu_user_regs *regs, struct vcpu *v)
@@ -810,7 +761,8 @@
     vpit = &v->domain->arch.hvm_domain.vpit;
     kill_timer(&vpit->pit_timer);
     kill_timer(&v->arch.hvm_svm.hlt_timer);
-    if ( hvm_apic_support(v->domain) ) {
+    if ( hvm_apic_support(v->domain) && (VLAPIC(v) != NULL) ) 
+    {
         kill_timer( &(VLAPIC(v)->vlapic_timer) );
         xfree( VLAPIC(v) );
     }
@@ -819,8 +771,29 @@
 
 void arch_svm_do_resume(struct vcpu *v) 
 {
-    svm_do_resume(v);
-    reset_stack_and_jump(svm_asm_do_resume);
+    /* pinning VCPU to a different core? */
+    if ( v->arch.hvm_svm.launch_core == smp_processor_id()) {
+        svm_do_resume( v );
+        reset_stack_and_jump( svm_asm_do_resume );
+    }
+    else {
+        printk("VCPU core pinned: %d to %d\n", v->arch.hvm_svm.launch_core, 
smp_processor_id() );
+        v->arch.hvm_svm.launch_core = smp_processor_id();
+        svm_migrate_timers( v );
+        svm_do_resume( v );
+        reset_stack_and_jump( svm_asm_do_resume );
+    }
+}
+
+
+void svm_migrate_timers(struct vcpu *v)
+{
+    struct hvm_virpit *vpit = &(v->domain->arch.hvm_domain.vpit);
+
+    migrate_timer( &vpit->pit_timer, v->processor );
+    migrate_timer( &v->arch.hvm_svm.hlt_timer, v->processor );
+    if ( hvm_apic_support(v->domain) && VLAPIC( v ))
+        migrate_timer( &(VLAPIC(v)->vlapic_timer ), v->processor );
 }
 
 
@@ -860,9 +833,9 @@
        /* No support for APIC */
         if (!hvm_apic_support(v->domain) && gpa >= 0xFEC00000)
         { 
-            unsigned long inst_len;
-           inst_len = svm_instruction_length(v);
-            if (inst_len == (unsigned long)-1)
+            int inst_len;
+            inst_len = svm_instruction_length(v);
+            if (inst_len == -1)
             {
                 printf("%s: INST_LEN - Unable to decode properly.\n", 
__func__);
                 domain_crash_synchronous();
@@ -914,6 +887,14 @@
 
     eip = vmcb->rip;
     error_code = vmcb->exitinfo1;
+
+    if (vmcb->idtr.limit == 0) {
+        printf("Huh? We got a GP Fault with an invalid IDTR!\n");
+        svm_dump_vmcb(__func__, vmcb);
+        svm_dump_regs(__func__, regs);
+        svm_dump_inst(vmcb->rip); 
+        __hvm_bug(regs);
+    }
 
     HVM_DBG_LOG(DBG_LEVEL_1,
                 "svm_general_protection_fault: eip = %lx, erro_code = %lx",
@@ -927,7 +908,7 @@
 
     
     /* Reflect it back into the guest */
-    svm_inject_exception(vmcb, TRAP_gp_fault, error_code);
+    svm_inject_exception(vmcb, TRAP_gp_fault, 1, error_code);
 }
 
 /* Reserved bits: [31:14], [12:1] */
@@ -939,7 +920,7 @@
     unsigned int eax, ebx, ecx, edx;
     unsigned long eip;
     struct vcpu *v = current;
-    unsigned int inst_len;
+    int inst_len;
 
     ASSERT(vmcb);
 
@@ -956,21 +937,29 @@
 
     if (input == 1)
     {
+#ifndef __x86_64__
         if ( hvm_apic_support(v->domain) &&
                 !vlapic_global_enabled((VLAPIC(v))) )
+#endif
             clear_bit(X86_FEATURE_APIC, &edx);
            
-#ifdef __x86_64__
+#if CONFIG_PAGING_LEVELS < 3
+        clear_bit(X86_FEATURE_PAE, &edx);
+        clear_bit(X86_FEATURE_PSE, &edx);
+        clear_bit(X86_FEATURE_PSE36, &edx);
+#else
         if ( v->domain->arch.ops->guest_paging_levels == PAGING_L2 )
-#endif
         {
+            if ( !v->domain->arch.hvm_domain.pae_enabled )
+                clear_bit(X86_FEATURE_PAE, &edx);
             clear_bit(X86_FEATURE_PSE, &edx);
-            clear_bit(X86_FEATURE_PAE, &edx);
             clear_bit(X86_FEATURE_PSE36, &edx);
         }
+#endif
        
         /* Clear out reserved bits. */
         ecx &= ~SVM_VCPU_CPUID_L1_RESERVED; /* mask off reserved bits */
+        clear_bit(X86_FEATURE_MWAIT & 31, &ecx);
     }
 #ifdef __i386__
     else if ( input == 0x80000001 )
@@ -991,6 +980,7 @@
             eip, input, eax, ebx, ecx, edx);
 
     inst_len = __get_instruction_length(vmcb, INSTR_CPUID, NULL);
+    ASSERT(inst_len > 0);
     __update_guest_eip(vmcb, inst_len);
 }
 
@@ -1083,9 +1073,11 @@
     unsigned long *reg_p = 0;
     unsigned int gpreg = 0;
     unsigned long eip;
-    unsigned int inst_len; 
+    int inst_len; 
+    int index;
     struct vmcb_struct *vmcb;
     u8 buffer[MAX_INST_LEN];
+    u8 prefix = 0;
 
     vmcb = v->arch.hvm_svm.vmcb;
     
@@ -1093,13 +1085,15 @@
 
     eip = vmcb->rip;
     inst_copy_from_guest(buffer, svm_rip2pointer(vmcb), sizeof(buffer));
-
-    ASSERT(buffer[0] == 0x0f && (buffer[1] & 0xFD) == 0x21);
-
-    gpreg = decode_src_reg(buffer[2]);
-#if DEBUG
-    ASSERT(reg == decode_dest_reg(buffer[2]));
-#endif
+    index = skip_prefix_bytes(buffer, sizeof(buffer));
+    
+    ASSERT(buffer[index+0] == 0x0f && (buffer[index+1] & 0xFD) == 0x21);
+
+    if (index > 0 && (buffer[index-1] & 0xF0) == 0x40)
+        prefix = buffer[index-1];
+
+    gpreg = decode_src_reg(prefix, buffer[index + 2]);
+    ASSERT(reg == decode_dest_reg(prefix, buffer[index + 2]));
 
     HVM_DBG_LOG(DBG_LEVEL_1, "svm_dr_access : eip=%lx, reg=%d, gpreg = %x",
             eip, reg, gpreg);
@@ -1120,6 +1114,7 @@
         __hvm_bug(regs);
         break;
     }
+    ASSERT(inst_len > 0);
     __update_guest_eip(vmcb, inst_len);
 }
 
@@ -1335,13 +1330,13 @@
     }
 }
 
-
 static int svm_set_cr0(unsigned long value)
 {
     struct vcpu *v = current;
     unsigned long mfn;
     int paging_enabled;
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
+    unsigned long crn;
 
     ASSERT(vmcb);
 
@@ -1377,7 +1372,7 @@
                     &v->arch.hvm_svm.cpu_state))
         {
             HVM_DBG_LOG(DBG_LEVEL_1, "Enable paging before PAE enable\n");
-            svm_inject_exception(vmcb, TRAP_gp_fault, 0);
+            svm_inject_exception(vmcb, TRAP_gp_fault, 1, 0);
         }
 
         if (test_bit(SVM_CPU_STATE_LME_ENABLED, &v->arch.hvm_svm.cpu_state))
@@ -1386,14 +1381,7 @@
             HVM_DBG_LOG(DBG_LEVEL_1, "Enable the Long mode\n");
             set_bit(SVM_CPU_STATE_LMA_ENABLED,
                     &v->arch.hvm_svm.cpu_state);
-#if 0
-            __vmread(VM_ENTRY_CONTROLS, &vm_entry_value);
-            vm_entry_value |= VM_ENTRY_CONTROLS_IA32E_MODE;
-            __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value);
-#else
-           printk("Cannot yet set SVM_CPU_STATE_LMA_ENABLED\n");
-           domain_crash_synchronous();
-#endif
+            vmcb->efer |= (EFER_LMA | EFER_LME);
 
 #if CONFIG_PAGING_LEVELS >= 4 
             if (!shadow_set_guest_paging_levels(v->domain, 4)) 
@@ -1404,8 +1392,9 @@
 #endif
         }
         else
+#endif  /* __x86_64__ */
         {
-#if CONFIG_PAGING_LEVELS >= 4
+#if CONFIG_PAGING_LEVELS >= 3
             if (!shadow_set_guest_paging_levels(v->domain, 2))
             {
                 printk("Unsupported guest paging levels\n");
@@ -1414,33 +1403,18 @@
 #endif
         }
 
-#if 0
-        unsigned long crn;
-
         /* update CR4's PAE if needed */
-        __vmread(GUEST_CR4, &crn);
+        crn = vmcb->cr4;
         if ((!(crn & X86_CR4_PAE)) 
                 && test_bit(SVM_CPU_STATE_PAE_ENABLED, 
                     &v->arch.hvm_svm.cpu_state))
         {
             HVM_DBG_LOG(DBG_LEVEL_1, "enable PAE on cr4\n");
-            __vmwrite(GUEST_CR4, crn | X86_CR4_PAE);
-        }
-#else
-       printk("Cannot yet set SVM_CPU_STATE_PAE_ENABLED\n");
-       domain_crash_synchronous(); 
-#endif
-#elif defined(__i386__)
-       {
-            unsigned long old_base_mfn;
-            old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
-            if (old_base_mfn)
-                put_page(mfn_to_page(old_base_mfn));
-       }
-#endif
+            vmcb->cr4 |= X86_CR4_PAE;
+        }
 
         /* Now arch.guest_table points to machine physical. */
-        v->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT);
+        v->arch.guest_table = mk_pagetable((u64)mfn << PAGE_SHIFT);
         update_pagetables(v);
 
         HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx", 
@@ -1461,7 +1435,7 @@
      */
     if ((value & X86_CR0_PE) == 0) {
        if (value & X86_CR0_PG) {
-            svm_inject_exception(vmcb, TRAP_gp_fault, 0);
+            svm_inject_exception(vmcb, TRAP_gp_fault, 1, 0);
             return 0;
         }
 
@@ -1471,7 +1445,6 @@
 
     return 1;
 }
-
 
 /*
  * Read from control registers. CR0 and CR4 are read from the shadow.
@@ -1497,7 +1470,7 @@
         value = (unsigned long) v->arch.hvm_svm.cpu_cr3;
         break;
     case 4:
-        value = vmcb->cr4;
+        value = (unsigned long) v->arch.hvm_svm.cpu_shadow_cr4;
         break;
     case 8:
 #if 0
@@ -1579,7 +1552,7 @@
             }
 
             old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
-            v->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT);
+            v->arch.guest_table = mk_pagetable((u64)mfn << PAGE_SHIFT);
 
             if (old_base_mfn)
                 put_page(mfn_to_page(old_base_mfn));
@@ -1596,12 +1569,19 @@
 
     case 4:         
         /* CR4 */
-        if (value & X86_CR4_PAE)
-            __hvm_bug(regs);    /* not implemented */
-
-        old_cr = vmcb->cr4;
-        
-        vmcb->cr4 = value;
+        if (value & X86_CR4_PAE) {
+            set_bit(SVM_CPU_STATE_PAE_ENABLED, &v->arch.hvm_svm.cpu_state);
+        } else {
+            if (test_bit(SVM_CPU_STATE_LMA_ENABLED,
+                         &v->arch.hvm_svm.cpu_state)) {
+                svm_inject_exception(vmcb, TRAP_gp_fault, 1, 0);
+            }
+            clear_bit(SVM_CPU_STATE_PAE_ENABLED, &v->arch.hvm_svm.cpu_state);
+        }
+
+        old_cr = v->arch.hvm_svm.cpu_shadow_cr4;
+        v->arch.hvm_svm.cpu_shadow_cr4 = value;
+        vmcb->cr4 = value | SVM_CR4_HOST_MASK;
   
         /*
          * Writing to CR4 to modify the PSE, PGE, or PAE flag invalidates
@@ -1630,10 +1610,12 @@
         struct cpu_user_regs *regs)
 {
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
-    unsigned int inst_len = 0;
+    int inst_len = 0;
+    int index;
     unsigned int gpreg;
     unsigned long value;
-    u8 buffer[6];   
+    u8 buffer[MAX_INST_LEN];   
+    u8 prefix = 0;
     int result = 1;
     enum instruction_index list_a[] = {INSTR_MOV2CR, INSTR_CLTS, INSTR_LMSW};
     enum instruction_index list_b[] = {INSTR_MOVCR2, INSTR_SMSW};
@@ -1642,29 +1624,41 @@
     ASSERT(vmcb);
 
     inst_copy_from_guest(buffer, svm_rip2pointer(vmcb), sizeof(buffer));
+    /* get index to first actual instruction byte - as we will need to know 
where the 
+     * prefix lives later on
+     */
+    index = skip_prefix_bytes(buffer, sizeof(buffer));
     
     if (type == TYPE_MOV_TO_CR) 
     {
         inst_len = __get_instruction_length_from_list(vmcb, list_a, 
-                ARR_SIZE(list_a), buffer, &match);
+                ARR_SIZE(list_a), &buffer[index], &match);
     }
     else
     {
         inst_len = __get_instruction_length_from_list(vmcb, list_b, 
-                ARR_SIZE(list_b), buffer, &match);
-    }
+                ARR_SIZE(list_b), &buffer[index], &match);
+    }
+
+    ASSERT(inst_len > 0);
+
+    inst_len += index;
+
+    /* Check for REX prefix - it's ALWAYS the last byte of any prefix bytes */
+    if (index > 0 && (buffer[index-1] & 0xF0) == 0x40)
+        prefix = buffer[index-1];
 
     HVM_DBG_LOG(DBG_LEVEL_1, "eip = %lx", (unsigned long) vmcb->rip);
 
     switch (match) 
     {
     case INSTR_MOV2CR:
-        gpreg = decode_src_reg(buffer[2]);
+        gpreg = decode_src_reg(prefix, buffer[index+2]);
         result = mov_to_cr(gpreg, cr, regs);
         break;
 
     case INSTR_MOVCR2:
-        gpreg = decode_src_reg(buffer[2]);
+        gpreg = decode_src_reg(prefix, buffer[index+2]);
         mov_from_cr(cr, gpreg, regs);
         break;
 
@@ -1680,7 +1674,7 @@
         if (svm_dbg_on)
             svm_dump_inst(svm_rip2pointer(vmcb));
         
-        gpreg = decode_src_reg(buffer[2]);
+        gpreg = decode_src_reg(prefix, buffer[index+2]);
         value = get_reg(gpreg, regs, vmcb) & 0xF;
 
         if (svm_dbg_on)
@@ -1698,7 +1692,7 @@
     case INSTR_SMSW:
         svm_dump_inst(svm_rip2pointer(vmcb));
         value = v->arch.hvm_svm.cpu_shadow_cr0;
-        gpreg = decode_src_reg(buffer[2]);
+        gpreg = decode_src_reg(prefix, buffer[index+2]);
         set_reg(gpreg, value, regs, vmcb);
 
         if (svm_dbg_on)
@@ -1721,7 +1715,7 @@
 static inline void svm_do_msr_access(struct vcpu *v, struct cpu_user_regs 
*regs)
 {
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
-    unsigned int  inst_len;
+    int  inst_len;
     int64_t tsc_sum;
 
     ASSERT(vmcb);
@@ -1813,7 +1807,9 @@
         next_wakeup = next_pit;
     if ( next_wakeup != - 1 )
         set_timer(&current->arch.hvm_svm.hlt_timer, next_wakeup);
+/* temporary workaround for 8828/8822 evtchn patches causing SVM failure.
     hvm_safe_block();
+*/
 }
 
 
@@ -1860,7 +1856,7 @@
     struct vcpu *v = current;
     u8 opcode[MAX_INST_SIZE], prefix, length = MAX_INST_SIZE;
     unsigned long g_vaddr;
-    unsigned int inst_len;
+    int inst_len;
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
 
     ASSERT(vmcb);
@@ -1877,6 +1873,7 @@
     if (invlpga)
     {
         inst_len = __get_instruction_length(vmcb, INSTR_INVLPGA, opcode);
+        ASSERT(inst_len > 0);
         __update_guest_eip(vmcb, inst_len);
 
         /* 
@@ -1890,6 +1887,7 @@
         /* What about multiple prefix codes? */
         prefix = (is_prefix(opcode[0])?opcode[0]:0);
         inst_len = __get_instruction_length(vmcb, INSTR_INVLPG, opcode);
+        ASSERT(inst_len > 0);
 
         inst_len--;
         length -= inst_len;
@@ -1941,7 +1939,10 @@
     v->arch.hvm_svm.cpu_shadow_cr0 = X86_CR0_ET;
 
     vmcb->cr2 = 0;
-    vmcb->cr4 = 0;
+    vmcb->efer = EFER_SVME;
+
+    vmcb->cr4 = SVM_CR4_HOST_MASK;
+    v->arch.hvm_svm.cpu_shadow_cr4 = 0;
 
     /* This will jump to ROMBIOS */
     vmcb->rip = 0xFFF0;
@@ -2011,12 +2012,13 @@
 static int svm_do_vmmcall(struct vcpu *v, struct cpu_user_regs *regs)
 {
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
-    unsigned int inst_len;
+    int inst_len;
 
     ASSERT(vmcb);
     ASSERT(regs);
 
     inst_len = __get_instruction_length(vmcb, INSTR_VMCALL, NULL);
+    ASSERT(inst_len > 0);
 
     /* VMMCALL sanity check */
     if (vmcb->cpl > get_vmmcall_cpl(regs->edi))
@@ -2470,7 +2472,7 @@
         {
             v->arch.hvm_svm.injecting_event = 1;
             /* Inject #PG using Interruption-Information Fields */
-            svm_inject_exception(vmcb, TRAP_page_fault, regs.error_code);
+            svm_inject_exception(vmcb, TRAP_page_fault, 1, regs.error_code);
 
             v->arch.hvm_svm.cpu_cr2 = va;
             vmcb->cr2 = va;
@@ -2665,26 +2667,23 @@
 {
     struct vcpu *v = current;
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
-    int core = smp_processor_id();
-    int oldcore = v->arch.hvm_svm.core; 
-    /* 
-     * if need to assign new asid or if switching cores, 
-     * then retire asid for old core, and assign new for new core.
-     */
-    if( v->arch.hvm_svm.core != core ) {
-        if (svm_dbg_on)
-            printk("old core %d new core 
%d\n",(int)v->arch.hvm_svm.core,(int)core);
-        v->arch.hvm_svm.core = core;
-    }
-    if( test_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags) ||
-          (oldcore != core)) {
-        if(!asidpool_assign_next(vmcb, 1, 
-                   oldcore, core)) {
+
+   /*
+    * if need to assign new asid, or if switching cores,
+    * retire asid for the old core, and assign a new asid to the current core.
+    */
+    if ( test_bit( ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags ) ||
+       ( v->arch.hvm_svm.asid_core != v->arch.hvm_svm.launch_core )) {
+        /* recycle asid */
+        if ( !asidpool_assign_next( vmcb, 1,
+            v->arch.hvm_svm.asid_core, v->arch.hvm_svm.launch_core )) {
             /* If we get here, we have a major problem */
             domain_crash_synchronous();
         }
-    }
-    clear_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
+
+        v->arch.hvm_svm.asid_core = v->arch.hvm_svm.launch_core;
+        clear_bit( ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags );
+    }
 }
 
 /*
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/hvm/svm/vmcb.c
--- a/xen/arch/x86/hvm/svm/vmcb.c       Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/hvm/svm/vmcb.c       Wed Mar  1 19:47:25 2006
@@ -190,7 +190,6 @@
     unsigned long eflags;
     unsigned long shadow_cr;
     struct vmcb_struct *vmcb = arch_svm->vmcb;
-    struct Xgt_desc_struct desc;
 
     /* Allows IRQs to be shares */
     vmcb->vintr.fields.intr_masking = 1;
@@ -224,9 +223,9 @@
     vmcb->fs.base = 0;
     vmcb->gs.base = 0;
 
-    __asm__ __volatile__ ("sidt  (%0) \n" :: "a"(&desc) : "memory");
-    vmcb->idtr.base = desc.address;
-    vmcb->idtr.limit = desc.size;
+    /* Guest Interrupt descriptor table */
+    vmcb->idtr.base = 0;
+    vmcb->idtr.limit = 0;
 
     /* Set up segment attributes */
     attrib.bytes = 0;
@@ -248,15 +247,11 @@
     attrib.fields.type = 0xb;   /* type=0xb -> executable/readable, accessed */
     vmcb->cs.attributes = attrib;
 
-    /* Global descriptor table */
-    //NMERGE7500 - can probably remove access to gdtr
-    vmcb->gdtr.base = regs->edx;
-    regs->edx = 0;
-    ASSERT(regs->eax <= 0xFFFF); /* Make sure we're in the limit */
-    vmcb->gdtr.limit = regs->eax;
-    regs->eax = 0;
-
-    /* Local Descriptor Table */
+    /* Guest Global descriptor table */
+    vmcb->gdtr.base = 0;
+    vmcb->gdtr.limit = 0;
+
+    /* Guest Local Descriptor Table */
     attrib.fields.s = 0; /* not code or data segement */
     attrib.fields.type = 0x2; /* LDT */
     attrib.fields.db = 0; /* 16-bit */
@@ -279,11 +274,10 @@
     /* CR3 is set in svm_final_setup_guest */
 
     __asm__ __volatile__ ("mov %%cr4,%0" : "=r" (crn) :); 
-    shadow_cr = crn;
-    vmcb->cr4 = shadow_cr;
-
-//MERGE7500 - should write a 0 instead to rsp?
-    vmcb->rsp = regs->esp;
+    arch_svm->cpu_shadow_cr4 = crn & ~(X86_CR4_PGE | X86_CR4_PSE);
+    vmcb->cr4 = crn | SVM_CR4_HOST_MASK;
+
+    vmcb->rsp = 0;
     vmcb->rip = regs->eip;
 
     eflags = regs->eflags & ~HVM_EFLAGS_RESERVED_0; /* clear 0s */
@@ -306,7 +300,7 @@
 {
     if(arch_svm->vmcb != NULL)
     {
-        asidpool_retire(arch_svm->vmcb, arch_svm->core);
+        asidpool_retire(arch_svm->vmcb, arch_svm->asid_core);
          free_vmcb(arch_svm->vmcb);
     }
     if(arch_svm->iopm != NULL) {
@@ -404,18 +398,17 @@
 
 void svm_do_launch(struct vcpu *v)
 {
+    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
+    int core = smp_processor_id();
+    ASSERT(vmcb);
+
     /* Update CR3, GDT, LDT, TR */
-    struct vmcb_struct *vmcb;
-    int core = smp_processor_id();
-    vmcb = v->arch.hvm_svm.vmcb;
-    ASSERT(vmcb);
-
     svm_stts(v);
 
-    /* current core is the one we will perform the vmrun on */
-    v->arch.hvm_svm.core = core;
+    /* current core is the one we intend to perform the VMRUN on */
+    v->arch.hvm_svm.launch_core = v->arch.hvm_svm.asid_core = core;
     clear_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags);
-    if ( !asidpool_assign_next(vmcb, 0, core, core) )
+    if ( !asidpool_assign_next( vmcb, 0, core, core ))
         BUG();
 
     if (v->vcpu_id == 0)
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/hvm/svm/x86_64/exits.S
--- a/xen/arch/x86/hvm/svm/x86_64/exits.S       Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/hvm/svm/x86_64/exits.S       Wed Mar  1 19:47:25 2006
@@ -107,8 +107,6 @@
         movq %rax, VMCB_rax(%rcx)
         movq VCPU_svm_hsa_pa(%rbx), %rax
         VMSAVE
-       /* XXX FPU SAVE */
-       /* XXX DO TSC OFFSET */
 
         movq VCPU_svm_vmcb_pa(%rbx), %rax
         popq %r15
@@ -137,9 +135,7 @@
         VMSAVE
         /* rax is the only register we're allowed to touch here... */
 
-       /* XXX FPU SAVE */
         GET_CURRENT(%rax)
-       /* XXX DO TSC OFFSET */
         movq VCPU_svm_hsa_pa(%rax), %rax
         VMLOAD
 
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/hvm/vlapic.c
--- a/xen/arch/x86/hvm/vlapic.c Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/hvm/vlapic.c Wed Mar  1 19:47:25 2006
@@ -225,27 +225,35 @@
         break;
 
     case VLAPIC_DELIV_MODE_INIT:
-        if (!level && trig_mode == 1) {        //Deassert
+        if ( !level && trig_mode == 1 ) {        //Deassert
             printk("This hvm_vlapic is for P4, no work for De-assert init\n");
         } else {
             /* FIXME How to check the situation after vcpu reset? */
-            vlapic->init_sipi_sipi_state = 
VLAPIC_INIT_SIPI_SIPI_STATE_WAIT_SIPI;
-            if (vlapic->vcpu) {
-                vcpu_pause(vlapic->vcpu);
+            if ( test_and_clear_bit(_VCPUF_initialised, &v->vcpu_flags) ) {
+                printk("Reset hvm vcpu not supported yet\n");
+                domain_crash_synchronous();
             }
+            v->arch.hvm_vcpu.init_sipi_sipi_state =
+                HVM_VCPU_INIT_SIPI_SIPI_STATE_WAIT_SIPI;
+            result = 1;
         }
         break;
 
     case VLAPIC_DELIV_MODE_STARTUP:
-        if (vlapic->init_sipi_sipi_state != 
VLAPIC_INIT_SIPI_SIPI_STATE_WAIT_SIPI)
+        if ( v->arch.hvm_vcpu.init_sipi_sipi_state ==
+                HVM_VCPU_INIT_SIPI_SIPI_STATE_NORM )
             break;
-        vlapic->init_sipi_sipi_state = VLAPIC_INIT_SIPI_SIPI_STATE_NORM;
-        if (!vlapic->vcpu) {
-            /* XXX Call hvm_bringup_ap here */
-             result = 0;
-        }else{
-            //hvm_vcpu_reset(vlapic->vcpu);
-        }
+
+        v->arch.hvm_vcpu.init_sipi_sipi_state =
+                HVM_VCPU_INIT_SIPI_SIPI_STATE_NORM;
+
+        if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) ) {
+            printk("SIPI for initialized vcpu vcpuid %x\n", v->vcpu_id);
+            domain_crash_synchronous();
+        }
+
+        if ( hvm_bringup_ap(v->vcpu_id, vector) != 0 )
+            result = 0;
         break;
 
     default:
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/hvm/vmx/io.c
--- a/xen/arch/x86/hvm/vmx/io.c Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/hvm/vmx/io.c Wed Mar  1 19:47:25 2006
@@ -113,13 +113,15 @@
     struct hvm_virpit *vpit = &plat->vpit;
     struct hvm_virpic *pic= &plat->vpic;
 
-    hvm_pic_assist(v);
-    __vmread_vcpu(v, CPU_BASED_VM_EXEC_CONTROL, &cpu_exec_control);
-    if ( vpit->pending_intr_nr ) {
+    if ( v->vcpu_id == 0 )
+        hvm_pic_assist(v);
+
+    if ( (v->vcpu_id == 0) && vpit->pending_intr_nr ) {
         pic_set_irq(pic, 0, 0);
         pic_set_irq(pic, 0, 1);
     }
 
+    __vmread_vcpu(v, CPU_BASED_VM_EXEC_CONTROL, &cpu_exec_control);
     __vmread(VM_ENTRY_INTR_INFO_FIELD, &intr_fields);
 
     if (intr_fields & INTR_INFO_VALID_MASK) {
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/hvm/vmx/vmx.c
--- a/xen/arch/x86/hvm/vmx/vmx.c        Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/hvm/vmx/vmx.c        Wed Mar  1 19:47:25 2006
@@ -448,6 +448,37 @@
     return 0;                   /* dummy */
 }
 
+/* SMP VMX guest support */
+void vmx_init_ap_context(struct vcpu_guest_context *ctxt,
+                         int vcpuid, int trampoline_vector)
+{
+    int i;
+
+    memset(ctxt, 0, sizeof(*ctxt));
+
+    /*
+     * Initial register values:
+     */
+    ctxt->user_regs.eip = VMXASSIST_BASE;
+    ctxt->user_regs.edx = vcpuid;
+    ctxt->user_regs.ebx = trampoline_vector;
+
+    ctxt->flags = VGCF_HVM_GUEST;
+
+    /* Virtual IDT is empty at start-of-day. */
+    for ( i = 0; i < 256; i++ )
+    {
+        ctxt->trap_ctxt[i].vector = i;
+        ctxt->trap_ctxt[i].cs     = FLAT_KERNEL_CS;
+    }
+
+    /* No callback handlers. */
+#if defined(__i386__)
+    ctxt->event_callback_cs     = FLAT_KERNEL_CS;
+    ctxt->failsafe_callback_cs  = FLAT_KERNEL_CS;
+#endif
+}
+
 void do_nmi(struct cpu_user_regs *);
 
 static int check_vmx_controls(ctrls, msr)
@@ -544,6 +575,8 @@
     hvm_funcs.paging_enabled = vmx_paging_enabled;
     hvm_funcs.instruction_length = vmx_instruction_length;
     hvm_funcs.get_guest_ctrl_reg = vmx_get_ctrl_reg;
+
+    hvm_funcs.init_ap_context = vmx_init_ap_context;
 
     hvm_enabled = 1;
 
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/mm.c
--- a/xen/arch/x86/mm.c Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/mm.c Wed Mar  1 19:47:25 2006
@@ -97,11 +97,11 @@
 #include <xen/domain_page.h>
 #include <xen/event.h>
 #include <xen/iocap.h>
+#include <xen/guest_access.h>
 #include <asm/shadow.h>
 #include <asm/page.h>
 #include <asm/flushtlb.h>
 #include <asm/io.h>
-#include <asm/uaccess.h>
 #include <asm/ldt.h>
 #include <asm/x86_emulate.h>
 #include <public/memory.h>
@@ -475,7 +475,8 @@
     {
         MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte
                 " for dom%d",
-                mfn, get_gpfn_from_mfn(mfn), l1e_get_intpte(l1e), 
d->domain_id);
+                mfn, get_gpfn_from_mfn(mfn),
+                l1e_get_intpte(l1e), d->domain_id);
     }
 
     return okay;
@@ -515,7 +516,6 @@
 
 
 #if CONFIG_PAGING_LEVELS >= 3
-
 static int 
 get_page_from_l3e(
     l3_pgentry_t l3e, unsigned long pfn,
@@ -545,11 +545,9 @@
 #endif
     return rc;
 }
-
 #endif /* 3 level */
 
 #if CONFIG_PAGING_LEVELS >= 4
-
 static int 
 get_page_from_l4e(
     l4_pgentry_t l4e, unsigned long pfn, 
@@ -579,7 +577,6 @@
 
     return rc;
 }
-
 #endif /* 4 level */
 
 
@@ -649,27 +646,22 @@
 
 
 #if CONFIG_PAGING_LEVELS >= 3
-
 static void put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn)
 {
     if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) && 
          (l3e_get_pfn(l3e) != pfn) )
         put_page_and_type(mfn_to_page(l3e_get_pfn(l3e)));
 }
-
 #endif
 
 #if CONFIG_PAGING_LEVELS >= 4
-
 static void put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn)
 {
     if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) && 
          (l4e_get_pfn(l4e) != pfn) )
         put_page_and_type(mfn_to_page(l4e_get_pfn(l4e)));
 }
-
 #endif
-
 
 static int alloc_l1_table(struct page_info *page)
 {
@@ -1569,43 +1561,71 @@
     int okay;
     unsigned long old_base_mfn;
 
+    ASSERT(writable_pagetable_in_sync(d));
+
     if ( shadow_mode_refcounts(d) )
+    {
         okay = get_page_from_pagenr(mfn, d);
+        if ( unlikely(!okay) )
+        {
+            MEM_LOG("Error while installing new baseptr %lx", mfn);
+            return 0;
+        }
+    }
     else
+    {
         okay = get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
-
-    if ( likely(okay) )
-    {
-        invalidate_shadow_ldt(v);
-
-        old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
-        v->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT);
-        update_pagetables(v); /* update shadow_table and monitor_table */
-
-        write_ptbase(v);
-
+        if ( unlikely(!okay) )
+        {
+            /* Switch to idle pagetable: this VCPU has no active p.t. now. */
+            old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
+            v->arch.guest_table = mk_pagetable(0);
+            update_pagetables(v);
+            write_cr3(__pa(idle_pg_table));
+            if ( old_base_mfn != 0 )
+                put_page_and_type(mfn_to_page(old_base_mfn));
+
+            /* Retry the validation with no active p.t. for this VCPU. */
+            okay = get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d);
+            if ( !okay )
+            {
+                /* Failure here is unrecoverable: the VCPU has no pagetable! */
+                MEM_LOG("Fatal error while installing new baseptr %lx", mfn);
+                domain_crash(d);
+                percpu_info[v->processor].deferred_ops = 0;
+                return 0;
+            }
+        }
+    }
+
+    invalidate_shadow_ldt(v);
+
+    old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
+    v->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT);
+    update_pagetables(v); /* update shadow_table and monitor_table */
+
+    write_ptbase(v);
+
+    if ( likely(old_base_mfn != 0) )
+    {
         if ( shadow_mode_refcounts(d) )
             put_page(mfn_to_page(old_base_mfn));
         else
             put_page_and_type(mfn_to_page(old_base_mfn));
-
-        /* CR3 also holds a ref to its shadow... */
-        if ( shadow_mode_enabled(d) )
-        {
-            if ( v->arch.monitor_shadow_ref )
-                put_shadow_ref(v->arch.monitor_shadow_ref);
-            v->arch.monitor_shadow_ref =
-                pagetable_get_pfn(v->arch.monitor_table);
-            ASSERT(!page_get_owner(mfn_to_page(v->arch.monitor_shadow_ref)));
-            get_shadow_ref(v->arch.monitor_shadow_ref);
-        }
-    }
-    else
-    {
-        MEM_LOG("Error while installing new baseptr %lx", mfn);
-    }
-
-    return okay;
+    }
+
+    /* CR3 also holds a ref to its shadow... */
+    if ( shadow_mode_enabled(d) )
+    {
+        if ( v->arch.monitor_shadow_ref )
+            put_shadow_ref(v->arch.monitor_shadow_ref);
+        v->arch.monitor_shadow_ref =
+            pagetable_get_pfn(v->arch.monitor_table);
+        ASSERT(!page_get_owner(mfn_to_page(v->arch.monitor_shadow_ref)));
+        get_shadow_ref(v->arch.monitor_shadow_ref);
+    }
+
+    return 1;
 }
 
 static void process_deferred_ops(unsigned int cpu)
@@ -1625,7 +1645,7 @@
         else
             local_flush_tlb();
     }
-        
+
     if ( deferred_ops & DOP_RELOAD_LDT )
         (void)map_ldt_shadow_page(0);
 
@@ -1752,9 +1772,9 @@
     {
         if ( hypercall_preempt_check() )
         {
-            rc = hypercall4_create_continuation(
-                __HYPERVISOR_mmuext_op, uops,
-                (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
+            rc = hypercall_create_continuation(
+                __HYPERVISOR_mmuext_op, "pipi",
+                uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
             break;
         }
 
@@ -2018,9 +2038,9 @@
     {
         if ( hypercall_preempt_check() )
         {
-            rc = hypercall4_create_continuation(
-                __HYPERVISOR_mmu_update, ureqs, 
-                (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
+            rc = hypercall_create_continuation(
+                __HYPERVISOR_mmu_update, "pipi",
+                ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
             break;
         }
 
@@ -2769,7 +2789,7 @@
 }
 
 
-long arch_memory_op(int op, void *arg)
+long arch_memory_op(int op, GUEST_HANDLE(void) arg)
 {
     struct xen_reserved_phys_area xrpa;
     unsigned long pfn;
@@ -2779,7 +2799,7 @@
     switch ( op )
     {
     case XENMEM_reserved_phys_area:
-        if ( copy_from_user(&xrpa, arg, sizeof(xrpa)) )
+        if ( copy_from_guest(&xrpa, arg, 1) )
             return -EFAULT;
 
         /* No guest has more than one reserved area. */
@@ -2813,7 +2833,7 @@
 
         put_domain(d);
 
-        if ( copy_to_user(arg, &xrpa, sizeof(xrpa)) )
+        if ( copy_to_guest(arg, &xrpa, 1) )
             return -EFAULT;
 
         break;
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/setup.c
--- a/xen/arch/x86/setup.c      Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/setup.c      Wed Mar  1 19:47:25 2006
@@ -144,6 +144,20 @@
 
 static struct e820entry e820_raw[E820MAX];
 
+static unsigned long initial_images_start, initial_images_end;
+
+unsigned long initial_images_nrpages(void)
+{
+    unsigned long s = initial_images_start + PAGE_SIZE - 1;
+    unsigned long e = initial_images_end;
+    return ((e >> PAGE_SHIFT) - (s >> PAGE_SHIFT));
+}
+
+void discard_initial_images(void)
+{
+    init_domheap_pages(initial_images_start, initial_images_end);
+}
+
 void __init __start_xen(multiboot_info_t *mbi)
 {
     char *cmdline;
@@ -152,7 +166,6 @@
     unsigned int initrdidx = 1;
     module_t *mod = (module_t *)__va(mbi->mods_addr);
     unsigned long nr_pages, modules_length;
-    unsigned long initial_images_start, initial_images_end;
     paddr_t s, e;
     int i, e820_warn = 0, e820_raw_nr = 0, bytes = 0;
     struct ns16550_defaults ns16550 = {
@@ -437,11 +450,7 @@
         set_in_cr4(X86_CR4_OSXMMEXCPT);
 
     if ( opt_nosmp )
-    {
         max_cpus = 0;
-        smp_num_siblings = 1;
-        boot_cpu_data.x86_max_cores = 1;
-    }
 
     smp_prepare_cpus(max_cpus);
 
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/shadow32.c
--- a/xen/arch/x86/shadow32.c   Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/shadow32.c   Wed Mar  1 19:47:25 2006
@@ -43,7 +43,8 @@
 static void mark_shadows_as_reflecting_snapshot(struct domain *d, unsigned 
long gpfn);
 #endif
 
-static void free_p2m_table(struct vcpu *v);
+static int alloc_p2m_table(struct domain *d);
+static void free_p2m_table(struct domain *d);
 
 /********
 
@@ -739,7 +740,7 @@
     mpl2e = (l2_pgentry_t *)map_domain_page_global(mmfn);
     memset(mpl2e, 0, PAGE_SIZE);
 
-    memcpy(&mpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
+    memcpy(&mpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
            &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
            HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
 
@@ -760,6 +761,23 @@
 
     if ( v->vcpu_id == 0 )
         alloc_p2m_table(d);
+    else
+    {
+        unsigned long mfn;
+
+        mfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table);
+        if ( mfn )
+        {
+            l2_pgentry_t *l2tab;
+
+            l2tab = map_domain_page(mfn);
+
+            mpl2e[l2_table_offset(RO_MPT_VIRT_START)] =
+                l2tab[l2_table_offset(RO_MPT_VIRT_START)];
+
+            unmap_domain_page(l2tab);
+        }
+    }
 }
 
 /*
@@ -771,7 +789,7 @@
     unsigned long mfn;
 
     ASSERT( pagetable_get_paddr(v->arch.monitor_table) );
-    
+
     mpl2e = v->arch.monitor_vtable;
 
     /*
@@ -794,7 +812,7 @@
     }
 
     if ( v->vcpu_id == 0 )
-        free_p2m_table(v);
+        free_p2m_table(v->domain);
 
     /*
      * Then free monitor_table.
@@ -808,8 +826,8 @@
 }
 
 static int
-map_p2m_entry(
-    l1_pgentry_t *l1tab, unsigned long va, unsigned long gpa, unsigned long 
mfn)
+map_p2m_entry(l1_pgentry_t *l1tab, unsigned long va,
+              unsigned long gpa, unsigned long mfn)
 {
     unsigned long *l0tab = NULL;
     l1_pgentry_t l1e = { 0 };
@@ -820,27 +838,22 @@
     {
         page = alloc_domheap_page(NULL);
         if ( !page )
-            goto fail;
-
-        if ( l0tab  )
-            unmap_domain_page(l0tab);
+            return 0;
+
         l0tab = map_domain_page(page_to_mfn(page));
-        memset(l0tab, 0, PAGE_SIZE );
+        memset(l0tab, 0, PAGE_SIZE);
+
         l1e = l1tab[l1_table_offset(va)] =
             l1e_from_page(page, __PAGE_HYPERVISOR);
     }
-    else if ( l0tab == NULL)
+    else
         l0tab = map_domain_page(l1e_get_pfn(l1e));
 
-    l0tab[gpa & ((PAGE_SIZE / sizeof (mfn)) - 1) ] = mfn;
-
-    if ( l0tab )
-        unmap_domain_page(l0tab);
+    l0tab[gpa & ((PAGE_SIZE / sizeof(mfn)) - 1)] = mfn;
+
+    unmap_domain_page(l0tab);
 
     return 1;
-
-fail:
-    return 0;
 }
 
 int
@@ -853,7 +866,6 @@
     l1_pgentry_t *l1;
     struct page_info *l1page;
     unsigned long va = pfn << PAGE_SHIFT;
-    int error;
 
     if ( shadow_mode_external(d) )
     {
@@ -877,6 +889,7 @@
 
     if ( shadow_mode_external(d) )
     {
+        int error;
         l1_pgentry_t *l1tab = NULL;
         l2_pgentry_t l2e;
 
@@ -885,14 +898,13 @@
         ASSERT( l2e_get_flags(l2e) & _PAGE_PRESENT );
 
         l1tab = map_domain_page(l2e_get_pfn(l2e));
-        error = map_p2m_entry(l1tab, va, pfn, mfn);
-        if ( !error )
-            domain_crash_synchronous(); 
+        if ( !(error = map_p2m_entry(l1tab, va, pfn, mfn)) )
+            domain_crash(d);
 
         unmap_domain_page(l1tab);
         unmap_domain_page_with_cache(l2, l2cache);
 
-        return 1;
+        return error;
     }
 
     /*
@@ -926,7 +938,7 @@
     return 1;
 }
 
-int
+static int
 alloc_p2m_table(struct domain *d)
 {
     struct list_head *list_ent;
@@ -937,7 +949,7 @@
     l2_pgentry_t l2e = { 0 };
     struct page_info *page;
     unsigned long gpfn, mfn;
-    int error;
+    int error = 0;
 
     if ( pagetable_get_pfn(d->vcpu[0]->arch.monitor_table) )
     {
@@ -955,6 +967,9 @@
         }
         else
             l1tab = map_domain_page(l2e_get_pfn(l2e));
+
+        if ( l2tab )
+            unmap_domain_page(l2tab);
     }
     else
     {
@@ -972,23 +987,23 @@
         page = list_entry(list_ent, struct page_info, list);
         mfn = page_to_mfn(page);
 
-        error = map_p2m_entry(l1tab, va, gpfn, mfn);
-        if ( !error )
-            domain_crash_synchronous(); 
+        if ( !(error = map_p2m_entry(l1tab, va, gpfn, mfn)) )
+        {
+            domain_crash(d);
+            break;
+        }
 
         list_ent = frame_table[mfn].list.next;
         va += sizeof(mfn);
     }
 
-    if (l2tab)
-        unmap_domain_page(l2tab);
     unmap_domain_page(l1tab);
 
-    return 1;
-}
-
-static void 
-free_p2m_table(struct vcpu *v)
+    return error;
+}
+
+static void
+free_p2m_table(struct domain *d)
 {
     unsigned long va;
     l2_pgentry_t *l2tab;
@@ -996,10 +1011,10 @@
     l2_pgentry_t l2e;
     l1_pgentry_t l1e;
 
-    ASSERT ( pagetable_get_pfn(v->arch.monitor_table) );
+    ASSERT( pagetable_get_pfn(d->vcpu[0]->arch.monitor_table) );
 
     l2tab = map_domain_page(
-        pagetable_get_pfn(v->arch.monitor_table));
+        pagetable_get_pfn(d->vcpu[0]->arch.monitor_table));
 
     for ( va = RO_MPT_VIRT_START; va < RO_MPT_VIRT_END; )
     {
@@ -1015,11 +1030,13 @@
 
                 if ( l1e_get_flags(l1e) & _PAGE_PRESENT )
                     free_domheap_page(mfn_to_page(l1e_get_pfn(l1e)));
-                va += PAGE_SIZE; 
+                va += PAGE_SIZE;
             }
             unmap_domain_page(l1tab);
             free_domheap_page(mfn_to_page(l2e_get_pfn(l2e)));
         }
+        else
+            va += PAGE_SIZE * L1_PAGETABLE_ENTRIES;
     }
     unmap_domain_page(l2tab);
 }
@@ -1246,7 +1263,7 @@
 
     if ( shadow_mode_refcounts(d) )
     {
-        struct list_head *list_ent; 
+        struct list_head *list_ent;
         struct page_info *page;
 
         /*
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/shadow_public.c
--- a/xen/arch/x86/shadow_public.c      Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/shadow_public.c      Wed Mar  1 19:47:25 2006
@@ -31,7 +31,8 @@
 #include <xen/trace.h>
 #include <asm/shadow_64.h>
 
-static void free_p2m_table(struct vcpu *v);
+static int alloc_p2m_table(struct domain *d);
+static void free_p2m_table(struct domain *d);
 
 #define SHADOW_MAX_GUEST32(_encoded) ((L1_PAGETABLE_ENTRIES_32 - 1) - 
((_encoded) >> 16))
 
@@ -328,6 +329,23 @@
 
     if ( v->vcpu_id == 0 )
         alloc_p2m_table(d);
+    else
+    {
+        unsigned long mfn;
+
+        mfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table);
+        if ( mfn )
+        {
+            l4_pgentry_t *l4tab;
+
+            l4tab = map_domain_page(mfn);
+
+            mpl4e[l4_table_offset(RO_MPT_VIRT_START)] =
+                l4tab[l4_table_offset(RO_MPT_VIRT_START)];
+
+            unmap_domain_page(l4tab);
+        }
+    }
 }
 
 void free_monitor_pagetable(struct vcpu *v)
@@ -338,7 +356,7 @@
      * free monitor_table.
      */
     if ( v->vcpu_id == 0 )
-        free_p2m_table(v);
+        free_p2m_table(v->domain);
 
     /*
      * Then free monitor_table.
@@ -397,13 +415,49 @@
             l2e_empty();
     mpl2e[l2_table_offset(RO_MPT_VIRT_START)] = l2e_empty();
 
-    unmap_domain_page(mpl2e);
-
     v->arch.monitor_table = mk_pagetable(m3mfn << PAGE_SHIFT); /* < 4GB */
     v->arch.monitor_vtable = (l2_pgentry_t *) mpl3e;
 
     if ( v->vcpu_id == 0 )
         alloc_p2m_table(d);
+    else
+    {
+        unsigned long mfn;
+
+        mfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table);
+        if ( mfn )
+        {
+            l3_pgentry_t *l3tab, l3e;
+            l2_pgentry_t *l2tab;
+
+            l3tab = map_domain_page(mfn);
+            l3e = l3tab[l3_table_offset(RO_MPT_VIRT_START)];
+
+            /*
+             * NB: when CONFIG_PAGING_LEVELS == 3,
+             * (entry_get_flags(l3e) & _PAGE_PRESENT) is always true here.
+             * alloc_monitor_pagetable should guarantee this.
+             */
+            if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
+                BUG();
+
+            l2tab = map_domain_page(l3e_get_pfn(l3e));
+
+            /*
+             * Just one l2 slot is used here, so at most 2M for p2m table:
+             *      ((4K * 512)/sizeof(unsigned long)) * 4K = 2G
+             * should be OK on PAE xen, since Qemu DM can only map 1.5G VMX
+             * guest memory.
+             */
+            mpl2e[l2_table_offset(RO_MPT_VIRT_START)] =
+                l2tab[l2_table_offset(RO_MPT_VIRT_START)];
+
+            unmap_domain_page(l2tab);
+            unmap_domain_page(l3tab);
+        }
+    }
+
+    unmap_domain_page(mpl2e);
 }
 
 void free_monitor_pagetable(struct vcpu *v)
@@ -413,7 +467,7 @@
      * free monitor_table.
      */
     if ( v->vcpu_id == 0 )
-        free_p2m_table(v);
+        free_p2m_table(v->domain);
 
     m3mfn = pagetable_get_pfn(v->arch.monitor_table);
     m2mfn = l2e_get_pfn(v->arch.monitor_vtable[L3_PAGETABLE_ENTRIES - 1]);
@@ -1348,14 +1402,14 @@
 }
 
 static int
-map_p2m_entry(
-    pgentry_64_t *top_tab, unsigned long va, unsigned long gpa, unsigned long 
mfn)
+map_p2m_entry(pgentry_64_t *top_tab, unsigned long va,
+              unsigned long gpfn, unsigned long mfn)
 {
 #if CONFIG_PAGING_LEVELS >= 4
     pgentry_64_t l4e = { 0 };
+    pgentry_64_t *l3tab = NULL;
 #endif
 #if CONFIG_PAGING_LEVELS >= 3
-    pgentry_64_t *l3tab = NULL;
     pgentry_64_t l3e = { 0 };
 #endif
     l2_pgentry_t *l2tab = NULL;
@@ -1367,7 +1421,7 @@
 
 #if CONFIG_PAGING_LEVELS >= 4
     l4e = top_tab[l4_table_offset(va)];
-    if ( !(entry_get_flags(l4e) & _PAGE_PRESENT) ) 
+    if ( !(entry_get_flags(l4e) & _PAGE_PRESENT) )
     {
         page = alloc_domheap_page(NULL);
         if ( !page )
@@ -1375,17 +1429,14 @@
 
         l3tab = map_domain_page(page_to_mfn(page));
         memset(l3tab, 0, PAGE_SIZE);
-        l4e = top_tab[l4_table_offset(va)] = 
+        l4e = top_tab[l4_table_offset(va)] =
             entry_from_page(page, __PAGE_HYPERVISOR);
-    } 
-    else if ( l3tab == NULL)
+    }
+    else
         l3tab = map_domain_page(entry_get_pfn(l4e));
 
     l3e = l3tab[l3_table_offset(va)];
-#else
-    l3e = top_tab[l3_table_offset(va)];
-#endif
-    if ( !(entry_get_flags(l3e) & _PAGE_PRESENT) ) 
+    if ( !(entry_get_flags(l3e) & _PAGE_PRESENT) )
     {
         page = alloc_domheap_page(NULL);
         if ( !page )
@@ -1393,14 +1444,29 @@
 
         l2tab = map_domain_page(page_to_mfn(page));
         memset(l2tab, 0, PAGE_SIZE);
-        l3e = l3tab[l3_table_offset(va)] = 
+        l3e = l3tab[l3_table_offset(va)] =
             entry_from_page(page, __PAGE_HYPERVISOR);
-    } 
-    else if ( l2tab == NULL) 
+    }
+    else
         l2tab = map_domain_page(entry_get_pfn(l3e));
 
+    unmap_domain_page(l3tab);
+#else
+    l3e = top_tab[l3_table_offset(va)];
+
+    /*
+     * NB: when CONFIG_PAGING_LEVELS == 3,
+     * (entry_get_flags(l3e) & _PAGE_PRESENT) is always true here.
+     * alloc_monitor_pagetable should guarantee this.
+     */
+    if ( !(entry_get_flags(l3e) & _PAGE_PRESENT) )
+        BUG();
+
+    l2tab = map_domain_page(entry_get_pfn(l3e));
+#endif
+
     l2e = l2tab[l2_table_offset(va)];
-    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) 
+    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
     {
         page = alloc_domheap_page(NULL);
         if ( !page )
@@ -1408,14 +1474,16 @@
 
         l1tab = map_domain_page(page_to_mfn(page));
         memset(l1tab, 0, PAGE_SIZE);
-        l2e = l2tab[l2_table_offset(va)] = 
+        l2e = l2tab[l2_table_offset(va)] =
             l2e_from_page(page, __PAGE_HYPERVISOR);
-    } 
-    else if ( l1tab == NULL) 
+    }
+    else
         l1tab = map_domain_page(l2e_get_pfn(l2e));
 
+    unmap_domain_page(l2tab);
+
     l1e = l1tab[l1_table_offset(va)];
-    if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) ) 
+    if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) )
     {
         page = alloc_domheap_page(NULL);
         if ( !page )
@@ -1423,96 +1491,88 @@
 
         l0tab = map_domain_page(page_to_mfn(page));
         memset(l0tab, 0, PAGE_SIZE);
-        l1e = l1tab[l1_table_offset(va)] = 
+        l1e = l1tab[l1_table_offset(va)] =
             l1e_from_page(page, __PAGE_HYPERVISOR);
     }
-    else if ( l0tab == NULL) 
+    else
         l0tab = map_domain_page(l1e_get_pfn(l1e));
 
-    l0tab[gpa & ((PAGE_SIZE / sizeof (mfn)) - 1) ] = mfn;
-
-    if ( l2tab )
-    {
-        unmap_domain_page(l2tab);
-        l2tab = NULL;
-    }
-    if ( l1tab )
-    {
-        unmap_domain_page(l1tab);
-        l1tab = NULL;
-    }
-    if ( l0tab )
-    {
-        unmap_domain_page(l0tab);
-        l0tab = NULL;
-    }
+    unmap_domain_page(l1tab);
+
+    l0tab[gpfn & ((PAGE_SIZE / sizeof (mfn)) - 1) ] = mfn;
+
+    unmap_domain_page(l0tab);
 
     return 1;
 
 nomem:
-
     return 0;
 }
 
 int
-set_p2m_entry(struct domain *d, unsigned long pfn, unsigned long mfn,
+set_p2m_entry(struct domain *d, unsigned long gpfn, unsigned long mfn,
               struct domain_mmap_cache *l2cache,
               struct domain_mmap_cache *l1cache)
 {
-    unsigned long tabpfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table);
-    pgentry_64_t *top;
-    unsigned long va = RO_MPT_VIRT_START + (pfn * sizeof (unsigned long));
+    unsigned long tabmfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table);
+    unsigned long va = RO_MPT_VIRT_START + (gpfn * sizeof(unsigned long));
+    pgentry_64_t *top_tab;
     int error;
 
-    ASSERT(tabpfn != 0);
+    ASSERT(tabmfn != 0);
     ASSERT(shadow_lock_is_acquired(d));
 
-    top = map_domain_page_with_cache(tabpfn, l2cache);
-    error = map_p2m_entry(top, va, pfn, mfn);
-    unmap_domain_page_with_cache(top, l2cache);
-
-    if ( !error )
-         domain_crash_synchronous();
-        
-    return 1;
-}
-
-int
+    top_tab = map_domain_page_with_cache(tabmfn, l2cache);
+
+    if ( !(error = map_p2m_entry(top_tab, va, gpfn, mfn)) )
+        domain_crash(d);
+
+    unmap_domain_page_with_cache(top_tab, l2cache);
+
+    return error;
+}
+
+static int
 alloc_p2m_table(struct domain *d)
 {
     struct list_head *list_ent;
     unsigned long va = RO_MPT_VIRT_START; /*  phys_to_machine_mapping */
     pgentry_64_t *top_tab = NULL;
     unsigned long mfn;
-    int gpa;
-
-    ASSERT ( pagetable_get_pfn(d->vcpu[0]->arch.monitor_table) );
+    int gpfn, error = 0;
+
+    ASSERT( pagetable_get_pfn(d->vcpu[0]->arch.monitor_table) );
 
     top_tab = map_domain_page(
         pagetable_get_pfn(d->vcpu[0]->arch.monitor_table));
 
-
     list_ent = d->page_list.next;
 
-    for ( gpa = 0; list_ent != &d->page_list; gpa++ ) 
+    for ( gpfn = 0; list_ent != &d->page_list; gpfn++ )
     {
         struct page_info *page;
+
         page = list_entry(list_ent, struct page_info, list);
         mfn = page_to_mfn(page);
 
-        map_p2m_entry(top_tab, va, gpa, mfn);
+        if ( !(error = map_p2m_entry(top_tab, va, gpfn, mfn)) )
+        {
+            domain_crash(d);
+            break;
+        }
+
         list_ent = frame_table[mfn].list.next;
         va += sizeof(mfn);
     }
 
     unmap_domain_page(top_tab);
 
-    return 1;
+    return error;
 }
 
 #if CONFIG_PAGING_LEVELS >= 3
 static void
-free_p2m_table(struct vcpu *v)
+free_p2m_table(struct domain *d)
 {
     unsigned long va;
     l1_pgentry_t *l1tab;
@@ -1520,27 +1580,35 @@
     l2_pgentry_t *l2tab;
     l2_pgentry_t l2e;
 #if CONFIG_PAGING_LEVELS >= 3
-    l3_pgentry_t *l3tab; 
+    l3_pgentry_t *l3tab;
     l3_pgentry_t l3e;
 #endif
 #if CONFIG_PAGING_LEVELS == 4
     int i3;
-    l4_pgentry_t *l4tab; 
+    l4_pgentry_t *l4tab;
     l4_pgentry_t l4e;
 #endif
 
-    ASSERT ( pagetable_get_pfn(v->arch.monitor_table) );
+    ASSERT( pagetable_get_pfn(d->vcpu[0]->arch.monitor_table) );
 
 #if CONFIG_PAGING_LEVELS == 4
     l4tab = map_domain_page(
-        pagetable_get_pfn(v->arch.monitor_table));
+        pagetable_get_pfn(d->vcpu[0]->arch.monitor_table));
 #endif
 #if CONFIG_PAGING_LEVELS == 3
     l3tab = map_domain_page(
-        pagetable_get_pfn(v->arch.monitor_table));
-
-    va = RO_MPT_VIRT_START;
-    l3e = l3tab[l3_table_offset(va)];
+        pagetable_get_pfn(d->vcpu[0]->arch.monitor_table));
+
+    l3e = l3tab[l3_table_offset(RO_MPT_VIRT_START)];
+
+    /*
+     * NB: when CONFIG_PAGING_LEVELS == 3,
+     * (entry_get_flags(l3e) & _PAGE_PRESENT) is always true here.
+     * alloc_monitor_pagetable should guarantee this.
+     */
+    if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
+        BUG();
+
     l2tab = map_domain_page(l3e_get_pfn(l3e));
 #endif
 
@@ -1555,8 +1623,8 @@
 
             for ( i3 = 0; i3 < L3_PAGETABLE_ENTRIES; i3++ )
             {
-
                 l3e = l3tab[l3_table_offset(va)];
+
                 if ( l3e_get_flags(l3e) & _PAGE_PRESENT )
                 {
                     int i2;
@@ -1567,12 +1635,13 @@
                     {
 #endif
                         l2e = l2tab[l2_table_offset(va)];
+
                         if ( l2e_get_flags(l2e) & _PAGE_PRESENT )
                         {
                             int i1;
 
                             l1tab = map_domain_page(l2e_get_pfn(l2e));
-                            
+
                             /*
                              * unsigned long phys_to_machine_mapping[]
                              */
@@ -1591,7 +1660,7 @@
                         else
                             va += PAGE_SIZE * L1_PAGETABLE_ENTRIES;
 
-#if CONFIG_PAGING_LEVELS == 4                    
+#if CONFIG_PAGING_LEVELS == 4
                     }
                     unmap_domain_page(l2tab);
                     free_domheap_page(mfn_to_page(l3e_get_pfn(l3e)));
@@ -1603,7 +1672,7 @@
             free_domheap_page(mfn_to_page(l4e_get_pfn(l4e)));
         }
         else
-            va += PAGE_SIZE * 
+            va += PAGE_SIZE *
                 L1_PAGETABLE_ENTRIES * L2_PAGETABLE_ENTRIES * 
L3_PAGETABLE_ENTRIES;
 #endif
     }
@@ -1622,7 +1691,7 @@
     paddr_t pa, l1_pgentry_t gpte,
     struct domain_mmap_cache *cache)
 {
-    unsigned long sl1mfn;    
+    unsigned long sl1mfn;
     l1_pgentry_t *spl1e, spte;
 
     shadow_lock(d);
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/traps.c
--- a/xen/arch/x86/traps.c      Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/traps.c      Wed Mar  1 19:47:25 2006
@@ -951,6 +951,7 @@
             
         case 3: /* Write CR3 */
             LOCK_BIGLOCK(v->domain);
+            cleanup_writable_pagetable(v->domain);
             (void)new_guest_cr3(gmfn_to_mfn(v->domain, paddr_to_pfn(*reg)));
             UNLOCK_BIGLOCK(v->domain);
             break;
@@ -1002,7 +1003,6 @@
 #endif
         default:
             if ( (rdmsr_safe(regs->ecx, l, h) != 0) ||
-                 (regs->ecx != MSR_EFER) ||
                  (regs->eax != l) || (regs->edx != h) )
                 DPRINTK("Domain attempted WRMSR %p from "
                         "%08x:%08x to %08lx:%08lx.\n",
@@ -1033,8 +1033,8 @@
                 goto fail;
             break;
         default:
-            DPRINTK("Domain attempted RDMSR %p.\n", _p(regs->ecx));
             /* Everyone can read the MSR space. */
+            /*DPRINTK("Domain attempted RDMSR %p.\n", _p(regs->ecx));*/
             if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
                 goto fail;
             break;
@@ -1416,8 +1416,8 @@
     {
         if ( hypercall_preempt_check() )
         {
-            rc = hypercall1_create_continuation(
-                __HYPERVISOR_set_trap_table, traps);
+            rc = hypercall_create_continuation(
+                __HYPERVISOR_set_trap_table, "p", traps);
             break;
         }
 
@@ -1430,7 +1430,7 @@
         if ( cur.address == 0 )
             break;
 
-        fixup_guest_selector(cur.cs);
+        fixup_guest_code_selector(cur.cs);
 
         memcpy(&dst[cur.vector], &cur, sizeof(cur));
 
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/x86_32/asm-offsets.c
--- a/xen/arch/x86/x86_32/asm-offsets.c Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/x86_32/asm-offsets.c Wed Mar  1 19:47:25 2006
@@ -72,6 +72,13 @@
     DEFINE(_VCPUF_nmi_masked, _VCPUF_nmi_masked);
     BLANK();
 
+    OFFSET(TSS_ss0, struct tss_struct, ss0);
+    OFFSET(TSS_esp0, struct tss_struct, esp0);
+    OFFSET(TSS_ss1, struct tss_struct, ss1);
+    OFFSET(TSS_esp1, struct tss_struct, esp1);
+    DEFINE(TSS_sizeof, sizeof(struct tss_struct));
+    BLANK();
+
     OFFSET(VCPU_svm_vmcb_pa, struct vcpu, arch.hvm_svm.vmcb_pa);
     OFFSET(VCPU_svm_hsa_pa,  struct vcpu, arch.hvm_svm.host_save_pa);
     OFFSET(VCPU_svm_vmcb, struct vcpu, arch.hvm_svm.vmcb);
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/x86_32/entry.S
--- a/xen/arch/x86/x86_32/entry.S       Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/x86_32/entry.S       Wed Mar  1 19:47:25 2006
@@ -77,6 +77,13 @@
 restore_all_guest:
         testl $X86_EFLAGS_VM,UREGS_eflags(%esp)
         jnz  restore_all_vm86
+#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
+        testl $2,UREGS_cs(%esp)
+        jnz   1f
+        call  restore_ring0_guest
+        jmp   restore_all_vm86
+1:
+#endif
 FLT1:   mov  UREGS_ds(%esp),%ds
 FLT2:   mov  UREGS_es(%esp),%es
 FLT3:   mov  UREGS_fs(%esp),%fs
@@ -157,6 +164,7 @@
         ALIGN
 ENTRY(hypercall)
         subl $4,%esp
+        FIXUP_RING0_GUEST_STACK
        SAVE_ALL(b)
         sti
         GET_CURRENT(%ebx)
@@ -294,6 +302,11 @@
         popl %eax
         shll $16,%eax                    # Bits 16-23: saved_upcall_mask
         movw UREGS_cs+4(%esp),%ax        # Bits  0-15: CS
+#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
+        testw $2,%ax
+        jnz  FLT15
+        and  $~3,%ax                     # RPL 1 -> RPL 0
+#endif
 FLT15:  movl %eax,%gs:4(%esi) 
         test $0x00FF0000,%eax            # Bits 16-23: saved_upcall_mask
         setz %ch                         # %ch == !saved_upcall_mask
@@ -388,6 +401,7 @@
        pushl $TRAP_divide_error<<16
        ALIGN
 error_code:
+        FIXUP_RING0_GUEST_STACK
         SAVE_ALL_NOSEGREGS(a)
         SET_XEN_SEGMENTS(a)
         testb $X86_EFLAGS_IF>>8,UREGS_eflags+1(%esp)
@@ -505,6 +519,10 @@
        jmp error_code
 
 ENTRY(nmi)
+#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
+        # NMI entry protocol is incompatible with guest kernel in ring 0.
+        iret
+#else
         # Save state but do not trash the segment registers!
         # We may otherwise be unable to reload them or copy them to ring 1. 
        pushl %eax
@@ -546,6 +564,7 @@
         movl  $(APIC_DM_FIXED | APIC_DEST_SELF | APIC_DEST_LOGICAL | \
                 TRAP_deferred_nmi),%ss:APIC_ICR(%eax)
         jmp   restore_all_xen
+#endif /* !CONFIG_X86_SUPERVISOR_MODE_KERNEL */
 
 ENTRY(setup_vm86_frame)
         # Copies the entire stack frame forwards by 16 bytes.
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/x86_32/mm.c
--- a/xen/arch/x86/x86_32/mm.c  Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/x86_32/mm.c  Wed Mar  1 19:47:25 2006
@@ -23,6 +23,7 @@
 #include <xen/init.h>
 #include <xen/mm.h>
 #include <xen/sched.h>
+#include <xen/guest_access.h>
 #include <asm/current.h>
 #include <asm/page.h>
 #include <asm/flushtlb.h>
@@ -180,9 +181,18 @@
             page_set_owner(page, dom_xen);
         }
     }
-}
-
-long subarch_memory_op(int op, void *arg)
+
+    if ( supervisor_mode_kernel )
+    {
+        /* Guest kernel runs in ring 0, not ring 1. */
+        struct desc_struct *d;
+        d = &gdt_table[(FLAT_RING1_CS >> 3) - FIRST_RESERVED_GDT_ENTRY];
+        d[0].b &= ~_SEGMENT_DPL;
+        d[1].b &= ~_SEGMENT_DPL;
+    }
+}
+
+long subarch_memory_op(int op, GUEST_HANDLE(void) arg)
 {
     struct xen_machphys_mfn_list xmml;
     unsigned long mfn;
@@ -192,7 +202,7 @@
     switch ( op )
     {
     case XENMEM_machphys_mfn_list:
-        if ( copy_from_user(&xmml, arg, sizeof(xmml)) )
+        if ( copy_from_guest(&xmml, arg, 1) )
             return -EFAULT;
 
         max = min_t(unsigned int, xmml.max_extents, mpt_size >> 21);
@@ -201,11 +211,12 @@
         {
             mfn = l2e_get_pfn(idle_pg_table_l2[l2_linear_offset(
                 RDWR_MPT_VIRT_START + (i << 21))]) + l1_table_offset(i << 21);
-            if ( put_user(mfn, &xmml.extent_start[i]) )
+            if ( copy_to_guest_offset(xmml.extent_start, i, &mfn, 1) )
                 return -EFAULT;
         }
 
-        if ( put_user(i, &((struct xen_machphys_mfn_list *)arg)->nr_extents) )
+        xmml.nr_extents = i;
+        if ( copy_to_guest(arg, &xmml, 1) )
             return -EFAULT;
 
         break;
@@ -223,7 +234,7 @@
     int nr = smp_processor_id();
     struct tss_struct *t = &init_tss[nr];
 
-    fixup_guest_selector(ss);
+    fixup_guest_stack_selector(ss);
 
     current->arch.guest_context.kernel_ss = ss;
     current->arch.guest_context.kernel_sp = esp;
@@ -239,6 +250,10 @@
     unsigned long base, limit;
     u32 a = d->a, b = d->b;
     u16 cs;
+
+    /* Let a ring0 guest kernel set any descriptor it wants to. */
+    if ( supervisor_mode_kernel )
+        return 1;
 
     /* A not-present descriptor will always fault, so is safe. */
     if ( !(b & _SEGMENT_P) ) 
@@ -273,7 +288,7 @@
 
         /* Validate and fix up the target code selector. */
         cs = a >> 16;
-        fixup_guest_selector(cs);
+        fixup_guest_code_selector(cs);
         if ( !guest_gate_selector_okay(cs) )
             goto bad;
         a = d->a = (d->a & 0xffffU) | (cs << 16);
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/x86_32/traps.c
--- a/xen/arch/x86/x86_32/traps.c       Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/x86_32/traps.c       Wed Mar  1 19:47:25 2006
@@ -256,8 +256,14 @@
      * We can't virtualise interrupt gates, as there's no way to get
      * the CPU to automatically clear the events_mask variable. Also we
      * must ensure that the CS is safe to poke into an interrupt gate.
-     */
-    if ( TI_GET_IF(ti) || !guest_gate_selector_okay(ti->cs) )
+     *
+     * When running with supervisor_mode_kernel enabled a direct trap
+     * to the guest OS cannot be used because the INT instruction will
+     * switch to the Xen stack and we need to swap back to the guest
+     * kernel stack before passing control to the system call entry point.
+     */
+    if ( TI_GET_IF(ti) || !guest_gate_selector_okay(ti->cs) ||
+         supervisor_mode_kernel )
     {
         v->arch.int80_desc.a = v->arch.int80_desc.b = 0;
         return;
@@ -278,8 +284,8 @@
 {
     struct vcpu *d = current;
 
-    fixup_guest_selector(event_selector);
-    fixup_guest_selector(failsafe_selector);
+    fixup_guest_code_selector(event_selector);
+    fixup_guest_code_selector(failsafe_selector);
 
     d->arch.guest_context.event_callback_cs     = event_selector;
     d->arch.guest_context.event_callback_eip    = event_address;
@@ -289,12 +295,51 @@
     return 0;
 }
 
-void hypercall_page_initialise(void *hypercall_page)
-{
+static void hypercall_page_initialise_ring0_kernel(void *hypercall_page)
+{
+    extern asmlinkage int hypercall(void);
     char *p;
     int i;
 
     /* Fill in all the transfer points with template machine code. */
+
+    for ( i = 0; i < NR_hypercalls; i++ )
+    {
+        p = (char *)(hypercall_page + (i * 32));
+
+        *(u8  *)(p+ 0) = 0x9c;      /* pushf */
+        *(u8  *)(p+ 1) = 0xfa;      /* cli */
+        *(u8  *)(p+ 2) = 0xb8;      /* mov $<i>,%eax */
+        *(u32 *)(p+ 3) = i;
+        *(u8  *)(p+ 7) = 0x9a;      /* lcall $__HYPERVISOR_CS,&hypercall */
+        *(u32 *)(p+ 8) = (u32)&hypercall;
+        *(u16 *)(p+12) = (u16)__HYPERVISOR_CS;
+        *(u8  *)(p+14) = 0xc3;      /* ret */
+    }
+
+    /*
+     * HYPERVISOR_iret is special because it doesn't return and expects a
+     * special stack frame. Guests jump at this transfer point instead of
+     * calling it.
+     */
+    p = (char *)(hypercall_page + (__HYPERVISOR_iret * 32));
+    *(u8  *)(p+ 0) = 0x50;      /* push %eax */
+    *(u8  *)(p+ 1) = 0x9c;      /* pushf */
+    *(u8  *)(p+ 2) = 0xfa;      /* cli */
+    *(u8  *)(p+ 3) = 0xb8;      /* mov $<i>,%eax */
+    *(u32 *)(p+ 4) = __HYPERVISOR_iret;
+    *(u8  *)(p+ 8) = 0x9a;      /* lcall $__HYPERVISOR_CS,&hypercall */
+    *(u32 *)(p+ 9) = (u32)&hypercall;
+    *(u16 *)(p+13) = (u16)__HYPERVISOR_CS;
+}
+
+static void hypercall_page_initialise_ring1_kernel(void *hypercall_page)
+{
+    char *p;
+    int i;
+
+    /* Fill in all the transfer points with template machine code. */
+
     for ( i = 0; i < (PAGE_SIZE / 32); i++ )
     {
         p = (char *)(hypercall_page + (i * 32));
@@ -314,6 +359,14 @@
     *(u8  *)(p+ 1) = 0xb8;    /* mov  $__HYPERVISOR_iret,%eax */
     *(u32 *)(p+ 2) = __HYPERVISOR_iret;
     *(u16 *)(p+ 6) = 0x82cd;  /* int  $0x82 */
+}
+
+void hypercall_page_initialise(void *hypercall_page)
+{
+    if ( supervisor_mode_kernel )
+        hypercall_page_initialise_ring0_kernel(hypercall_page);
+    else
+        hypercall_page_initialise_ring1_kernel(hypercall_page);
 }
 
 /*
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/x86_64/mm.c
--- a/xen/arch/x86/x86_64/mm.c  Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/x86_64/mm.c  Wed Mar  1 19:47:25 2006
@@ -22,6 +22,7 @@
 #include <xen/init.h>
 #include <xen/mm.h>
 #include <xen/sched.h>
+#include <xen/guest_access.h>
 #include <asm/current.h>
 #include <asm/asm_defns.h>
 #include <asm/page.h>
@@ -182,7 +183,7 @@
     }
 }
 
-long subarch_memory_op(int op, void *arg)
+long subarch_memory_op(int op, GUEST_HANDLE(void) arg)
 {
     struct xen_machphys_mfn_list xmml;
     l3_pgentry_t l3e;
@@ -194,7 +195,7 @@
     switch ( op )
     {
     case XENMEM_machphys_mfn_list:
-        if ( copy_from_user(&xmml, arg, sizeof(xmml)) )
+        if ( copy_from_guest(&xmml, arg, 1) )
             return -EFAULT;
 
         for ( i = 0, v = RDWR_MPT_VIRT_START;
@@ -209,11 +210,12 @@
             if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
                 break;
             mfn = l2e_get_pfn(l2e) + l1_table_offset(v);
-            if ( put_user(mfn, &xmml.extent_start[i]) )
+            if ( copy_to_guest_offset(xmml.extent_start, i, &mfn, 1) )
                 return -EFAULT;
         }
 
-        if ( put_user(i, &((struct xen_machphys_mfn_list *)arg)->nr_extents) )
+        xmml.nr_extents = i;
+        if ( copy_to_guest(arg, &xmml, 1) )
             return -EFAULT;
 
         break;
@@ -228,7 +230,7 @@
 
 long do_stack_switch(unsigned long ss, unsigned long esp)
 {
-    fixup_guest_selector(ss);
+    fixup_guest_stack_selector(ss);
     current->arch.guest_context.kernel_ss = ss;
     current->arch.guest_context.kernel_sp = esp;
     return 0;
@@ -315,7 +317,7 @@
 
     /* Validate and fix up the target code selector. */
     cs = a >> 16;
-    fixup_guest_selector(cs);
+    fixup_guest_code_selector(cs);
     if ( !guest_gate_selector_okay(cs) )
         goto bad;
     a = d->a = (d->a & 0xffffU) | (cs << 16);
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/common/dom0_ops.c
--- a/xen/common/dom0_ops.c     Wed Mar  1 17:01:54 2006
+++ b/xen/common/dom0_ops.c     Wed Mar  1 19:47:25 2006
@@ -46,6 +46,7 @@
     struct vcpu   *v;
     u64 cpu_time = 0;
     int flags = DOMFLAGS_BLOCKED;
+    struct vcpu_runstate_info runstate;
     
     info->domain = d->domain_id;
     info->nr_online_vcpus = 0;
@@ -55,7 +56,8 @@
      * - domain is marked as running if any of its vcpus is running
      */
     for_each_vcpu ( d, v ) {
-        cpu_time += v->cpu_time;
+        vcpu_runstate_get(v, &runstate);
+        cpu_time += runstate.time[RUNSTATE_running];
         info->max_vcpu_id = v->vcpu_id;
         if ( !test_bit(_VCPUF_down, &v->vcpu_flags) )
         {
@@ -165,7 +167,15 @@
         domid_t        dom;
         struct vcpu   *v;
         unsigned int   i, cnt[NR_CPUS] = { 0 };
+        cpumask_t      cpu_exclude_map;
         static domid_t rover = 0;
+
+        /*
+         * Running the domain 0 kernel in ring 0 is not compatible
+         * with multiple guests.
+         */
+        if ( supervisor_mode_kernel )
+            return -EINVAL;
 
         dom = op->u.createdomain.domain;
         if ( (dom > 0) && (dom < DOMID_FIRST_RESERVED) )
@@ -195,18 +205,29 @@
         read_lock(&domlist_lock);
         for_each_domain ( d )
             for_each_vcpu ( d, v )
-                cnt[v->processor]++;
+                if ( !test_bit(_VCPUF_down, &v->vcpu_flags) )
+                    cnt[v->processor]++;
         read_unlock(&domlist_lock);
         
         /*
-         * If we're on a HT system, we only use the first HT for dom0, other 
-         * domains will all share the second HT of each CPU. Since dom0 is on 
-         * CPU 0, we favour high numbered CPUs in the event of a tie.
+         * If we're on a HT system, we only auto-allocate to a non-primary HT.
+         * We favour high numbered CPUs in the event of a tie.
          */
-        pro = smp_num_siblings - 1;
-        for ( i = pro; i < num_online_cpus(); i += smp_num_siblings )
+        pro = first_cpu(cpu_sibling_map[0]);
+        if ( cpus_weight(cpu_sibling_map[0]) > 1 )
+            pro = next_cpu(pro, cpu_sibling_map[0]);
+        cpu_exclude_map = cpu_sibling_map[0];
+        for_each_online_cpu ( i )
+        {
+            if ( cpu_isset(i, cpu_exclude_map) )
+                continue;
+            if ( (i == first_cpu(cpu_sibling_map[i])) &&
+                 (cpus_weight(cpu_sibling_map[i]) > 1) )
+                continue;
+            cpus_or(cpu_exclude_map, cpu_exclude_map, cpu_sibling_map[i]);
             if ( cnt[i] <= cnt[pro] )
                 pro = i;
+        }
 
         ret = -ENOMEM;
         if ( (d = domain_create(dom, pro)) == NULL )
@@ -485,6 +506,7 @@
     { 
         struct domain *d;
         struct vcpu   *v;
+        struct vcpu_runstate_info runstate;
 
         ret = -ESRCH;
         if ( (d = find_domain_by_id(op->u.getvcpuinfo.domain)) == NULL )
@@ -498,10 +520,12 @@
         if ( (v = d->vcpu[op->u.getvcpuinfo.vcpu]) == NULL )
             goto getvcpuinfo_out;
 
+        vcpu_runstate_get(v, &runstate);
+
         op->u.getvcpuinfo.online   = !test_bit(_VCPUF_down, &v->vcpu_flags);
         op->u.getvcpuinfo.blocked  = test_bit(_VCPUF_blocked, &v->vcpu_flags);
         op->u.getvcpuinfo.running  = test_bit(_VCPUF_running, &v->vcpu_flags);
-        op->u.getvcpuinfo.cpu_time = v->cpu_time;
+        op->u.getvcpuinfo.cpu_time = runstate.time[RUNSTATE_running];
         op->u.getvcpuinfo.cpu      = v->processor;
         op->u.getvcpuinfo.cpumap   = 0;
         memcpy(&op->u.getvcpuinfo.cpumap,
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/common/domain.c
--- a/xen/common/domain.c       Wed Mar  1 17:01:54 2006
+++ b/xen/common/domain.c       Wed Mar  1 19:47:25 2006
@@ -451,6 +451,41 @@
     case VCPUOP_is_up:
         rc = !test_bit(_VCPUF_down, &v->vcpu_flags);
         break;
+
+    case VCPUOP_get_runstate_info:
+    {
+        struct vcpu_runstate_info runstate;
+        vcpu_runstate_get(v, &runstate);
+        if ( copy_to_user(arg, &runstate, sizeof(runstate)) )
+            rc = -EFAULT;
+        break;
+    }
+
+    case VCPUOP_register_runstate_memory_area:
+    {
+        struct vcpu_register_runstate_memory_area area;
+
+        rc = -EINVAL;
+        if ( v != current )
+            break;
+
+        rc = -EFAULT;
+        if ( copy_from_user(&area, arg, sizeof(area)) )
+            break;
+
+        if ( !access_ok(area.addr.v, sizeof(*area.addr.v)) )
+            break;
+
+        rc = 0;
+        v->runstate_guest = area.addr.v;
+        __copy_to_user(v->runstate_guest, &v->runstate, sizeof(v->runstate));
+
+        break;
+    }
+
+    default:
+        rc = -ENOSYS;
+        break;
     }
 
     return rc;
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/common/kernel.c
--- a/xen/common/kernel.c       Wed Mar  1 17:01:54 2006
+++ b/xen/common/kernel.c       Wed Mar  1 19:47:25 2006
@@ -195,6 +195,8 @@
                     (1U << XENFEAT_writable_page_tables) |
                     (1U << XENFEAT_auto_translated_physmap) |
                     (1U << XENFEAT_pae_pgdir_above_4gb);
+            if ( supervisor_mode_kernel )
+                fi.submap |= 1U << XENFEAT_supervisor_mode_kernel;
             break;
         default:
             return -EINVAL;
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/common/keyhandler.c
--- a/xen/common/keyhandler.c   Wed Mar  1 17:01:54 2006
+++ b/xen/common/keyhandler.c   Wed Mar  1 19:47:25 2006
@@ -169,8 +169,6 @@
 }
 
 extern void dump_runq(unsigned char key);
-extern void print_sched_histo(unsigned char key);
-extern void reset_sched_histo(unsigned char key);
 #ifndef NDEBUG
 extern void audit_domains_key(unsigned char key);
 #endif
@@ -206,10 +204,6 @@
         'd', dump_registers, "dump registers"); 
     register_keyhandler(
         'h', show_handlers, "show this message");
-    register_keyhandler(
-        'l', print_sched_histo, "print sched latency histogram");
-    register_keyhandler(
-        'L', reset_sched_histo, "reset sched latency histogram");
     register_keyhandler(
         'q', dump_domains, "dump domain (and guest debug) info");
     register_keyhandler(
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/common/memory.c
--- a/xen/common/memory.c       Wed Mar  1 17:01:54 2006
+++ b/xen/common/memory.c       Wed Mar  1 19:47:25 2006
@@ -16,6 +16,7 @@
 #include <xen/event.h>
 #include <xen/shadow.h>
 #include <xen/iocap.h>
+#include <xen/guest_access.h>
 #include <asm/current.h>
 #include <asm/hardirq.h>
 #include <public/memory.h>
@@ -30,7 +31,7 @@
 static long
 increase_reservation(
     struct domain *d, 
-    unsigned long *extent_list, 
+    GUEST_HANDLE(xen_ulong) extent_list,
     unsigned int   nr_extents,
     unsigned int   extent_order,
     unsigned int   flags,
@@ -39,8 +40,8 @@
     struct page_info *page;
     unsigned long     i, mfn;
 
-    if ( (extent_list != NULL) &&
-         !array_access_ok(extent_list, nr_extents, sizeof(*extent_list)) )
+    if ( !guest_handle_is_null(extent_list) &&
+         !guest_handle_okay(extent_list, nr_extents) )
         return 0;
 
     if ( (extent_order != 0) &&
@@ -65,10 +66,10 @@
         }
 
         /* Inform the domain of the new page's machine address. */ 
-        if ( extent_list != NULL )
+        if ( !guest_handle_is_null(extent_list) )
         {
             mfn = page_to_mfn(page);
-            if ( unlikely(__copy_to_user(&extent_list[i], &mfn, sizeof(mfn))) )
+            if ( unlikely(__copy_to_guest_offset(extent_list, i, &mfn, 1)) )
                 return i;
         }
     }
@@ -79,16 +80,16 @@
 static long
 populate_physmap(
     struct domain *d, 
-    unsigned long *extent_list, 
-    unsigned int   nr_extents,
-    unsigned int   extent_order,
-    unsigned int   flags,
-    int           *preempted)
+    GUEST_HANDLE(xen_ulong) extent_list,
+    unsigned int  nr_extents,
+    unsigned int  extent_order,
+    unsigned int  flags,
+    int          *preempted)
 {
     struct page_info *page;
     unsigned long    i, j, gpfn, mfn;
 
-    if ( !array_access_ok(extent_list, nr_extents, sizeof(*extent_list)) )
+    if ( !guest_handle_okay(extent_list, nr_extents) )
         return 0;
 
     if ( (extent_order != 0) &&
@@ -103,7 +104,7 @@
             goto out;
         }
 
-        if ( unlikely(__copy_from_user(&gpfn, &extent_list[i], sizeof(gpfn))) )
+        if ( unlikely(__copy_from_guest_offset(&gpfn, extent_list, i, 1)) )
             goto out;
 
         if ( unlikely((page = alloc_domheap_pages(
@@ -128,7 +129,7 @@
                 set_gpfn_from_mfn(mfn + j, gpfn + j);
 
             /* Inform the domain of the new page's machine address. */ 
-            if ( unlikely(__copy_to_user(&extent_list[i], &mfn, sizeof(mfn))) )
+            if ( unlikely(__copy_to_guest_offset(extent_list, i, &mfn, 1)) )
                 goto out;
         }
     }
@@ -139,8 +140,8 @@
     
 static long
 decrease_reservation(
-    struct domain *d, 
-    unsigned long *extent_list, 
+    struct domain *d,
+    GUEST_HANDLE(xen_ulong) extent_list,
     unsigned int   nr_extents,
     unsigned int   extent_order,
     unsigned int   flags,
@@ -149,7 +150,7 @@
     struct page_info *page;
     unsigned long    i, j, gmfn, mfn;
 
-    if ( !array_access_ok(extent_list, nr_extents, sizeof(*extent_list)) )
+    if ( !guest_handle_okay(extent_list, nr_extents) )
         return 0;
 
     for ( i = 0; i < nr_extents; i++ )
@@ -160,7 +161,7 @@
             return i;
         }
 
-        if ( unlikely(__copy_from_user(&gmfn, &extent_list[i], sizeof(gmfn))) )
+        if ( unlikely(__copy_from_guest_offset(&gmfn, extent_list, i, 1)) )
             return i;
 
         for ( j = 0; j < (1 << extent_order); j++ )
@@ -197,21 +198,21 @@
 
 static long
 translate_gpfn_list(
-    struct xen_translate_gpfn_list *uop, unsigned long *progress)
+    GUEST_HANDLE(xen_translate_gpfn_list_t) uop, unsigned long *progress)
 {
     struct xen_translate_gpfn_list op;
     unsigned long i, gpfn, mfn;
     struct domain *d;
 
-    if ( copy_from_user(&op, uop, sizeof(op)) )
+    if ( copy_from_guest(&op, uop, 1) )
         return -EFAULT;
 
     /* Is size too large for us to encode a continuation? */
     if ( op.nr_gpfns > (ULONG_MAX >> START_EXTENT_SHIFT) )
         return -EINVAL;
 
-    if ( !array_access_ok(op.gpfn_list, op.nr_gpfns, sizeof(*op.gpfn_list)) ||
-         !array_access_ok(op.mfn_list, op.nr_gpfns, sizeof(*op.mfn_list)) )
+    if ( !guest_handle_okay(op.gpfn_list, op.nr_gpfns) ||
+         !guest_handle_okay(op.mfn_list,  op.nr_gpfns) )
         return -EFAULT;
 
     if ( op.domid == DOMID_SELF )
@@ -237,8 +238,7 @@
             return -EAGAIN;
         }
 
-        if ( unlikely(__copy_from_user(&gpfn, &op.gpfn_list[i],
-                                       sizeof(gpfn))) )
+        if ( unlikely(__copy_from_guest_offset(&gpfn, op.gpfn_list, i, 1)) )
         {
             put_domain(d);
             return -EFAULT;
@@ -246,8 +246,7 @@
 
         mfn = gmfn_to_mfn(d, gpfn);
 
-        if ( unlikely(__copy_to_user(&op.mfn_list[i], &mfn,
-                                     sizeof(mfn))) )
+        if ( unlikely(__copy_to_guest_offset(op.mfn_list, i, &mfn, 1)) )
         {
             put_domain(d);
             return -EFAULT;
@@ -258,7 +257,7 @@
     return 0;
 }
 
-long do_memory_op(unsigned long cmd, void *arg)
+long do_memory_op(unsigned long cmd, GUEST_HANDLE(void) arg)
 {
     struct domain *d;
     int rc, op, flags = 0, preempted = 0;
@@ -273,7 +272,7 @@
     case XENMEM_increase_reservation:
     case XENMEM_decrease_reservation:
     case XENMEM_populate_physmap:
-        if ( copy_from_user(&reservation, arg, sizeof(reservation)) )
+        if ( copy_from_guest(&reservation, arg, 1) )
             return -EFAULT;
 
         /* Is size too large for us to encode a continuation? */
@@ -283,9 +282,9 @@
         start_extent = cmd >> START_EXTENT_SHIFT;
         if ( unlikely(start_extent > reservation.nr_extents) )
             return -EINVAL;
-        
-        if ( reservation.extent_start != NULL )
-            reservation.extent_start += start_extent;
+
+        if ( !guest_handle_is_null(reservation.extent_start) )
+            guest_handle_add_offset(reservation.extent_start, start_extent);
         reservation.nr_extents -= start_extent;
 
         if ( (reservation.address_bits != 0) &&
@@ -342,8 +341,9 @@
         rc += start_extent;
 
         if ( preempted )
-            return hypercall2_create_continuation(
-                __HYPERVISOR_memory_op, op | (rc << START_EXTENT_SHIFT), arg);
+            return hypercall_create_continuation(
+                __HYPERVISOR_memory_op, "lh",
+                op | (rc << START_EXTENT_SHIFT), arg);
 
         break;
 
@@ -353,10 +353,10 @@
 
     case XENMEM_current_reservation:
     case XENMEM_maximum_reservation:
-        if ( copy_from_user(&domid, (domid_t *)arg, sizeof(domid)) )
+        if ( copy_from_guest(&domid, arg, 1) )
             return -EFAULT;
 
-        if ( likely((domid = (unsigned long)arg) == DOMID_SELF) )
+        if ( likely(domid == DOMID_SELF) )
             d = current->domain;
         else if ( !IS_PRIV(current->domain) )
             return -EPERM;
@@ -372,12 +372,13 @@
 
     case XENMEM_translate_gpfn_list:
         progress = cmd >> START_EXTENT_SHIFT;
-        rc = translate_gpfn_list(arg, &progress);
+        rc = translate_gpfn_list(
+            guest_handle_cast(arg, xen_translate_gpfn_list_t),
+            &progress);
         if ( rc == -EAGAIN )
-            return hypercall2_create_continuation(
-                __HYPERVISOR_memory_op,
-                op | (progress << START_EXTENT_SHIFT),
-                arg);
+            return hypercall_create_continuation(
+                __HYPERVISOR_memory_op, "lh",
+                op | (progress << START_EXTENT_SHIFT), arg);
         break;
 
     default:
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/common/multicall.c
--- a/xen/common/multicall.c    Wed Mar  1 17:01:54 2006
+++ b/xen/common/multicall.c    Wed Mar  1 19:47:25 2006
@@ -81,8 +81,8 @@
             if ( i < nr_calls )
             {
                 mcs->flags = 0;
-                return hypercall2_create_continuation(
-                    __HYPERVISOR_multicall, &call_list[i], nr_calls-i);
+                return hypercall_create_continuation(
+                    __HYPERVISOR_multicall, "pi", &call_list[i], nr_calls-i);
             }
         }
     }
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/common/page_alloc.c
--- a/xen/common/page_alloc.c   Wed Mar  1 17:01:54 2006
+++ b/xen/common/page_alloc.c   Wed Mar  1 19:47:25 2006
@@ -32,6 +32,7 @@
 #include <xen/softirq.h>
 #include <xen/shadow.h>
 #include <xen/domain_page.h>
+#include <xen/keyhandler.h>
 #include <asm/page.h>
 
 /*
@@ -662,6 +663,26 @@
 }
 
 
+static void pagealloc_keyhandler(unsigned char key)
+{
+    printk("Physical memory information:\n");
+    printk("    Xen heap: %lukB free\n"
+           "    DMA heap: %lukB free\n"
+           "    Dom heap: %lukB free\n",
+           avail[MEMZONE_XEN]<<(PAGE_SHIFT-10),
+           avail[MEMZONE_DMADOM]<<(PAGE_SHIFT-10),
+           avail[MEMZONE_DOM]<<(PAGE_SHIFT-10));
+}
+
+
+static __init int pagealloc_keyhandler_init(void)
+{
+    register_keyhandler('m', pagealloc_keyhandler, "memory info");
+    return 0;
+}
+__initcall(pagealloc_keyhandler_init);
+
+
 
 /*************************
  * PAGE SCRUBBING
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/common/sched_bvt.c
--- a/xen/common/sched_bvt.c    Wed Mar  1 17:01:54 2006
+++ b/xen/common/sched_bvt.c    Wed Mar  1 19:47:25 2006
@@ -132,13 +132,13 @@
     vcpu_schedule_unlock_irq(v);
 }
 
-static inline u32 calc_avt(struct vcpu *d, s_time_t now)
+static inline u32 calc_avt(struct vcpu *v, s_time_t now)
 {
     u32 ranfor, mcus;
-    struct bvt_dom_info *inf = BVT_INFO(d->domain);
-    struct bvt_vcpu_info *einf = EBVT_INFO(d);
-    
-    ranfor = (u32)(now - d->lastschd);
+    struct bvt_dom_info *inf = BVT_INFO(v->domain);
+    struct bvt_vcpu_info *einf = EBVT_INFO(v);
+    
+    ranfor = (u32)(now - v->runstate.state_entry_time);
     mcus = (ranfor + MCU - 1)/MCU;
 
     return einf->avt + mcus * inf->mcu_advance;
@@ -262,7 +262,7 @@
     curr_evt = calc_evt(curr, calc_avt(curr, now));
     /* Calculate the time the current domain would run assuming
        the second smallest evt is of the newly woken domain */
-    r_time = curr->lastschd +
+    r_time = curr->runstate.state_entry_time +
         ((einf->evt - curr_evt) / BVT_INFO(curr->domain)->mcu_advance) +
         ctx_allow;
 
@@ -558,7 +558,6 @@
         printk("%3d: %u has=%c ", loop++, v->domain->domain_id,
                test_bit(_VCPUF_running, &v->vcpu_flags) ? 'T':'F');
         bvt_dump_runq_el(v);
-        printk("c=0x%X%08X\n", (u32)(v->cpu_time>>32), (u32)v->cpu_time);
         printk("         l: %p n: %p  p: %p\n",
                &vcpu_inf->run_list, vcpu_inf->run_list.next,
                vcpu_inf->run_list.prev);
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/common/sched_sedf.c
--- a/xen/common/sched_sedf.c   Wed Mar  1 17:01:54 2006
+++ b/xen/common/sched_sedf.c   Wed Mar  1 19:47:25 2006
@@ -1408,18 +1408,14 @@
 {
     printk("%i.%i has=%c ", d->domain->domain_id, d->vcpu_id,
            test_bit(_VCPUF_running, &d->vcpu_flags) ? 'T':'F');
-    printk("p=%"PRIu64" sl=%"PRIu64" ddl=%"PRIu64" w=%hu c=%"PRIu64
+    printk("p=%"PRIu64" sl=%"PRIu64" ddl=%"PRIu64" w=%hu"
            " sc=%i xtr(%s)=%"PRIu64" ew=%hu",
            EDOM_INFO(d)->period, EDOM_INFO(d)->slice, EDOM_INFO(d)->deadl_abs,
-           EDOM_INFO(d)->weight, d->cpu_time,
+           EDOM_INFO(d)->weight,
            EDOM_INFO(d)->score[EXTRA_UTIL_Q],
            (EDOM_INFO(d)->status & EXTRA_AWARE) ? "yes" : "no",
            EDOM_INFO(d)->extra_time_tot, EDOM_INFO(d)->extraweight);
     
-    if ( d->cpu_time != 0 )
-        printf(" (%"PRIu64"%%)", (EDOM_INFO(d)->extra_time_tot * 100)
-               / d->cpu_time);
-
 #ifdef SEDF_STATS
     if ( EDOM_INFO(d)->block_time_tot != 0 )
         printf(" pen=%"PRIu64"%%", (EDOM_INFO(d)->penalty_time_tot * 100) /
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/common/schedule.c
--- a/xen/common/schedule.c     Wed Mar  1 17:01:54 2006
+++ b/xen/common/schedule.c     Wed Mar  1 19:47:25 2006
@@ -36,14 +36,6 @@
 static char opt_sched[10] = "sedf";
 string_param("sched", opt_sched);
 
-/*#define WAKE_HISTO*/
-/*#define BLOCKTIME_HISTO*/
-#if defined(WAKE_HISTO)
-#define BUCKETS 31
-#elif defined(BLOCKTIME_HISTO)
-#define BUCKETS 200
-#endif
-
 #define TIME_SLOP      (s32)MICROSECS(50)     /* allow time to slip a bit */
 
 /* Various timer handlers. */
@@ -73,6 +65,36 @@
 /* Per-CPU periodic timer sends an event to the currently-executing domain. */
 static struct timer t_timer[NR_CPUS]; 
 
+static inline void vcpu_runstate_change(
+    struct vcpu *v, int new_state, s_time_t new_entry_time)
+{
+    ASSERT(v->runstate.state != new_state);
+    ASSERT(spin_is_locked(&schedule_data[v->processor].schedule_lock));
+
+    v->runstate.time[v->runstate.state] +=
+        new_entry_time - v->runstate.state_entry_time;
+    v->runstate.state_entry_time = new_entry_time;
+    v->runstate.state = new_state;
+}
+
+void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate)
+{
+    if ( likely(v == current) )
+    {
+        /* Fast lock-free path. */
+        memcpy(runstate, &v->runstate, sizeof(*runstate));
+        ASSERT(runstate->state == RUNSTATE_running);
+        runstate->time[RUNSTATE_running] += NOW() - runstate->state_entry_time;
+    }
+    else
+    {
+        vcpu_schedule_lock_irq(v);
+        memcpy(runstate, &v->runstate, sizeof(*runstate));
+        runstate->time[runstate->state] += NOW() - runstate->state_entry_time;
+        vcpu_schedule_unlock_irq(v);
+    }
+}
+
 struct domain *alloc_domain(void)
 {
     struct domain *d;
@@ -119,6 +141,9 @@
     v->cpu_affinity = is_idle_domain(d) ?
         cpumask_of_cpu(cpu_id) : CPU_MASK_ALL;
 
+    v->runstate.state = is_idle_vcpu(v) ? RUNSTATE_running : RUNSTATE_offline;
+    v->runstate.state_entry_time = NOW();
+
     if ( (vcpu_id != 0) && !is_idle_domain(d) )
         set_bit(_VCPUF_down, &v->vcpu_flags);
 
@@ -165,8 +190,15 @@
     unsigned long flags;
 
     vcpu_schedule_lock_irqsave(v, flags);
+
     if ( likely(!vcpu_runnable(v)) )
+    {
+        if ( v->runstate.state == RUNSTATE_runnable )
+            vcpu_runstate_change(v, RUNSTATE_offline, NOW());
+
         SCHED_OP(sleep, v);
+    }
+
     vcpu_schedule_unlock_irqrestore(v, flags);
 
     TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id);
@@ -187,11 +219,19 @@
     unsigned long flags;
 
     vcpu_schedule_lock_irqsave(v, flags);
+
     if ( likely(vcpu_runnable(v)) )
     {
+        if ( v->runstate.state >= RUNSTATE_blocked )
+            vcpu_runstate_change(v, RUNSTATE_runnable, NOW());
         SCHED_OP(wake, v);
-        v->wokenup = NOW();
-    }
+    }
+    else if ( !test_bit(_VCPUF_blocked, &v->vcpu_flags) )
+    {
+        if ( v->runstate.state == RUNSTATE_blocked )
+            vcpu_runstate_change(v, RUNSTATE_offline, NOW());
+    }
+
     vcpu_schedule_unlock_irqrestore(v, flags);
 
     TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id);
@@ -376,8 +416,6 @@
 
     stop_timer(&schedule_data[cpu].s_timer);
     
-    prev->cpu_time += now - prev->lastschd;
-
     /* get policy-specific decision on scheduling... */
     next_slice = ops.do_schedule(now);
 
@@ -386,8 +424,6 @@
 
     schedule_data[cpu].curr = next;
     
-    next->lastschd = now;
-
     set_timer(&schedule_data[cpu].s_timer, now + r_time);
 
     if ( unlikely(prev == next) )
@@ -397,38 +433,23 @@
     }
 
     TRACE_2D(TRC_SCHED_SWITCH_INFPREV,
-             prev->domain->domain_id, now - prev->lastschd);
+             prev->domain->domain_id,
+             now - prev->runstate.state_entry_time);
     TRACE_3D(TRC_SCHED_SWITCH_INFNEXT,
-             next->domain->domain_id, now - next->wokenup, r_time);
-
-    /*
-     * Logic of wokenup field in domain struct:
-     * Used to calculate "waiting time", which is the time that a domain
-     * spends being "runnable", but not actually running. wokenup is set
-     * set whenever a domain wakes from sleeping. However, if wokenup is not
-     * also set here then a preempted runnable domain will get a screwed up
-     * "waiting time" value next time it is scheduled.
-     */
-    prev->wokenup = now;
-
-#if defined(WAKE_HISTO)
-    if ( !is_idle_vcpu(next) && next->wokenup )
-    {
-        ulong diff = (ulong)(now - next->wokenup);
-        diff /= (ulong)MILLISECS(1);
-        if (diff <= BUCKETS-2)  schedule_data[cpu].hist[diff]++;
-        else                    schedule_data[cpu].hist[BUCKETS-1]++;
-    }
-    next->wokenup = (s_time_t)0;
-#elif defined(BLOCKTIME_HISTO)
-    prev->lastdeschd = now;
-    if ( !is_idle_vcpu(next) )
-    {
-        ulong diff = (ulong)((now - next->lastdeschd) / MILLISECS(10));
-        if (diff <= BUCKETS-2)  schedule_data[cpu].hist[diff]++;
-        else                    schedule_data[cpu].hist[BUCKETS-1]++;
-    }
-#endif
+             next->domain->domain_id,
+             (next->runstate.state == RUNSTATE_runnable) ?
+             (now - next->runstate.state_entry_time) : 0,
+             r_time);
+
+    ASSERT(prev->runstate.state == RUNSTATE_running);
+    vcpu_runstate_change(
+        prev,
+        (test_bit(_VCPUF_blocked, &prev->vcpu_flags) ? RUNSTATE_blocked :
+         (vcpu_runnable(prev) ? RUNSTATE_runnable : RUNSTATE_offline)),
+        now);
+
+    ASSERT(next->runstate.state != RUNSTATE_running);
+    vcpu_runstate_change(next, RUNSTATE_running, now);
 
     ASSERT(!test_bit(_VCPUF_running, &next->vcpu_flags));
     set_bit(_VCPUF_running, &next->vcpu_flags);
@@ -567,47 +588,6 @@
 
     local_irq_restore(flags);
 }
-
-#if defined(WAKE_HISTO) || defined(BLOCKTIME_HISTO)
-
-void print_sched_histo(unsigned char key)
-{
-    int i, j, k;
-    for_each_online_cpu ( k )
-    {
-        j = 0;
-        printf ("CPU[%02d]: scheduler latency histogram (ms:[count])\n", k);
-        for ( i = 0; i < BUCKETS; i++ )
-        {
-            if ( schedule_data[k].hist[i] != 0 )
-            {
-                if ( i < BUCKETS-1 )
-                    printk("%2d:[%7u]    ", i, schedule_data[k].hist[i]);
-                else
-                    printk(" >:[%7u]    ", schedule_data[k].hist[i]);
-                if ( !(++j % 5) )
-                    printk("\n");
-            }
-        }
-        printk("\n");
-    }
-      
-}
-
-void reset_sched_histo(unsigned char key)
-{
-    int i, j;
-    for ( j = 0; j < NR_CPUS; j++ )
-        for ( i=0; i < BUCKETS; i++ ) 
-            schedule_data[j].hist[i] = 0;
-}
-
-#else
-
-void print_sched_histo(unsigned char key) { }
-void reset_sched_histo(unsigned char key) { }
-
-#endif
 
 /*
  * Local variables:
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/drivers/char/console.c
--- a/xen/drivers/char/console.c        Wed Mar  1 17:01:54 2006
+++ b/xen/drivers/char/console.c        Wed Mar  1 19:47:25 2006
@@ -335,8 +335,9 @@
         }
 
         if ( hypercall_preempt_check() )
-            return hypercall3_create_continuation(
-                __HYPERVISOR_console_io, CONSOLEIO_write, count, buffer);
+            return hypercall_create_continuation(
+                __HYPERVISOR_console_io, "iip",
+                CONSOLEIO_write, count, buffer);
 
         kcount = min_t(int, count, sizeof(kbuf)-1);
         if ( copy_from_user(kbuf, buffer, kcount) )
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-ia64/config.h
--- a/xen/include/asm-ia64/config.h     Wed Mar  1 17:01:54 2006
+++ b/xen/include/asm-ia64/config.h     Wed Mar  1 19:47:25 2006
@@ -36,6 +36,8 @@
 //#define CONFIG_NR_CPUS 16
 //leave SMP for a later time
 //#undef CONFIG_SMP
+
+#define supervisor_mode_kernel (0)
 
 #define MAX_DMADOM_PFN (0x7FFFFFFFUL >> PAGE_SHIFT) /* 31 addressable bits */
 
@@ -190,11 +192,6 @@
 
 #define find_first_set_bit(x)  (ffs(x)-1)      // FIXME: Is this right???
 
-// from include/asm-x86/*/uaccess.h
-#define array_access_ok(addr,count,size)                       \
-    (likely(sizeof(count) <= 4) /* disallow 64-bit counts */ &&  \
-     access_ok(type,addr,count*size))
-
 // see drivers/char/console.c
 #ifndef VALIDATE_VT
 #define        OPT_CONSOLE_STR "com1"
@@ -299,7 +296,6 @@
 //#define raw_smp_processor_id()       0
 //#endif
 
-
 #ifndef __ASSEMBLY__
 #include <linux/linkage.h>
 #define FORCE_CRASH()  asm("break.m 0;;");
diff -r 88f97bb8f3ae -r 673f62edbfbe 
xen/include/asm-ia64/linux-xen/asm/README.origin
--- a/xen/include/asm-ia64/linux-xen/asm/README.origin  Wed Mar  1 17:01:54 2006
+++ b/xen/include/asm-ia64/linux-xen/asm/README.origin  Wed Mar  1 19:47:25 2006
@@ -22,4 +22,3 @@
 system.h               -> linux/include/asm-ia64/system.h
 tlbflush.h             -> linux/include/asm-ia64/tlbflush.h
 types.h                        -> linux/include/asm-ia64/types.h
-uaccess.h              -> linux/include/asm-ia64/uaccess.h
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/config.h
--- a/xen/include/asm-x86/config.h      Wed Mar  1 17:01:54 2006
+++ b/xen/include/asm-x86/config.h      Wed Mar  1 19:47:25 2006
@@ -36,6 +36,12 @@
 #define OPT_CONSOLE_STR "com1,vga"
 
 #define NR_CPUS 32
+
+#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
+# define supervisor_mode_kernel (1)
+#else
+# define supervisor_mode_kernel (0)
+#endif
 
 /* Linkage for x86 */
 #define __ALIGN .align 16,0x90
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/desc.h
--- a/xen/include/asm-x86/desc.h        Wed Mar  1 17:01:54 2006
+++ b/xen/include/asm-x86/desc.h        Wed Mar  1 19:47:25 2006
@@ -27,9 +27,22 @@
 #endif
 
 /* Fix up the RPL of a guest segment selector. */
-#define fixup_guest_selector(sel)                               \
+#define __fixup_guest_selector(sel)                             \
     ((sel) = (((sel) & 3) >= GUEST_KERNEL_RPL) ? (sel) :        \
      (((sel) & ~3) | GUEST_KERNEL_RPL))
+
+/* Stack selectors don't need fixing up if the kernel runs in ring 0. */
+#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
+#define fixup_guest_stack_selector(ss) ((void)0)
+#else
+#define fixup_guest_stack_selector(ss) __fixup_guest_selector(ss)
+#endif
+
+/*
+ * Code selectors are always fixed up. It allows the Xen exit stub to detect
+ * return to guest context, even when the guest kernel runs in ring 0.
+ */
+#define fixup_guest_code_selector(cs)  __fixup_guest_selector(cs)
 
 /*
  * We need this function because enforcing the correct guest kernel RPL is
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/hvm/hvm.h
--- a/xen/include/asm-x86/hvm/hvm.h     Wed Mar  1 17:01:54 2006
+++ b/xen/include/asm-x86/hvm/hvm.h     Wed Mar  1 19:47:25 2006
@@ -67,6 +67,9 @@
     int (*paging_enabled)(struct vcpu *v);
     int (*instruction_length)(struct vcpu *v);
     unsigned long (*get_guest_ctrl_reg)(struct vcpu *v, unsigned int num);
+
+    void (*init_ap_context)(struct vcpu_guest_context *ctxt,
+                            int vcpuid, int trampoline_vector);
 };
 
 extern struct hvm_function_table hvm_funcs;
@@ -173,4 +176,14 @@
         return hvm_funcs.get_guest_ctrl_reg(v, num);
     return 0;                   /* force to fail */
 }
+
+static inline void
+hvm_init_ap_context(struct vcpu_guest_context *ctxt,
+                    int vcpuid, int trampoline_vector)
+{
+    return hvm_funcs.init_ap_context(ctxt, vcpuid, trampoline_vector);
+}
+
+extern int hvm_bringup_ap(int vcpuid, int trampoline_vector);
+
 #endif /* __ASM_X86_HVM_HVM_H__ */
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/hvm/svm/emulate.h
--- a/xen/include/asm-x86/hvm/svm/emulate.h     Wed Mar  1 17:01:54 2006
+++ b/xen/include/asm-x86/hvm/svm/emulate.h     Wed Mar  1 19:47:25 2006
@@ -83,15 +83,15 @@
         struct cpu_user_regs *regs, const u8 prefix, const u8 *operand, 
         u8 *size);
 extern OPERATING_MODE get_operating_mode (struct vmcb_struct *vmcb);
-extern unsigned int decode_dest_reg(u8 modrm);
-extern unsigned int decode_src_reg(u8 modrm);
+extern unsigned int decode_dest_reg(u8 prefix, u8 modrm);
+extern unsigned int decode_src_reg(u8 prefix, u8 modrm);
 extern unsigned long svm_rip2pointer(struct vmcb_struct *vmcb);
-extern unsigned int __get_instruction_length_from_list(struct vmcb_struct 
*vmcb,
+extern int __get_instruction_length_from_list(struct vmcb_struct *vmcb,
         enum instruction_index *list, unsigned int list_count, 
         u8 *guest_eip_buf, enum instruction_index *match);
 
 
-static inline unsigned int __get_instruction_length(struct vmcb_struct *vmcb, 
+static inline int __get_instruction_length(struct vmcb_struct *vmcb, 
         enum instruction_index instr, u8 *guest_eip_buf)
 {
     return __get_instruction_length_from_list(vmcb, &instr, 1, guest_eip_buf, 
@@ -138,9 +138,20 @@
 }
 
 
+static inline int skip_prefix_bytes(u8 *buf, size_t size)
+{
+    int index;
+    for (index = 0; index < size && is_prefix(buf[index]); index ++)  
+        /* do nothing */ ;
+    return index;
+}
+
+
+
 static void inline __update_guest_eip(struct vmcb_struct *vmcb, 
-        unsigned long inst_len) 
+        int inst_len) 
 {
+    ASSERT(inst_len > 0);
     vmcb->rip += inst_len;
 }
 
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/hvm/svm/svm.h
--- a/xen/include/asm-x86/hvm/svm/svm.h Wed Mar  1 17:01:54 2006
+++ b/xen/include/asm-x86/hvm/svm/svm.h Wed Mar  1 19:47:25 2006
@@ -54,6 +54,8 @@
 /* For debugging. Remove when no longer needed. */
 extern void svm_dump_host_regs(const char *from);
 
+extern void svm_migrate_timers(struct vcpu *v);
+
 /* ASID API */
 enum {
     ASID_AVAILABLE = 0,
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/hvm/svm/vmcb.h
--- a/xen/include/asm-x86/hvm/svm/vmcb.h        Wed Mar  1 17:01:54 2006
+++ b/xen/include/asm-x86/hvm/svm/vmcb.h        Wed Mar  1 19:47:25 2006
@@ -269,21 +269,6 @@
 #define SVM_LONG_GUEST(ed)    \
   (test_bit(SVM_CPU_STATE_LMA_ENABLED, &ed->arch.hvm_svm.cpu_state))
 
-enum {
-    SVM_INDEX_MSR_LSTAR = 0,
-    SVM_INDEX_MSR_STAR,
-    SVM_INDEX_MSR_CSTAR,
-    SVM_INDEX_MSR_SYSCALL_MASK,
-    SVM_INDEX_MSR_EFER,
-
-    SVM_MSR_COUNT,
-};
-
-struct svm_msr_state {
-    unsigned long flags;
-    unsigned long msr_items[SVM_MSR_COUNT];
-    unsigned long shadow_gs;
-};
 
 /* 
  * Attribute for segment selector. This is a copy of bit 40:47 & 52:55 of the
@@ -449,7 +434,7 @@
 
 struct arch_svm_struct {
     struct vmcb_struct *vmcb;
-    void               *host_save_area;
+    void                       *host_save_area;
     u64                 host_save_pa;
     u64                 vmcb_pa;
     u32                 *iopm;
@@ -457,14 +442,15 @@
     u64                 vmexit_tsc; /* tsc read at #VMEXIT. for TSC_OFFSET */
     int                 injecting_event;
     int                 saved_irq_vector;
-    u32                 core;        /* cpu of last vmexit */
+    u32                 launch_core;
+    u32                 asid_core;
     
     unsigned long       flags;      /* VMCB flags */
-    unsigned long       cpu_shadow_cr0; /* copy of guest read shadow CR0 */
+    unsigned long       cpu_shadow_cr0; /* Guest value for CR0 */
+    unsigned long       cpu_shadow_cr4; /* Guest value for CR4 */
     unsigned long       cpu_cr2;
     unsigned long       cpu_cr3;
     unsigned long       cpu_state;
-    struct svm_msr_state msr_content;
     struct timer        hlt_timer;  /* hlt ins emulation wakeup timer */
 };
 
@@ -485,6 +471,14 @@
 
 #define VMCB_EFLAGS_RESERVED_0          0xffc08028 /* bitmap for 0 */
 #define VMCB_EFLAGS_RESERVED_1          0x00000002 /* bitmap for 1 */
+
+/* These bits in the CR4 are owned by the host */
+#ifdef __i386__
+#define SVM_CR4_HOST_MASK (0)
+#else
+#define SVM_CR4_HOST_MASK (X86_CR4_PAE)
+#endif
+
 
 #endif /* ASM_X86_HVM_SVM_VMCS_H__ */
 
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/hvm/vcpu.h
--- a/xen/include/asm-x86/hvm/vcpu.h    Wed Mar  1 17:01:54 2006
+++ b/xen/include/asm-x86/hvm/vcpu.h    Wed Mar  1 19:47:25 2006
@@ -25,10 +25,15 @@
 #include <asm/hvm/vmx/vmcs.h>
 #include <asm/hvm/svm/vmcb.h>
 
+#define HVM_VCPU_INIT_SIPI_SIPI_STATE_NORM          0
+#define HVM_VCPU_INIT_SIPI_SIPI_STATE_WAIT_SIPI     1
+
 struct hvm_vcpu {
-    unsigned long       ioflags;
-    struct mmio_op      mmio_op;
-    struct vlapic       *vlapic;
+    unsigned long   ioflags;
+    struct mmio_op  mmio_op;
+    struct vlapic   *vlapic;
+    /* For AP startup */
+    unsigned long   init_sipi_sipi_state;
 
     union {
         struct arch_vmx_struct vmx;
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/hvm/vlapic.h
--- a/xen/include/asm-x86/hvm/vlapic.h  Wed Mar  1 17:01:54 2006
+++ b/xen/include/asm-x86/hvm/vlapic.h  Wed Mar  1 19:47:25 2006
@@ -158,9 +158,6 @@
     int deliver_mode;
     int source[6];
 } direct_intr_info_t;
-
-#define VLAPIC_INIT_SIPI_SIPI_STATE_NORM          0
-#define VLAPIC_INIT_SIPI_SIPI_STATE_WAIT_SIPI     1
 
 struct vlapic
 {
@@ -197,7 +194,6 @@
     unsigned long      init_ticks;
     uint32_t           err_write_count;
     uint64_t           apic_base_msr;
-    uint32_t           init_sipi_sipi_state;
     struct vcpu        *vcpu;
     struct domain      *domain;
 };
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/mm.h
--- a/xen/include/asm-x86/mm.h  Wed Mar  1 17:01:54 2006
+++ b/xen/include/asm-x86/mm.h  Wed Mar  1 19:47:25 2006
@@ -337,6 +337,10 @@
         UNLOCK_BIGLOCK(d);                                      \
     } while ( 0 )
 
+#define writable_pagetable_in_sync(d)           \
+    (!((d)->arch.ptwr[PTWR_PT_ACTIVE].l1va |    \
+       (d)->arch.ptwr[PTWR_PT_INACTIVE].l1va))
+
 int audit_adjust_pgtables(struct domain *d, int dir, int noisy);
 
 #ifndef NDEBUG
@@ -376,7 +380,7 @@
 int __sync_lazy_execstate(void);
 
 /* Arch-specific portion of memory_op hypercall. */
-long arch_memory_op(int op, void *arg);
-long subarch_memory_op(int op, void *arg);
+long arch_memory_op(int op, GUEST_HANDLE(void) arg);
+long subarch_memory_op(int op, GUEST_HANDLE(void) arg);
 
 #endif /* __ASM_X86_MM_H__ */
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/shadow_64.h
--- a/xen/include/asm-x86/shadow_64.h   Wed Mar  1 17:01:54 2006
+++ b/xen/include/asm-x86/shadow_64.h   Wed Mar  1 19:47:25 2006
@@ -223,6 +223,7 @@
     int i;
     pgentry_64_t *le_e;
     pgentry_64_t *le_p = NULL;
+    pgentry_64_t *phys_vtable = NULL;
     unsigned long mfn;
     int index;
     u32 level = flag & L_MASK;
@@ -251,25 +252,35 @@
     {
         root_level = PAE_PAGING_LEVELS;
         index = table_offset_64(va, root_level);
-        le_e = (pgentry_64_t *)map_domain_page(
+        phys_vtable = (pgentry_64_t *)map_domain_page(
             pagetable_get_pfn(v->domain->arch.phys_table));
+        le_e = &phys_vtable[index];
     }
 
     /*
      * If it's not external mode, then mfn should be machine physical.
      */
-    for (i = root_level - level; i > 0; i--) {
-        if ( unlikely(!(entry_get_flags(*le_e) & _PAGE_PRESENT)) ) {
+    for ( i = root_level - level; i > 0; i-- )
+    {
+        if ( unlikely(!(entry_get_flags(*le_e) & _PAGE_PRESENT)) )
+        {
             if ( le_p )
                 unmap_domain_page(le_p);
+
+            if ( phys_vtable )
+                unmap_domain_page(phys_vtable);
+
             return 0;
         }
+
         mfn = entry_get_pfn(*le_e);
         if ( (flag & GUEST_ENTRY) && shadow_mode_translate(d) )
             mfn = get_mfn_from_gpfn(mfn);
+
         if ( le_p )
             unmap_domain_page(le_p);
         le_p = (pgentry_64_t *)map_domain_page(mfn);
+
         if ( flag & SHADOW_ENTRY )
             index = table_offset_64(va, (level + i - 1));
         else
@@ -285,8 +296,10 @@
     if ( le_p )
         unmap_domain_page(le_p);
 
+    if ( phys_vtable )
+        unmap_domain_page(phys_vtable);
+
     return 1;
-
 }
 
 static inline int __rw_entry(
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/shadow_public.h
--- a/xen/include/asm-x86/shadow_public.h       Wed Mar  1 17:01:54 2006
+++ b/xen/include/asm-x86/shadow_public.h       Wed Mar  1 19:47:25 2006
@@ -21,8 +21,6 @@
 
 #ifndef _XEN_SHADOW_PUBLIC_H
 #define _XEN_SHADOW_PUBLIC_H
-
-extern int alloc_p2m_table(struct domain *d);
 
 #if CONFIG_PAGING_LEVELS >= 3
 #define MFN_PINNED(_x) (mfn_to_page(_x)->u.inuse.type_info & PGT_pinned)
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/x86_32/asm_defns.h
--- a/xen/include/asm-x86/x86_32/asm_defns.h    Wed Mar  1 17:01:54 2006
+++ b/xen/include/asm-x86/x86_32/asm_defns.h    Wed Mar  1 19:47:25 2006
@@ -48,9 +48,24 @@
 
 #ifdef PERF_COUNTERS
 #define PERFC_INCR(_name,_idx)                          \
-    lock incl perfcounters+_name(,_idx,4)
+        lock incl perfcounters+_name(,_idx,4)
 #else
 #define PERFC_INCR(_name,_idx)
+#endif
+
+#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
+#define FIXUP_RING0_GUEST_STACK                         \
+        testl $2,8(%esp);                               \
+        jnz 1f; /* rings 2 & 3 permitted */             \
+        testl $1,8(%esp);                               \
+        jz 2f;                                          \
+        ud2; /* ring 1 should not be used */            \
+        2:cmpl $(__HYPERVISOR_VIRT_START),%esp;         \
+        jge 1f;                                         \
+        call fixup_ring0_guest_stack;                   \
+        1:
+#else
+#define FIXUP_RING0_GUEST_STACK
 #endif
 
 #define BUILD_SMP_INTERRUPT(x,v) XBUILD_SMP_INTERRUPT(x,v)
@@ -61,6 +76,7 @@
     ".globl " STR(x) "\n\t"                     \
     STR(x) ":\n\t"                              \
     "pushl $"#v"<<16\n\t"                       \
+    STR(FIXUP_RING0_GUEST_STACK)                \
     STR(SAVE_ALL(a))                            \
     "movl %esp,%eax\n\t"                        \
     "pushl %eax\n\t"                            \
@@ -72,6 +88,7 @@
 __asm__(                                        \
     "\n" __ALIGN_STR"\n"                        \
     "common_interrupt:\n\t"                     \
+    STR(FIXUP_RING0_GUEST_STACK)                \
     STR(SAVE_ALL(a))                            \
     "movl %esp,%eax\n\t"                        \
     "pushl %eax\n\t"                            \
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/public/memory.h
--- a/xen/include/public/memory.h       Wed Mar  1 17:01:54 2006
+++ b/xen/include/public/memory.h       Wed Mar  1 19:47:25 2006
@@ -29,7 +29,7 @@
      *   OUT: GMFN bases of extents that were allocated
      *   (NB. This command also updates the mach_to_phys translation table)
      */
-    unsigned long *extent_start;
+    GUEST_HANDLE(xen_ulong) extent_start;
 
     /* Number of extents, and size/alignment of each (2^extent_order pages). */
     unsigned long  nr_extents;
@@ -50,6 +50,7 @@
     domid_t        domid;
 
 } xen_memory_reservation_t;
+DEFINE_GUEST_HANDLE(xen_memory_reservation_t);
 
 /*
  * Returns the maximum machine frame number of mapped RAM in this system.
@@ -85,7 +86,7 @@
      * any large discontiguities in the machine address space, 2MB gaps in
      * the machphys table will be represented by an MFN base of zero.
      */
-    unsigned long *extent_start;
+    GUEST_HANDLE(xen_ulong) extent_start;
 
     /*
      * Number of extents written to the above array. This will be smaller
@@ -93,6 +94,7 @@
      */
     unsigned int nr_extents;
 } xen_machphys_mfn_list_t;
+DEFINE_GUEST_HANDLE(xen_machphys_mfn_list_t);
 
 /*
  * Returns the base and size of the specified reserved 'RAM hole' in the
@@ -113,6 +115,7 @@
     /* Base and size of the specified reserved area. */
     unsigned long first_gpfn, nr_gpfns;
 } xen_reserved_phys_area_t;
+DEFINE_GUEST_HANDLE(xen_reserved_phys_area_t);
 
 /*
  * Translates a list of domain-specific GPFNs into MFNs. Returns a -ve error
@@ -127,14 +130,15 @@
     unsigned long nr_gpfns;
 
     /* List of GPFNs to translate. */
-    unsigned long *gpfn_list;
+    GUEST_HANDLE(xen_ulong) gpfn_list;
 
     /*
      * Output list to contain MFN translations. May be the same as the input
      * list (in which case each input GPFN is overwritten with the output MFN).
      */
-    unsigned long *mfn_list;
+    GUEST_HANDLE(xen_ulong) mfn_list;
 } xen_translate_gpfn_list_t;
+DEFINE_GUEST_HANDLE(xen_translate_gpfn_list_t);
 
 #endif /* __XEN_PUBLIC_MEMORY_H__ */
 
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/public/vcpu.h
--- a/xen/include/public/vcpu.h Wed Mar  1 17:01:54 2006
+++ b/xen/include/public/vcpu.h Wed Mar  1 19:47:25 2006
@@ -51,6 +51,61 @@
 /* Returns 1 if the given VCPU is up. */
 #define VCPUOP_is_up                3
 
+/*
+ * Return information about the state and running time of a VCPU.
+ * @extra_arg == pointer to vcpu_runstate_info structure.
+ */
+#define VCPUOP_get_runstate_info    4
+typedef struct vcpu_runstate_info {
+    /* VCPU's current state (RUNSTATE_*). */
+    int      state;
+    /* When was current state entered (system time, ns)? */
+    uint64_t state_entry_time;
+    /*
+     * Time spent in each RUNSTATE_* (ns). The sum of these times is
+     * guaranteed not to drift from system time.
+     */
+    uint64_t time[4];
+} vcpu_runstate_info_t;
+
+/* VCPU is currently running on a physical CPU. */
+#define RUNSTATE_running  0
+
+/* VCPU is runnable, but not currently scheduled on any physical CPU. */
+#define RUNSTATE_runnable 1
+
+/* VCPU is blocked (a.k.a. idle). It is therefore not runnable. */
+#define RUNSTATE_blocked  2
+
+/*
+ * VCPU is not runnable, but it is not blocked.
+ * This is a 'catch all' state for things like hotplug and pauses by the
+ * system administrator (or for critical sections in the hypervisor).
+ * RUNSTATE_blocked dominates this state (it is the preferred state).
+ */
+#define RUNSTATE_offline  3
+
+/*
+ * Register a shared memory area from which the guest may obtain its own
+ * runstate information without needing to execute a hypercall.
+ * Notes:
+ *  1. The registered address may be virtual or physical, depending on the
+ *     platform. The virtual address should be registered on x86 systems.
+ *  2. Only one shared area may be registered per VCPU. The shared area is
+ *     updated by the hypervisor each time the VCPU is scheduled. Thus
+ *     runstate.state will always be RUNSTATE_running and
+ *     runstate.state_entry_time will indicate the system time at which the
+ *     VCPU was last scheduled to run.
+ * @extra_arg == pointer to vcpu_register_runstate_memory_area structure.
+ */
+#define VCPUOP_register_runstate_memory_area 5
+typedef struct vcpu_register_runstate_memory_area {
+    union {
+        struct vcpu_runstate_info *v;
+        uint64_t p;
+    } addr;
+} vcpu_register_runstate_memory_area_t;
+
 #endif /* __XEN_PUBLIC_VCPU_H__ */
 
 /*
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/public/version.h
--- a/xen/include/public/version.h      Wed Mar  1 17:01:54 2006
+++ b/xen/include/public/version.h      Wed Mar  1 19:47:25 2006
@@ -48,36 +48,8 @@
     uint32_t     submap;        /* OUT: 32-bit submap */
 } xen_feature_info_t;
 
-/*
- * If set, the guest does not need to write-protect its pagetables, and can
- * update them via direct writes.
- */
-#define XENFEAT_writable_page_tables       0
-
-/*
- * If set, the guest does not need to write-protect its segment descriptor
- * tables, and can update them via direct writes.
- */
-#define XENFEAT_writable_descriptor_tables 1
-
-/*
- * If set, translation between the guest's 'pseudo-physical' address space
- * and the host's machine address space are handled by the hypervisor. In this
- * mode the guest does not need to perform phys-to/from-machine translations
- * when performing page table operations.
- */
-#define XENFEAT_auto_translated_physmap    2
-
-/* If set, the guest is running in supervisor mode (e.g., x86 ring 0). */
-#define XENFEAT_supervisor_mode_kernel     3
-
-/*
- * If set, the guest does not need to allocate x86 PAE page directories
- * below 4GB. This flag is usually implied by auto_translated_physmap.
- */
-#define XENFEAT_pae_pgdir_above_4gb        4
-
-#define XENFEAT_NR_SUBMAPS 1
+/* Declares the features reported by XENVER_get_features. */
+#include "features.h"
 
 #endif /* __XEN_PUBLIC_VERSION_H__ */
 
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/public/xen.h
--- a/xen/include/public/xen.h  Wed Mar  1 17:01:54 2006
+++ b/xen/include/public/xen.h  Wed Mar  1 19:47:25 2006
@@ -8,6 +8,22 @@
 
 #ifndef __XEN_PUBLIC_XEN_H__
 #define __XEN_PUBLIC_XEN_H__
+
+#ifdef __XEN__
+#define DEFINE_GUEST_HANDLE(type) struct __guest_handle_ ## type { type *p; }
+#define GUEST_HANDLE(type)        struct __guest_handle_ ## type
+#else
+#define DEFINE_GUEST_HANDLE(type)
+#define GUEST_HANDLE(type)        type *
+#endif
+
+#ifndef __ASSEMBLY__
+/* Guest handle for unsigned long pointer. Define a name with no whitespace. */
+typedef unsigned long xen_ulong;
+DEFINE_GUEST_HANDLE(xen_ulong);
+/* Guest handle for arbitrary-type pointer (void *). */
+DEFINE_GUEST_HANDLE(void);
+#endif
 
 #if defined(__i386__)
 #include "arch-x86_32.h"
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/xen/sched-if.h
--- a/xen/include/xen/sched-if.h        Wed Mar  1 17:01:54 2006
+++ b/xen/include/xen/sched-if.h        Wed Mar  1 19:47:25 2006
@@ -8,9 +8,6 @@
 #ifndef __XEN_SCHED_IF_H__
 #define __XEN_SCHED_IF_H__
 
-#define BUCKETS  10
-/*300*/
-
 struct schedule_data {
     spinlock_t          schedule_lock;  /* spinlock protecting curr        */
     struct vcpu        *curr;           /* current task                    */
@@ -18,9 +15,6 @@
     void               *sched_priv;
     struct timer        s_timer;        /* scheduling timer                */
     unsigned long       tick;           /* current periodic 'tick'         */
-#ifdef BUCKETS
-    u32                 hist[BUCKETS];  /* for scheduler latency histogram */
-#endif
 } __cacheline_aligned;
 
 extern struct schedule_data schedule_data[];
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/xen/sched.h
--- a/xen/include/xen/sched.h   Wed Mar  1 17:01:54 2006
+++ b/xen/include/xen/sched.h   Wed Mar  1 19:47:25 2006
@@ -8,6 +8,7 @@
 #include <xen/smp.h>
 #include <public/xen.h>
 #include <public/dom0_ops.h>
+#include <public/vcpu.h>
 #include <xen/time.h>
 #include <xen/timer.h>
 #include <xen/grant_table.h>
@@ -63,14 +64,13 @@
 
     struct vcpu     *next_in_list;
 
-    struct timer  timer;         /* one-shot timer for timeout values */
+    struct timer     timer;         /* one-shot timer for timeout values */
     unsigned long    sleep_tick;    /* tick at which this vcpu started sleep */
 
-    s_time_t         lastschd;      /* time this domain was last scheduled */
-    s_time_t         lastdeschd;    /* time this domain was last descheduled */
-    s_time_t         cpu_time;      /* total CPU time received till now */
-    s_time_t         wokenup;       /* time domain got woken up */
     void            *sched_priv;    /* scheduler-specific data */
+
+    struct vcpu_runstate_info runstate;
+    struct vcpu_runstate_info *runstate_guest; /* guest address */
 
     unsigned long    vcpu_flags;
 
@@ -303,31 +303,18 @@
 
 void startup_cpu_idle_loop(void);
 
-unsigned long __hypercall_create_continuation(
-    unsigned int op, unsigned int nr_args, ...);
-#define hypercall0_create_continuation(_op)                               \
-    __hypercall_create_continuation((_op), 0)
-#define hypercall1_create_continuation(_op, _a1)                          \
-    __hypercall_create_continuation((_op), 1,                             \
-        (unsigned long)(_a1))
-#define hypercall2_create_continuation(_op, _a1, _a2)                     \
-    __hypercall_create_continuation((_op), 2,                             \
-        (unsigned long)(_a1), (unsigned long)(_a2))
-#define hypercall3_create_continuation(_op, _a1, _a2, _a3)                \
-    __hypercall_create_continuation((_op), 3,                             \
-        (unsigned long)(_a1), (unsigned long)(_a2), (unsigned long)(_a3))
-#define hypercall4_create_continuation(_op, _a1, _a2, _a3, _a4)           \
-    __hypercall_create_continuation((_op), 4,                             \
-        (unsigned long)(_a1), (unsigned long)(_a2), (unsigned long)(_a3), \
-        (unsigned long)(_a4))
-#define hypercall5_create_continuation(_op, _a1, _a2, _a3, _a4, _a5)      \
-    __hypercall_create_continuation((_op), 5,                             \
-        (unsigned long)(_a1), (unsigned long)(_a2), (unsigned long)(_a3), \
-        (unsigned long)(_a4), (unsigned long)(_a5))
-#define hypercall6_create_continuation(_op, _a1, _a2, _a3, _a4, _a5, _a6) \
-    __hypercall_create_continuation((_op), 6,                             \
-        (unsigned long)(_a1), (unsigned long)(_a2), (unsigned long)(_a3), \
-        (unsigned long)(_a4), (unsigned long)(_a5), (unsigned long)(_a6))
+/*
+ * Creates a continuation to resume the current hypercall. The caller should
+ * return immediately, propagating the value returned from this invocation.
+ * The format string specifies the types and number of hypercall arguments.
+ * It contains one character per argument as follows:
+ *  'i' [unsigned] {char, int}
+ *  'l' [unsigned] long
+ *  'p' pointer (foo *)
+ *  'h' guest handle (GUEST_HANDLE(foo))
+ */
+unsigned long hypercall_create_continuation(
+    unsigned int op, const char *format, ...);
 
 #define hypercall_preempt_check() (unlikely(    \
         softirq_pending(smp_processor_id()) |   \
@@ -397,7 +384,6 @@
 #define _DOMF_debugging        4
 #define DOMF_debugging         (1UL<<_DOMF_debugging)
 
-
 static inline int vcpu_runnable(struct vcpu *v)
 {
     return ( (atomic_read(&v->pausecnt) == 0) &&
@@ -415,6 +401,8 @@
 
 int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity);
 
+void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate);
+
 static inline void vcpu_unblock(struct vcpu *v)
 {
     if ( test_and_clear_bit(_VCPUF_blocked, &v->vcpu_flags) )
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/xen/string.h
--- a/xen/include/xen/string.h  Wed Mar  1 17:01:54 2006
+++ b/xen/include/xen/string.h  Wed Mar  1 19:47:25 2006
@@ -24,6 +24,9 @@
 #endif
 #ifndef __HAVE_ARCH_STRNCPY
 extern char * strncpy(char *,const char *, __kernel_size_t);
+#endif
+#ifndef __HAVE_ARCH_STRLCPY
+extern size_t strlcpy(char *,const char *, __kernel_size_t);
 #endif
 #ifndef __HAVE_ARCH_STRCAT
 extern char * strcat(char *, const char *);
diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/arch/i386/mm/pgtable.c
--- /dev/null   Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/arch/i386/mm/pgtable.c       Wed Mar  1 19:47:25 2006
@@ -0,0 +1,283 @@
+/*
+ *  linux/arch/i386/mm/pgtable.c
+ */
+
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+
+#include <asm/system.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/fixmap.h>
+#include <asm/e820.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+
+void show_mem(void)
+{
+       int total = 0, reserved = 0;
+       int shared = 0, cached = 0;
+       int highmem = 0;
+       struct page *page;
+       pg_data_t *pgdat;
+       unsigned long i;
+       struct page_state ps;
+       unsigned long flags;
+
+       printk(KERN_INFO "Mem-info:\n");
+       show_free_areas();
+       printk(KERN_INFO "Free swap:       %6ldkB\n", 
nr_swap_pages<<(PAGE_SHIFT-10));
+       for_each_pgdat(pgdat) {
+               pgdat_resize_lock(pgdat, &flags);
+               for (i = 0; i < pgdat->node_spanned_pages; ++i) {
+                       page = pgdat_page_nr(pgdat, i);
+                       total++;
+                       if (PageHighMem(page))
+                               highmem++;
+                       if (PageReserved(page))
+                               reserved++;
+                       else if (PageSwapCache(page))
+                               cached++;
+                       else if (page_count(page))
+                               shared += page_count(page) - 1;
+               }
+               pgdat_resize_unlock(pgdat, &flags);
+       }
+       printk(KERN_INFO "%d pages of RAM\n", total);
+       printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
+       printk(KERN_INFO "%d reserved pages\n", reserved);
+       printk(KERN_INFO "%d pages shared\n", shared);
+       printk(KERN_INFO "%d pages swap cached\n", cached);
+
+       get_page_state(&ps);
+       printk(KERN_INFO "%lu pages dirty\n", ps.nr_dirty);
+       printk(KERN_INFO "%lu pages writeback\n", ps.nr_writeback);
+       printk(KERN_INFO "%lu pages mapped\n", ps.nr_mapped);
+       printk(KERN_INFO "%lu pages slab\n", ps.nr_slab);
+       printk(KERN_INFO "%lu pages pagetables\n", ps.nr_page_table_pages);
+}
+
+/*
+ * Associate a virtual page frame with a given physical page frame 
+ * and protection flags for that frame.
+ */ 
+static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+
+       pgd = swapper_pg_dir + pgd_index(vaddr);
+       if (pgd_none(*pgd)) {
+               BUG();
+               return;
+       }
+       pud = pud_offset(pgd, vaddr);
+       if (pud_none(*pud)) {
+               BUG();
+               return;
+       }
+       pmd = pmd_offset(pud, vaddr);
+       if (pmd_none(*pmd)) {
+               BUG();
+               return;
+       }
+       pte = pte_offset_kernel(pmd, vaddr);
+       /* <pfn,flags> stored as-is, to permit clearing entries */
+       set_pte(pte, pfn_pte(pfn, flags));
+
+       /*
+        * It's enough to flush this one mapping.
+        * (PGE mappings get flushed as well)
+        */
+       __flush_tlb_one(vaddr);
+}
+
+/*
+ * Associate a large virtual page frame with a given physical page frame 
+ * and protection flags for that frame. pfn is for the base of the page,
+ * vaddr is what the page gets mapped to - both must be properly aligned. 
+ * The pmd must already be instantiated. Assumes PAE mode.
+ */ 
+void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+
+       if (vaddr & (PMD_SIZE-1)) {             /* vaddr is misaligned */
+               printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n");
+               return; /* BUG(); */
+       }
+       if (pfn & (PTRS_PER_PTE-1)) {           /* pfn is misaligned */
+               printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n");
+               return; /* BUG(); */
+       }
+       pgd = swapper_pg_dir + pgd_index(vaddr);
+       if (pgd_none(*pgd)) {
+               printk(KERN_WARNING "set_pmd_pfn: pgd_none\n");
+               return; /* BUG(); */
+       }
+       pud = pud_offset(pgd, vaddr);
+       pmd = pmd_offset(pud, vaddr);
+       set_pmd(pmd, pfn_pmd(pfn, flags));
+       /*
+        * It's enough to flush this one mapping.
+        * (PGE mappings get flushed as well)
+        */
+       __flush_tlb_one(vaddr);
+}
+
+static int nr_fixmaps = 0;
+unsigned long __FIXADDR_TOP = 0xfffff000;
+EXPORT_SYMBOL(__FIXADDR_TOP);
+
+void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t 
flags)
+{
+       unsigned long address = __fix_to_virt(idx);
+
+       if (idx >= __end_of_fixed_addresses) {
+               BUG();
+               return;
+       }
+       set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
+       nr_fixmaps++;
+}
+
+void set_fixaddr_top(unsigned long top)
+{
+       BUG_ON(nr_fixmaps > 0);
+       __FIXADDR_TOP = top - PAGE_SIZE;
+}
+
+pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
+{
+       return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
+}
+
+struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
+{
+       struct page *pte;
+
+#ifdef CONFIG_HIGHPTE
+       pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
+#else
+       pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
+#endif
+       return pte;
+}
+
+void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags)
+{
+       memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
+}
+
+/*
+ * List of all pgd's needed for non-PAE so it can invalidate entries
+ * in both cached and uncached pgd's; not needed for PAE since the
+ * kernel pmd is shared. If PAE were not to share the pmd a similar
+ * tactic would be needed. This is essentially codepath-based locking
+ * against pageattr.c; it is the unique case in which a valid change
+ * of kernel pagetables can't be lazily synchronized by vmalloc faults.
+ * vmalloc faults work because attached pagetables are never freed.
+ * The locking scheme was chosen on the basis of manfred's
+ * recommendations and having no core impact whatsoever.
+ * -- wli
+ */
+DEFINE_SPINLOCK(pgd_lock);
+struct page *pgd_list;
+
+static inline void pgd_list_add(pgd_t *pgd)
+{
+       struct page *page = virt_to_page(pgd);
+       page->index = (unsigned long)pgd_list;
+       if (pgd_list)
+               set_page_private(pgd_list, (unsigned long)&page->index);
+       pgd_list = page;
+       set_page_private(page, (unsigned long)&pgd_list);
+}
+
+static inline void pgd_list_del(pgd_t *pgd)
+{
+       struct page *next, **pprev, *page = virt_to_page(pgd);
+       next = (struct page *)page->index;
+       pprev = (struct page **)page_private(page);
+       *pprev = next;
+       if (next)
+               set_page_private(next, (unsigned long)pprev);
+}
+
+void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
+{
+       unsigned long flags;
+
+       if (PTRS_PER_PMD == 1) {
+               memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
+               spin_lock_irqsave(&pgd_lock, flags);
+       }
+
+       clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
+                       swapper_pg_dir + USER_PTRS_PER_PGD,
+                       KERNEL_PGD_PTRS);
+       if (PTRS_PER_PMD > 1)
+               return;
+
+       pgd_list_add(pgd);
+       spin_unlock_irqrestore(&pgd_lock, flags);
+}
+
+/* never called when PTRS_PER_PMD > 1 */
+void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused)
+{
+       unsigned long flags; /* can be called from interrupt context */
+
+       spin_lock_irqsave(&pgd_lock, flags);
+       pgd_list_del(pgd);
+       spin_unlock_irqrestore(&pgd_lock, flags);
+}
+
+pgd_t *pgd_alloc(struct mm_struct *mm)
+{
+       int i;
+       pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
+
+       if (PTRS_PER_PMD == 1 || !pgd)
+               return pgd;
+
+       for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
+               pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
+               if (!pmd)
+                       goto out_oom;
+               set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
+       }
+       return pgd;
+
+out_oom:
+       for (i--; i >= 0; i--)
+               kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
+       kmem_cache_free(pgd_cache, pgd);
+       return NULL;
+}
+
+void pgd_free(pgd_t *pgd)
+{
+       int i;
+
+       /* in the PAE case user pgd entries are overwritten before usage */
+       if (PTRS_PER_PMD > 1)
+               for (i = 0; i < USER_PTRS_PER_PGD; ++i)
+                       kmem_cache_free(pmd_cache, (void 
*)__va(pgd_val(pgd[i])-1));
+       /* in the non-PAE case, free_pgtables() clears user pgd entries */
+       kmem_cache_free(pgd_cache, pgd);
+}
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/include/asm-i386/fixmap.h
--- /dev/null   Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/include/asm-i386/fixmap.h    Wed Mar  1 19:47:25 2006
@@ -0,0 +1,151 @@
+/*
+ * fixmap.h: compile-time virtual memory allocation
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 1998 Ingo Molnar
+ *
+ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ */
+
+#ifndef _ASM_FIXMAP_H
+#define _ASM_FIXMAP_H
+
+#include <linux/config.h>
+
+/* used by vmalloc.c, vsyscall.lds.S.
+ *
+ * Leave one empty page between vmalloc'ed areas and
+ * the start of the fixmap.
+ */
+extern unsigned long __FIXADDR_TOP;
+
+#ifndef __ASSEMBLY__
+#include <linux/kernel.h>
+#include <asm/acpi.h>
+#include <asm/apicdef.h>
+#include <asm/page.h>
+#ifdef CONFIG_HIGHMEM
+#include <linux/threads.h>
+#include <asm/kmap_types.h>
+#endif
+
+/*
+ * Here we define all the compile-time 'special' virtual
+ * addresses. The point is to have a constant address at
+ * compile time, but to set the physical address only
+ * in the boot process. We allocate these special addresses
+ * from the end of virtual memory (0xfffff000) backwards.
+ * Also this lets us do fail-safe vmalloc(), we
+ * can guarantee that these special addresses and
+ * vmalloc()-ed addresses never overlap.
+ *
+ * these 'compile-time allocated' memory buffers are
+ * fixed-size 4k pages. (or larger if used with an increment
+ * highger than 1) use fixmap_set(idx,phys) to associate
+ * physical memory with fixmap indices.
+ *
+ * TLB entries of such buffers will not be flushed across
+ * task switches.
+ */
+enum fixed_addresses {
+       FIX_HOLE,
+#ifdef CONFIG_X86_LOCAL_APIC
+       FIX_APIC_BASE,  /* local (CPU) APIC) -- required for SMP or not */
+#endif
+#ifdef CONFIG_X86_IO_APIC
+       FIX_IO_APIC_BASE_0,
+       FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
+#endif
+#ifdef CONFIG_X86_VISWS_APIC
+       FIX_CO_CPU,     /* Cobalt timer */
+       FIX_CO_APIC,    /* Cobalt APIC Redirection Table */ 
+       FIX_LI_PCIA,    /* Lithium PCI Bridge A */
+       FIX_LI_PCIB,    /* Lithium PCI Bridge B */
+#endif
+#ifdef CONFIG_X86_F00F_BUG
+       FIX_F00F_IDT,   /* Virtual mapping for IDT */
+#endif
+#ifdef CONFIG_X86_CYCLONE_TIMER
+       FIX_CYCLONE_TIMER, /*cyclone timer register*/
+#endif 
+#ifdef CONFIG_HIGHMEM
+       FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
+       FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
+#endif
+#ifdef CONFIG_ACPI
+       FIX_ACPI_BEGIN,
+       FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
+#endif
+#ifdef CONFIG_PCI_MMCONFIG
+       FIX_PCIE_MCFG,
+#endif
+       __end_of_permanent_fixed_addresses,
+       /* temporary boot-time mappings, used before ioremap() is functional */
+#define NR_FIX_BTMAPS  16
+       FIX_BTMAP_END = __end_of_permanent_fixed_addresses,
+       FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1,
+       FIX_WP_TEST,
+       __end_of_fixed_addresses
+};
+
+extern void __set_fixmap (enum fixed_addresses idx,
+                                       unsigned long phys, pgprot_t flags);
+
+extern void set_fixaddr_top(unsigned long top);
+
+#define set_fixmap(idx, phys) \
+               __set_fixmap(idx, phys, PAGE_KERNEL)
+/*
+ * Some hardware wants to get fixmapped without caching.
+ */
+#define set_fixmap_nocache(idx, phys) \
+               __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
+
+#define clear_fixmap(idx) \
+               __set_fixmap(idx, 0, __pgprot(0))
+
+#define FIXADDR_TOP    ((unsigned long)__FIXADDR_TOP)
+
+#define __FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
+#define __FIXADDR_BOOT_SIZE    (__end_of_fixed_addresses << PAGE_SHIFT)
+#define FIXADDR_START          (FIXADDR_TOP - __FIXADDR_SIZE)
+#define FIXADDR_BOOT_START     (FIXADDR_TOP - __FIXADDR_BOOT_SIZE)
+
+#define __fix_to_virt(x)       (FIXADDR_TOP - ((x) << PAGE_SHIFT))
+#define __virt_to_fix(x)       ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
+
+extern void __this_fixmap_does_not_exist(void);
+
+/*
+ * 'index to address' translation. If anyone tries to use the idx
+ * directly without tranlation, we catch the bug with a NULL-deference
+ * kernel oops. Illegal ranges of incoming indices are caught too.
+ */
+static __always_inline unsigned long fix_to_virt(const unsigned int idx)
+{
+       /*
+        * this branch gets completely eliminated after inlining,
+        * except when someone tries to use fixaddr indices in an
+        * illegal way. (such as mixing up address types or using
+        * out-of-range indices).
+        *
+        * If it doesn't get removed, the linker will complain
+        * loudly with a reasonably clear error message..
+        */
+       if (idx >= __end_of_fixed_addresses)
+               __this_fixmap_does_not_exist();
+
+        return __fix_to_virt(idx);
+}
+
+static inline unsigned long virt_to_fix(const unsigned long vaddr)
+{
+       BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
+       return __virt_to_fix(vaddr);
+}
+
+#endif /* !__ASSEMBLY__ */
+#endif
diff -r 88f97bb8f3ae -r 673f62edbfbe 
linux-2.6-xen-sparse/include/asm-i386/page.h
--- /dev/null   Wed Mar  1 17:01:54 2006
+++ b/linux-2.6-xen-sparse/include/asm-i386/page.h      Wed Mar  1 19:47:25 2006
@@ -0,0 +1,148 @@
+#ifndef _I386_PAGE_H
+#define _I386_PAGE_H
+
+/* PAGE_SHIFT determines the page size */
+#define PAGE_SHIFT     12
+#define PAGE_SIZE      (1UL << PAGE_SHIFT)
+#define PAGE_MASK      (~(PAGE_SIZE-1))
+
+#define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1))
+#define LARGE_PAGE_SIZE (1UL << PMD_SHIFT)
+
+#ifdef __KERNEL__
+#ifndef __ASSEMBLY__
+
+#include <linux/config.h>
+
+#ifdef CONFIG_X86_USE_3DNOW
+
+#include <asm/mmx.h>
+
+#define clear_page(page)       mmx_clear_page((void *)(page))
+#define copy_page(to,from)     mmx_copy_page(to,from)
+
+#else
+
+/*
+ *     On older X86 processors it's not a win to use MMX here it seems.
+ *     Maybe the K6-III ?
+ */
+ 
+#define clear_page(page)       memset((void *)(page), 0, PAGE_SIZE)
+#define copy_page(to,from)     memcpy((void *)(to), (void *)(from), PAGE_SIZE)
+
+#endif
+
+#define clear_user_page(page, vaddr, pg)       clear_page(page)
+#define copy_user_page(to, from, vaddr, pg)    copy_page(to, from)
+
+#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | 
__GFP_ZERO, vma, vaddr)
+#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
+
+/*
+ * These are used to make use of C type-checking..
+ */
+extern int nx_enabled;
+#ifdef CONFIG_X86_PAE
+extern unsigned long long __supported_pte_mask;
+typedef struct { unsigned long pte_low, pte_high; } pte_t;
+typedef struct { unsigned long long pmd; } pmd_t;
+typedef struct { unsigned long long pgd; } pgd_t;
+typedef struct { unsigned long long pgprot; } pgprot_t;
+#define pmd_val(x)     ((x).pmd)
+#define pte_val(x)     ((x).pte_low | ((unsigned long long)(x).pte_high << 32))
+#define __pmd(x) ((pmd_t) { (x) } )
+#define HPAGE_SHIFT    21
+#else
+typedef struct { unsigned long pte_low; } pte_t;
+typedef struct { unsigned long pgd; } pgd_t;
+typedef struct { unsigned long pgprot; } pgprot_t;
+#define boot_pte_t pte_t /* or would you rather have a typedef */
+#define pte_val(x)     ((x).pte_low)
+#define HPAGE_SHIFT    22
+#endif
+#define PTE_MASK       PAGE_MASK
+
+#ifdef CONFIG_HUGETLB_PAGE
+#define HPAGE_SIZE     ((1UL) << HPAGE_SHIFT)
+#define HPAGE_MASK     (~(HPAGE_SIZE - 1))
+#define HUGETLB_PAGE_ORDER     (HPAGE_SHIFT - PAGE_SHIFT)
+#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
+#endif
+
+#define pgd_val(x)     ((x).pgd)
+#define pgprot_val(x)  ((x).pgprot)
+
+#define __pte(x) ((pte_t) { (x) } )
+#define __pgd(x) ((pgd_t) { (x) } )
+#define __pgprot(x)    ((pgprot_t) { (x) } )
+
+#endif /* !__ASSEMBLY__ */
+
+/* to align the pointer to the (next) page boundary */
+#define PAGE_ALIGN(addr)       (((addr)+PAGE_SIZE-1)&PAGE_MASK)
+
+/*
+ * This handles the memory map.. We could make this a config
+ * option, but too many people screw it up, and too few need
+ * it.
+ *
+ * A __PAGE_OFFSET of 0xC0000000 means that the kernel has
+ * a virtual address space of one gigabyte, which limits the
+ * amount of physical memory you can use to about 950MB. 
+ *
+ * If you want more physical memory than this then see the CONFIG_HIGHMEM4G
+ * and CONFIG_HIGHMEM64G options in the kernel configuration.
+ */
+
+#ifndef __ASSEMBLY__
+
+/*
+ * This much address space is reserved for vmalloc() and iomap()
+ * as well as fixmap mappings.
+ */
+extern unsigned int __VMALLOC_RESERVE;
+
+extern int sysctl_legacy_va_layout;
+
+extern int page_is_ram(unsigned long pagenr);
+
+#endif /* __ASSEMBLY__ */
+
+#ifdef __ASSEMBLY__
+#define __PAGE_OFFSET          CONFIG_PAGE_OFFSET
+#define __PHYSICAL_START       CONFIG_PHYSICAL_START
+#else
+#define __PAGE_OFFSET          ((unsigned long)CONFIG_PAGE_OFFSET)
+#define __PHYSICAL_START       ((unsigned long)CONFIG_PHYSICAL_START)
+#endif
+#define __KERNEL_START         (__PAGE_OFFSET + __PHYSICAL_START)
+
+
+#define PAGE_OFFSET            ((unsigned long)__PAGE_OFFSET)
+#define VMALLOC_RESERVE                ((unsigned long)__VMALLOC_RESERVE)
+#define MAXMEM                 (__FIXADDR_TOP-__PAGE_OFFSET-__VMALLOC_RESERVE)
+#define __pa(x)                        ((unsigned long)(x)-PAGE_OFFSET)
+#define __va(x)                        ((void *)((unsigned 
long)(x)+PAGE_OFFSET))
+#define pfn_to_kaddr(pfn)      __va((pfn) << PAGE_SHIFT)
+#ifdef CONFIG_FLATMEM
+#define pfn_to_page(pfn)       (mem_map + (pfn))
+#define page_to_pfn(page)      ((unsigned long)((page) - mem_map))
+#define pfn_valid(pfn)         ((pfn) < max_mapnr)
+#endif /* CONFIG_FLATMEM */
+#define virt_to_page(kaddr)    pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
+
+#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
+
+#define VM_DATA_DEFAULT_FLAGS \
+       (VM_READ | VM_WRITE | \
+       ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
+                VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+
+#define __HAVE_ARCH_GATE_AREA 1
+
+#endif /* __KERNEL__ */
+
+#include <asm-generic/page.h>
+
+#endif /* _I386_PAGE_H */
diff -r 88f97bb8f3ae -r 673f62edbfbe 
patches/linux-2.6.16-rc5/i386-mach-io-check-nmi.patch
--- /dev/null   Wed Mar  1 17:01:54 2006
+++ b/patches/linux-2.6.16-rc5/i386-mach-io-check-nmi.patch     Wed Mar  1 
19:47:25 2006
@@ -0,0 +1,45 @@
+diff -pruN ../pristine-linux-2.6.16-rc5/arch/i386/kernel/traps.c 
./arch/i386/kernel/traps.c
+--- ../pristine-linux-2.6.16-rc5/arch/i386/kernel/traps.c      2006-02-27 
15:46:58.000000000 +0000
++++ ./arch/i386/kernel/traps.c 2006-02-27 15:55:23.000000000 +0000
+@@ -567,18 +567,11 @@ static void mem_parity_error(unsigned ch
+ 
+ static void io_check_error(unsigned char reason, struct pt_regs * regs)
+ {
+-      unsigned long i;
+-
+       printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
+       show_registers(regs);
+ 
+       /* Re-enable the IOCK line, wait for a few seconds */
+-      reason = (reason & 0xf) | 8;
+-      outb(reason, 0x61);
+-      i = 2000;
+-      while (--i) udelay(1000);
+-      reason &= ~8;
+-      outb(reason, 0x61);
++      clear_io_check_error(reason);
+ }
+ 
+ static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
+diff -pruN 
../pristine-linux-2.6.16-rc5/include/asm-i386/mach-default/mach_traps.h 
./include/asm-i386/mach-default/mach_traps.h
+--- ../pristine-linux-2.6.16-rc5/include/asm-i386/mach-default/mach_traps.h    
2006-01-03 03:21:10.000000000 +0000
++++ ./include/asm-i386/mach-default/mach_traps.h       2006-02-27 
15:55:23.000000000 +0000
+@@ -15,6 +15,18 @@ static inline void clear_mem_error(unsig
+       outb(reason, 0x61);
+ }
+ 
++static inline void clear_io_check_error(unsigned char reason)
++{
++      unsigned long i;
++
++      reason = (reason & 0xf) | 8;
++      outb(reason, 0x61);
++      i = 2000;
++      while (--i) udelay(1000);
++      reason &= ~8;
++      outb(reason, 0x61);
++}
++
+ static inline unsigned char get_nmi_reason(void)
+ {
+       return inb(0x61);
diff -r 88f97bb8f3ae -r 673f62edbfbe patches/linux-2.6.16-rc5/net-csum.patch
--- /dev/null   Wed Mar  1 17:01:54 2006
+++ b/patches/linux-2.6.16-rc5/net-csum.patch   Wed Mar  1 19:47:25 2006
@@ -0,0 +1,41 @@
+diff -pruN ../pristine-linux-2.6.16-rc5/net/ipv4/netfilter/ip_nat_proto_tcp.c 
./net/ipv4/netfilter/ip_nat_proto_tcp.c
+--- ../pristine-linux-2.6.16-rc5/net/ipv4/netfilter/ip_nat_proto_tcp.c 
2006-02-27 15:47:38.000000000 +0000
++++ ./net/ipv4/netfilter/ip_nat_proto_tcp.c    2006-02-27 15:55:25.000000000 
+0000
+@@ -129,10 +129,14 @@ tcp_manip_pkt(struct sk_buff **pskb,
+       if (hdrsize < sizeof(*hdr))
+               return 1;
+ 
+-      hdr->check = ip_nat_cheat_check(~oldip, newip,
++      if ((*pskb)->proto_csum_blank) {
++              hdr->check = ip_nat_cheat_check(oldip, ~newip, hdr->check);
++      } else {
++              hdr->check = ip_nat_cheat_check(~oldip, newip,
+                                       ip_nat_cheat_check(oldport ^ 0xFFFF,
+                                                          newport,
+                                                          hdr->check));
++      }
+       return 1;
+ }
+ 
+diff -pruN ../pristine-linux-2.6.16-rc5/net/ipv4/netfilter/ip_nat_proto_udp.c 
./net/ipv4/netfilter/ip_nat_proto_udp.c
+--- ../pristine-linux-2.6.16-rc5/net/ipv4/netfilter/ip_nat_proto_udp.c 
2006-02-27 15:47:38.000000000 +0000
++++ ./net/ipv4/netfilter/ip_nat_proto_udp.c    2006-02-27 15:55:25.000000000 
+0000
+@@ -113,11 +113,16 @@ udp_manip_pkt(struct sk_buff **pskb,
+               newport = tuple->dst.u.udp.port;
+               portptr = &hdr->dest;
+       }
+-      if (hdr->check) /* 0 is a special case meaning no checksum */
+-              hdr->check = ip_nat_cheat_check(~oldip, newip,
++      if (hdr->check) { /* 0 is a special case meaning no checksum */
++              if ((*pskb)->proto_csum_blank) {
++                      hdr->check = ip_nat_cheat_check(oldip, ~newip, 
hdr->check);
++              } else {
++                      hdr->check = ip_nat_cheat_check(~oldip, newip,
+                                       ip_nat_cheat_check(*portptr ^ 0xFFFF,
+                                                          newport,
+                                                          hdr->check));
++              }
++      }
+       *portptr = newport;
+       return 1;
+ }
diff -r 88f97bb8f3ae -r 673f62edbfbe patches/linux-2.6.16-rc5/pmd-shared.patch
--- /dev/null   Wed Mar  1 17:01:54 2006
+++ b/patches/linux-2.6.16-rc5/pmd-shared.patch Wed Mar  1 19:47:25 2006
@@ -0,0 +1,111 @@
+diff -pruN ../pristine-linux-2.6.16-rc5/arch/i386/mm/pageattr.c 
./arch/i386/mm/pageattr.c
+--- ../pristine-linux-2.6.16-rc5/arch/i386/mm/pageattr.c       2006-02-27 
15:46:58.000000000 +0000
++++ ./arch/i386/mm/pageattr.c  2006-02-27 15:55:31.000000000 +0000
+@@ -78,7 +78,7 @@ static void set_pmd_pte(pte_t *kpte, uns
+       unsigned long flags;
+ 
+       set_pte_atomic(kpte, pte);      /* change init_mm */
+-      if (PTRS_PER_PMD > 1)
++      if (HAVE_SHARED_KERNEL_PMD)
+               return;
+ 
+       spin_lock_irqsave(&pgd_lock, flags);
+diff -pruN ../pristine-linux-2.6.16-rc5/arch/i386/mm/pgtable.c 
./arch/i386/mm/pgtable.c
+--- ../pristine-linux-2.6.16-rc5/arch/i386/mm/pgtable.c        2006-01-03 
03:21:10.000000000 +0000
++++ ./arch/i386/mm/pgtable.c   2006-02-27 15:55:31.000000000 +0000
+@@ -215,9 +215,10 @@ void pgd_ctor(void *pgd, kmem_cache_t *c
+               spin_lock_irqsave(&pgd_lock, flags);
+       }
+ 
+-      clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
+-                      swapper_pg_dir + USER_PTRS_PER_PGD,
+-                      KERNEL_PGD_PTRS);
++      if (PTRS_PER_PMD == 1 || HAVE_SHARED_KERNEL_PMD)
++              clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
++                              swapper_pg_dir + USER_PTRS_PER_PGD,
++                              KERNEL_PGD_PTRS);
+       if (PTRS_PER_PMD > 1)
+               return;
+ 
+@@ -249,6 +250,30 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
+                       goto out_oom;
+               set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
+       }
++
++      if (!HAVE_SHARED_KERNEL_PMD) {
++              unsigned long flags;
++
++              for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
++                      pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
++                      if (!pmd)
++                              goto out_oom;
++                      set_pgd(&pgd[USER_PTRS_PER_PGD], __pgd(1 + __pa(pmd)));
++              }
++
++              spin_lock_irqsave(&pgd_lock, flags);
++              for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
++                      unsigned long v = (unsigned long)i << PGDIR_SHIFT;
++                      pgd_t *kpgd = pgd_offset_k(v);
++                      pud_t *kpud = pud_offset(kpgd, v);
++                      pmd_t *kpmd = pmd_offset(kpud, v);
++                      pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
++                      memcpy(pmd, kpmd, PAGE_SIZE);
++              }
++              pgd_list_add(pgd);
++              spin_unlock_irqrestore(&pgd_lock, flags);
++      }
++
+       return pgd;
+ 
+ out_oom:
+@@ -263,9 +288,23 @@ void pgd_free(pgd_t *pgd)
+       int i;
+ 
+       /* in the PAE case user pgd entries are overwritten before usage */
+-      if (PTRS_PER_PMD > 1)
+-              for (i = 0; i < USER_PTRS_PER_PGD; ++i)
+-                      kmem_cache_free(pmd_cache, (void 
*)__va(pgd_val(pgd[i])-1));
++      if (PTRS_PER_PMD > 1) {
++              for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
++                      pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
++                      kmem_cache_free(pmd_cache, pmd);
++              }
++              if (!HAVE_SHARED_KERNEL_PMD) {
++                      unsigned long flags;
++                      spin_lock_irqsave(&pgd_lock, flags);
++                      pgd_list_del(pgd);
++                      spin_unlock_irqrestore(&pgd_lock, flags);
++                      for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
++                              pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
++                              memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
++                              kmem_cache_free(pmd_cache, pmd);
++                      }
++              }
++      }
+       /* in the non-PAE case, free_pgtables() clears user pgd entries */
+       kmem_cache_free(pgd_cache, pgd);
+ }
+diff -pruN ../pristine-linux-2.6.16-rc5/include/asm-i386/pgtable-2level-defs.h 
./include/asm-i386/pgtable-2level-defs.h
+--- ../pristine-linux-2.6.16-rc5/include/asm-i386/pgtable-2level-defs.h        
2006-01-03 03:21:10.000000000 +0000
++++ ./include/asm-i386/pgtable-2level-defs.h   2006-02-27 15:55:31.000000000 
+0000
+@@ -1,6 +1,8 @@
+ #ifndef _I386_PGTABLE_2LEVEL_DEFS_H
+ #define _I386_PGTABLE_2LEVEL_DEFS_H
+ 
++#define HAVE_SHARED_KERNEL_PMD 0
++
+ /*
+  * traditional i386 two-level paging structure:
+  */
+diff -pruN ../pristine-linux-2.6.16-rc5/include/asm-i386/pgtable-3level-defs.h 
./include/asm-i386/pgtable-3level-defs.h
+--- ../pristine-linux-2.6.16-rc5/include/asm-i386/pgtable-3level-defs.h        
2006-01-03 03:21:10.000000000 +0000
++++ ./include/asm-i386/pgtable-3level-defs.h   2006-02-27 15:55:31.000000000 
+0000
+@@ -1,6 +1,8 @@
+ #ifndef _I386_PGTABLE_3LEVEL_DEFS_H
+ #define _I386_PGTABLE_3LEVEL_DEFS_H
+ 
++#define HAVE_SHARED_KERNEL_PMD 1
++
+ /*
+  * PGDIR_SHIFT determines what a top-level page table entry can map
+  */
diff -r 88f97bb8f3ae -r 673f62edbfbe patches/linux-2.6.16-rc5/smp-alts.patch
--- /dev/null   Wed Mar  1 17:01:54 2006
+++ b/patches/linux-2.6.16-rc5/smp-alts.patch   Wed Mar  1 19:47:25 2006
@@ -0,0 +1,591 @@
+diff -pruN ../pristine-linux-2.6.16-rc5/arch/i386/Kconfig ./arch/i386/Kconfig
+--- ../pristine-linux-2.6.16-rc5/arch/i386/Kconfig     2006-02-27 
15:46:58.000000000 +0000
++++ ./arch/i386/Kconfig        2006-02-27 15:55:34.000000000 +0000
+@@ -202,6 +202,19 @@ config SMP
+ 
+         If you don't know what to do here, say N.
+ 
++config SMP_ALTERNATIVES
++      bool "SMP alternatives support (EXPERIMENTAL)"
++      depends on SMP && EXPERIMENTAL
++      help
++        Try to reduce the overhead of running an SMP kernel on a uniprocessor
++        host slightly by replacing certain key instruction sequences
++        according to whether we currently have more than one CPU available.
++        This should provide a noticeable boost to performance when
++        running SMP kernels on UP machines, and have negligible impact
++        when running on an true SMP host.
++
++          If unsure, say N.
++        
+ config NR_CPUS
+       int "Maximum number of CPUs (2-255)"
+       range 2 255
+diff -pruN ../pristine-linux-2.6.16-rc5/arch/i386/kernel/Makefile 
./arch/i386/kernel/Makefile
+--- ../pristine-linux-2.6.16-rc5/arch/i386/kernel/Makefile     2006-02-27 
15:46:58.000000000 +0000
++++ ./arch/i386/kernel/Makefile        2006-02-27 15:55:34.000000000 +0000
+@@ -37,6 +37,7 @@ obj-$(CONFIG_EFI)            += efi.o efi_stub.o
+ obj-$(CONFIG_DOUBLEFAULT)     += doublefault.o
+ obj-$(CONFIG_VM86)            += vm86.o
+ obj-$(CONFIG_EARLY_PRINTK)    += early_printk.o
++obj-$(CONFIG_SMP_ALTERNATIVES)  += smpalts.o
+ 
+ EXTRA_AFLAGS   := -traditional
+ 
+diff -pruN ../pristine-linux-2.6.16-rc5/arch/i386/kernel/smpalts.c 
./arch/i386/kernel/smpalts.c
+--- ../pristine-linux-2.6.16-rc5/arch/i386/kernel/smpalts.c    1970-01-01 
01:00:00.000000000 +0100
++++ ./arch/i386/kernel/smpalts.c       2006-02-27 15:55:34.000000000 +0000
+@@ -0,0 +1,85 @@
++#include <linux/kernel.h>
++#include <asm/system.h>
++#include <asm/smp_alt.h>
++#include <asm/processor.h>
++#include <asm/string.h>
++
++struct smp_replacement_record {
++      unsigned char targ_size;
++      unsigned char smp1_size;
++      unsigned char smp2_size;
++      unsigned char up_size;
++      unsigned char feature;
++      unsigned char data[0];
++};
++
++struct smp_alternative_record {
++      void *targ_start;
++      struct smp_replacement_record *repl;
++};
++
++extern struct smp_alternative_record __start_smp_alternatives_table,
++  __stop_smp_alternatives_table;
++extern unsigned long __init_begin, __init_end;
++
++void prepare_for_smp(void)
++{
++      struct smp_alternative_record *r;
++      printk(KERN_INFO "Enabling SMP...\n");
++      for (r = &__start_smp_alternatives_table;
++           r != &__stop_smp_alternatives_table;
++           r++) {
++              BUG_ON(r->repl->targ_size < r->repl->smp1_size);
++              BUG_ON(r->repl->targ_size < r->repl->smp2_size);
++              BUG_ON(r->repl->targ_size < r->repl->up_size);
++               if (system_state == SYSTEM_RUNNING &&
++                   r->targ_start >= (void *)&__init_begin &&
++                   r->targ_start < (void *)&__init_end)
++                       continue;
++              if (r->repl->feature != (unsigned char)-1 &&
++                  boot_cpu_has(r->repl->feature)) {
++                      memcpy(r->targ_start,
++                             r->repl->data + r->repl->smp1_size,
++                             r->repl->smp2_size);
++                      memset(r->targ_start + r->repl->smp2_size,
++                             0x90,
++                             r->repl->targ_size - r->repl->smp2_size);
++              } else {
++                      memcpy(r->targ_start,
++                             r->repl->data,
++                             r->repl->smp1_size);
++                      memset(r->targ_start + r->repl->smp1_size,
++                             0x90,
++                             r->repl->targ_size - r->repl->smp1_size);
++              }
++      }
++      /* Paranoia */
++      asm volatile ("jmp 1f\n1:");
++      mb();
++}
++
++void unprepare_for_smp(void)
++{
++      struct smp_alternative_record *r;
++      printk(KERN_INFO "Disabling SMP...\n");
++      for (r = &__start_smp_alternatives_table;
++           r != &__stop_smp_alternatives_table;
++           r++) {
++              BUG_ON(r->repl->targ_size < r->repl->smp1_size);
++              BUG_ON(r->repl->targ_size < r->repl->smp2_size);
++              BUG_ON(r->repl->targ_size < r->repl->up_size);
++               if (system_state == SYSTEM_RUNNING &&
++                   r->targ_start >= (void *)&__init_begin &&
++                   r->targ_start < (void *)&__init_end)
++                       continue;
++              memcpy(r->targ_start,
++                     r->repl->data + r->repl->smp1_size + r->repl->smp2_size,
++                     r->repl->up_size);
++              memset(r->targ_start + r->repl->up_size,
++                     0x90,
++                     r->repl->targ_size - r->repl->up_size);
++      }
++      /* Paranoia */
++      asm volatile ("jmp 1f\n1:");
++      mb();
++}
+diff -pruN ../pristine-linux-2.6.16-rc5/arch/i386/kernel/smpboot.c 
./arch/i386/kernel/smpboot.c
+--- ../pristine-linux-2.6.16-rc5/arch/i386/kernel/smpboot.c    2006-02-27 
15:46:58.000000000 +0000
++++ ./arch/i386/kernel/smpboot.c       2006-02-27 15:55:34.000000000 +0000
+@@ -1208,6 +1208,11 @@ static void __init smp_boot_cpus(unsigne
+               if (max_cpus <= cpucount+1)
+                       continue;
+ 
++#ifdef CONFIG_SMP_ALTERNATIVES
++              if (kicked == 1)
++                      prepare_for_smp();
++#endif
++
+               if (((cpu = alloc_cpu_id()) <= 0) || do_boot_cpu(apicid, cpu))
+                       printk("CPU #%d not responding - cannot use it.\n",
+                                                               apicid);
+@@ -1386,6 +1391,11 @@ int __devinit __cpu_up(unsigned int cpu)
+               return -EIO;
+       }
+ 
++#ifdef CONFIG_SMP_ALTERNATIVES
++      if (num_online_cpus() == 1)
++              prepare_for_smp();
++#endif
++
+       local_irq_enable();
+       per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
+       /* Unleash the CPU! */
+diff -pruN ../pristine-linux-2.6.16-rc5/arch/i386/kernel/vmlinux.lds.S 
./arch/i386/kernel/vmlinux.lds.S
+--- ../pristine-linux-2.6.16-rc5/arch/i386/kernel/vmlinux.lds.S        
2006-01-03 03:21:10.000000000 +0000
++++ ./arch/i386/kernel/vmlinux.lds.S   2006-02-27 15:55:34.000000000 +0000
+@@ -34,6 +34,13 @@ SECTIONS
+   __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) }
+   __stop___ex_table = .;
+ 
++  . = ALIGN(16);
++  __start_smp_alternatives_table = .;
++  __smp_alternatives : { *(__smp_alternatives) }
++  __stop_smp_alternatives_table = .;
++
++  __smp_replacements : { *(__smp_replacements) }
++
+   RODATA
+ 
+   /* writeable */
+diff -pruN ../pristine-linux-2.6.16-rc5/include/asm-i386/atomic.h 
./include/asm-i386/atomic.h
+--- ../pristine-linux-2.6.16-rc5/include/asm-i386/atomic.h     2006-02-27 
15:47:25.000000000 +0000
++++ ./include/asm-i386/atomic.h        2006-02-27 15:55:34.000000000 +0000
+@@ -4,18 +4,13 @@
+ #include <linux/config.h>
+ #include <linux/compiler.h>
+ #include <asm/processor.h>
++#include <asm/smp_alt.h>
+ 
+ /*
+  * Atomic operations that C can't guarantee us.  Useful for
+  * resource counting etc..
+  */
+ 
+-#ifdef CONFIG_SMP
+-#define LOCK "lock ; "
+-#else
+-#define LOCK ""
+-#endif
+-
+ /*
+  * Make sure gcc doesn't try to be clever and move things around
+  * on us. We need to use _exactly_ the address the user gave us,
+diff -pruN ../pristine-linux-2.6.16-rc5/include/asm-i386/bitops.h 
./include/asm-i386/bitops.h
+--- ../pristine-linux-2.6.16-rc5/include/asm-i386/bitops.h     2006-02-27 
15:47:25.000000000 +0000
++++ ./include/asm-i386/bitops.h        2006-02-27 15:55:34.000000000 +0000
+@@ -7,6 +7,7 @@
+ 
+ #include <linux/config.h>
+ #include <linux/compiler.h>
++#include <asm/smp_alt.h>
+ 
+ /*
+  * These have to be done with inline assembly: that way the bit-setting
+@@ -16,12 +17,6 @@
+  * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
+  */
+ 
+-#ifdef CONFIG_SMP
+-#define LOCK_PREFIX "lock ; "
+-#else
+-#define LOCK_PREFIX ""
+-#endif
+-
+ #define ADDR (*(volatile long *) addr)
+ 
+ /**
+@@ -41,7 +36,7 @@
+  */
+ static inline void set_bit(int nr, volatile unsigned long * addr)
+ {
+-      __asm__ __volatile__( LOCK_PREFIX
++      __asm__ __volatile__( LOCK
+               "btsl %1,%0"
+               :"+m" (ADDR)
+               :"Ir" (nr));
+@@ -76,7 +71,7 @@ static inline void __set_bit(int nr, vol
+  */
+ static inline void clear_bit(int nr, volatile unsigned long * addr)
+ {
+-      __asm__ __volatile__( LOCK_PREFIX
++      __asm__ __volatile__( LOCK
+               "btrl %1,%0"
+               :"+m" (ADDR)
+               :"Ir" (nr));
+@@ -121,7 +116,7 @@ static inline void __change_bit(int nr, 
+  */
+ static inline void change_bit(int nr, volatile unsigned long * addr)
+ {
+-      __asm__ __volatile__( LOCK_PREFIX
++      __asm__ __volatile__( LOCK
+               "btcl %1,%0"
+               :"+m" (ADDR)
+               :"Ir" (nr));
+@@ -140,7 +135,7 @@ static inline int test_and_set_bit(int n
+ {
+       int oldbit;
+ 
+-      __asm__ __volatile__( LOCK_PREFIX
++      __asm__ __volatile__( LOCK
+               "btsl %2,%1\n\tsbbl %0,%0"
+               :"=r" (oldbit),"+m" (ADDR)
+               :"Ir" (nr) : "memory");
+@@ -180,7 +175,7 @@ static inline int test_and_clear_bit(int
+ {
+       int oldbit;
+ 
+-      __asm__ __volatile__( LOCK_PREFIX
++      __asm__ __volatile__( LOCK
+               "btrl %2,%1\n\tsbbl %0,%0"
+               :"=r" (oldbit),"+m" (ADDR)
+               :"Ir" (nr) : "memory");
+@@ -231,7 +226,7 @@ static inline int test_and_change_bit(in
+ {
+       int oldbit;
+ 
+-      __asm__ __volatile__( LOCK_PREFIX
++      __asm__ __volatile__( LOCK
+               "btcl %2,%1\n\tsbbl %0,%0"
+               :"=r" (oldbit),"+m" (ADDR)
+               :"Ir" (nr) : "memory");
+diff -pruN ../pristine-linux-2.6.16-rc5/include/asm-i386/futex.h 
./include/asm-i386/futex.h
+--- ../pristine-linux-2.6.16-rc5/include/asm-i386/futex.h      2006-02-27 
15:47:25.000000000 +0000
++++ ./include/asm-i386/futex.h 2006-02-27 15:55:34.000000000 +0000
+@@ -28,7 +28,7 @@
+ "1:   movl    %2, %0\n\
+       movl    %0, %3\n"                                       \
+       insn "\n"                                               \
+-"2:   " LOCK_PREFIX "cmpxchgl %3, %2\n\
++"2:   " LOCK "cmpxchgl %3, %2\n\
+       jnz     1b\n\
+ 3:    .section .fixup,\"ax\"\n\
+ 4:    mov     %5, %1\n\
+@@ -68,7 +68,7 @@ futex_atomic_op_inuser (int encoded_op, 
+ #endif
+               switch (op) {
+               case FUTEX_OP_ADD:
+-                      __futex_atomic_op1(LOCK_PREFIX "xaddl %0, %2", ret,
++                      __futex_atomic_op1(LOCK "xaddl %0, %2", ret,
+                                          oldval, uaddr, oparg);
+                       break;
+               case FUTEX_OP_OR:
+diff -pruN ../pristine-linux-2.6.16-rc5/include/asm-i386/rwsem.h 
./include/asm-i386/rwsem.h
+--- ../pristine-linux-2.6.16-rc5/include/asm-i386/rwsem.h      2006-01-03 
03:21:10.000000000 +0000
++++ ./include/asm-i386/rwsem.h 2006-02-27 15:55:34.000000000 +0000
+@@ -40,6 +40,7 @@
+ 
+ #include <linux/list.h>
+ #include <linux/spinlock.h>
++#include <asm/smp_alt.h>
+ 
+ struct rwsem_waiter;
+ 
+@@ -99,7 +100,7 @@ static inline void __down_read(struct rw
+ {
+       __asm__ __volatile__(
+               "# beginning down_read\n\t"
+-LOCK_PREFIX   "  incl      (%%eax)\n\t" /* adds 0x00000001, returns the old 
value */
++LOCK          "  incl      (%%eax)\n\t" /* adds 0x00000001, returns the old 
value */
+               "  js        2f\n\t" /* jump if we weren't granted the lock */
+               "1:\n\t"
+               LOCK_SECTION_START("")
+@@ -130,7 +131,7 @@ static inline int __down_read_trylock(st
+               "  movl      %1,%2\n\t"
+               "  addl      %3,%2\n\t"
+               "  jle       2f\n\t"
+-LOCK_PREFIX   "  cmpxchgl  %2,%0\n\t"
++LOCK          "  cmpxchgl  %2,%0\n\t"
+               "  jnz       1b\n\t"
+               "2:\n\t"
+               "# ending __down_read_trylock\n\t"
+@@ -150,7 +151,7 @@ static inline void __down_write(struct r
+       tmp = RWSEM_ACTIVE_WRITE_BIAS;
+       __asm__ __volatile__(
+               "# beginning down_write\n\t"
+-LOCK_PREFIX   "  xadd      %%edx,(%%eax)\n\t" /* subtract 0x0000ffff, returns 
the old value */
++LOCK          "  xadd      %%edx,(%%eax)\n\t" /* subtract 0x0000ffff, returns 
the old value */
+               "  testl     %%edx,%%edx\n\t" /* was the count 0 before? */
+               "  jnz       2f\n\t" /* jump if we weren't granted the lock */
+               "1:\n\t"
+@@ -188,7 +189,7 @@ static inline void __up_read(struct rw_s
+       __s32 tmp = -RWSEM_ACTIVE_READ_BIAS;
+       __asm__ __volatile__(
+               "# beginning __up_read\n\t"
+-LOCK_PREFIX   "  xadd      %%edx,(%%eax)\n\t" /* subtracts 1, returns the old 
value */
++LOCK          "  xadd      %%edx,(%%eax)\n\t" /* subtracts 1, returns the old 
value */
+               "  js        2f\n\t" /* jump if the lock is being waited upon */
+               "1:\n\t"
+               LOCK_SECTION_START("")
+@@ -214,7 +215,7 @@ static inline void __up_write(struct rw_
+       __asm__ __volatile__(
+               "# beginning __up_write\n\t"
+               "  movl      %2,%%edx\n\t"
+-LOCK_PREFIX   "  xaddl     %%edx,(%%eax)\n\t" /* tries to transition 
0xffff0001 -> 0x00000000 */
++LOCK          "  xaddl     %%edx,(%%eax)\n\t" /* tries to transition 
0xffff0001 -> 0x00000000 */
+               "  jnz       2f\n\t" /* jump if the lock is being waited upon */
+               "1:\n\t"
+               LOCK_SECTION_START("")
+@@ -239,7 +240,7 @@ static inline void __downgrade_write(str
+ {
+       __asm__ __volatile__(
+               "# beginning __downgrade_write\n\t"
+-LOCK_PREFIX   "  addl      %2,(%%eax)\n\t" /* transitions 0xZZZZ0001 -> 
0xYYYY0001 */
++LOCK          "  addl      %2,(%%eax)\n\t" /* transitions 0xZZZZ0001 -> 
0xYYYY0001 */
+               "  js        2f\n\t" /* jump if the lock is being waited upon */
+               "1:\n\t"
+               LOCK_SECTION_START("")
+@@ -263,7 +264,7 @@ LOCK_PREFIX        "  addl      %2,(%%eax)\n\t"
+ static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem)
+ {
+       __asm__ __volatile__(
+-LOCK_PREFIX   "addl %1,%0"
++LOCK            "addl %1,%0"
+               : "=m"(sem->count)
+               : "ir"(delta), "m"(sem->count));
+ }
+@@ -276,7 +277,7 @@ static inline int rwsem_atomic_update(in
+       int tmp = delta;
+ 
+       __asm__ __volatile__(
+-LOCK_PREFIX   "xadd %0,(%2)"
++LOCK                    "xadd %0,(%2)"
+               : "+r"(tmp), "=m"(sem->count)
+               : "r"(sem), "m"(sem->count)
+               : "memory");
+diff -pruN ../pristine-linux-2.6.16-rc5/include/asm-i386/smp_alt.h 
./include/asm-i386/smp_alt.h
+--- ../pristine-linux-2.6.16-rc5/include/asm-i386/smp_alt.h    1970-01-01 
01:00:00.000000000 +0100
++++ ./include/asm-i386/smp_alt.h       2006-02-27 15:55:34.000000000 +0000
+@@ -0,0 +1,32 @@
++#ifndef __ASM_SMP_ALT_H__
++#define __ASM_SMP_ALT_H__
++
++#include <linux/config.h>
++
++#ifdef CONFIG_SMP
++#if defined(CONFIG_SMP_ALTERNATIVES) && !defined(MODULE)
++#define LOCK \
++        "6677: nop\n" \
++      ".section __smp_alternatives,\"a\"\n" \
++      ".long 6677b\n" \
++      ".long 6678f\n" \
++      ".previous\n" \
++      ".section __smp_replacements,\"a\"\n" \
++      "6678: .byte 1\n" \
++      ".byte 1\n" \
++      ".byte 0\n" \
++        ".byte 1\n" \
++      ".byte -1\n" \
++      "lock\n" \
++      "nop\n" \
++      ".previous\n"
++void prepare_for_smp(void);
++void unprepare_for_smp(void);
++#else
++#define LOCK "lock ; "
++#endif
++#else
++#define LOCK ""
++#endif
++
++#endif /* __ASM_SMP_ALT_H__ */
+diff -pruN ../pristine-linux-2.6.16-rc5/include/asm-i386/spinlock.h 
./include/asm-i386/spinlock.h
+--- ../pristine-linux-2.6.16-rc5/include/asm-i386/spinlock.h   2006-01-03 
03:21:10.000000000 +0000
++++ ./include/asm-i386/spinlock.h      2006-02-27 15:55:34.000000000 +0000
+@@ -6,6 +6,7 @@
+ #include <asm/page.h>
+ #include <linux/config.h>
+ #include <linux/compiler.h>
++#include <asm/smp_alt.h>
+ 
+ /*
+  * Your basic SMP spinlocks, allowing only a single CPU anywhere
+@@ -23,7 +24,8 @@
+ 
+ #define __raw_spin_lock_string \
+       "\n1:\t" \
+-      "lock ; decb %0\n\t" \
++      LOCK \
++      "decb %0\n\t" \
+       "jns 3f\n" \
+       "2:\t" \
+       "rep;nop\n\t" \
+@@ -34,7 +36,8 @@
+ 
+ #define __raw_spin_lock_string_flags \
+       "\n1:\t" \
+-      "lock ; decb %0\n\t" \
++      LOCK \
++      "decb %0\n\t" \
+       "jns 4f\n\t" \
+       "2:\t" \
+       "testl $0x200, %1\n\t" \
+@@ -65,10 +68,34 @@ static inline void __raw_spin_lock_flags
+ static inline int __raw_spin_trylock(raw_spinlock_t *lock)
+ {
+       char oldval;
++#ifdef CONFIG_SMP_ALTERNATIVES
+       __asm__ __volatile__(
+-              "xchgb %b0,%1"
++              "1:movb %1,%b0\n"
++              "movb $0,%1\n"
++              "2:"
++              ".section __smp_alternatives,\"a\"\n"
++              ".long 1b\n"
++              ".long 3f\n"
++              ".previous\n"
++              ".section __smp_replacements,\"a\"\n"
++              "3: .byte 2b - 1b\n"
++              ".byte 5f-4f\n"
++              ".byte 0\n"
++              ".byte 6f-5f\n"
++              ".byte -1\n"
++              "4: xchgb %b0,%1\n"
++              "5: movb %1,%b0\n"
++              "movb $0,%1\n"
++              "6:\n"
++              ".previous\n"
+               :"=q" (oldval), "=m" (lock->slock)
+               :"0" (0) : "memory");
++#else
++      __asm__ __volatile__(
++              "xchgb %b0,%1\n"
++              :"=q" (oldval), "=m" (lock->slock)
++              :"0" (0) : "memory");
++#endif
+       return oldval > 0;
+ }
+ 
+@@ -178,12 +205,12 @@ static inline int __raw_write_trylock(ra
+ 
+ static inline void __raw_read_unlock(raw_rwlock_t *rw)
+ {
+-      asm volatile("lock ; incl %0" :"=m" (rw->lock) : : "memory");
++      asm volatile(LOCK "incl %0" :"=m" (rw->lock) : : "memory");
+ }
+ 
+ static inline void __raw_write_unlock(raw_rwlock_t *rw)
+ {
+-      asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ", %0"
++      asm volatile(LOCK "addl $" RW_LOCK_BIAS_STR ", %0"
+                                : "=m" (rw->lock) : : "memory");
+ }
+ 
+diff -pruN ../pristine-linux-2.6.16-rc5/include/asm-i386/system.h 
./include/asm-i386/system.h
+--- ../pristine-linux-2.6.16-rc5/include/asm-i386/system.h     2006-02-27 
15:47:25.000000000 +0000
++++ ./include/asm-i386/system.h        2006-02-27 15:55:34.000000000 +0000
+@@ -5,7 +5,7 @@
+ #include <linux/kernel.h>
+ #include <asm/segment.h>
+ #include <asm/cpufeature.h>
+-#include <linux/bitops.h> /* for LOCK_PREFIX */
++#include <asm/smp_alt.h>
+ 
+ #ifdef __KERNEL__
+ 
+@@ -271,19 +271,19 @@ static inline unsigned long __cmpxchg(vo
+       unsigned long prev;
+       switch (size) {
+       case 1:
+-              __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2"
++              __asm__ __volatile__(LOCK "cmpxchgb %b1,%2"
+                                    : "=a"(prev)
+                                    : "q"(new), "m"(*__xg(ptr)), "0"(old)
+                                    : "memory");
+               return prev;
+       case 2:
+-              __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2"
++              __asm__ __volatile__(LOCK "cmpxchgw %w1,%2"
+                                    : "=a"(prev)
+                                    : "r"(new), "m"(*__xg(ptr)), "0"(old)
+                                    : "memory");
+               return prev;
+       case 4:
+-              __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2"
++              __asm__ __volatile__(LOCK "cmpxchgl %1,%2"
+                                    : "=a"(prev)
+                                    : "r"(new), "m"(*__xg(ptr)), "0"(old)
+                                    : "memory");
+@@ -336,7 +336,7 @@ static inline unsigned long long __cmpxc
+                                     unsigned long long new)
+ {
+       unsigned long long prev;
+-      __asm__ __volatile__(LOCK_PREFIX "cmpxchg8b %3"
++      __asm__ __volatile__(LOCK "cmpxchg8b %3"
+                            : "=A"(prev)
+                            : "b"((unsigned long)new),
+                              "c"((unsigned long)(new >> 32)),
+@@ -503,11 +503,55 @@ struct alt_instr { 
+ #endif
+ 
+ #ifdef CONFIG_SMP
++#if defined(CONFIG_SMP_ALTERNATIVES) && !defined(MODULE)
++#define smp_alt_mb(instr)                                           \
++__asm__ __volatile__("6667:\nnop\nnop\nnop\nnop\nnop\nnop\n6668:\n" \
++                   ".section __smp_alternatives,\"a\"\n"          \
++                   ".long 6667b\n"                                \
++                     ".long 6673f\n"                                \
++                   ".previous\n"                                  \
++                   ".section __smp_replacements,\"a\"\n"          \
++                   "6673:.byte 6668b-6667b\n"                     \
++                   ".byte 6670f-6669f\n"                          \
++                   ".byte 6671f-6670f\n"                          \
++                     ".byte 0\n"                                    \
++                   ".byte %c0\n"                                  \
++                   "6669:lock;addl $0,0(%%esp)\n"                 \
++                   "6670:" instr "\n"                             \
++                   "6671:\n"                                      \
++                   ".previous\n"                                  \
++                   :                                              \
++                   : "i" (X86_FEATURE_XMM2)                       \
++                   : "memory")
++#define smp_rmb() smp_alt_mb("lfence")
++#define smp_mb()  smp_alt_mb("mfence")
++#define set_mb(var, value) do {                                     \
++unsigned long __set_mb_temp;                                        \
++__asm__ __volatile__("6667:movl %1, %0\n6668:\n"                    \
++                   ".section __smp_alternatives,\"a\"\n"          \
++                   ".long 6667b\n"                                \
++                   ".long 6673f\n"                                \
++                   ".previous\n"                                  \
++                   ".section __smp_replacements,\"a\"\n"          \
++                   "6673: .byte 6668b-6667b\n"                    \
++                   ".byte 6670f-6669f\n"                          \
++                   ".byte 0\n"                                    \
++                   ".byte 6671f-6670f\n"                          \
++                   ".byte -1\n"                                   \
++                   "6669: xchg %1, %0\n"                          \
++                   "6670:movl %1, %0\n"                           \
++                   "6671:\n"                                      \
++                   ".previous\n"                                  \
++                   : "=m" (var), "=r" (__set_mb_temp)             \
++                   : "1" (value)                                  \
++                   : "memory"); } while (0)
++#else
+ #define smp_mb()      mb()
+ #define smp_rmb()     rmb()
++#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
++#endif
+ #define smp_wmb()     wmb()
+ #define smp_read_barrier_depends()    read_barrier_depends()
+-#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
+ #else
+ #define smp_mb()      barrier()
+ #define smp_rmb()     barrier()
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/locking.sh
--- /dev/null   Wed Mar  1 17:01:54 2006
+++ b/tools/examples/locking.sh Wed Mar  1 19:47:25 2006
@@ -0,0 +1,98 @@
+#
+# Copyright (c) 2005 XenSource Ltd.
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of version 2.1 of the GNU Lesser General Public
+# License as published by the Free Software Foundation.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+#
+
+#
+# Serialisation
+#
+
+LOCK_SLEEPTIME=1
+LOCK_SPINNING_RETRIES=5
+LOCK_RETRIES=10
+LOCK_BASEDIR=/var/run/xen-hotplug
+
+
+claim_lock()
+{
+  local lockdir="$LOCK_BASEDIR/$1"
+  mkdir -p "$LOCK_BASEDIR"
+  _claim_lock "$lockdir"
+}
+
+
+release_lock()
+{
+  _release_lock "$LOCK_BASEDIR/$1"
+}
+
+
+_claim_lock()
+{
+  local lockdir="$1"
+  local owner=$(_lock_owner "$lockdir")
+  local retries=0
+
+  while [ $retries -lt $LOCK_RETRIES ]
+  do
+    mkdir "$lockdir" 2>/dev/null && trap "release_lock $1; sigerr" ERR &&
+      _update_lock_info "$lockdir" && return
+
+    local new_owner=$(_lock_owner "$lockdir")
+    if [ "$new_owner" != "$owner" ]
+    then
+      owner="$new_owner"
+      retries=0
+    fi
+
+    if [ $retries -gt $LOCK_SPINNING_RETRIES ]
+    then
+      sleep $LOCK_SLEEPTIME
+    else
+      sleep 0
+    fi
+    retries=$(($retries + 1))
+  done
+  _steal_lock "$lockdir"
+}
+
+
+_release_lock()
+{
+  trap sigerr ERR
+  rm -rf "$1" 2>/dev/null || true
+}
+
+
+_steal_lock()
+{
+  local lockdir="$1"
+  local owner=$(cat "$lockdir/owner" 2>/dev/null || echo "unknown")
+  log err "Forced to steal lock on $lockdir from $owner!"
+  _release_lock "$lockdir"
+  _claim_lock "$lockdir"
+}
+
+
+_lock_owner()
+{
+  cat "$1/owner" 2>/dev/null || echo "unknown"
+}
+
+
+_update_lock_info()
+{
+  echo "$$: $0" >"$1/owner"
+}
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/logging.sh
--- /dev/null   Wed Mar  1 17:01:54 2006
+++ b/tools/examples/logging.sh Wed Mar  1 19:47:25 2006
@@ -0,0 +1,22 @@
+#
+# Copyright (c) 2005 XenSource Ltd.
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of version 2.1 of the GNU Lesser General Public
+# License as published by the Free Software Foundation.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+#
+
+log() {
+  local level="$1"
+  shift
+  logger -p "daemon.$level" -- "$0:" "$@" || echo "$0 $@" >&2
+}
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/vtpm-delete
--- /dev/null   Wed Mar  1 17:01:54 2006
+++ b/tools/examples/vtpm-delete        Wed Mar  1 19:47:25 2006
@@ -0,0 +1,9 @@
+#!/bin/sh
+
+# This scripts must be called the following way:
+# vtpm-delete <domain name>
+
+dir=$(dirname "$0")
+. "$dir/vtpm-common.sh"
+
+vtpm_delete_instance $1
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/vtpm-hotplug-common.sh
--- /dev/null   Wed Mar  1 17:01:54 2006
+++ b/tools/examples/vtpm-hotplug-common.sh     Wed Mar  1 19:47:25 2006
@@ -0,0 +1,35 @@
+#
+# Copyright (c) 2005 IBM Corporation
+# Copyright (c) 2005 XenSource Ltd.
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of version 2.1 of the GNU Lesser General Public
+# License as published by the Free Software Foundation.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+#
+
+dir=$(dirname "$0")
+. "$dir/xen-hotplug-common.sh"
+
+findCommand "$@"
+if [ "$command" != "online" ]  &&
+   [ "$command" != "offline" ] &&
+   [ "$command" != "add" ]     &&
+   [ "$command" != "remove" ]
+then
+       log err "Invalid command: $command"
+       exit 1
+fi
+
+
+XENBUS_PATH="${XENBUS_PATH:?}"
+
+. "$dir/vtpm-common.sh"
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/xen-hotplug-cleanup
--- /dev/null   Wed Mar  1 17:01:54 2006
+++ b/tools/examples/xen-hotplug-cleanup        Wed Mar  1 19:47:25 2006
@@ -0,0 +1,21 @@
+#! /bin/sh
+
+dir=$(dirname "$0")
+. "$dir/xen-hotplug-common.sh"
+
+# Claim the lock protecting /etc/xen/scripts/block.  This stops a race whereby
+# paths in the store would disappear underneath that script as it attempted to
+# read from the store checking for device sharing.
+# Any other scripts that do similar things will have to have their lock
+# claimed too.
+# This is pretty horrible, but there's not really a nicer way of solving this.
+claim_lock "block"
+
+# remove device frontend store entries
+xenstore-rm -t $(xenstore-read "$XENBUS_PATH/frontend") || true
+
+# remove device backend store entries
+xenstore-rm -t "$XENBUS_PATH"       || true
+xenstore-rm -t "error/$XENBUS_PATH" || true
+
+release_lock "block"
diff -r 88f97bb8f3ae -r 673f62edbfbe 
tools/xm-test/tests/vtpm/01_vtpm-list_pos.py
--- /dev/null   Wed Mar  1 17:01:54 2006
+++ b/tools/xm-test/tests/vtpm/01_vtpm-list_pos.py      Wed Mar  1 19:47:25 2006
@@ -0,0 +1,45 @@
+#!/usr/bin/python
+
+# Copyright (C) International Business Machines Corp., 2006
+# Author: Stefan Berger <stefanb@xxxxxxxxxx)
+
+# Positive Test: create domain with virtual TPM attached at build time,
+#                verify list
+
+
+from XmTestLib import *
+
+def vtpm_cleanup(domName):
+       # Since this is only a temporary domain I clean up the domain from the
+       # virtual TPM directory
+       traceCommand("/etc/xen/scripts/vtpm-delete %s" % domName)
+
+if ENABLE_HVM_SUPPORT:
+    SKIP("vtpm-list not supported for HVM domains")
+
+config = {"vtpm":"instance=1,backend=0"}
+domain = XmTestDomain(extraConfig=config)
+
+try:
+    domain.start()
+except DomainError, e:
+    if verbose:
+        print e.extra
+    vtpm_cleanup(domain.getName())
+    FAIL("Unable to create domain")
+
+domName = domain.getName()
+
+status, output = traceCommand("xm vtpm-list %s" % domain.getId())
+eyecatcher = "/local/domain/0/backend/vtpm"
+where = output.find(eyecatcher)
+if status != 0:
+    vtpm_cleanup(domName)
+    FAIL("xm vtpm-list returned bad status, expected 0, status is %i" % status)
+elif where < 0:
+    vtpm_cleanup(domName)
+    FAIL("Fail to list virtual TPM device")
+
+domain.stop()
+
+vtpm_cleanup(domName)
diff -r 88f97bb8f3ae -r 673f62edbfbe 
tools/xm-test/tests/vtpm/02_vtpm-cat_pcrs.py
--- /dev/null   Wed Mar  1 17:01:54 2006
+++ b/tools/xm-test/tests/vtpm/02_vtpm-cat_pcrs.py      Wed Mar  1 19:47:25 2006
@@ -0,0 +1,81 @@
+#!/usr/bin/python
+
+# Copyright (C) International Business Machines Corp., 2006
+# Author: Stefan Berger <stefanb@xxxxxxxxxx)
+
+# Positive Test: create domain with virtual TPM attached at build time,
+#                check list of pcrs
+
+from XmTestLib import *
+
+def vtpm_cleanup(domName):
+       # Since this is only a temporary domain I clean up the domain from the
+       # virtual TPM directory
+       traceCommand("/etc/xen/scripts/vtpm-delete %s" % domName)
+
+if ENABLE_HVM_SUPPORT:
+    SKIP("vtpm-list not supported for HVM domains")
+
+status, output = traceCommand("ls /dev/tpm0")
+if re.search("No such file or directory",output):
+    SKIP("This machine has no hardware TPM; cannot run this test")
+
+status, output = traceCommand("ps aux | grep vtpm_manager | grep -v grep")
+if output == "":
+    FAIL("virtual TPM manager must be started to run this test")
+
+# vtpm manager has been detected
+config = {"vtpm":"instance=1,backend=0"}
+domain = XmTestDomain(extraConfig=config)
+
+try:
+    domain.start()
+except DomainError, e:
+    if verbose:
+        print e.extra
+    vtpm_cleanup(domain.getName())
+    FAIL("Unable to create domain")
+
+domName = domain.getName()
+
+try:
+    console = XmConsole(domain.getName())
+except ConsoleError, e:
+    vtpm_cleanup(domName)
+    FAIL(str(e))
+
+try:
+    console.sendInput("input")
+    run = console.runCmd("ls /sys")
+except ConsoleError, e:
+    saveLog(console.getHistory())
+    vtpm_cleanup(domName)
+    FAIL(str(e))
+
+if re.search("No such file",run["output"]):
+    try:
+        run = console.runCmd("mkdir /sys")
+        run = console.runCmd("mount -t sysfs /sys /sys")
+    except ConsoleError, e:
+        saveLog(console.getHistory())
+        vtpm_cleanup(domName)
+        FAIL(str(e))
+
+try:
+    run = console.runCmd("cat /sys/devices/platform/tpm_vtpm/pcrs")
+except ConsoleError, e:
+    saveLog(console.getHistory())
+    vtpm_cleanup(domName)
+    FAIL(str(e))
+
+if re.search("No such file",run["output"]):
+    FAIL("TPM frontend support not compiled into (domU?) kernel")
+
+console.closeConsole()
+
+domain.stop()
+
+vtpm_cleanup(domName)
+
+if not re.search("PCR-00:",run["output"]):
+       FAIL("Virtual TPM is not working correctly on /dev/vtpm on backend 
side")
diff -r 88f97bb8f3ae -r 673f62edbfbe tools/xm-test/tests/vtpm/Makefile.am
--- /dev/null   Wed Mar  1 17:01:54 2006
+++ b/tools/xm-test/tests/vtpm/Makefile.am      Wed Mar  1 19:47:25 2006
@@ -0,0 +1,22 @@
+
+SUBDIRS =
+
+TESTS = 01_vtpm-list_pos.test \
+        02_vtpm-cat_pcrs.test
+
+XFAIL_TESTS =
+
+EXTRA_DIST = $(TESTS) $(XFAIL_TESTS)
+
+TESTS_ENVIRONMENT=@TENV@
+
+%.test: %.py
+       cp $< $@
+       chmod +x $@
+
+clean-local: am_config_clean-local
+
+am_config_clean-local:
+       rm -f *test
+       rm -f *log
+       rm -f *~
diff -r 88f97bb8f3ae -r 673f62edbfbe 
xen/arch/x86/x86_32/supervisor_mode_kernel.S
--- /dev/null   Wed Mar  1 17:01:54 2006
+++ b/xen/arch/x86/x86_32/supervisor_mode_kernel.S      Wed Mar  1 19:47:25 2006
@@ -0,0 +1,145 @@
+/*
+ * Handle stack fixup for guest running in RING 0.
+ *
+ * Copyright (c) 2006 Ian Campbell
+ *
+ * When a guest kernel is allowed to run in RING 0 a hypercall,
+ * interrupt or exception interrupting the guest kernel will not cause
+ * a privilege level change and therefore the stack will not be swapped
+ * to the Xen stack.
+ *
+ * To fix this we look for RING 0 activation frames with a stack
+ * pointer below HYPERVISOR_VIRT_START (indicating a guest kernel
+ * frame) and fix this up by locating the Xen stack via the TSS
+ * and moving the activation frame to the Xen stack. In the process we
+ * convert the frame into an inter-privilege frame returning to RING 1
+ * so that we can catch and reverse the process on exit.
+ */
+
+#include <xen/config.h>
+#include <asm/asm_defns.h>
+#include <public/xen.h>
+
+        # Upon entry the stack should be the Xen stack and contain:
+        #   %ss, %esp, EFLAGS, %cs|1, %eip, ERROR, SAVE_ALL, RETURN
+        # On exit the stack should be %ss:%esp (i.e. the guest stack)
+        # and contain:
+        #   EFLAGS, %cs, %eip, ERROR, SAVE_ALL, RETURN
+        ALIGN
+ENTRY(restore_ring0_guest)
+        # Point %gs:%esi to guest stack.
+RRG0:   movw UREGS_ss+4(%esp),%gs
+        movl UREGS_esp+4(%esp),%esi
+
+        # Copy EFLAGS...EBX, RETURN from Xen stack to guest stack.
+        movl $(UREGS_kernel_sizeof>>2)+1,%ecx
+
+1:      subl $4,%esi
+        movl -4(%esp,%ecx,4),%eax
+RRG1:   movl %eax,%gs:(%esi)
+        loop 1b
+
+RRG2:   andl $~3,%gs:UREGS_cs+4(%esi)
+
+        movl %gs,%eax
+
+        # We need to do this because these registers are not present
+        # on the guest stack so they cannot be restored by the code in
+        # restore_all_guest.
+RRG3:   mov  UREGS_ds+4(%esp),%ds
+RRG4:   mov  UREGS_es+4(%esp),%es
+RRG5:   mov  UREGS_fs+4(%esp),%fs
+RRG6:   mov  UREGS_gs+4(%esp),%gs
+
+RRG7:   movl %eax,%ss
+        movl %esi,%esp
+
+        ret
+.section __ex_table,"a"
+        .long RRG0,domain_crash_synchronous
+        .long RRG1,domain_crash_synchronous
+        .long RRG2,domain_crash_synchronous
+        .long RRG3,domain_crash_synchronous
+        .long RRG4,domain_crash_synchronous
+        .long RRG5,domain_crash_synchronous
+        .long RRG6,domain_crash_synchronous
+        .long RRG7,domain_crash_synchronous
+.previous
+
+        # Upon entry the stack should be a guest stack and contain:
+        #   EFLAGS, %cs, %eip, ERROR, RETURN
+        # On exit the stack should be the Xen stack and contain:
+        #   %ss, %esp, EFLAGS, %cs|1, %eip, ERROR, RETURN
+        ALIGN
+ENTRY(fixup_ring0_guest_stack)
+        pushl %eax
+        pushl %ecx
+        pushl %ds
+        pushl %gs
+        pushl %esi
+
+        movw  $__HYPERVISOR_DS,%ax
+        movw  %ax,%ds
+
+        # Point %gs:%esi to guest stack frame.
+        movw  %ss,%ax
+        movw  %ax,%gs
+        movl  %esp,%esi
+        # Account for entries on the guest stack:
+        # * Pushed by normal exception/interrupt/hypercall mechanisms
+        #   * EFLAGS, %cs, %eip, ERROR == 4 words.
+        # * Pushed by the fixup routine
+        #   * [RETURN], %eax, %ecx, %ds, %gs and %esi == 6 words.
+        addl $((6+4)*4),%esi
+
+        # %gs:%esi now points to the guest stack before the
+        # interrupt/exception occured.
+
+        /*
+         * Reverse the __TSS macro, giving us the CPU number.
+         * The TSS for this cpu is at init_tss + ( cpu * 128 ).
+         */
+        str   %ecx
+        shrl  $3,%ecx                                   # Calculate GDT index 
for TSS.
+        subl  $(FIRST_RESERVED_GDT_ENTRY+8),%ecx        # %ecx = 2*cpu.
+        shll  $6,%ecx                                   # Each TSS entry is 
0x80 bytes
+        addl  $init_tss,%ecx                            # but we have 2*cpu 
from above.
+
+        # Load Xen stack from TSS.
+        movw  TSS_ss0(%ecx),%ax
+TRP1:   movw  %ax,%ss
+        movl  TSS_esp0(%ecx),%esp
+
+        pushl %gs
+        pushl %esi
+
+        # Move EFLAGS, %cs, %eip, ERROR, RETURN, %eax, %ecx, %ds, %gs, %esi
+        # from guest stack to Xen stack.
+        movl  $10,%ecx
+1:      subl  $4,%esp
+        subl  $4,%esi
+TRP2:   movl  %gs:(%esi),%eax
+        movl  %eax,(%esp)
+        loop  1b
+
+        # CS = CS|1 to simulate RING1 stack frame.
+        orl   $1,32(%esp)
+
+        popl  %esi
+        popl  %gs
+        popl  %ds
+        popl  %ecx
+        popl  %eax
+        ret
+.section __ex_table,"a"
+        .long TRP1,domain_crash_synchronous
+        .long TRP2,domain_crash_synchronous
+.previous
+
+domain_crash_synchronous_string:
+        .asciz "domain_crash_sync called from supervisor_mode_kernel.S (%lx)\n"
+
+domain_crash_synchronous:
+        pushl $domain_crash_synchronous_string
+        call  printf
+        jmp   __domain_crash_synchronous
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-ia64/uaccess.h
--- /dev/null   Wed Mar  1 17:01:54 2006
+++ b/xen/include/asm-ia64/uaccess.h    Wed Mar  1 19:47:25 2006
@@ -0,0 +1,285 @@
+#ifndef _ASM_IA64_UACCESS_H
+#define _ASM_IA64_UACCESS_H
+
+/*
+ * This file defines various macros to transfer memory areas across
+ * the user/kernel boundary.  This needs to be done carefully because
+ * this code is executed in kernel mode and uses user-specified
+ * addresses.  Thus, we need to be careful not to let the user to
+ * trick us into accessing kernel memory that would normally be
+ * inaccessible.  This code is also fairly performance sensitive,
+ * so we want to spend as little time doing safety checks as
+ * possible.
+ *
+ * To make matters a bit more interesting, these macros sometimes also
+ * called from within the kernel itself, in which case the address
+ * validity check must be skipped.  The get_fs() macro tells us what
+ * to do: if get_fs()==USER_DS, checking is performed, if
+ * get_fs()==KERNEL_DS, checking is bypassed.
+ *
+ * Note that even if the memory area specified by the user is in a
+ * valid address range, it is still possible that we'll get a page
+ * fault while accessing it.  This is handled by filling out an
+ * exception handler fixup entry for each instruction that has the
+ * potential to fault.  When such a fault occurs, the page fault
+ * handler checks to see whether the faulting instruction has a fixup
+ * associated and, if so, sets r8 to -EFAULT and clears r9 to 0 and
+ * then resumes execution at the continuation point.
+ *
+ * Based on <asm-alpha/uaccess.h>.
+ *
+ * Copyright (C) 1998, 1999, 2001-2004 Hewlett-Packard Co
+ *     David Mosberger-Tang <davidm@xxxxxxxxxx>
+ */
+
+#include <linux/compiler.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/page-flags.h>
+#include <linux/mm.h>
+
+#include <asm/intrinsics.h>
+#include <asm/pgtable.h>
+#include <asm/io.h>
+
+#define IS_VMM_ADDRESS(addr) ((((addr) >> 60) ^ ((addr) >> 59)) & 1)
+#define __access_ok(addr) (!IS_VMM_ADDRESS((unsigned long)(addr)))
+#define access_ok(addr, size) (__access_ok(addr))
+#define array_access_ok(addr,count,size)( __access_ok(addr))
+
+/*
+ * These are the main single-value transfer routines.  They automatically
+ * use the right size if we just have the right pointer type.
+ *
+ * Careful to not
+ * (a) re-use the arguments for side effects (sizeof/typeof is ok)
+ * (b) require any knowledge of processes at this stage
+ */
+#define put_user(x, ptr)       __put_user_check((__typeof__(*(ptr))) (x), 
(ptr), sizeof(*(ptr)), get_fs())
+#define get_user(x, ptr)       __get_user_check((x), (ptr), sizeof(*(ptr)), 
get_fs())
+
+/*
+ * The "__xxx" versions do not do address space checking, useful when
+ * doing multiple accesses to the same area (the programmer has to do the
+ * checks by hand with "access_ok()")
+ */
+#define __put_user(x, ptr)     __put_user_nocheck((__typeof__(*(ptr))) (x), 
(ptr), sizeof(*(ptr)))
+#define __get_user(x, ptr)     __get_user_nocheck((x), (ptr), sizeof(*(ptr)))
+
+extern long __put_user_unaligned_unknown (void);
+
+#define __put_user_unaligned(x, ptr)                                           
                \
+({                                                                             
                \
+       long __ret;                                                             
                \
+       switch (sizeof(*(ptr))) {                                               
                \
+               case 1: __ret = __put_user((x), (ptr)); break;                  
                \
+               case 2: __ret = (__put_user((x), (u8 __user *)(ptr)))           
                \
+                       | (__put_user((x) >> 8, ((u8 __user *)(ptr) + 1))); 
break;              \
+               case 4: __ret = (__put_user((x), (u16 __user *)(ptr)))          
                \
+                       | (__put_user((x) >> 16, ((u16 __user *)(ptr) + 1))); 
break;            \
+               case 8: __ret = (__put_user((x), (u32 __user *)(ptr)))          
                \
+                       | (__put_user((x) >> 32, ((u32 __user *)(ptr) + 1))); 
break;            \
+               default: __ret = __put_user_unaligned_unknown();                
                \
+       }                                                                       
                \
+       __ret;                                                                  
                \
+})
+
+extern long __get_user_unaligned_unknown (void);
+
+#define __get_user_unaligned(x, ptr)                                           
                \
+({                                                                             
                \
+       long __ret;                                                             
                \
+       switch (sizeof(*(ptr))) {                                               
                \
+               case 1: __ret = __get_user((x), (ptr)); break;                  
                \
+               case 2: __ret = (__get_user((x), (u8 __user *)(ptr)))           
                \
+                       | (__get_user((x) >> 8, ((u8 __user *)(ptr) + 1))); 
break;              \
+               case 4: __ret = (__get_user((x), (u16 __user *)(ptr)))          
                \
+                       | (__get_user((x) >> 16, ((u16 __user *)(ptr) + 1))); 
break;            \
+               case 8: __ret = (__get_user((x), (u32 __user *)(ptr)))          
                \
+                       | (__get_user((x) >> 32, ((u32 __user *)(ptr) + 1))); 
break;            \
+               default: __ret = __get_user_unaligned_unknown();                
                \
+       }                                                                       
                \
+       __ret;                                                                  
                \
+})
+
+#ifdef ASM_SUPPORTED
+  struct __large_struct { unsigned long buf[100]; };
+# define __m(x) (*(struct __large_struct __user *)(x))
+
+/* We need to declare the __ex_table section before we can use it in .xdata.  
*/
+asm (".section \"__ex_table\", \"a\"\n\t.previous");
+
+# define __get_user_size(val, addr, n, err)                                    
                \
+do {                                                                           
                \
+       register long __gu_r8 asm ("r8") = 0;                                   
                \
+       register long __gu_r9 asm ("r9");                                       
                \
+       asm ("\n[1:]\tld"#n" %0=%2%P2\t// %0 and %1 get overwritten by 
exception handler\n"     \
+            "\t.xdata4 \"__ex_table\", 1b-., 1f-.+4\n"                         
                \
+            "[1:]"                                                             
                \
+            : "=r"(__gu_r9), "=r"(__gu_r8) : "m"(__m(addr)), "1"(__gu_r8));    
                \
+       (err) = __gu_r8;                                                        
                \
+       (val) = __gu_r9;                                                        
                \
+} while (0)
+
+/*
+ * The "__put_user_size()" macro tells gcc it reads from memory instead of 
writing it.  This
+ * is because they do not write to any memory gcc knows about, so there are no 
aliasing
+ * issues.
+ */
+# define __put_user_size(val, addr, n, err)                                    
                \
+do {                                                                           
                \
+       register long __pu_r8 asm ("r8") = 0;                                   
                \
+       asm volatile ("\n[1:]\tst"#n" %1=%r2%P1\t// %0 gets overwritten by 
exception handler\n" \
+                     "\t.xdata4 \"__ex_table\", 1b-., 1f-.\n"                  
                \
+                     "[1:]"                                                    
                \
+                     : "=r"(__pu_r8) : "m"(__m(addr)), "rO"(val), 
"0"(__pu_r8));               \
+       (err) = __pu_r8;                                                        
                \
+} while (0)
+
+#else /* !ASM_SUPPORTED */
+# define RELOC_TYPE    2       /* ip-rel */
+# define __get_user_size(val, addr, n, err)                            \
+do {                                                                   \
+       __ld_user("__ex_table", (unsigned long) addr, n, RELOC_TYPE);   \
+       (err) = ia64_getreg(_IA64_REG_R8);                              \
+       (val) = ia64_getreg(_IA64_REG_R9);                              \
+} while (0)
+# define __put_user_size(val, addr, n, err)                                    
                \
+do {                                                                           
                \
+       __st_user("__ex_table", (unsigned long) addr, n, RELOC_TYPE, (unsigned 
long) (val));    \
+       (err) = ia64_getreg(_IA64_REG_R8);                                      
                \
+} while (0)
+#endif /* !ASM_SUPPORTED */
+
+extern void __get_user_unknown (void);
+
+/*
+ * Evaluating arguments X, PTR, SIZE, and SEGMENT may involve 
subroutine-calls, which
+ * could clobber r8 and r9 (among others).  Thus, be careful not to evaluate 
it while
+ * using r8/r9.
+ */
+#define __do_get_user(check, x, ptr, size, segment)                            
        \
+({                                                                             
        \
+       const __typeof__(*(ptr)) __user *__gu_ptr = (ptr);                      
        \
+       __typeof__ (size) __gu_size = (size);                                   
        \
+       long __gu_err = -EFAULT, __gu_val = 0;                                  
        \
+                                                                               
        \
+       if (!check || __access_ok(__gu_ptr))                                    
        \
+               switch (__gu_size) {                                            
        \
+                     case 1: __get_user_size(__gu_val, __gu_ptr, 1, __gu_err); 
break;  \
+                     case 2: __get_user_size(__gu_val, __gu_ptr, 2, __gu_err); 
break;  \
+                     case 4: __get_user_size(__gu_val, __gu_ptr, 4, __gu_err); 
break;  \
+                     case 8: __get_user_size(__gu_val, __gu_ptr, 8, __gu_err); 
break;  \
+                     default: __get_user_unknown(); break;                     
        \
+               }                                                               
        \
+       (x) = (__typeof__(*(__gu_ptr))) __gu_val;                               
        \
+       __gu_err;                                                               
        \
+})
+
+#define __get_user_nocheck(x, ptr, size)       __do_get_user(0, x, ptr, size, 
KERNEL_DS)
+#define __get_user_check(x, ptr, size, segment)        __do_get_user(1, x, 
ptr, size, segment)
+
+extern void __put_user_unknown (void);
+
+/*
+ * Evaluating arguments X, PTR, SIZE, and SEGMENT may involve 
subroutine-calls, which
+ * could clobber r8 (among others).  Thus, be careful not to evaluate them 
while using r8.
+ */
+#define __do_put_user(check, x, ptr, size, segment)                            
        \
+({                                                                             
        \
+       __typeof__ (x) __pu_x = (x);                                            
        \
+       __typeof__ (*(ptr)) __user *__pu_ptr = (ptr);                           
        \
+       __typeof__ (size) __pu_size = (size);                                   
        \
+       long __pu_err = -EFAULT;                                                
        \
+                                                                               
        \
+       if (!check || __access_ok(__pu_ptr))                                    
        \
+               switch (__pu_size) {                                            
        \
+                     case 1: __put_user_size(__pu_x, __pu_ptr, 1, __pu_err); 
break;    \
+                     case 2: __put_user_size(__pu_x, __pu_ptr, 2, __pu_err); 
break;    \
+                     case 4: __put_user_size(__pu_x, __pu_ptr, 4, __pu_err); 
break;    \
+                     case 8: __put_user_size(__pu_x, __pu_ptr, 8, __pu_err); 
break;    \
+                     default: __put_user_unknown(); break;                     
        \
+               }                                                               
        \
+       __pu_err;                                                               
        \
+})
+
+#define __put_user_nocheck(x, ptr, size)       __do_put_user(0, x, ptr, size, 
KERNEL_DS)
+#define __put_user_check(x, ptr, size, segment)        __do_put_user(1, x, 
ptr, size, segment)
+
+/*
+ * Complex access routines
+ */
+extern unsigned long __must_check __copy_user (void __user *to, const void 
__user *from,
+                                              unsigned long count);
+
+static inline unsigned long
+__copy_to_user (void __user *to, const void *from, unsigned long count)
+{
+       return __copy_user(to, (void __user *) from, count);
+}
+
+static inline unsigned long
+__copy_from_user (void *to, const void __user *from, unsigned long count)
+{
+       return __copy_user((void __user *) to, from, count);
+}
+
+#define __copy_to_user_inatomic                __copy_to_user
+#define __copy_from_user_inatomic      __copy_from_user
+#define copy_to_user(to, from, n)                                              
        \
+({                                                                             
        \
+       void __user *__cu_to = (to);                                            
        \
+       const void *__cu_from = (from);                                         
        \
+       long __cu_len = (n);                                                    
        \
+                                                                               
        \
+       if (__access_ok(__cu_to))                                               
        \
+               __cu_len = __copy_user(__cu_to, (void __user *) __cu_from, 
__cu_len);   \
+       __cu_len;                                                               
        \
+})
+
+#define copy_from_user(to, from, n)                                            
        \
+({                                                                             
        \
+       void *__cu_to = (to);                                                   
        \
+       const void __user *__cu_from = (from);                                  
        \
+       long __cu_len = (n);                                                    
        \
+                                                                               
        \
+       __chk_user_ptr(__cu_from);                                              
        \
+       if (__access_ok(__cu_from))                                             
        \
+               __cu_len = __copy_user((void __user *) __cu_to, __cu_from, 
__cu_len);   \
+       __cu_len;                                                               
        \
+})
+
+#define __copy_in_user(to, from, size) __copy_user((to), (from), (size))
+
+static inline unsigned long
+copy_in_user (void __user *to, const void __user *from, unsigned long n)
+{
+       if (likely(access_ok(from, n) && access_ok(to, n)))
+               n = __copy_user(to, from, n);
+       return n;
+}
+
+#define ARCH_HAS_SORT_EXTABLE
+#define ARCH_HAS_SEARCH_EXTABLE
+
+struct exception_table_entry {
+       int addr;       /* location-relative address of insn this fixup is for 
*/
+       int cont;       /* location-relative continuation addr.; if bit 2 is 
set, r9 is set to 0 */
+};
+
+extern void ia64_handle_exception (struct pt_regs *regs, const struct 
exception_table_entry *e);
+extern const struct exception_table_entry *search_exception_tables (unsigned 
long addr);
+
+static inline int
+ia64_done_with_exception (struct pt_regs *regs)
+{
+       const struct exception_table_entry *e;
+       e = search_exception_tables(regs->cr_iip + ia64_psr(regs)->ri);
+       if (e) {
+               ia64_handle_exception(regs, e);
+               return 1;
+       }
+       return 0;
+}
+
+#endif /* _ASM_IA64_UACCESS_H */
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/public/features.h
--- /dev/null   Wed Mar  1 17:01:54 2006
+++ b/xen/include/public/features.h     Wed Mar  1 19:47:25 2006
@@ -0,0 +1,53 @@
+/******************************************************************************
+ * features.h
+ * 
+ * Feature flags, reported by XENVER_get_features.
+ * 
+ * Copyright (c) 2006, Keir Fraser <keir@xxxxxxxxxxxxx>
+ */
+
+#ifndef __XEN_PUBLIC_FEATURES_H__
+#define __XEN_PUBLIC_FEATURES_H__
+
+/*
+ * If set, the guest does not need to write-protect its pagetables, and can
+ * update them via direct writes.
+ */
+#define XENFEAT_writable_page_tables       0
+
+/*
+ * If set, the guest does not need to write-protect its segment descriptor
+ * tables, and can update them via direct writes.
+ */
+#define XENFEAT_writable_descriptor_tables 1
+
+/*
+ * If set, translation between the guest's 'pseudo-physical' address space
+ * and the host's machine address space are handled by the hypervisor. In this
+ * mode the guest does not need to perform phys-to/from-machine translations
+ * when performing page table operations.
+ */
+#define XENFEAT_auto_translated_physmap    2
+
+/* If set, the guest is running in supervisor mode (e.g., x86 ring 0). */
+#define XENFEAT_supervisor_mode_kernel     3
+
+/*
+ * If set, the guest does not need to allocate x86 PAE page directories
+ * below 4GB. This flag is usually implied by auto_translated_physmap.
+ */
+#define XENFEAT_pae_pgdir_above_4gb        4
+
+#define XENFEAT_NR_SUBMAPS 1
+
+#endif /* __XEN_PUBLIC_FEATURES_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/xen/guest_access.h
--- /dev/null   Wed Mar  1 17:01:54 2006
+++ b/xen/include/xen/guest_access.h    Wed Mar  1 19:47:25 2006
@@ -0,0 +1,71 @@
+/******************************************************************************
+ * guest_access.h
+ * 
+ * Copyright (x) 2006, K A Fraser
+ */
+
+#ifndef __XEN_GUEST_ACCESS_H__
+#define __XEN_GUEST_ACCESS_H__
+
+#include <asm/uaccess.h>
+
+/* Is the guest handle a NULL reference? */
+#define guest_handle_is_null(hnd)        ((hnd).p == NULL)
+
+/* Offset the given guest handle into the array it refers to. */
+#define guest_handle_add_offset(hnd, nr) ((hnd).p += (nr))
+
+/* Cast a guest handle to the specified type of handle. */
+#define guest_handle_cast(hnd, type) ({         \
+    type *_x = (hnd).p;                         \
+    (GUEST_HANDLE(type)) { _x };                \
+})
+
+/*
+ * Copy an array of objects to guest context via a guest handle.
+ * Optionally specify an offset into the guest array.
+ */
+#define copy_to_guest_offset(hnd, off, ptr, nr) ({      \
+    const typeof(ptr) _x = (hnd).p;                     \
+    const typeof(ptr) _y = (ptr);                       \
+    copy_to_user(_x+(off), _y, sizeof(*_x)*(nr));       \
+})
+#define copy_to_guest(hnd, ptr, nr)                     \
+    copy_to_guest_offset(hnd, 0, ptr, nr)
+
+/*
+ * Copy an array of objects from guest context via a guest handle.
+ * Optionally specify an offset into the guest array.
+ */
+#define copy_from_guest_offset(ptr, hnd, off, nr) ({    \
+    const typeof(ptr) _x = (hnd).p;                     \
+    const typeof(ptr) _y = (ptr);                       \
+    copy_from_user(_y, _x+(off), sizeof(*_x)*(nr));     \
+})
+#define copy_from_guest(ptr, hnd, nr)                   \
+    copy_from_guest_offset(ptr, hnd, 0, nr)
+
+/*
+ * Pre-validate a guest handle.
+ * Allows use of faster __copy_* functions.
+ */
+#define guest_handle_okay(hnd, nr)                      \
+    array_access_ok((hnd).p, (nr), sizeof(*(hnd).p))
+
+#define __copy_to_guest_offset(hnd, off, ptr, nr) ({    \
+    const typeof(ptr) _x = (hnd).p;                     \
+    const typeof(ptr) _y = (ptr);                       \
+    __copy_to_user(_x+(off), _y, sizeof(*_x)*(nr));     \
+})
+#define __copy_to_guest(hnd, ptr, nr)                   \
+    __copy_to_guest_offset(hnd, 0, ptr, nr)
+
+#define __copy_from_guest_offset(ptr, hnd, off, nr) ({  \
+    const typeof(ptr) _x = (hnd).p;                     \
+    const typeof(ptr) _y = (ptr);                       \
+    __copy_from_user(_y, _x+(off), sizeof(*_x)*(nr));   \
+})
+#define __copy_from_guest(ptr, hnd, nr)                 \
+    __copy_from_guest_offset(ptr, hnd, 0, nr)
+
+#endif /* __XEN_GUEST_ACCESS_H__ */
diff -r 88f97bb8f3ae -r 673f62edbfbe 
patches/linux-2.6.16-rc4/i386-mach-io-check-nmi.patch
--- a/patches/linux-2.6.16-rc4/i386-mach-io-check-nmi.patch     Wed Mar  1 
17:01:54 2006
+++ /dev/null   Wed Mar  1 19:47:25 2006
@@ -1,45 +0,0 @@
-diff -pruN ../pristine-linux-2.6.16-rc3/arch/i386/kernel/traps.c 
./arch/i386/kernel/traps.c
---- ../pristine-linux-2.6.16-rc3/arch/i386/kernel/traps.c      2006-02-15 
20:38:51.000000000 +0000
-+++ ./arch/i386/kernel/traps.c 2006-02-15 20:40:43.000000000 +0000
-@@ -567,18 +567,11 @@ static void mem_parity_error(unsigned ch
- 
- static void io_check_error(unsigned char reason, struct pt_regs * regs)
- {
--      unsigned long i;
--
-       printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
-       show_registers(regs);
- 
-       /* Re-enable the IOCK line, wait for a few seconds */
--      reason = (reason & 0xf) | 8;
--      outb(reason, 0x61);
--      i = 2000;
--      while (--i) udelay(1000);
--      reason &= ~8;
--      outb(reason, 0x61);
-+      clear_io_check_error(reason);
- }
- 
- static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
-diff -pruN 
../pristine-linux-2.6.16-rc3/include/asm-i386/mach-default/mach_traps.h 
./include/asm-i386/mach-default/mach_traps.h
---- ../pristine-linux-2.6.16-rc3/include/asm-i386/mach-default/mach_traps.h    
2006-01-03 03:21:10.000000000 +0000
-+++ ./include/asm-i386/mach-default/mach_traps.h       2006-02-15 
20:40:43.000000000 +0000
-@@ -15,6 +15,18 @@ static inline void clear_mem_error(unsig
-       outb(reason, 0x61);
- }
- 
-+static inline void clear_io_check_error(unsigned char reason)
-+{
-+      unsigned long i;
-+
-+      reason = (reason & 0xf) | 8;
-+      outb(reason, 0x61);
-+      i = 2000;
-+      while (--i) udelay(1000);
-+      reason &= ~8;
-+      outb(reason, 0x61);
-+}
-+
- static inline unsigned char get_nmi_reason(void)
- {
-       return inb(0x61);
diff -r 88f97bb8f3ae -r 673f62edbfbe patches/linux-2.6.16-rc4/net-csum.patch
--- a/patches/linux-2.6.16-rc4/net-csum.patch   Wed Mar  1 17:01:54 2006
+++ /dev/null   Wed Mar  1 19:47:25 2006
@@ -1,41 +0,0 @@
-diff -pruN 
../pristine-linux-2.6.16-rc1-git4/net/ipv4/netfilter/ip_nat_proto_tcp.c 
./net/ipv4/netfilter/ip_nat_proto_tcp.c
---- ../pristine-linux-2.6.16-rc1-git4/net/ipv4/netfilter/ip_nat_proto_tcp.c    
2006-02-02 17:39:51.000000000 +0000
-+++ ./net/ipv4/netfilter/ip_nat_proto_tcp.c    2006-02-02 17:44:18.000000000 
+0000
-@@ -129,10 +129,14 @@ tcp_manip_pkt(struct sk_buff **pskb,
-       if (hdrsize < sizeof(*hdr))
-               return 1;
- 
--      hdr->check = ip_nat_cheat_check(~oldip, newip,
-+      if ((*pskb)->proto_csum_blank) {
-+              hdr->check = ip_nat_cheat_check(oldip, ~newip, hdr->check);
-+      } else {
-+              hdr->check = ip_nat_cheat_check(~oldip, newip,
-                                       ip_nat_cheat_check(oldport ^ 0xFFFF,
-                                                          newport,
-                                                          hdr->check));
-+      }
-       return 1;
- }
-
-diff -pruN 
../pristine-linux-2.6.16-rc1-git4/net/ipv4/netfilter/ip_nat_proto_udp.c 
./net/ipv4/netfilter/ip_nat_proto_udp.c
---- ../pristine-linux-2.6.16-rc1-git4/net/ipv4/netfilter/ip_nat_proto_udp.c    
2006-02-02 17:39:51.000000000 +0000
-+++ ./net/ipv4/netfilter/ip_nat_proto_udp.c    2006-02-02 17:44:18.000000000 
+0000
-@@ -113,11 +113,16 @@ udp_manip_pkt(struct sk_buff **pskb,
-               newport = tuple->dst.u.udp.port;
-               portptr = &hdr->dest;
-       }
--      if (hdr->check) /* 0 is a special case meaning no checksum */
--              hdr->check = ip_nat_cheat_check(~oldip, newip,
-+      if (hdr->check) { /* 0 is a special case meaning no checksum */
-+              if ((*pskb)->proto_csum_blank) {
-+                      hdr->check = ip_nat_cheat_check(oldip, ~newip, 
hdr->check);
-+              } else {
-+                      hdr->check = ip_nat_cheat_check(~oldip, newip,
-                                       ip_nat_cheat_check(*portptr ^ 0xFFFF,
-                                                          newport,
-                                                          hdr->check));
-+              }
-+      }
-       *portptr = newport;
-       return 1;
- }
diff -r 88f97bb8f3ae -r 673f62edbfbe patches/linux-2.6.16-rc4/pmd-shared.patch
--- a/patches/linux-2.6.16-rc4/pmd-shared.patch Wed Mar  1 17:01:54 2006
+++ /dev/null   Wed Mar  1 19:47:25 2006
@@ -1,111 +0,0 @@
-diff -pruN ../pristine-linux-2.6.16-rc1-git4/arch/i386/mm/pageattr.c 
./arch/i386/mm/pageattr.c
---- ../pristine-linux-2.6.16-rc1-git4/arch/i386/mm/pageattr.c  2006-02-02 
17:39:29.000000000 +0000
-+++ ./arch/i386/mm/pageattr.c  2006-02-02 17:45:14.000000000 +0000
-@@ -78,7 +78,7 @@ static void set_pmd_pte(pte_t *kpte, uns
-       unsigned long flags;
- 
-       set_pte_atomic(kpte, pte);      /* change init_mm */
--      if (PTRS_PER_PMD > 1)
-+      if (HAVE_SHARED_KERNEL_PMD)
-               return;
- 
-       spin_lock_irqsave(&pgd_lock, flags);
-diff -pruN ../pristine-linux-2.6.16-rc1-git4/arch/i386/mm/pgtable.c 
./arch/i386/mm/pgtable.c
---- ../pristine-linux-2.6.16-rc1-git4/arch/i386/mm/pgtable.c   2006-01-03 
03:21:10.000000000 +0000
-+++ ./arch/i386/mm/pgtable.c   2006-02-02 17:45:14.000000000 +0000
-@@ -215,9 +215,10 @@ void pgd_ctor(void *pgd, kmem_cache_t *c
-               spin_lock_irqsave(&pgd_lock, flags);
-       }
- 
--      clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
--                      swapper_pg_dir + USER_PTRS_PER_PGD,
--                      KERNEL_PGD_PTRS);
-+      if (PTRS_PER_PMD == 1 || HAVE_SHARED_KERNEL_PMD)
-+              clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
-+                              swapper_pg_dir + USER_PTRS_PER_PGD,
-+                              KERNEL_PGD_PTRS);
-       if (PTRS_PER_PMD > 1)
-               return;
- 
-@@ -249,6 +250,30 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
-                       goto out_oom;
-               set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
-       }
-+
-+      if (!HAVE_SHARED_KERNEL_PMD) {
-+              unsigned long flags;
-+
-+              for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
-+                      pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
-+                      if (!pmd)
-+                              goto out_oom;
-+                      set_pgd(&pgd[USER_PTRS_PER_PGD], __pgd(1 + __pa(pmd)));
-+              }
-+
-+              spin_lock_irqsave(&pgd_lock, flags);
-+              for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
-+                      unsigned long v = (unsigned long)i << PGDIR_SHIFT;
-+                      pgd_t *kpgd = pgd_offset_k(v);
-+                      pud_t *kpud = pud_offset(kpgd, v);
-+                      pmd_t *kpmd = pmd_offset(kpud, v);
-+                      pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
-+                      memcpy(pmd, kpmd, PAGE_SIZE);
-+              }
-+              pgd_list_add(pgd);
-+              spin_unlock_irqrestore(&pgd_lock, flags);
-+      }
-+
-       return pgd;
- 
- out_oom:
-@@ -263,9 +288,23 @@ void pgd_free(pgd_t *pgd)
-       int i;
- 
-       /* in the PAE case user pgd entries are overwritten before usage */
--      if (PTRS_PER_PMD > 1)
--              for (i = 0; i < USER_PTRS_PER_PGD; ++i)
--                      kmem_cache_free(pmd_cache, (void 
*)__va(pgd_val(pgd[i])-1));
-+      if (PTRS_PER_PMD > 1) {
-+              for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
-+                      pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
-+                      kmem_cache_free(pmd_cache, pmd);
-+              }
-+              if (!HAVE_SHARED_KERNEL_PMD) {
-+                      unsigned long flags;
-+                      spin_lock_irqsave(&pgd_lock, flags);
-+                      pgd_list_del(pgd);
-+                      spin_unlock_irqrestore(&pgd_lock, flags);
-+                      for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
-+                              pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
-+                              memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
-+                              kmem_cache_free(pmd_cache, pmd);
-+                      }
-+              }
-+      }
-       /* in the non-PAE case, free_pgtables() clears user pgd entries */
-       kmem_cache_free(pgd_cache, pgd);
- }
-diff -pruN 
../pristine-linux-2.6.16-rc1-git4/include/asm-i386/pgtable-2level-defs.h 
./include/asm-i386/pgtable-2level-defs.h
---- ../pristine-linux-2.6.16-rc1-git4/include/asm-i386/pgtable-2level-defs.h   
2006-01-03 03:21:10.000000000 +0000
-+++ ./include/asm-i386/pgtable-2level-defs.h   2006-02-02 17:45:14.000000000 
+0000
-@@ -1,6 +1,8 @@
- #ifndef _I386_PGTABLE_2LEVEL_DEFS_H
- #define _I386_PGTABLE_2LEVEL_DEFS_H
- 
-+#define HAVE_SHARED_KERNEL_PMD 0
-+
- /*
-  * traditional i386 two-level paging structure:
-  */
-diff -pruN 
../pristine-linux-2.6.16-rc1-git4/include/asm-i386/pgtable-3level-defs.h 
./include/asm-i386/pgtable-3level-defs.h
---- ../pristine-linux-2.6.16-rc1-git4/include/asm-i386/pgtable-3level-defs.h   
2006-01-03 03:21:10.000000000 +0000
-+++ ./include/asm-i386/pgtable-3level-defs.h   2006-02-02 17:45:14.000000000 
+0000
-@@ -1,6 +1,8 @@
- #ifndef _I386_PGTABLE_3LEVEL_DEFS_H
- #define _I386_PGTABLE_3LEVEL_DEFS_H
- 
-+#define HAVE_SHARED_KERNEL_PMD 1
-+
- /*
-  * PGDIR_SHIFT determines what a top-level page table entry can map
-  */
diff -r 88f97bb8f3ae -r 673f62edbfbe patches/linux-2.6.16-rc4/smp-alts.patch
--- a/patches/linux-2.6.16-rc4/smp-alts.patch   Wed Mar  1 17:01:54 2006
+++ /dev/null   Wed Mar  1 19:47:25 2006
@@ -1,591 +0,0 @@
-diff -pruN ../pristine-linux-2.6.16-rc3/arch/i386/Kconfig ./arch/i386/Kconfig
---- ../pristine-linux-2.6.16-rc3/arch/i386/Kconfig     2006-02-15 
20:38:51.000000000 +0000
-+++ ./arch/i386/Kconfig        2006-02-15 20:45:57.000000000 +0000
-@@ -202,6 +202,19 @@ config SMP
- 
-         If you don't know what to do here, say N.
- 
-+config SMP_ALTERNATIVES
-+      bool "SMP alternatives support (EXPERIMENTAL)"
-+      depends on SMP && EXPERIMENTAL
-+      help
-+        Try to reduce the overhead of running an SMP kernel on a uniprocessor
-+        host slightly by replacing certain key instruction sequences
-+        according to whether we currently have more than one CPU available.
-+        This should provide a noticeable boost to performance when
-+        running SMP kernels on UP machines, and have negligible impact
-+        when running on an true SMP host.
-+
-+          If unsure, say N.
-+        
- config NR_CPUS
-       int "Maximum number of CPUs (2-255)"
-       range 2 255
-diff -pruN ../pristine-linux-2.6.16-rc3/arch/i386/kernel/Makefile 
./arch/i386/kernel/Makefile
---- ../pristine-linux-2.6.16-rc3/arch/i386/kernel/Makefile     2006-02-15 
20:38:51.000000000 +0000
-+++ ./arch/i386/kernel/Makefile        2006-02-15 20:45:57.000000000 +0000
-@@ -37,6 +37,7 @@ obj-$(CONFIG_EFI)            += efi.o efi_stub.o
- obj-$(CONFIG_DOUBLEFAULT)     += doublefault.o
- obj-$(CONFIG_VM86)            += vm86.o
- obj-$(CONFIG_EARLY_PRINTK)    += early_printk.o
-+obj-$(CONFIG_SMP_ALTERNATIVES)  += smpalts.o
- 
- EXTRA_AFLAGS   := -traditional
- 
-diff -pruN ../pristine-linux-2.6.16-rc3/arch/i386/kernel/smpalts.c 
./arch/i386/kernel/smpalts.c
---- ../pristine-linux-2.6.16-rc3/arch/i386/kernel/smpalts.c    1970-01-01 
01:00:00.000000000 +0100
-+++ ./arch/i386/kernel/smpalts.c       2006-02-15 20:45:57.000000000 +0000
-@@ -0,0 +1,85 @@
-+#include <linux/kernel.h>
-+#include <asm/system.h>
-+#include <asm/smp_alt.h>
-+#include <asm/processor.h>
-+#include <asm/string.h>
-+
-+struct smp_replacement_record {
-+      unsigned char targ_size;
-+      unsigned char smp1_size;
-+      unsigned char smp2_size;
-+      unsigned char up_size;
-+      unsigned char feature;
-+      unsigned char data[0];
-+};
-+
-+struct smp_alternative_record {
-+      void *targ_start;
-+      struct smp_replacement_record *repl;
-+};
-+
-+extern struct smp_alternative_record __start_smp_alternatives_table,
-+  __stop_smp_alternatives_table;
-+extern unsigned long __init_begin, __init_end;
-+
-+void prepare_for_smp(void)
-+{
-+      struct smp_alternative_record *r;
-+      printk(KERN_INFO "Enabling SMP...\n");
-+      for (r = &__start_smp_alternatives_table;
-+           r != &__stop_smp_alternatives_table;
-+           r++) {
-+              BUG_ON(r->repl->targ_size < r->repl->smp1_size);
-+              BUG_ON(r->repl->targ_size < r->repl->smp2_size);
-+              BUG_ON(r->repl->targ_size < r->repl->up_size);
-+               if (system_state == SYSTEM_RUNNING &&
-+                   r->targ_start >= (void *)&__init_begin &&
-+                   r->targ_start < (void *)&__init_end)
-+                       continue;
-+              if (r->repl->feature != (unsigned char)-1 &&
-+                  boot_cpu_has(r->repl->feature)) {
-+                      memcpy(r->targ_start,
-+                             r->repl->data + r->repl->smp1_size,
-+                             r->repl->smp2_size);
-+                      memset(r->targ_start + r->repl->smp2_size,
-+                             0x90,
-+                             r->repl->targ_size - r->repl->smp2_size);
-+              } else {
-+                      memcpy(r->targ_start,
-+                             r->repl->data,
-+                             r->repl->smp1_size);
-+                      memset(r->targ_start + r->repl->smp1_size,
-+                             0x90,
-+                             r->repl->targ_size - r->repl->smp1_size);
-+              }
-+      }
-+      /* Paranoia */
-+      asm volatile ("jmp 1f\n1:");
-+      mb();
-+}
-+
-+void unprepare_for_smp(void)
-+{
-+      struct smp_alternative_record *r;
-+      printk(KERN_INFO "Disabling SMP...\n");
-+      for (r = &__start_smp_alternatives_table;
-+           r != &__stop_smp_alternatives_table;
-+           r++) {
-+              BUG_ON(r->repl->targ_size < r->repl->smp1_size);
-+              BUG_ON(r->repl->targ_size < r->repl->smp2_size);
-+              BUG_ON(r->repl->targ_size < r->repl->up_size);
-+               if (system_state == SYSTEM_RUNNING &&
-+                   r->targ_start >= (void *)&__init_begin &&
-+                   r->targ_start < (void *)&__init_end)
-+                       continue;
-+              memcpy(r->targ_start,
-+                     r->repl->data + r->repl->smp1_size + r->repl->smp2_size,
-+                     r->repl->up_size);
-+              memset(r->targ_start + r->repl->up_size,
-+                     0x90,
-+                     r->repl->targ_size - r->repl->up_size);
-+      }
-+      /* Paranoia */
-+      asm volatile ("jmp 1f\n1:");
-+      mb();
-+}
-diff -pruN ../pristine-linux-2.6.16-rc3/arch/i386/kernel/smpboot.c 
./arch/i386/kernel/smpboot.c
---- ../pristine-linux-2.6.16-rc3/arch/i386/kernel/smpboot.c    2006-02-15 
20:38:51.000000000 +0000
-+++ ./arch/i386/kernel/smpboot.c       2006-02-15 20:45:57.000000000 +0000
-@@ -1214,6 +1214,11 @@ static void __init smp_boot_cpus(unsigne
-               if (max_cpus <= cpucount+1)
-                       continue;
- 
-+#ifdef CONFIG_SMP_ALTERNATIVES
-+              if (kicked == 1)
-+                      prepare_for_smp();
-+#endif
-+
-               if (((cpu = alloc_cpu_id()) <= 0) || do_boot_cpu(apicid, cpu))
-                       printk("CPU #%d not responding - cannot use it.\n",
-                                                               apicid);
-@@ -1392,6 +1397,11 @@ int __devinit __cpu_up(unsigned int cpu)
-               return -EIO;
-       }
- 
-+#ifdef CONFIG_SMP_ALTERNATIVES
-+      if (num_online_cpus() == 1)
-+              prepare_for_smp();
-+#endif
-+
-       local_irq_enable();
-       per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
-       /* Unleash the CPU! */
-diff -pruN ../pristine-linux-2.6.16-rc3/arch/i386/kernel/vmlinux.lds.S 
./arch/i386/kernel/vmlinux.lds.S
---- ../pristine-linux-2.6.16-rc3/arch/i386/kernel/vmlinux.lds.S        
2006-01-03 03:21:10.000000000 +0000
-+++ ./arch/i386/kernel/vmlinux.lds.S   2006-02-15 20:45:57.000000000 +0000
-@@ -34,6 +34,13 @@ SECTIONS
-   __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) }
-   __stop___ex_table = .;
- 
-+  . = ALIGN(16);
-+  __start_smp_alternatives_table = .;
-+  __smp_alternatives : { *(__smp_alternatives) }
-+  __stop_smp_alternatives_table = .;
-+
-+  __smp_replacements : { *(__smp_replacements) }
-+
-   RODATA
- 
-   /* writeable */
-diff -pruN ../pristine-linux-2.6.16-rc3/include/asm-i386/atomic.h 
./include/asm-i386/atomic.h
---- ../pristine-linux-2.6.16-rc3/include/asm-i386/atomic.h     2006-02-15 
20:38:57.000000000 +0000
-+++ ./include/asm-i386/atomic.h        2006-02-15 20:45:57.000000000 +0000
-@@ -4,18 +4,13 @@
- #include <linux/config.h>
- #include <linux/compiler.h>
- #include <asm/processor.h>
-+#include <asm/smp_alt.h>
- 
- /*
-  * Atomic operations that C can't guarantee us.  Useful for
-  * resource counting etc..
-  */
- 
--#ifdef CONFIG_SMP
--#define LOCK "lock ; "
--#else
--#define LOCK ""
--#endif
--
- /*
-  * Make sure gcc doesn't try to be clever and move things around
-  * on us. We need to use _exactly_ the address the user gave us,
-diff -pruN ../pristine-linux-2.6.16-rc3/include/asm-i386/bitops.h 
./include/asm-i386/bitops.h
---- ../pristine-linux-2.6.16-rc3/include/asm-i386/bitops.h     2006-02-15 
20:38:57.000000000 +0000
-+++ ./include/asm-i386/bitops.h        2006-02-15 20:45:57.000000000 +0000
-@@ -7,6 +7,7 @@
- 
- #include <linux/config.h>
- #include <linux/compiler.h>
-+#include <asm/smp_alt.h>
- 
- /*
-  * These have to be done with inline assembly: that way the bit-setting
-@@ -16,12 +17,6 @@
-  * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
-  */
- 
--#ifdef CONFIG_SMP
--#define LOCK_PREFIX "lock ; "
--#else
--#define LOCK_PREFIX ""
--#endif
--
- #define ADDR (*(volatile long *) addr)
- 
- /**
-@@ -41,7 +36,7 @@
-  */
- static inline void set_bit(int nr, volatile unsigned long * addr)
- {
--      __asm__ __volatile__( LOCK_PREFIX
-+      __asm__ __volatile__( LOCK
-               "btsl %1,%0"
-               :"+m" (ADDR)
-               :"Ir" (nr));
-@@ -76,7 +71,7 @@ static inline void __set_bit(int nr, vol
-  */
- static inline void clear_bit(int nr, volatile unsigned long * addr)
- {
--      __asm__ __volatile__( LOCK_PREFIX
-+      __asm__ __volatile__( LOCK
-               "btrl %1,%0"
-               :"+m" (ADDR)
-               :"Ir" (nr));
-@@ -121,7 +116,7 @@ static inline void __change_bit(int nr, 
-  */
- static inline void change_bit(int nr, volatile unsigned long * addr)
- {
--      __asm__ __volatile__( LOCK_PREFIX
-+      __asm__ __volatile__( LOCK
-               "btcl %1,%0"
-               :"+m" (ADDR)
-               :"Ir" (nr));
-@@ -140,7 +135,7 @@ static inline int test_and_set_bit(int n
- {
-       int oldbit;
- 
--      __asm__ __volatile__( LOCK_PREFIX
-+      __asm__ __volatile__( LOCK
-               "btsl %2,%1\n\tsbbl %0,%0"
-               :"=r" (oldbit),"+m" (ADDR)
-               :"Ir" (nr) : "memory");
-@@ -180,7 +175,7 @@ static inline int test_and_clear_bit(int
- {
-       int oldbit;
- 
--      __asm__ __volatile__( LOCK_PREFIX
-+      __asm__ __volatile__( LOCK
-               "btrl %2,%1\n\tsbbl %0,%0"
-               :"=r" (oldbit),"+m" (ADDR)
-               :"Ir" (nr) : "memory");
-@@ -231,7 +226,7 @@ static inline int test_and_change_bit(in
- {
-       int oldbit;
- 
--      __asm__ __volatile__( LOCK_PREFIX
-+      __asm__ __volatile__( LOCK
-               "btcl %2,%1\n\tsbbl %0,%0"
-               :"=r" (oldbit),"+m" (ADDR)
-               :"Ir" (nr) : "memory");
-diff -pruN ../pristine-linux-2.6.16-rc3/include/asm-i386/futex.h 
./include/asm-i386/futex.h
---- ../pristine-linux-2.6.16-rc3/include/asm-i386/futex.h      2006-02-15 
20:38:57.000000000 +0000
-+++ ./include/asm-i386/futex.h 2006-02-15 20:45:57.000000000 +0000
-@@ -28,7 +28,7 @@
- "1:   movl    %2, %0\n\
-       movl    %0, %3\n"                                       \
-       insn "\n"                                               \
--"2:   " LOCK_PREFIX "cmpxchgl %3, %2\n\
-+"2:   " LOCK "cmpxchgl %3, %2\n\
-       jnz     1b\n\
- 3:    .section .fixup,\"ax\"\n\
- 4:    mov     %5, %1\n\
-@@ -68,7 +68,7 @@ futex_atomic_op_inuser (int encoded_op, 
- #endif
-               switch (op) {
-               case FUTEX_OP_ADD:
--                      __futex_atomic_op1(LOCK_PREFIX "xaddl %0, %2", ret,
-+                      __futex_atomic_op1(LOCK "xaddl %0, %2", ret,
-                                          oldval, uaddr, oparg);
-                       break;
-               case FUTEX_OP_OR:
-diff -pruN ../pristine-linux-2.6.16-rc3/include/asm-i386/rwsem.h 
./include/asm-i386/rwsem.h
---- ../pristine-linux-2.6.16-rc3/include/asm-i386/rwsem.h      2006-01-03 
03:21:10.000000000 +0000
-+++ ./include/asm-i386/rwsem.h 2006-02-15 20:45:57.000000000 +0000
-@@ -40,6 +40,7 @@
- 
- #include <linux/list.h>
- #include <linux/spinlock.h>
-+#include <asm/smp_alt.h>
- 
- struct rwsem_waiter;
- 
-@@ -99,7 +100,7 @@ static inline void __down_read(struct rw
- {
-       __asm__ __volatile__(
-               "# beginning down_read\n\t"
--LOCK_PREFIX   "  incl      (%%eax)\n\t" /* adds 0x00000001, returns the old 
value */
-+LOCK          "  incl      (%%eax)\n\t" /* adds 0x00000001, returns the old 
value */
-               "  js        2f\n\t" /* jump if we weren't granted the lock */
-               "1:\n\t"
-               LOCK_SECTION_START("")
-@@ -130,7 +131,7 @@ static inline int __down_read_trylock(st
-               "  movl      %1,%2\n\t"
-               "  addl      %3,%2\n\t"
-               "  jle       2f\n\t"
--LOCK_PREFIX   "  cmpxchgl  %2,%0\n\t"
-+LOCK          "  cmpxchgl  %2,%0\n\t"
-               "  jnz       1b\n\t"
-               "2:\n\t"
-               "# ending __down_read_trylock\n\t"
-@@ -150,7 +151,7 @@ static inline void __down_write(struct r
-       tmp = RWSEM_ACTIVE_WRITE_BIAS;
-       __asm__ __volatile__(
-               "# beginning down_write\n\t"
--LOCK_PREFIX   "  xadd      %%edx,(%%eax)\n\t" /* subtract 0x0000ffff, returns 
the old value */
-+LOCK          "  xadd      %%edx,(%%eax)\n\t" /* subtract 0x0000ffff, returns 
the old value */
-               "  testl     %%edx,%%edx\n\t" /* was the count 0 before? */
-               "  jnz       2f\n\t" /* jump if we weren't granted the lock */
-               "1:\n\t"
-@@ -188,7 +189,7 @@ static inline void __up_read(struct rw_s
-       __s32 tmp = -RWSEM_ACTIVE_READ_BIAS;
-       __asm__ __volatile__(
-               "# beginning __up_read\n\t"
--LOCK_PREFIX   "  xadd      %%edx,(%%eax)\n\t" /* subtracts 1, returns the old 
value */
-+LOCK          "  xadd      %%edx,(%%eax)\n\t" /* subtracts 1, returns the old 
value */
-               "  js        2f\n\t" /* jump if the lock is being waited upon */
-               "1:\n\t"
-               LOCK_SECTION_START("")
-@@ -214,7 +215,7 @@ static inline void __up_write(struct rw_
-       __asm__ __volatile__(
-               "# beginning __up_write\n\t"
-               "  movl      %2,%%edx\n\t"
--LOCK_PREFIX   "  xaddl     %%edx,(%%eax)\n\t" /* tries to transition 
0xffff0001 -> 0x00000000 */
-+LOCK          "  xaddl     %%edx,(%%eax)\n\t" /* tries to transition 
0xffff0001 -> 0x00000000 */
-               "  jnz       2f\n\t" /* jump if the lock is being waited upon */
-               "1:\n\t"
-               LOCK_SECTION_START("")
-@@ -239,7 +240,7 @@ static inline void __downgrade_write(str
- {
-       __asm__ __volatile__(
-               "# beginning __downgrade_write\n\t"
--LOCK_PREFIX   "  addl      %2,(%%eax)\n\t" /* transitions 0xZZZZ0001 -> 
0xYYYY0001 */
-+LOCK          "  addl      %2,(%%eax)\n\t" /* transitions 0xZZZZ0001 -> 
0xYYYY0001 */
-               "  js        2f\n\t" /* jump if the lock is being waited upon */
-               "1:\n\t"
-               LOCK_SECTION_START("")
-@@ -263,7 +264,7 @@ LOCK_PREFIX        "  addl      %2,(%%eax)\n\t"
- static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem)
- {
-       __asm__ __volatile__(
--LOCK_PREFIX   "addl %1,%0"
-+LOCK            "addl %1,%0"
-               : "=m"(sem->count)
-               : "ir"(delta), "m"(sem->count));
- }
-@@ -276,7 +277,7 @@ static inline int rwsem_atomic_update(in
-       int tmp = delta;
- 
-       __asm__ __volatile__(
--LOCK_PREFIX   "xadd %0,(%2)"
-+LOCK                    "xadd %0,(%2)"
-               : "+r"(tmp), "=m"(sem->count)
-               : "r"(sem), "m"(sem->count)
-               : "memory");
-diff -pruN ../pristine-linux-2.6.16-rc3/include/asm-i386/smp_alt.h 
./include/asm-i386/smp_alt.h
---- ../pristine-linux-2.6.16-rc3/include/asm-i386/smp_alt.h    1970-01-01 
01:00:00.000000000 +0100
-+++ ./include/asm-i386/smp_alt.h       2006-02-15 20:45:57.000000000 +0000
-@@ -0,0 +1,32 @@
-+#ifndef __ASM_SMP_ALT_H__
-+#define __ASM_SMP_ALT_H__
-+
-+#include <linux/config.h>
-+
-+#ifdef CONFIG_SMP
-+#if defined(CONFIG_SMP_ALTERNATIVES) && !defined(MODULE)
-+#define LOCK \
-+        "6677: nop\n" \
-+      ".section __smp_alternatives,\"a\"\n" \
-+      ".long 6677b\n" \
-+      ".long 6678f\n" \
-+      ".previous\n" \
-+      ".section __smp_replacements,\"a\"\n" \
-+      "6678: .byte 1\n" \
-+      ".byte 1\n" \
-+      ".byte 0\n" \
-+        ".byte 1\n" \
-+      ".byte -1\n" \
-+      "lock\n" \
-+      "nop\n" \
-+      ".previous\n"
-+void prepare_for_smp(void);
-+void unprepare_for_smp(void);
-+#else
-+#define LOCK "lock ; "
-+#endif
-+#else
-+#define LOCK ""
-+#endif
-+
-+#endif /* __ASM_SMP_ALT_H__ */
-diff -pruN ../pristine-linux-2.6.16-rc3/include/asm-i386/spinlock.h 
./include/asm-i386/spinlock.h
---- ../pristine-linux-2.6.16-rc3/include/asm-i386/spinlock.h   2006-01-03 
03:21:10.000000000 +0000
-+++ ./include/asm-i386/spinlock.h      2006-02-15 20:45:57.000000000 +0000
-@@ -6,6 +6,7 @@
- #include <asm/page.h>
- #include <linux/config.h>
- #include <linux/compiler.h>
-+#include <asm/smp_alt.h>
- 
- /*
-  * Your basic SMP spinlocks, allowing only a single CPU anywhere
-@@ -23,7 +24,8 @@
- 
- #define __raw_spin_lock_string \
-       "\n1:\t" \
--      "lock ; decb %0\n\t" \
-+      LOCK \
-+      "decb %0\n\t" \
-       "jns 3f\n" \
-       "2:\t" \
-       "rep;nop\n\t" \
-@@ -34,7 +36,8 @@
- 
- #define __raw_spin_lock_string_flags \
-       "\n1:\t" \
--      "lock ; decb %0\n\t" \
-+      LOCK \
-+      "decb %0\n\t" \
-       "jns 4f\n\t" \
-       "2:\t" \
-       "testl $0x200, %1\n\t" \
-@@ -65,10 +68,34 @@ static inline void __raw_spin_lock_flags
- static inline int __raw_spin_trylock(raw_spinlock_t *lock)
- {
-       char oldval;
-+#ifdef CONFIG_SMP_ALTERNATIVES
-       __asm__ __volatile__(
--              "xchgb %b0,%1"
-+              "1:movb %1,%b0\n"
-+              "movb $0,%1\n"
-+              "2:"
-+              ".section __smp_alternatives,\"a\"\n"
-+              ".long 1b\n"
-+              ".long 3f\n"
-+              ".previous\n"
-+              ".section __smp_replacements,\"a\"\n"
-+              "3: .byte 2b - 1b\n"
-+              ".byte 5f-4f\n"
-+              ".byte 0\n"
-+              ".byte 6f-5f\n"
-+              ".byte -1\n"
-+              "4: xchgb %b0,%1\n"
-+              "5: movb %1,%b0\n"
-+              "movb $0,%1\n"
-+              "6:\n"
-+              ".previous\n"
-               :"=q" (oldval), "=m" (lock->slock)
-               :"0" (0) : "memory");
-+#else
-+      __asm__ __volatile__(
-+              "xchgb %b0,%1\n"
-+              :"=q" (oldval), "=m" (lock->slock)
-+              :"0" (0) : "memory");
-+#endif
-       return oldval > 0;
- }
- 
-@@ -178,12 +205,12 @@ static inline int __raw_write_trylock(ra
- 
- static inline void __raw_read_unlock(raw_rwlock_t *rw)
- {
--      asm volatile("lock ; incl %0" :"=m" (rw->lock) : : "memory");
-+      asm volatile(LOCK "incl %0" :"=m" (rw->lock) : : "memory");
- }
- 
- static inline void __raw_write_unlock(raw_rwlock_t *rw)
- {
--      asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ", %0"
-+      asm volatile(LOCK "addl $" RW_LOCK_BIAS_STR ", %0"
-                                : "=m" (rw->lock) : : "memory");
- }
- 
-diff -pruN ../pristine-linux-2.6.16-rc3/include/asm-i386/system.h 
./include/asm-i386/system.h
---- ../pristine-linux-2.6.16-rc3/include/asm-i386/system.h     2006-02-15 
20:38:57.000000000 +0000
-+++ ./include/asm-i386/system.h        2006-02-15 20:45:57.000000000 +0000
-@@ -5,7 +5,7 @@
- #include <linux/kernel.h>
- #include <asm/segment.h>
- #include <asm/cpufeature.h>
--#include <linux/bitops.h> /* for LOCK_PREFIX */
-+#include <asm/smp_alt.h>
- 
- #ifdef __KERNEL__
- 
-@@ -271,19 +271,19 @@ static inline unsigned long __cmpxchg(vo
-       unsigned long prev;
-       switch (size) {
-       case 1:
--              __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2"
-+              __asm__ __volatile__(LOCK "cmpxchgb %b1,%2"
-                                    : "=a"(prev)
-                                    : "q"(new), "m"(*__xg(ptr)), "0"(old)
-                                    : "memory");
-               return prev;
-       case 2:
--              __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2"
-+              __asm__ __volatile__(LOCK "cmpxchgw %w1,%2"
-                                    : "=a"(prev)
-                                    : "r"(new), "m"(*__xg(ptr)), "0"(old)
-                                    : "memory");
-               return prev;
-       case 4:
--              __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2"
-+              __asm__ __volatile__(LOCK "cmpxchgl %1,%2"
-                                    : "=a"(prev)
-                                    : "r"(new), "m"(*__xg(ptr)), "0"(old)
-                                    : "memory");
-@@ -336,7 +336,7 @@ static inline unsigned long long __cmpxc
-                                     unsigned long long new)
- {
-       unsigned long long prev;
--      __asm__ __volatile__(LOCK_PREFIX "cmpxchg8b %3"
-+      __asm__ __volatile__(LOCK "cmpxchg8b %3"
-                            : "=A"(prev)
-                            : "b"((unsigned long)new),
-                              "c"((unsigned long)(new >> 32)),
-@@ -503,11 +503,55 @@ struct alt_instr { 
- #endif
- 
- #ifdef CONFIG_SMP
-+#if defined(CONFIG_SMP_ALTERNATIVES) && !defined(MODULE)
-+#define smp_alt_mb(instr)                                           \
-+__asm__ __volatile__("6667:\nnop\nnop\nnop\nnop\nnop\nnop\n6668:\n" \
-+                   ".section __smp_alternatives,\"a\"\n"          \
-+                   ".long 6667b\n"                                \
-+                     ".long 6673f\n"                                \
-+                   ".previous\n"                                  \
-+                   ".section __smp_replacements,\"a\"\n"          \
-+                   "6673:.byte 6668b-6667b\n"                     \
-+                   ".byte 6670f-6669f\n"                          \
-+                   ".byte 6671f-6670f\n"                          \
-+                     ".byte 0\n"                                    \
-+                   ".byte %c0\n"                                  \
-+                   "6669:lock;addl $0,0(%%esp)\n"                 \
-+                   "6670:" instr "\n"                             \
-+                   "6671:\n"                                      \
-+                   ".previous\n"                                  \
-+                   :                                              \
-+                   : "i" (X86_FEATURE_XMM2)                       \
-+                   : "memory")
-+#define smp_rmb() smp_alt_mb("lfence")
-+#define smp_mb()  smp_alt_mb("mfence")
-+#define set_mb(var, value) do {                                     \
-+unsigned long __set_mb_temp;                                        \
-+__asm__ __volatile__("6667:movl %1, %0\n6668:\n"                    \
-+                   ".section __smp_alternatives,\"a\"\n"          \
-+                   ".long 6667b\n"                                \
-+                   ".long 6673f\n"                                \
-+                   ".previous\n"                                  \
-+                   ".section __smp_replacements,\"a\"\n"          \
-+                   "6673: .byte 6668b-6667b\n"                    \
-+                   ".byte 6670f-6669f\n"                          \
-+                   ".byte 0\n"                                    \
-+                   ".byte 6671f-6670f\n"                          \
-+                   ".byte -1\n"                                   \
-+                   "6669: xchg %1, %0\n"                          \
-+                   "6670:movl %1, %0\n"                           \
-+                   "6671:\n"                                      \
-+                   ".previous\n"                                  \
-+                   : "=m" (var), "=r" (__set_mb_temp)             \
-+                   : "1" (value)                                  \
-+                   : "memory"); } while (0)
-+#else
- #define smp_mb()      mb()
- #define smp_rmb()     rmb()
-+#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
-+#endif
- #define smp_wmb()     wmb()
- #define smp_read_barrier_depends()    read_barrier_depends()
--#define set_mb(var, value) do { (void) xchg(&var, value); } while (0)
- #else
- #define smp_mb()      barrier()
- #define smp_rmb()     barrier()
diff -r 88f97bb8f3ae -r 673f62edbfbe 
xen/include/asm-ia64/linux-xen/asm/uaccess.h
--- a/xen/include/asm-ia64/linux-xen/asm/uaccess.h      Wed Mar  1 17:01:54 2006
+++ /dev/null   Wed Mar  1 19:47:25 2006
@@ -1,415 +0,0 @@
-#ifndef _ASM_IA64_UACCESS_H
-#define _ASM_IA64_UACCESS_H
-
-/*
- * This file defines various macros to transfer memory areas across
- * the user/kernel boundary.  This needs to be done carefully because
- * this code is executed in kernel mode and uses user-specified
- * addresses.  Thus, we need to be careful not to let the user to
- * trick us into accessing kernel memory that would normally be
- * inaccessible.  This code is also fairly performance sensitive,
- * so we want to spend as little time doing safety checks as
- * possible.
- *
- * To make matters a bit more interesting, these macros sometimes also
- * called from within the kernel itself, in which case the address
- * validity check must be skipped.  The get_fs() macro tells us what
- * to do: if get_fs()==USER_DS, checking is performed, if
- * get_fs()==KERNEL_DS, checking is bypassed.
- *
- * Note that even if the memory area specified by the user is in a
- * valid address range, it is still possible that we'll get a page
- * fault while accessing it.  This is handled by filling out an
- * exception handler fixup entry for each instruction that has the
- * potential to fault.  When such a fault occurs, the page fault
- * handler checks to see whether the faulting instruction has a fixup
- * associated and, if so, sets r8 to -EFAULT and clears r9 to 0 and
- * then resumes execution at the continuation point.
- *
- * Based on <asm-alpha/uaccess.h>.
- *
- * Copyright (C) 1998, 1999, 2001-2004 Hewlett-Packard Co
- *     David Mosberger-Tang <davidm@xxxxxxxxxx>
- */
-
-#include <linux/compiler.h>
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/page-flags.h>
-#include <linux/mm.h>
-
-#include <asm/intrinsics.h>
-#include <asm/pgtable.h>
-#include <asm/io.h>
-
-/*
- * For historical reasons, the following macros are grossly misnamed:
- */
-#define KERNEL_DS      ((mm_segment_t) { ~0UL })               /* cf. 
access_ok() */
-#define USER_DS                ((mm_segment_t) { TASK_SIZE-1 })        /* cf. 
access_ok() */
-
-#define VERIFY_READ    0
-#define VERIFY_WRITE   1
-
-#define get_ds()  (KERNEL_DS)
-#define get_fs()  (current_thread_info()->addr_limit)
-#define set_fs(x) (current_thread_info()->addr_limit = (x))
-
-#define segment_eq(a, b)       ((a).seg == (b).seg)
-
-/*
- * When accessing user memory, we need to make sure the entire area really is 
in
- * user-level space.  In order to do this efficiently, we make sure that the 
page at
- * address TASK_SIZE is never valid.  We also need to make sure that the 
address doesn't
- * point inside the virtually mapped linear page table.
- */
-#ifdef XEN
-#define IS_VMM_ADDRESS(addr) ((((addr) >> 60) ^ ((addr) >> 59)) & 1)
-#define __access_ok(addr, size, segment) (!IS_VMM_ADDRESS((unsigned 
long)(addr)))
-#else
-#define __access_ok(addr, size, segment)                                       
        \
-({                                                                             
        \
-       __chk_user_ptr(addr);                                                   
        \
-       (likely((unsigned long) (addr) <= (segment).seg)                        
        \
-        && ((segment).seg == KERNEL_DS.seg                                     
        \
-            || likely(REGION_OFFSET((unsigned long) (addr)) < 
RGN_MAP_LIMIT)));        \
-})
-#endif
-#define access_ok(type, addr, size)    __access_ok((addr), (size), get_fs())
-
-/* this function will go away soon - use access_ok() instead */
-static inline int __deprecated
-verify_area (int type, const void __user *addr, unsigned long size)
-{
-       return access_ok(type, addr, size) ? 0 : -EFAULT;
-}
-
-/*
- * These are the main single-value transfer routines.  They automatically
- * use the right size if we just have the right pointer type.
- *
- * Careful to not
- * (a) re-use the arguments for side effects (sizeof/typeof is ok)
- * (b) require any knowledge of processes at this stage
- */
-#define put_user(x, ptr)       __put_user_check((__typeof__(*(ptr))) (x), 
(ptr), sizeof(*(ptr)), get_fs())
-#define get_user(x, ptr)       __get_user_check((x), (ptr), sizeof(*(ptr)), 
get_fs())
-
-/*
- * The "__xxx" versions do not do address space checking, useful when
- * doing multiple accesses to the same area (the programmer has to do the
- * checks by hand with "access_ok()")
- */
-#define __put_user(x, ptr)     __put_user_nocheck((__typeof__(*(ptr))) (x), 
(ptr), sizeof(*(ptr)))
-#define __get_user(x, ptr)     __get_user_nocheck((x), (ptr), sizeof(*(ptr)))
-
-extern long __put_user_unaligned_unknown (void);
-
-#define __put_user_unaligned(x, ptr)                                           
                \
-({                                                                             
                \
-       long __ret;                                                             
                \
-       switch (sizeof(*(ptr))) {                                               
                \
-               case 1: __ret = __put_user((x), (ptr)); break;                  
                \
-               case 2: __ret = (__put_user((x), (u8 __user *)(ptr)))           
                \
-                       | (__put_user((x) >> 8, ((u8 __user *)(ptr) + 1))); 
break;              \
-               case 4: __ret = (__put_user((x), (u16 __user *)(ptr)))          
                \
-                       | (__put_user((x) >> 16, ((u16 __user *)(ptr) + 1))); 
break;            \
-               case 8: __ret = (__put_user((x), (u32 __user *)(ptr)))          
                \
-                       | (__put_user((x) >> 32, ((u32 __user *)(ptr) + 1))); 
break;            \
-               default: __ret = __put_user_unaligned_unknown();                
                \
-       }                                                                       
                \
-       __ret;                                                                  
                \
-})
-
-extern long __get_user_unaligned_unknown (void);
-
-#define __get_user_unaligned(x, ptr)                                           
                \
-({                                                                             
                \
-       long __ret;                                                             
                \
-       switch (sizeof(*(ptr))) {                                               
                \
-               case 1: __ret = __get_user((x), (ptr)); break;                  
                \
-               case 2: __ret = (__get_user((x), (u8 __user *)(ptr)))           
                \
-                       | (__get_user((x) >> 8, ((u8 __user *)(ptr) + 1))); 
break;              \
-               case 4: __ret = (__get_user((x), (u16 __user *)(ptr)))          
                \
-                       | (__get_user((x) >> 16, ((u16 __user *)(ptr) + 1))); 
break;            \
-               case 8: __ret = (__get_user((x), (u32 __user *)(ptr)))          
                \
-                       | (__get_user((x) >> 32, ((u32 __user *)(ptr) + 1))); 
break;            \
-               default: __ret = __get_user_unaligned_unknown();                
                \
-       }                                                                       
                \
-       __ret;                                                                  
                \
-})
-
-#ifdef ASM_SUPPORTED
-  struct __large_struct { unsigned long buf[100]; };
-# define __m(x) (*(struct __large_struct __user *)(x))
-
-/* We need to declare the __ex_table section before we can use it in .xdata.  
*/
-asm (".section \"__ex_table\", \"a\"\n\t.previous");
-
-# define __get_user_size(val, addr, n, err)                                    
                \
-do {                                                                           
                \
-       register long __gu_r8 asm ("r8") = 0;                                   
                \
-       register long __gu_r9 asm ("r9");                                       
                \
-       asm ("\n[1:]\tld"#n" %0=%2%P2\t// %0 and %1 get overwritten by 
exception handler\n"     \
-            "\t.xdata4 \"__ex_table\", 1b-., 1f-.+4\n"                         
                \
-            "[1:]"                                                             
                \
-            : "=r"(__gu_r9), "=r"(__gu_r8) : "m"(__m(addr)), "1"(__gu_r8));    
                \
-       (err) = __gu_r8;                                                        
                \
-       (val) = __gu_r9;                                                        
                \
-} while (0)
-
-/*
- * The "__put_user_size()" macro tells gcc it reads from memory instead of 
writing it.  This
- * is because they do not write to any memory gcc knows about, so there are no 
aliasing
- * issues.
- */
-# define __put_user_size(val, addr, n, err)                                    
                \
-do {                                                                           
                \
-       register long __pu_r8 asm ("r8") = 0;                                   
                \
-       asm volatile ("\n[1:]\tst"#n" %1=%r2%P1\t// %0 gets overwritten by 
exception handler\n" \
-                     "\t.xdata4 \"__ex_table\", 1b-., 1f-.\n"                  
                \
-                     "[1:]"                                                    
                \
-                     : "=r"(__pu_r8) : "m"(__m(addr)), "rO"(val), 
"0"(__pu_r8));               \
-       (err) = __pu_r8;                                                        
                \
-} while (0)
-
-#else /* !ASM_SUPPORTED */
-# define RELOC_TYPE    2       /* ip-rel */
-# define __get_user_size(val, addr, n, err)                            \
-do {                                                                   \
-       __ld_user("__ex_table", (unsigned long) addr, n, RELOC_TYPE);   \
-       (err) = ia64_getreg(_IA64_REG_R8);                              \
-       (val) = ia64_getreg(_IA64_REG_R9);                              \
-} while (0)
-# define __put_user_size(val, addr, n, err)                                    
                \
-do {                                                                           
                \
-       __st_user("__ex_table", (unsigned long) addr, n, RELOC_TYPE, (unsigned 
long) (val));    \
-       (err) = ia64_getreg(_IA64_REG_R8);                                      
                \
-} while (0)
-#endif /* !ASM_SUPPORTED */
-
-extern void __get_user_unknown (void);
-
-/*
- * Evaluating arguments X, PTR, SIZE, and SEGMENT may involve 
subroutine-calls, which
- * could clobber r8 and r9 (among others).  Thus, be careful not to evaluate 
it while
- * using r8/r9.
- */
-#define __do_get_user(check, x, ptr, size, segment)                            
        \
-({                                                                             
        \
-       const __typeof__(*(ptr)) __user *__gu_ptr = (ptr);                      
        \
-       __typeof__ (size) __gu_size = (size);                                   
        \
-       long __gu_err = -EFAULT, __gu_val = 0;                                  
        \
-                                                                               
        \
-       if (!check || __access_ok(__gu_ptr, size, segment))                     
        \
-               switch (__gu_size) {                                            
        \
-                     case 1: __get_user_size(__gu_val, __gu_ptr, 1, __gu_err); 
break;  \
-                     case 2: __get_user_size(__gu_val, __gu_ptr, 2, __gu_err); 
break;  \
-                     case 4: __get_user_size(__gu_val, __gu_ptr, 4, __gu_err); 
break;  \
-                     case 8: __get_user_size(__gu_val, __gu_ptr, 8, __gu_err); 
break;  \
-                     default: __get_user_unknown(); break;                     
        \
-               }                                                               
        \
-       (x) = (__typeof__(*(__gu_ptr))) __gu_val;                               
        \
-       __gu_err;                                                               
        \
-})
-
-#define __get_user_nocheck(x, ptr, size)       __do_get_user(0, x, ptr, size, 
KERNEL_DS)
-#define __get_user_check(x, ptr, size, segment)        __do_get_user(1, x, 
ptr, size, segment)
-
-extern void __put_user_unknown (void);
-
-/*
- * Evaluating arguments X, PTR, SIZE, and SEGMENT may involve 
subroutine-calls, which
- * could clobber r8 (among others).  Thus, be careful not to evaluate them 
while using r8.
- */
-#define __do_put_user(check, x, ptr, size, segment)                            
        \
-({                                                                             
        \
-       __typeof__ (x) __pu_x = (x);                                            
        \
-       __typeof__ (*(ptr)) __user *__pu_ptr = (ptr);                           
        \
-       __typeof__ (size) __pu_size = (size);                                   
        \
-       long __pu_err = -EFAULT;                                                
        \
-                                                                               
        \
-       if (!check || __access_ok(__pu_ptr, __pu_size, segment))                
        \
-               switch (__pu_size) {                                            
        \
-                     case 1: __put_user_size(__pu_x, __pu_ptr, 1, __pu_err); 
break;    \
-                     case 2: __put_user_size(__pu_x, __pu_ptr, 2, __pu_err); 
break;    \
-                     case 4: __put_user_size(__pu_x, __pu_ptr, 4, __pu_err); 
break;    \
-                     case 8: __put_user_size(__pu_x, __pu_ptr, 8, __pu_err); 
break;    \
-                     default: __put_user_unknown(); break;                     
        \
-               }                                                               
        \
-       __pu_err;                                                               
        \
-})
-
-#define __put_user_nocheck(x, ptr, size)       __do_put_user(0, x, ptr, size, 
KERNEL_DS)
-#define __put_user_check(x, ptr, size, segment)        __do_put_user(1, x, 
ptr, size, segment)
-
-/*
- * Complex access routines
- */
-extern unsigned long __must_check __copy_user (void __user *to, const void 
__user *from,
-                                              unsigned long count);
-
-static inline unsigned long
-__copy_to_user (void __user *to, const void *from, unsigned long count)
-{
-       return __copy_user(to, (void __user *) from, count);
-}
-
-static inline unsigned long
-__copy_from_user (void *to, const void __user *from, unsigned long count)
-{
-       return __copy_user((void __user *) to, from, count);
-}
-
-#define __copy_to_user_inatomic                __copy_to_user
-#define __copy_from_user_inatomic      __copy_from_user
-#define copy_to_user(to, from, n)                                              
        \
-({                                                                             
        \
-       void __user *__cu_to = (to);                                            
        \
-       const void *__cu_from = (from);                                         
        \
-       long __cu_len = (n);                                                    
        \
-                                                                               
        \
-       if (__access_ok(__cu_to, __cu_len, get_fs()))                           
        \
-               __cu_len = __copy_user(__cu_to, (void __user *) __cu_from, 
__cu_len);   \
-       __cu_len;                                                               
        \
-})
-
-#define copy_from_user(to, from, n)                                            
        \
-({                                                                             
        \
-       void *__cu_to = (to);                                                   
        \
-       const void __user *__cu_from = (from);                                  
        \
-       long __cu_len = (n);                                                    
        \
-                                                                               
        \
-       __chk_user_ptr(__cu_from);                                              
        \
-       if (__access_ok(__cu_from, __cu_len, get_fs()))                         
        \
-               __cu_len = __copy_user((void __user *) __cu_to, __cu_from, 
__cu_len);   \
-       __cu_len;                                                               
        \
-})
-
-#define __copy_in_user(to, from, size) __copy_user((to), (from), (size))
-
-static inline unsigned long
-copy_in_user (void __user *to, const void __user *from, unsigned long n)
-{
-       if (likely(access_ok(VERIFY_READ, from, n) && access_ok(VERIFY_WRITE, 
to, n)))
-               n = __copy_user(to, from, n);
-       return n;
-}
-
-extern unsigned long __do_clear_user (void __user *, unsigned long);
-
-#define __clear_user(to, n)            __do_clear_user(to, n)
-
-#define clear_user(to, n)                                      \
-({                                                             \
-       unsigned long __cu_len = (n);                           \
-       if (__access_ok(to, __cu_len, get_fs()))                \
-               __cu_len = __do_clear_user(to, __cu_len);       \
-       __cu_len;                                               \
-})
-
-
-/*
- * Returns: -EFAULT if exception before terminator, N if the entire buffer 
filled, else
- * strlen.
- */
-extern long __must_check __strncpy_from_user (char *to, const char __user 
*from, long to_len);
-
-#define strncpy_from_user(to, from, n)                                 \
-({                                                                     \
-       const char __user * __sfu_from = (from);                        \
-       long __sfu_ret = -EFAULT;                                       \
-       if (__access_ok(__sfu_from, 0, get_fs()))                       \
-               __sfu_ret = __strncpy_from_user((to), __sfu_from, (n)); \
-       __sfu_ret;                                                      \
-})
-
-/* Returns: 0 if bad, string length+1 (memory size) of string if ok */
-extern unsigned long __strlen_user (const char __user *);
-
-#define strlen_user(str)                               \
-({                                                     \
-       const char __user *__su_str = (str);            \
-       unsigned long __su_ret = 0;                     \
-       if (__access_ok(__su_str, 0, get_fs()))         \
-               __su_ret = __strlen_user(__su_str);     \
-       __su_ret;                                       \
-})
-
-/*
- * Returns: 0 if exception before NUL or reaching the supplied limit
- * (N), a value greater than N if the limit would be exceeded, else
- * strlen.
- */
-extern unsigned long __strnlen_user (const char __user *, long);
-
-#define strnlen_user(str, len)                                 \
-({                                                             \
-       const char __user *__su_str = (str);                    \
-       unsigned long __su_ret = 0;                             \
-       if (__access_ok(__su_str, 0, get_fs()))                 \
-               __su_ret = __strnlen_user(__su_str, len);       \
-       __su_ret;                                               \
-})
-
-/* Generic code can't deal with the location-relative format that we use for 
compactness.  */
-#define ARCH_HAS_SORT_EXTABLE
-#define ARCH_HAS_SEARCH_EXTABLE
-
-struct exception_table_entry {
-       int addr;       /* location-relative address of insn this fixup is for 
*/
-       int cont;       /* location-relative continuation addr.; if bit 2 is 
set, r9 is set to 0 */
-};
-
-extern void ia64_handle_exception (struct pt_regs *regs, const struct 
exception_table_entry *e);
-extern const struct exception_table_entry *search_exception_tables (unsigned 
long addr);
-
-static inline int
-ia64_done_with_exception (struct pt_regs *regs)
-{
-       const struct exception_table_entry *e;
-       e = search_exception_tables(regs->cr_iip + ia64_psr(regs)->ri);
-       if (e) {
-               ia64_handle_exception(regs, e);
-               return 1;
-       }
-       return 0;
-}
-
-#ifndef XEN
-#define ARCH_HAS_TRANSLATE_MEM_PTR     1
-static __inline__ char *
-xlate_dev_mem_ptr (unsigned long p)
-{
-       struct page *page;
-       char * ptr;
-
-       page = mfn_to_page(p >> PAGE_SHIFT);
-       if (PageUncached(page))
-               ptr = (char *)p + __IA64_UNCACHED_OFFSET;
-       else
-               ptr = __va(p);
-
-       return ptr;
-}
-
-/*
- * Convert a virtual cached kernel memory pointer to an uncached pointer
- */
-static __inline__ char *
-xlate_dev_kmem_ptr (char * p)
-{
-       struct page *page;
-       char * ptr;
-
-       page = virt_to_page((unsigned long)p >> PAGE_SHIFT);
-       if (PageUncached(page))
-               ptr = (char *)__pa(p) + __IA64_UNCACHED_OFFSET;
-       else
-               ptr = p;
-
-       return ptr;
-}
-#endif
-
-#endif /* _ASM_IA64_UACCESS_H */

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.