[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-changelog] merge
# HG changeset patch # User awilliam@xxxxxxxxxxx # Node ID 673f62edbfbe4098ea1d5a34d8a77667da762090 # Parent 88f97bb8f3ae7e0fb85dbe8fb420d7f02f844a34 # Parent d8451bb6278cb5f3f477dd9392213be7c66730b4 merge diff -r 88f97bb8f3ae -r 673f62edbfbe buildconfigs/linux-defconfig_xen0_x86_32 --- a/buildconfigs/linux-defconfig_xen0_x86_32 Wed Mar 1 17:01:54 2006 +++ b/buildconfigs/linux-defconfig_xen0_x86_32 Wed Mar 1 19:47:25 2006 @@ -1320,6 +1320,7 @@ # CONFIG_XEN_BLKDEV_TAP_BE is not set CONFIG_XEN_NETDEV_BACKEND=y # CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER is not set +CONFIG_XEN_NETDEV_LOOPBACK=y # CONFIG_XEN_TPMDEV_BACKEND is not set CONFIG_XEN_BLKDEV_FRONTEND=y CONFIG_XEN_NETDEV_FRONTEND=y diff -r 88f97bb8f3ae -r 673f62edbfbe buildconfigs/linux-defconfig_xen0_x86_64 --- a/buildconfigs/linux-defconfig_xen0_x86_64 Wed Mar 1 17:01:54 2006 +++ b/buildconfigs/linux-defconfig_xen0_x86_64 Wed Mar 1 19:47:25 2006 @@ -1244,6 +1244,7 @@ # CONFIG_XEN_BLKDEV_TAP_BE is not set CONFIG_XEN_NETDEV_BACKEND=y # CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER is not set +CONFIG_XEN_NETDEV_LOOPBACK=y # CONFIG_XEN_TPMDEV_BACKEND is not set CONFIG_XEN_BLKDEV_FRONTEND=y CONFIG_XEN_NETDEV_FRONTEND=y diff -r 88f97bb8f3ae -r 673f62edbfbe buildconfigs/linux-defconfig_xen_x86_32 --- a/buildconfigs/linux-defconfig_xen_x86_32 Wed Mar 1 17:01:54 2006 +++ b/buildconfigs/linux-defconfig_xen_x86_32 Wed Mar 1 19:47:25 2006 @@ -2986,6 +2986,7 @@ # CONFIG_XEN_BLKDEV_TAP_BE is not set CONFIG_XEN_NETDEV_BACKEND=y # CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER is not set +CONFIG_XEN_NETDEV_LOOPBACK=y # CONFIG_XEN_TPMDEV_BACKEND is not set CONFIG_XEN_BLKDEV_FRONTEND=y CONFIG_XEN_NETDEV_FRONTEND=y diff -r 88f97bb8f3ae -r 673f62edbfbe buildconfigs/linux-defconfig_xen_x86_64 --- a/buildconfigs/linux-defconfig_xen_x86_64 Wed Mar 1 17:01:54 2006 +++ b/buildconfigs/linux-defconfig_xen_x86_64 Wed Mar 1 19:47:25 2006 @@ -2656,6 +2656,7 @@ # CONFIG_XEN_BLKDEV_TAP_BE is not set CONFIG_XEN_NETDEV_BACKEND=y # CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER is not set +CONFIG_XEN_NETDEV_LOOPBACK=y # CONFIG_XEN_TPMDEV_BACKEND is not set CONFIG_XEN_BLKDEV_FRONTEND=y CONFIG_XEN_NETDEV_FRONTEND=y diff -r 88f97bb8f3ae -r 673f62edbfbe buildconfigs/mk.linux-2.6-xen --- a/buildconfigs/mk.linux-2.6-xen Wed Mar 1 17:01:54 2006 +++ b/buildconfigs/mk.linux-2.6-xen Wed Mar 1 19:47:25 2006 @@ -2,8 +2,8 @@ OS = linux LINUX_SERIES = 2.6 -LINUX_VER = 2.6.16-rc4 -LINUX_SRCS = linux-2.6.15.tar.bz2 patch-2.6.16-rc4.bz2 +LINUX_VER = 2.6.16-rc5 +LINUX_SRCS = linux-2.6.15.tar.bz2 patch-2.6.16-rc5.bz2 LINUX_PDIR = linux-$(LINUX_VER) EXTRAVERSION ?= xen @@ -34,7 +34,7 @@ touch $(@D)/.hgskip touch $@ -pristine-linux-%.16-rc4/.valid-pristine: pristine-$(LINUX_PDIR)/.valid-srcs +pristine-linux-%.16-rc5/.valid-pristine: pristine-$(LINUX_PDIR)/.valid-srcs touch $@ # update timestamp to avoid rebuild $(LINUX_DIR)/include/linux/autoconf.h: ref-$(OS)-$(LINUX_VER)/.valid-ref diff -r 88f97bb8f3ae -r 673f62edbfbe docs/src/user.tex --- a/docs/src/user.tex Wed Mar 1 17:01:54 2006 +++ b/docs/src/user.tex Wed Mar 1 19:47:25 2006 @@ -626,7 +626,7 @@ allow you to monitor and log the Xen boot process via serial console and can be very useful in debugging. -%% kernel /boot/xen-2.0.gz dom0_mem=131072 com1=115200,8n1 +%% kernel /boot/xen-2.0.gz dom0_mem=131072 console=com1,vga com1=115200,8n1 %% module /boot/vmlinuz-2.6-xen0 root=/dev/sda4 ro In order to configure Xen serial console output, it is necessary to @@ -637,8 +637,9 @@ \end{verbatim}} \end{quote} -This configures Xen to output on COM1 at 115,200 baud, 8 data bits, 1 -stop bit and no parity. Modify these parameters for your environment. +This configures Xen to output on COM1 at 115,200 baud, 8 data bits, no +parity and 1 stop bit. Modify these parameters for your environment. +See Section~\ref{s:xboot} for an explanation of all boot parameters. One can also configure XenLinux to share the serial console; to achieve this append ``\path{console=ttyS0}'' to your module line. diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/arch/i386/Kconfig --- a/linux-2.6-xen-sparse/arch/i386/Kconfig Wed Mar 1 17:01:54 2006 +++ b/linux-2.6-xen-sparse/arch/i386/Kconfig Wed Mar 1 19:47:25 2006 @@ -770,7 +770,7 @@ config HOTPLUG_CPU bool "Support for hot-pluggable CPUs (EXPERIMENTAL)" - depends on SMP && HOTPLUG && EXPERIMENTAL + depends on SMP && HOTPLUG && EXPERIMENTAL && !X86_VOYAGER ---help--- Say Y here to experiment with turning CPUs off and on. CPUs can be controlled through /sys/devices/system/cpu. @@ -1122,6 +1122,7 @@ config KPROBES bool "Kprobes (EXPERIMENTAL)" + depends on EXPERIMENTAL && MODULES help Kprobes allows you to trap at almost any kernel address and execute a callback function. register_kprobe() establishes diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/arch/i386/kernel/Makefile --- a/linux-2.6-xen-sparse/arch/i386/kernel/Makefile Wed Mar 1 17:01:54 2006 +++ b/linux-2.6-xen-sparse/arch/i386/kernel/Makefile Wed Mar 1 19:47:25 2006 @@ -7,7 +7,7 @@ obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o \ ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \ pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o \ - quirks.o i8237.o + quirks.o i8237.o topology.o obj-y += cpu/ obj-y += timers/ diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/arch/i386/kernel/acpi/boot-xen.c --- a/linux-2.6-xen-sparse/arch/i386/kernel/acpi/boot-xen.c Wed Mar 1 17:01:54 2006 +++ b/linux-2.6-xen-sparse/arch/i386/kernel/acpi/boot-xen.c Wed Mar 1 19:47:25 2006 @@ -44,9 +44,6 @@ extern int gsi_irq_sharing(int gsi); #include <asm/proto.h> -static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id) { return 0; } - - #else /* X86 */ #ifdef CONFIG_X86_LOCAL_APIC diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/arch/i386/kernel/cpu/common-xen.c --- a/linux-2.6-xen-sparse/arch/i386/kernel/cpu/common-xen.c Wed Mar 1 17:01:54 2006 +++ b/linux-2.6-xen-sparse/arch/i386/kernel/cpu/common-xen.c Wed Mar 1 19:47:25 2006 @@ -4,6 +4,7 @@ #include <linux/smp.h> #include <linux/module.h> #include <linux/percpu.h> +#include <linux/bootmem.h> #include <asm/semaphore.h> #include <asm/processor.h> #include <asm/i387.h> @@ -18,6 +19,9 @@ #include <asm/hypervisor.h> #include "cpu.h" + +DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr); +EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr); #ifndef CONFIG_XEN DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]); @@ -598,6 +602,8 @@ struct tss_struct * t = &per_cpu(init_tss, cpu); #endif struct thread_struct *thread = ¤t->thread; + struct desc_struct *gdt; + struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); if (cpu_test_and_set(cpu, cpu_initialized)) { printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); @@ -614,7 +620,54 @@ set_in_cr4(X86_CR4_TSD); } - cpu_gdt_init(&cpu_gdt_descr[cpu]); +#ifndef CONFIG_XEN + /* + * This is a horrible hack to allocate the GDT. The problem + * is that cpu_init() is called really early for the boot CPU + * (and hence needs bootmem) but much later for the secondary + * CPUs, when bootmem will have gone away + */ + if (NODE_DATA(0)->bdata->node_bootmem_map) { + gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE); + /* alloc_bootmem_pages panics on failure, so no check */ + memset(gdt, 0, PAGE_SIZE); + } else { + gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL); + if (unlikely(!gdt)) { + printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu); + for (;;) + local_irq_enable(); + } + } + + /* + * Initialize the per-CPU GDT with the boot GDT, + * and set up the GDT descriptor: + */ + memcpy(gdt, cpu_gdt_table, GDT_SIZE); + + /* Set up GDT entry for 16bit stack */ + *(__u64 *)(&gdt[GDT_ENTRY_ESPFIX_SS]) |= + ((((__u64)stk16_off) << 16) & 0x000000ffffff0000ULL) | + ((((__u64)stk16_off) << 32) & 0xff00000000000000ULL) | + (CPU_16BIT_STACK_SIZE - 1); + + cpu_gdt_descr->size = GDT_SIZE - 1; + cpu_gdt_descr->address = (unsigned long)gdt; +#else + if (cpu == 0 && cpu_gdt_descr->address == 0) { + gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE); + /* alloc_bootmem_pages panics on failure, so no check */ + memset(gdt, 0, PAGE_SIZE); + + memcpy(gdt, cpu_gdt_table, GDT_SIZE); + + cpu_gdt_descr->size = GDT_SIZE; + cpu_gdt_descr->address = (unsigned long)gdt; + } +#endif + + cpu_gdt_init(cpu_gdt_descr); /* * Set up and load the per-CPU TSS and LDT diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/arch/i386/kernel/head-xen.S --- a/linux-2.6-xen-sparse/arch/i386/kernel/head-xen.S Wed Mar 1 17:01:54 2006 +++ b/linux-2.6-xen-sparse/arch/i386/kernel/head-xen.S Wed Mar 1 19:47:25 2006 @@ -87,19 +87,9 @@ */ .data - ALIGN - .word 0 # 32 bit align gdt_desc.address - .globl cpu_gdt_descr -cpu_gdt_descr: - .word GDT_SIZE - .long cpu_gdt_table - - .fill NR_CPUS-1,8,0 # space for the other GDT descriptors - /* * The Global Descriptor Table contains 28 quadwords, per-CPU. */ - .align PAGE_SIZE_asm ENTRY(cpu_gdt_table) .quad 0x0000000000000000 /* NULL descriptor */ .quad 0x0000000000000000 /* 0x0b reserved */ @@ -148,10 +138,6 @@ .quad 0x0000000000000000 /* 0xf0 - unused */ .quad 0x0000000000000000 /* 0xf8 - GDT entry 31: double-fault TSS */ - /* Be sure this is zeroed to avoid false validations in Xen */ - .fill PAGE_SIZE_asm / 8 - GDT_ENTRIES,8,0 - - /* * __xen_guest information */ @@ -176,6 +162,7 @@ .ascii ",FEATURES=writable_page_tables" .ascii "|writable_descriptor_tables" .ascii "|auto_translated_physmap" + .ascii "|pae_pgdir_above_4gb" .ascii "|supervisor_mode_kernel" #ifdef CONFIG_X86_PAE .ascii ",PAE=yes" diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/arch/i386/kernel/io_apic-xen.c --- a/linux-2.6-xen-sparse/arch/i386/kernel/io_apic-xen.c Wed Mar 1 17:01:54 2006 +++ b/linux-2.6-xen-sparse/arch/i386/kernel/io_apic-xen.c Wed Mar 1 19:47:25 2006 @@ -2634,8 +2634,10 @@ spin_unlock_irqrestore(&ioapic_lock, flags); /* Sanity check */ - if (reg_00.bits.ID != apic_id) - panic("IOAPIC[%d]: Unable change apic_id!\n", ioapic); + if (reg_00.bits.ID != apic_id) { + printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic); + return -1; + } } apic_printk(APIC_VERBOSE, KERN_INFO diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/arch/i386/kernel/mpparse-xen.c --- a/linux-2.6-xen-sparse/arch/i386/kernel/mpparse-xen.c Wed Mar 1 17:01:54 2006 +++ b/linux-2.6-xen-sparse/arch/i386/kernel/mpparse-xen.c Wed Mar 1 19:47:25 2006 @@ -935,6 +935,7 @@ u32 gsi_base) { int idx = 0; + int tmpid; if (nr_ioapics >= MAX_IO_APICS) { printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " @@ -957,9 +958,14 @@ set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); #endif if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 < 15)) - mp_ioapics[idx].mpc_apicid = io_apic_get_unique_id(idx, id); + tmpid = io_apic_get_unique_id(idx, id); else - mp_ioapics[idx].mpc_apicid = id; + tmpid = id; + if (tmpid == -1) { + nr_ioapics--; + return; + } + mp_ioapics[idx].mpc_apicid = tmpid; mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); /* diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/arch/i386/kernel/smpboot.c --- a/linux-2.6-xen-sparse/arch/i386/kernel/smpboot.c Wed Mar 1 17:01:54 2006 +++ b/linux-2.6-xen-sparse/arch/i386/kernel/smpboot.c Wed Mar 1 19:47:25 2006 @@ -898,12 +898,6 @@ unsigned long start_eip; unsigned short nmi_high = 0, nmi_low = 0; - if (!cpu_gdt_descr[cpu].address && - !(cpu_gdt_descr[cpu].address = get_zeroed_page(GFP_KERNEL))) { - printk("Failed to allocate GDT for CPU %d\n", cpu); - return 1; - } - ++cpucount; /* diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/arch/i386/kernel/time-xen.c --- a/linux-2.6-xen-sparse/arch/i386/kernel/time-xen.c Wed Mar 1 17:01:54 2006 +++ b/linux-2.6-xen-sparse/arch/i386/kernel/time-xen.c Wed Mar 1 19:47:25 2006 @@ -48,6 +48,8 @@ #include <linux/mca.h> #include <linux/sysctl.h> #include <linux/percpu.h> +#include <linux/kernel_stat.h> +#include <linux/posix-timers.h> #include <asm/io.h> #include <asm/smp.h> @@ -70,6 +72,7 @@ #include <asm/arch_hooks.h> #include <xen/evtchn.h> +#include <xen/interface/vcpu.h> #if defined (__i386__) #include <asm/i8259.h> @@ -122,6 +125,13 @@ /* Keep track of last time we did processing/updating of jiffies and xtime. */ static u64 processed_system_time; /* System time (ns) at last processing. */ static DEFINE_PER_CPU(u64, processed_system_time); + +/* How much CPU time was spent blocked and how much was 'stolen'? */ +static DEFINE_PER_CPU(u64, processed_stolen_time); +static DEFINE_PER_CPU(u64, processed_blocked_time); + +/* Current runstate of each CPU (updated automatically by the hypervisor). */ +static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); /* Must be signed, as it's compared with s64 quantities which can be -ve. */ #define NS_PER_TICK (1000000000LL/HZ) @@ -477,14 +487,45 @@ EXPORT_SYMBOL(do_settimeofday); -#ifdef CONFIG_XEN_PRIVILEGED_GUEST +static void sync_xen_wallclock(unsigned long dummy); +static DEFINE_TIMER(sync_xen_wallclock_timer, sync_xen_wallclock, 0, 0); +static void sync_xen_wallclock(unsigned long dummy) +{ + time_t sec; + s64 nsec; + dom0_op_t op; + + if (!ntp_synced() || independent_wallclock || + !(xen_start_info->flags & SIF_INITDOMAIN)) + return; + + write_seqlock_irq(&xtime_lock); + + sec = xtime.tv_sec; + nsec = xtime.tv_nsec + ((jiffies - wall_jiffies) * (u64)NS_PER_TICK); + __normalize_time(&sec, &nsec); + + op.cmd = DOM0_SETTIME; + op.u.settime.secs = sec; + op.u.settime.nsecs = nsec; + op.u.settime.system_time = processed_system_time; + HYPERVISOR_dom0_op(&op); + + update_wallclock(); + + write_sequnlock_irq(&xtime_lock); + + /* Once per minute. */ + mod_timer(&sync_xen_wallclock_timer, jiffies + 60*HZ); +} + static int set_rtc_mmss(unsigned long nowtime) { int retval; WARN_ON(irqs_disabled()); - if (!(xen_start_info->flags & SIF_INITDOMAIN)) + if (independent_wallclock || !(xen_start_info->flags & SIF_INITDOMAIN)) return 0; /* gets recalled with irq locally disabled */ @@ -497,12 +538,6 @@ return retval; } -#else -static int set_rtc_mmss(unsigned long nowtime) -{ - return 0; -} -#endif /* monotonic_clock(): returns # of nanoseconds passed since time_init() * Note: This function is required to return accurate @@ -567,19 +602,37 @@ irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) { - s64 delta, delta_cpu; + s64 delta, delta_cpu, stolen, blocked; + u64 sched_time; int i, cpu = smp_processor_id(); struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu); + struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu); write_seqlock(&xtime_lock); do { get_time_values_from_xen(); + /* Obtain a consistent snapshot of elapsed wallclock cycles. */ delta = delta_cpu = shadow->system_timestamp + get_nsec_offset(shadow); delta -= processed_system_time; delta_cpu -= per_cpu(processed_system_time, cpu); + + /* + * Obtain a consistent snapshot of stolen/blocked cycles. We + * can use state_entry_time to detect if we get preempted here. + */ + do { + sched_time = runstate->state_entry_time; + barrier(); + stolen = runstate->time[RUNSTATE_runnable] + + runstate->time[RUNSTATE_offline] - + per_cpu(processed_stolen_time, cpu); + blocked = runstate->time[RUNSTATE_blocked] - + per_cpu(processed_blocked_time, cpu); + barrier(); + } while (sched_time != runstate->state_entry_time); } while (!time_values_up_to_date(cpu)); @@ -612,18 +665,67 @@ write_sequnlock(&xtime_lock); /* - * Local CPU jiffy work. No need to hold xtime_lock, and I'm not sure - * if there is risk of deadlock if we do (since update_process_times - * may do scheduler rebalancing work and thus acquire runqueue locks). - */ - while (delta_cpu >= NS_PER_TICK) { - delta_cpu -= NS_PER_TICK; - per_cpu(processed_system_time, cpu) += NS_PER_TICK; - update_process_times(user_mode(regs)); - profile_tick(CPU_PROFILING, regs); - } + * Account stolen ticks. + * HACK: Passing NULL to account_steal_time() + * ensures that the ticks are accounted as stolen. + */ + if (stolen > 0) { + delta_cpu -= stolen; + do_div(stolen, NS_PER_TICK); + per_cpu(processed_stolen_time, cpu) += stolen * NS_PER_TICK; + per_cpu(processed_system_time, cpu) += stolen * NS_PER_TICK; + account_steal_time(NULL, (cputime_t)stolen); + } + + /* + * Account blocked ticks. + * HACK: Passing idle_task to account_steal_time() + * ensures that the ticks are accounted as idle/wait. + */ + if (blocked > 0) { + delta_cpu -= blocked; + do_div(blocked, NS_PER_TICK); + per_cpu(processed_blocked_time, cpu) += blocked * NS_PER_TICK; + per_cpu(processed_system_time, cpu) += blocked * NS_PER_TICK; + account_steal_time(idle_task(cpu), (cputime_t)blocked); + } + + /* Account user/system ticks. */ + if (delta_cpu > 0) { + do_div(delta_cpu, NS_PER_TICK); + per_cpu(processed_system_time, cpu) += delta_cpu * NS_PER_TICK; + if (user_mode(regs)) + account_user_time(current, (cputime_t)delta_cpu); + else + account_system_time(current, HARDIRQ_OFFSET, + (cputime_t)delta_cpu); + } + + /* Local timer processing (see update_process_times()). */ + run_local_timers(); + if (rcu_pending(cpu)) + rcu_check_callbacks(cpu, user_mode(regs)); + scheduler_tick(); + run_posix_cpu_timers(current); return IRQ_HANDLED; +} + +static void init_missing_ticks_accounting(int cpu) +{ + struct vcpu_register_runstate_memory_area area; + struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu); + + memset(runstate, 0, sizeof(*runstate)); + + area.addr.v = runstate; + HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area); + + per_cpu(processed_blocked_time, cpu) = + runstate->time[RUNSTATE_blocked]; + per_cpu(processed_stolen_time, cpu) = + runstate->time[RUNSTATE_runnable] + + runstate->time[RUNSTATE_offline]; } /* not static: needed by APM */ @@ -691,6 +793,7 @@ void notify_arch_cmos_timer(void) { mod_timer(&sync_cmos_timer, jiffies + 1); + mod_timer(&sync_xen_wallclock_timer, jiffies + 1); } static long clock_cmos_diff, sleep_start; @@ -814,6 +917,7 @@ processed_system_time = per_cpu(shadow_time, 0).system_timestamp; per_cpu(processed_system_time, 0) = processed_system_time; + init_missing_ticks_accounting(0); update_wallclock(); @@ -891,6 +995,7 @@ processed_system_time = per_cpu(shadow_time, 0).system_timestamp; per_cpu(processed_system_time, 0) = processed_system_time; + init_missing_ticks_accounting(0); update_wallclock(); } @@ -909,6 +1014,7 @@ /* Use cpu0 timestamp: cpu's shadow is not initialised yet. */ per_cpu(processed_system_time, cpu) = per_cpu(shadow_time, 0).system_timestamp; + init_missing_ticks_accounting(cpu); } while (read_seqretry(&xtime_lock, seq)); sprintf(timer_name[cpu], "timer%d", cpu); diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/arch/i386/mach-xen/Makefile --- a/linux-2.6-xen-sparse/arch/i386/mach-xen/Makefile Wed Mar 1 17:01:54 2006 +++ b/linux-2.6-xen-sparse/arch/i386/mach-xen/Makefile Wed Mar 1 19:47:25 2006 @@ -2,6 +2,4 @@ # Makefile for the linux kernel. # -obj-y := setup.o topology.o - -topology-y := ../mach-default/topology.o +obj-y := setup.o diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/arch/i386/mm/init-xen.c --- a/linux-2.6-xen-sparse/arch/i386/mm/init-xen.c Wed Mar 1 17:01:54 2006 +++ b/linux-2.6-xen-sparse/arch/i386/mm/init-xen.c Wed Mar 1 19:47:25 2006 @@ -454,6 +454,7 @@ static int disable_nx __initdata = 0; u64 __supported_pte_mask __read_mostly = ~_PAGE_NX; +EXPORT_SYMBOL(__supported_pte_mask); /* * noexec = on|off diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/arch/x86_64/Kconfig --- a/linux-2.6-xen-sparse/arch/x86_64/Kconfig Wed Mar 1 17:01:54 2006 +++ b/linux-2.6-xen-sparse/arch/x86_64/Kconfig Wed Mar 1 19:47:25 2006 @@ -381,21 +381,6 @@ as it is off-chip. You can find the HPET spec at <http://www.intel.com/hardwaredesign/hpetspec.htm>. -config X86_PM_TIMER - bool "PM timer" if EMBEDDED - depends on ACPI && !X86_64_XEN - default y - help - Support the ACPI PM timer for time keeping. This is slow, - but is useful on some chipsets without HPET on systems with more - than one CPU. On a single processor or single socket multi core - system it is normally not required. - When the PM timer is active 64bit vsyscalls are disabled - and should not be enabled (/proc/sys/kernel/vsyscall64 should - not be changed). - The kernel selects the PM timer only as a last resort, so it is - useful to enable just in case. - config HPET_EMULATE_RTC bool "Provide RTC interrupt" depends on HPET_TIMER && RTC=y @@ -640,6 +625,7 @@ config KPROBES bool "Kprobes (EXPERIMENTAL)" + depends on EXPERIMENTAL && MODULES help Kprobes allows you to trap at almost any kernel address and execute a callback function. register_kprobe() establishes diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile --- a/linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile Wed Mar 1 17:01:54 2006 +++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile Wed Mar 1 19:47:25 2006 @@ -45,7 +45,7 @@ bootflag-y += ../../i386/kernel/bootflag.o cpuid-$(subst m,y,$(CONFIG_X86_CPUID)) += ../../i386/kernel/cpuid.o -topology-y += ../../i386/mach-default/topology.o +topology-y += ../../i386/kernel/topology.o microcode-$(subst m,y,$(CONFIG_MICROCODE)) += ../../i386/kernel/microcode.o intel_cacheinfo-y += ../../i386/kernel/cpu/intel_cacheinfo.o quirks-y += ../../i386/kernel/quirks.o diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/arch/x86_64/kernel/apic-xen.c --- a/linux-2.6-xen-sparse/arch/x86_64/kernel/apic-xen.c Wed Mar 1 17:01:54 2006 +++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/apic-xen.c Wed Mar 1 19:47:25 2006 @@ -114,6 +114,8 @@ irq_exit(); } +int __initdata unsync_tsc_on_multicluster; + /* * This interrupt should _never_ happen with our APIC/SMP architecture */ diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/arch/x86_64/kernel/entry-xen.S --- a/linux-2.6-xen-sparse/arch/x86_64/kernel/entry-xen.S Wed Mar 1 17:01:54 2006 +++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/entry-xen.S Wed Mar 1 19:47:25 2006 @@ -51,6 +51,7 @@ #include <asm/page.h> #include <asm/errno.h> #include <xen/interface/arch-x86_64.h> +#include <xen/interface/features.h> #include "irq_vectors.h" @@ -146,16 +147,19 @@ */ .macro HYPERVISOR_IRET flag testb $3,1*8(%rsp) - jnz 1f + jnz 2f testl $NMI_MASK,2*8(%rsp) + jnz 2f + + testb $1,(xen_features+XENFEAT_supervisor_mode_kernel) jnz 1f /* Direct iret to kernel space. Correct CS and SS. */ orb $3,1*8(%rsp) orb $3,4*8(%rsp) - iretq - -1: /* Slow iret via hypervisor. */ +1: iretq + +2: /* Slow iret via hypervisor. */ andl $~NMI_MASK, 16(%rsp) pushq $\flag jmp hypercall_page + (__HYPERVISOR_iret * 32) diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/arch/x86_64/kernel/io_apic-xen.c --- a/linux-2.6-xen-sparse/arch/x86_64/kernel/io_apic-xen.c Wed Mar 1 17:01:54 2006 +++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/io_apic-xen.c Wed Mar 1 19:47:25 2006 @@ -51,6 +51,8 @@ int disable_timer_pin_1 __initdata; #ifndef CONFIG_XEN +int timer_over_8254 __initdata = 1; + /* Where if anywhere is the i8259 connect in external int mode */ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; #endif @@ -300,6 +302,22 @@ __setup("noapic", disable_ioapic_setup); __setup("apic", enable_ioapic_setup); + +#ifndef CONFIG_XEN +static int __init setup_disable_8254_timer(char *s) +{ + timer_over_8254 = -1; + return 1; +} +static int __init setup_enable_8254_timer(char *s) +{ + timer_over_8254 = 2; + return 1; +} + +__setup("disable_8254_timer", setup_disable_8254_timer); +__setup("enable_8254_timer", setup_enable_8254_timer); +#endif /* !CONFIG_XEN */ #include <asm/pci-direct.h> #include <linux/pci_ids.h> @@ -360,27 +378,20 @@ /* RED-PEN skip them on mptables too? */ return; case PCI_VENDOR_ID_ATI: + + /* This should be actually default, but + for 2.6.16 let's do it for ATI only where + it's really needed. */ #ifndef CONFIG_XEN - if (apic_runs_main_timer != 0) - break; -#ifdef CONFIG_ACPI - /* Don't do this for laptops right - right now because their timer - doesn't necessarily tick in C2/3 */ - if (acpi_fadt.revision >= 3 && - (acpi_fadt.plvl2_lat + acpi_fadt.plvl3_lat) < 1100) { - printk(KERN_INFO -"ATI board detected, but seems to be a laptop. Timer might be shakey, sorry\n"); - break; - } -#endif + if (timer_over_8254 == 1) { + timer_over_8254 = 0; printk(KERN_INFO - "ATI board detected. Using APIC/PM timer.\n"); - apic_runs_main_timer = 1; - nohpet = 1; + "ATI board detected. Disabling timer routing over 8254.\n"); + } #endif return; } + /* No multi-function device? */ type = read_pci_config_byte(num,slot,func, @@ -1848,6 +1859,8 @@ * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ * is so screwy. Thanks to Brian Perkins for testing/hacking this beast * fanatically on his truly buggy board. + * + * FIXME: really need to revamp this for modern platforms only. */ static inline void check_timer(void) { @@ -1870,7 +1883,8 @@ */ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); init_8259A(1); - enable_8259A_irq(0); + if (timer_over_8254 > 0) + enable_8259A_irq(0); pin1 = find_isa_irq_pin(0, mp_INT); apic1 = find_isa_irq_apic(0, mp_INT); @@ -1925,7 +1939,7 @@ } printk(" failed.\n"); - if (nmi_watchdog) { + if (nmi_watchdog == NMI_IO_APIC) { printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); nmi_watchdog = 0; } diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c --- a/linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c Wed Mar 1 17:01:54 2006 +++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c Wed Mar 1 19:47:25 2006 @@ -462,6 +462,12 @@ else if(!memcmp(from, "elfcorehdr=", 11)) elfcorehdr_addr = memparse(from+11, &from); #endif + +#if defined(CONFIG_HOTPLUG_CPU) && !defined(CONFIG_XEN) + else if (!memcmp(from, "additional_cpus=", 16)) + setup_additional_cpus(from+16); +#endif + next_char: c = *(from++); if (!c) diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/drivers/acpi/Kconfig --- a/linux-2.6-xen-sparse/drivers/acpi/Kconfig Wed Mar 1 17:01:54 2006 +++ b/linux-2.6-xen-sparse/drivers/acpi/Kconfig Wed Mar 1 19:47:25 2006 @@ -247,7 +247,7 @@ Enter the full path name to the file wich includes the AmlCode declaration. config ACPI_BLACKLIST_YEAR - int "Disable ACPI for systems before Jan 1st this year" if X86 + int "Disable ACPI for systems before Jan 1st this year" if X86_32 default 0 help enter a 4-digit year, eg. 2001 to disable ACPI by default @@ -285,9 +285,9 @@ dump your ACPI DSDT table using /proc/acpi/dsdt. config X86_PM_TIMER - bool "Power Management Timer Support" - depends on X86 - depends on !X86_64 + bool "Power Management Timer Support" if EMBEDDED + depends on X86 + depends on !XEN default y help The Power Management Timer is available on all ACPI-capable, @@ -298,9 +298,8 @@ voltage scaling, unlike the commonly used Time Stamp Counter (TSC) timing source. - So, if you see messages like 'Losing too many ticks!' in the - kernel logs, and/or you are using this on a notebook which - does not yet have an HPET, you should say "Y" here. + You should nearly always say Y here because many modern + systems require this timer. config ACPI_CONTAINER tristate "ACPI0004,PNP0A05 and PNP0A06 Container Driver (EXPERIMENTAL)" diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/drivers/video/Kconfig --- a/linux-2.6-xen-sparse/drivers/video/Kconfig Wed Mar 1 17:01:54 2006 +++ b/linux-2.6-xen-sparse/drivers/video/Kconfig Wed Mar 1 19:47:25 2006 @@ -520,7 +520,7 @@ config FB_GBE_MEM int "Video memory size in MB" depends on FB_GBE - default 8 + default 4 help This is the amount of memory reserved for the framebuffer, which can be any value between 1MB and 8MB. diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/drivers/xen/Kconfig --- a/linux-2.6-xen-sparse/drivers/xen/Kconfig Wed Mar 1 17:01:54 2006 +++ b/linux-2.6-xen-sparse/drivers/xen/Kconfig Wed Mar 1 19:47:25 2006 @@ -68,7 +68,7 @@ default n config XEN_BLKDEV_BACKEND - bool "Block-device backend driver" + tristate "Block-device backend driver" default y help The block-device backend driver allows the kernel to export its @@ -76,7 +76,7 @@ interface. config XEN_BLKDEV_TAP_BE - bool "Block Tap support for backend driver (DANGEROUS)" + tristate "Block Tap support for backend driver (DANGEROUS)" depends on XEN_BLKDEV_BACKEND default n help @@ -89,7 +89,7 @@ modified to use grant tables. config XEN_NETDEV_BACKEND - bool "Network-device backend driver" + tristate "Network-device backend driver" default y help The network-device backend driver allows the kernel to export its @@ -109,8 +109,16 @@ are unsure; or if you experience network hangs when this option is enabled; then you must say N here. +config XEN_NETDEV_LOOPBACK + tristate "Network-device loopback driver" + depends on XEN_NETDEV_BACKEND + default y + help + A two-interface loopback device to emulate a local netfront-netback + connection. + config XEN_TPMDEV_BACKEND - bool "TPM-device backend driver" + tristate "TPM-device backend driver" default n help The TPM-device backend driver @@ -145,7 +153,7 @@ (domain 0), then you almost certainly want to say Y here. config XEN_BLKDEV_TAP - bool "Block device tap driver" + tristate "Block device tap driver" default n help This driver allows a VM to interact on block device channels @@ -154,7 +162,7 @@ space. Odds are that you want to say N here. config XEN_TPMDEV_FRONTEND - bool "TPM-device frontend driver" + tristate "TPM-device frontend driver" default n select TCG_TPM select TCG_XEN diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/drivers/xen/blkback/Makefile --- a/linux-2.6-xen-sparse/drivers/xen/blkback/Makefile Wed Mar 1 17:01:54 2006 +++ b/linux-2.6-xen-sparse/drivers/xen/blkback/Makefile Wed Mar 1 19:47:25 2006 @@ -1,2 +1,3 @@ +obj-$(CONFIG_XEN_BLKDEV_BACKEND) := blkbk.o -obj-y := blkback.o xenbus.o interface.o vbd.o +blkbk-y := blkback.o xenbus.o interface.o vbd.o diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c --- a/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c Wed Mar 1 17:01:54 2006 +++ b/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c Wed Mar 1 19:47:25 2006 @@ -29,14 +29,10 @@ * 64 should be enough to keep us competitive with Linux. */ static int blkif_reqs = 64; +module_param_named(reqs, blkif_reqs, int, 0); +MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate"); + static int mmap_pages; - -static int __init set_blkif_reqs(char *str) -{ - get_option(&str, &blkif_reqs); - return 1; -} -__setup("blkif_reqs=", set_blkif_reqs); /* Run-time switchable: /sys/module/blkback/parameters/ */ static unsigned int log_stats = 0; @@ -574,10 +570,20 @@ list_add_tail(&pending_reqs[i].free_list, &pending_free); blkif_xenbus_init(); + __unsafe(THIS_MODULE); return 0; } -__initcall(blkif_init); +module_init(blkif_init); + +static void blkif_exit(void) +{ + BUG(); +} + +module_exit(blkif_exit); + +MODULE_LICENSE("Dual BSD/GPL"); /* * Local variables: diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/drivers/xen/core/skbuff.c --- a/linux-2.6-xen-sparse/drivers/xen/core/skbuff.c Wed Mar 1 17:01:54 2006 +++ b/linux-2.6-xen-sparse/drivers/xen/core/skbuff.c Wed Mar 1 19:47:25 2006 @@ -16,6 +16,7 @@ /* Referenced in netback.c. */ /*static*/ kmem_cache_t *skbuff_cachep; +EXPORT_SYMBOL(skbuff_cachep); #define MAX_SKBUFF_ORDER 4 static kmem_cache_t *skbuff_order_cachep[MAX_SKBUFF_ORDER + 1]; diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/drivers/xen/core/smpboot.c --- a/linux-2.6-xen-sparse/drivers/xen/core/smpboot.c Wed Mar 1 17:01:54 2006 +++ b/linux-2.6-xen-sparse/drivers/xen/core/smpboot.c Wed Mar 1 19:47:25 2006 @@ -150,6 +150,11 @@ { vcpu_guest_context_t ctxt; struct task_struct *idle = idle_task(vcpu); +#ifdef __x86_64__ + struct desc_ptr *gdt_descr = &cpu_gdt_descr[vcpu]; +#else + struct Xgt_desc_struct *gdt_descr = &per_cpu(cpu_gdt_descr, vcpu); +#endif if (vcpu == 0) return; @@ -171,8 +176,8 @@ ctxt.ldt_ents = 0; - ctxt.gdt_frames[0] = virt_to_mfn(cpu_gdt_descr[vcpu].address); - ctxt.gdt_ents = cpu_gdt_descr[vcpu].size / 8; + ctxt.gdt_frames[0] = virt_to_mfn(gdt_descr->address); + ctxt.gdt_ents = gdt_descr->size / 8; #ifdef __i386__ ctxt.user_regs.cs = __KERNEL_CS; @@ -210,6 +215,11 @@ { int cpu; struct task_struct *idle; +#ifdef __x86_64__ + struct desc_ptr *gdt_descr; +#else + struct Xgt_desc_struct *gdt_descr; +#endif cpu_data[0] = boot_cpu_data; @@ -225,6 +235,22 @@ for_each_cpu_mask (cpu, cpu_possible_map) { if (cpu == 0) continue; + +#ifdef __x86_64__ + gdt_descr = &cpu_gdt_descr[cpu]; +#else + gdt_descr = &per_cpu(cpu_gdt_descr, cpu); +#endif + gdt_descr->address = get_zeroed_page(GFP_KERNEL); + if (unlikely(!gdt_descr->address)) { + printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu); + continue; + } + gdt_descr->size = GDT_SIZE; + memcpy((void *)gdt_descr->address, cpu_gdt_table, GDT_SIZE); + make_page_readonly( + (void *)gdt_descr->address, + XENFEAT_writable_descriptor_tables); cpu_data[cpu] = boot_cpu_data; cpu_2_logical_apicid[cpu] = cpu; @@ -241,17 +267,6 @@ #endif irq_ctx_init(cpu); - - cpu_gdt_descr[cpu].address = - __get_free_page(GFP_KERNEL|__GFP_ZERO); - BUG_ON(cpu_gdt_descr[0].size > PAGE_SIZE); - cpu_gdt_descr[cpu].size = cpu_gdt_descr[0].size; - memcpy((void *)cpu_gdt_descr[cpu].address, - (void *)cpu_gdt_descr[0].address, - cpu_gdt_descr[0].size); - make_page_readonly( - (void *)cpu_gdt_descr[cpu].address, - XENFEAT_writable_descriptor_tables); #ifdef CONFIG_HOTPLUG_CPU if (xen_start_info->flags & SIF_INITDOMAIN) diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/drivers/xen/net_driver_util.c --- a/linux-2.6-xen-sparse/drivers/xen/net_driver_util.c Wed Mar 1 17:01:54 2006 +++ b/linux-2.6-xen-sparse/drivers/xen/net_driver_util.c Wed Mar 1 19:47:25 2006 @@ -30,6 +30,7 @@ #include <linux/if_ether.h> #include <linux/err.h> +#include <linux/module.h> #include <xen/net_driver_util.h> @@ -54,7 +55,7 @@ kfree(macstr); return 0; } - +EXPORT_SYMBOL(xen_net_read_mac); /* * Local variables: diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/drivers/xen/netback/Makefile --- a/linux-2.6-xen-sparse/drivers/xen/netback/Makefile Wed Mar 1 17:01:54 2006 +++ b/linux-2.6-xen-sparse/drivers/xen/netback/Makefile Wed Mar 1 19:47:25 2006 @@ -1,2 +1,5 @@ +obj-$(CONFIG_XEN_NETDEV_BACKEND) := netbk.o +obj-$(CONFIG_XEN_NETDEV_LOOPBACK) += netloop.o -obj-y := netback.o xenbus.o interface.o loopback.o +netbk-y := netback.o xenbus.o interface.o +netloop-y := loopback.o diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/drivers/xen/netback/loopback.c --- a/linux-2.6-xen-sparse/drivers/xen/netback/loopback.c Wed Mar 1 17:01:54 2006 +++ b/linux-2.6-xen-sparse/drivers/xen/netback/loopback.c Wed Mar 1 19:47:25 2006 @@ -178,6 +178,23 @@ return err; } +static void __init clean_loopback(int i) +{ + struct net_device *dev1, *dev2; + char dev_name[IFNAMSIZ]; + + sprintf(dev_name, "vif0.%d", i); + dev1 = dev_get_by_name(dev_name); + sprintf(dev_name, "veth%d", i); + dev2 = dev_get_by_name(dev_name); + if (dev1 && dev2) { + unregister_netdev(dev2); + unregister_netdev(dev1); + free_netdev(dev2); + free_netdev(dev1); + } +} + static int __init loopback_init(void) { int i, err = 0; @@ -190,6 +207,18 @@ } module_init(loopback_init); + +static void __exit loopback_exit(void) +{ + int i; + + for (i = nloopbacks; i-- > 0; ) + clean_loopback(i); +} + +module_exit(loopback_exit); + +MODULE_LICENSE("Dual BSD/GPL"); /* * Local variables: diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/drivers/xen/netback/netback.c --- a/linux-2.6-xen-sparse/drivers/xen/netback/netback.c Wed Mar 1 17:01:54 2006 +++ b/linux-2.6-xen-sparse/drivers/xen/netback/netback.c Wed Mar 1 19:47:25 2006 @@ -505,14 +505,12 @@ /* Still too big to send right now? Set a callback. */ if (txreq.size > netif->remaining_credit) { netif->remaining_credit = 0; - netif->credit_timeout.expires = - next_credit; netif->credit_timeout.data = (unsigned long)netif; netif->credit_timeout.function = tx_credit_callback; - add_timer_on(&netif->credit_timeout, - smp_processor_id()); + __mod_timer(&netif->credit_timeout, + next_credit); break; } } @@ -811,6 +809,8 @@ &netif_be_dbg); #endif + __unsafe(THIS_MODULE); + return 0; } @@ -821,6 +821,8 @@ module_init(netback_init); module_exit(netback_cleanup); + +MODULE_LICENSE("Dual BSD/GPL"); /* * Local variables: diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c --- a/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c Wed Mar 1 17:01:54 2006 +++ b/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c Wed Mar 1 19:47:25 2006 @@ -114,6 +114,7 @@ /* Receive-ring batched refills. */ #define RX_MIN_TARGET 8 +#define RX_DFL_MIN_TARGET 64 #define RX_MAX_TARGET NET_RX_RING_SIZE int rx_min_target, rx_max_target, rx_target; struct sk_buff_head rx_batch; @@ -1102,8 +1103,8 @@ spin_lock_init(&np->rx_lock); skb_queue_head_init(&np->rx_batch); - np->rx_target = RX_MIN_TARGET; - np->rx_min_target = RX_MIN_TARGET; + np->rx_target = RX_DFL_MIN_TARGET; + np->rx_min_target = RX_DFL_MIN_TARGET; np->rx_max_target = RX_MAX_TARGET; init_timer(&np->rx_refill_timer); diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/drivers/xen/tpmback/common.h --- a/linux-2.6-xen-sparse/drivers/xen/tpmback/common.h Wed Mar 1 17:01:54 2006 +++ b/linux-2.6-xen-sparse/drivers/xen/tpmback/common.h Wed Mar 1 19:47:25 2006 @@ -54,9 +54,11 @@ void tpmif_disconnect_complete(tpmif_t * tpmif); tpmif_t *tpmif_find(domid_t domid, long int instance); void tpmif_interface_init(void); +void tpmif_interface_exit(void); void tpmif_schedule_work(tpmif_t * tpmif); void tpmif_deschedule_work(tpmif_t * tpmif); void tpmif_xenbus_init(void); +void tpmif_xenbus_exit(void); int tpmif_map(tpmif_t *tpmif, unsigned long shared_page, unsigned int evtchn); irqreturn_t tpmif_be_int(int irq, void *dev_id, struct pt_regs *regs); int tpmif_vtpm_open(tpmif_t *tpmif, domid_t domain, u32 instance); diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/drivers/xen/tpmback/interface.c --- a/linux-2.6-xen-sparse/drivers/xen/tpmback/interface.c Wed Mar 1 17:01:54 2006 +++ b/linux-2.6-xen-sparse/drivers/xen/tpmback/interface.c Wed Mar 1 19:47:25 2006 @@ -186,6 +186,12 @@ 0, 0, NULL, NULL); } +void __init +tpmif_interface_exit(void) +{ + kmem_cache_destroy(tpmif_cachep); +} + /* * Local variables: * c-file-style: "linux" diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/drivers/xen/tpmback/tpmback.c --- a/linux-2.6-xen-sparse/drivers/xen/tpmback/tpmback.c Wed Mar 1 17:01:54 2006 +++ b/linux-2.6-xen-sparse/drivers/xen/tpmback/tpmback.c Wed Mar 1 19:47:25 2006 @@ -1092,7 +1092,20 @@ return 0; } -__initcall(tpmback_init); +module_init(tpmback_init); + +static void __exit +tpmback_exit(void) +{ + + tpmif_xenbus_exit(); + tpmif_interface_exit(); + misc_deregister(&ibmvtpms_miscdevice); +} + +module_exit(tpmback_exit); + +MODULE_LICENSE("Dual BSD/GPL"); /* * Local variables: diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/drivers/xen/tpmback/xenbus.c --- a/linux-2.6-xen-sparse/drivers/xen/tpmback/xenbus.c Wed Mar 1 17:01:54 2006 +++ b/linux-2.6-xen-sparse/drivers/xen/tpmback/xenbus.c Wed Mar 1 19:47:25 2006 @@ -317,6 +317,11 @@ xenbus_register_backend(&tpmback); } +void tpmif_xenbus_exit(void) +{ + xenbus_unregister_driver(&tpmback); +} + /* * Local variables: * c-file-style: "linux" diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/drivers/xen/tpmfront/tpmfront.c --- a/linux-2.6-xen-sparse/drivers/xen/tpmfront/tpmfront.c Wed Mar 1 17:01:54 2006 +++ b/linux-2.6-xen-sparse/drivers/xen/tpmfront/tpmfront.c Wed Mar 1 19:47:25 2006 @@ -480,6 +480,11 @@ xenbus_register_frontend(&tpmfront); } +static void __exit exit_tpm_xenbus(void) +{ + xenbus_unregister_driver(&tpmfront); +} + static int tpm_allocate_buffers(struct tpm_private *tp) @@ -700,7 +705,18 @@ return 0; } -__initcall(tpmif_init); +module_init(tpmif_init); + +static void __exit +tpmif_exit(void) +{ + exit_tpm_xenbus(); + gnttab_free_grant_references(gref_head); +} + +module_exit(tpmif_exit); + +MODULE_LICENSE("Dual BSD/GPL"); /* * Local variables: diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/desc.h --- a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/desc.h Wed Mar 1 17:01:54 2006 +++ b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/desc.h Wed Mar 1 19:47:25 2006 @@ -23,11 +23,13 @@ unsigned short pad; } __attribute__ ((packed)); -extern struct Xgt_desc_struct idt_descr, cpu_gdt_descr[NR_CPUS]; +extern struct Xgt_desc_struct idt_descr; +DECLARE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr); + static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) { - return ((struct desc_struct *)cpu_gdt_descr[cpu].address); + return (struct desc_struct *)per_cpu(cpu_gdt_descr, cpu).address; } #define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8)) diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/pci.h --- a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/pci.h Wed Mar 1 17:01:54 2006 +++ b/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/pci.h Wed Mar 1 19:47:25 2006 @@ -18,8 +18,6 @@ #define pcibios_assign_all_busses() 0 #endif #define pcibios_scan_all_fns(a, b) 0 - -extern int no_iommu, force_iommu; extern unsigned long pci_mem_start; #define PCIBIOS_MIN_IO 0x1000 diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/pgtable.h --- a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/pgtable.h Wed Mar 1 17:01:54 2006 +++ b/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/pgtable.h Wed Mar 1 19:47:25 2006 @@ -169,7 +169,7 @@ #define PGDIR_SIZE (1UL << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE-1)) -#define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) +#define USER_PTRS_PER_PGD ((TASK_SIZE-1)/PGDIR_SIZE+1) #define FIRST_USER_ADDRESS 0 #ifndef __ASSEMBLY__ diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/include/linux/mm.h --- a/linux-2.6-xen-sparse/include/linux/mm.h Wed Mar 1 17:01:54 2006 +++ b/linux-2.6-xen-sparse/include/linux/mm.h Wed Mar 1 19:47:25 2006 @@ -1064,7 +1064,11 @@ void drop_pagecache(void); void drop_slab(void); +#ifndef CONFIG_MMU +#define randomize_va_space 0 +#else extern int randomize_va_space; +#endif #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/mm/page_alloc.c --- a/linux-2.6-xen-sparse/mm/page_alloc.c Wed Mar 1 17:01:54 2006 +++ b/linux-2.6-xen-sparse/mm/page_alloc.c Wed Mar 1 19:47:25 2006 @@ -1017,7 +1017,7 @@ if (page) goto got_pg; - out_of_memory(gfp_mask, order); + out_of_memory(zonelist, gfp_mask, order); goto restart; } diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/net/core/skbuff.c --- a/linux-2.6-xen-sparse/net/core/skbuff.c Wed Mar 1 17:01:54 2006 +++ b/linux-2.6-xen-sparse/net/core/skbuff.c Wed Mar 1 19:47:25 2006 @@ -434,6 +434,9 @@ C(pkt_type); C(ip_summed); C(priority); +#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) + C(ipvs_property); +#endif C(protocol); n->destructor = NULL; #ifdef CONFIG_NETFILTER @@ -441,13 +444,6 @@ C(nfct); nf_conntrack_get(skb->nfct); C(nfctinfo); -#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) - C(nfct_reasm); - nf_conntrack_get_reasm(skb->nfct_reasm); -#endif -#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) - C(ipvs_property); -#endif #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) C(nfct_reasm); nf_conntrack_get_reasm(skb->nfct_reasm); diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/Makefile --- a/tools/examples/Makefile Wed Mar 1 17:01:54 2006 +++ b/tools/examples/Makefile Wed Mar 1 19:47:25 2006 @@ -26,10 +26,11 @@ XEN_SCRIPTS += network-nat vif-nat XEN_SCRIPTS += block XEN_SCRIPTS += block-enbd block-nbd -XEN_SCRIPTS += vtpm -XEN_SCRIPT_DATA = xen-script-common.sh +XEN_SCRIPTS += vtpm vtpm-delete +XEN_SCRIPTS += xen-hotplug-cleanup +XEN_SCRIPT_DATA = xen-script-common.sh locking.sh logging.sh XEN_SCRIPT_DATA += xen-hotplug-common.sh xen-network-common.sh vif-common.sh -XEN_SCRIPT_DATA += block-common.sh vtpm-common.sh +XEN_SCRIPT_DATA += block-common.sh vtpm-common.sh vtpm-hotplug-common.sh XEN_HOTPLUG_DIR = /etc/hotplug XEN_HOTPLUG_SCRIPTS = xen-backend.agent diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/vif-common.sh --- a/tools/examples/vif-common.sh Wed Mar 1 17:01:54 2006 +++ b/tools/examples/vif-common.sh Wed Mar 1 19:47:25 2006 @@ -125,7 +125,7 @@ # function ip_of() { - ip addr show "$1" | awk "/^.*inet.*$1\$/{print \$2}" | sed 's,/.*,,' | head -1 + ip addr show "$1" | awk "/^.*inet.*$1\$/{print \$2}" | sed -n '1 s,/.*,,p' } diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/vtpm --- a/tools/examples/vtpm Wed Mar 1 17:01:54 2006 +++ b/tools/examples/vtpm Wed Mar 1 19:47:25 2006 @@ -1,7 +1,7 @@ #!/bin/sh dir=$(dirname "$0") -. "$dir/vtpm-common.sh" +. "$dir/vtpm-hotplug-common.sh" vtpm_fatal_error=0 diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/vtpm-common.sh --- a/tools/examples/vtpm-common.sh Wed Mar 1 17:01:54 2006 +++ b/tools/examples/vtpm-common.sh Wed Mar 1 19:47:25 2006 @@ -17,21 +17,8 @@ # dir=$(dirname "$0") -. "$dir/xen-hotplug-common.sh" - -findCommand "$@" -if [ "$command" != "online" ] && - [ "$command" != "offline" ] && - [ "$command" != "add" ] && - [ "$command" != "remove" ] -then - log err "Invalid command: $command" - exit 1 -fi - - -XENBUS_PATH="${XENBUS_PATH:?}" - +. "$dir/logging.sh" +. "$dir/locking.sh" VTPMDB="/etc/xen/vtpm.db" @@ -58,7 +45,11 @@ function vtpm_resume() { true } + function vtpm_delete() { + true + } fi + #Find the instance number for the vtpm given the name of the domain # Parameters @@ -66,7 +57,7 @@ # Return value # Returns '0' if instance number could not be found, otherwise # it returns the instance number in the variable 'instance' -function find_instance () { +function vtpmdb_find_instance () { local vmname=$1 local ret=0 instance=`cat $VTPMDB | \ @@ -80,18 +71,17 @@ } \ }'` if [ "$instance" != "" ]; then - ret=1 - fi - return $ret + ret=$instance + fi + echo "$ret" } # Check whether a particular instance number is still available -# returns '1' if it is available -function is_free_instancenum () { +# returns "0" if it is not available, "1" otherwise. +function vtpmdb_is_free_instancenum () { local instance=$1 local avail=1 - #Allowed instance number range: 1-255 if [ $instance -eq 0 -o $instance -gt 255 ]; then avail=0 @@ -110,13 +100,13 @@ fi done fi - return $avail + echo "$avail" } # Get an available instance number given the database # Returns an unused instance number -function get_free_instancenum () { +function vtpmdb_get_free_instancenum () { local ctr local instances local don @@ -145,12 +135,12 @@ fi let ctr=ctr+1 done - let instance=$ctr + echo "$ctr" } # Add a domain name and instance number to the DB file -function add_instance () { +function vtpmdb_add_instance () { local vmname=$1 local inst=$2 @@ -159,8 +149,8 @@ echo "#1st column: domain name" >> $VTPMDB echo "#2nd column: TPM instance number" >> $VTPMDB fi - validate_entry $vmname $inst - if [ $? -eq 0 ]; then + res=$(vtpmdb_validate_entry $vmname $inst) + if [ $res -eq 0 ]; then echo "$vmname $inst" >> $VTPMDB fi } @@ -168,11 +158,10 @@ #Validate whether an entry is the same as passed to this #function -function validate_entry () { +function vtpmdb_validate_entry () { local rc=0 local vmname=$1 local inst=$2 - local res res=`cat $VTPMDB | \ gawk -vvmname=$vmname \ @@ -197,13 +186,15 @@ elif [ "$res" == "2" ]; then let rc=2 fi - return $rc + echo "$rc" } #Remove an entry from the vTPM database given its domain name -function remove_entry () { +#and instance number +function vtpmdb_remove_entry () { local vmname=$1 + local instance=$2 local VTPMDB_TMP="$VTPMDB".tmp `cat $VTPMDB | \ gawk -vvmname=$vmname \ @@ -214,6 +205,7 @@ '} > $VTPMDB_TMP` if [ -e $VTPMDB_TMP ]; then mv -f $VTPMDB_TMP $VTPMDB + vtpm_delete $instance else log err "Error creating temporary file '$VTPMDB_TMP'." fi @@ -222,7 +214,7 @@ # Find the reason for the creation of this device: # Set global REASON variable to 'resume' or 'create' -function get_create_reason () { +function vtpm_get_create_reason () { local resume=$(xenstore-read $XENBUS_PATH/resume) if [ "$resume" == "True" ]; then REASON="resume" @@ -230,6 +222,7 @@ REASON="create" fi } + #Create a vTPM instance # If no entry in the TPM database is found, the instance is @@ -237,26 +230,23 @@ function vtpm_create_instance () { local domname=$(xenstore_read "$XENBUS_PATH"/domain) local res - set +e - get_create_reason + local instance + vtpm_get_create_reason claim_lock vtpmdb - - find_instance $domname - res=$? - if [ $res -eq 0 ]; then + instance=$(vtpmdb_find_instance $domname) + if [ "$instance" == "0" ]; then #Try to give the preferred instance to the domain instance=$(xenstore_read "$XENBUS_PATH"/pref_instance) if [ "$instance" != "" ]; then - is_free_instancenum $instance - res=$? + res=$(vtpmdb_is_free_instancenum $instance) if [ $res -eq 0 ]; then - get_free_instancenum + instance=$(vtpmdb_get_free_instancenum) fi else - get_free_instancenum + instance=$(vtpmdb_get_free_instancenum) fi - add_instance $domname $instance + vtpmdb_add_instance $domname $instance if [ "$REASON" == "create" ]; then vtpm_create $instance elif [ "$REASON" == "resume" ]; then @@ -279,25 +269,40 @@ true fi xenstore_write $XENBUS_PATH/instance $instance - set -e -} - - -#Remove an instance +} + + +#Remove an instance when a VM is terminating or suspending. +#Since it is assumed that the VM will appear again, the +#entry is kept in the VTPMDB file. function vtpm_remove_instance () { local domname=$(xenstore_read "$XENBUS_PATH"/domain) - set +e - find_instance $domname - res=$? - if [ $res -eq 0 ]; then - #Something is really wrong with the DB - log err "vTPM DB file $VTPMDB has no entry for '$domname'" - else + + claim_lock vtpmdb + + instance=$(vtpmdb_find_instance $domname) + + if [ "$instance" != "0" ]; then if [ "$REASON" == "suspend" ]; then vtpm_suspend $instance fi fi - set -e -} - - + + release_lock vtpmdb +} + + +#Remove an entry in the VTPMDB file given the domain's name +#1st parameter: The name of the domain +function vtpm_delete_instance () { + local rc + + claim_lock vtpmdb + + instance=$(vtpmdb_find_instance $1) + if [ "$instance" != "0" ]; then + vtpmdb_remove_entry $1 $instance + fi + + release_lock vtpmdb +} diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/xen-backend.agent --- a/tools/examples/xen-backend.agent Wed Mar 1 17:01:54 2006 +++ b/tools/examples/xen-backend.agent Wed Mar 1 19:47:25 2006 @@ -18,12 +18,7 @@ add) ;; remove) - # remove device frontend store entries - xenstore-rm -t $(xenstore-read "$XENBUS_PATH/frontend") || true - - # remove device backend store entries - xenstore-rm -t "$XENBUS_PATH" || true - xenstore-rm -t "error/$XENBUS_PATH" || true + /etc/xen/scripts/xen-hotplug-cleanup ;; online) ;; diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/xen-backend.rules --- a/tools/examples/xen-backend.rules Wed Mar 1 17:01:54 2006 +++ b/tools/examples/xen-backend.rules Wed Mar 1 19:47:25 2006 @@ -2,6 +2,4 @@ SUBSYSTEM=="xen-backend", KERNEL=="vtpm*", RUN+="/etc/xen/scripts/vtpm $env{ACTION}" SUBSYSTEM=="xen-backend", KERNEL=="vif*", ACTION=="online", RUN+="$env{script} online" SUBSYSTEM=="xen-backend", KERNEL=="vif*", ACTION=="offline", RUN+="$env{script} offline" -SUBSYSTEM=="xen-backend", ACTION=="remove", RUN+="/bin/bash -c '/usr/bin/xenstore-rm -t $$(/usr/bin/xenstore-read $env{XENBUS_PATH}/frontend)'" -SUBSYSTEM=="xen-backend", ACTION=="remove", RUN+="/usr/bin/xenstore-rm -t $env{XENBUS_PATH}" -SUBSYSTEM=="xen-backend", ACTION=="remove", RUN+="/usr/bin/xenstore-rm -t error/$env{XENBUS_PATH}" +SUBSYSTEM=="xen-backend", ACTION=="remove", RUN+="/etc/xen/scripts/xen-hotplug-cleanup" diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/xen-hotplug-common.sh --- a/tools/examples/xen-hotplug-common.sh Wed Mar 1 17:01:54 2006 +++ b/tools/examples/xen-hotplug-common.sh Wed Mar 1 19:47:25 2006 @@ -17,19 +17,15 @@ dir=$(dirname "$0") +. "$dir/logging.sh" . "$dir/xen-script-common.sh" +. "$dir/locking.sh" exec 2>>/var/log/xen-hotplug.log export PATH="/sbin:/bin:/usr/bin:/usr/sbin:$PATH" export LANG="POSIX" unset $(set | grep ^LC_ | cut -d= -f1) - -log() { - local level="$1" - shift - logger -p "daemon.$level" -- "$0:" "$@" || echo "$0 $@" >&2 -} fatal() { xenstore_write "$XENBUS_PATH"/hotplug-status error @@ -93,87 +89,4 @@ } -# -# Serialisation -# - -LOCK_SLEEPTIME=1 -LOCK_SPINNING_RETRIES=5 -LOCK_RETRIES=10 -LOCK_BASEDIR=/var/run/xen-hotplug - - -claim_lock() -{ - local lockdir="$LOCK_BASEDIR/$1" - mkdir -p "$LOCK_BASEDIR" - _claim_lock "$lockdir" -} - - -release_lock() -{ - _release_lock "$LOCK_BASEDIR/$1" -} - - -_claim_lock() -{ - local lockdir="$1" - local owner=$(_lock_owner "$lockdir") - local retries=0 - - while [ $retries -lt $LOCK_RETRIES ] - do - mkdir "$lockdir" 2>/dev/null && trap "release_lock $1; sigerr" ERR && - _update_lock_info "$lockdir" && return - - local new_owner=$(_lock_owner "$lockdir") - if [ "$new_owner" != "$owner" ] - then - owner="$new_owner" - retries=0 - fi - - if [ $retries -gt $LOCK_SPINNING_RETRIES ] - then - sleep $LOCK_SLEEPTIME - else - sleep 0 - fi - retries=$(($retries + 1)) - done - _steal_lock "$lockdir" -} - - -_release_lock() -{ - trap sigerr ERR - rm -rf "$1" 2>/dev/null || true -} - - -_steal_lock() -{ - local lockdir="$1" - local owner=$(cat "$lockdir/owner" 2>/dev/null || echo "unknown") - log err "Forced to steal lock on $lockdir from $owner!" - _release_lock "$lockdir" - _claim_lock "$lockdir" -} - - -_lock_owner() -{ - cat "$1/owner" 2>/dev/null || echo "unknown" -} - - -_update_lock_info() -{ - echo "$$: $0" >"$1/owner" -} - - log debug "$@" "XENBUS_PATH=$XENBUS_PATH" diff -r 88f97bb8f3ae -r 673f62edbfbe tools/firmware/hvmloader/Makefile --- a/tools/firmware/hvmloader/Makefile Wed Mar 1 17:01:54 2006 +++ b/tools/firmware/hvmloader/Makefile Wed Mar 1 19:47:25 2006 @@ -19,7 +19,7 @@ # XEN_ROOT = ../../.. -include $(XEN_ROOT)/tools/Rules.mk +include $(XEN_ROOT)/Config.mk # The HVM loader is started in 32-bit mode at the address below: LOADADDR = 0x100000 @@ -29,9 +29,13 @@ OBJECTS = hvmloader.o acpi_madt.o -CC = gcc +# Disable PIE/SSP if GCC supports them. They can break us. +CFLAGS += $(call test-gcc-flag,$(CC),-nopie) +CFLAGS += $(call test-gcc-flag,$(CC),-fno-stack-protector) +CFLAGS += $(call test-gcc-flag,$(CC),-fno-stack-protector-all) + OBJCOPY = objcopy -CFLAGS = $(DEFINES) -I. $(XENINC) -Wall -fno-builtin -O2 -msoft-float +CFLAGS += $(DEFINES) -I. $(XENINC) -Wall -fno-builtin -O2 -msoft-float CFLAGS += -m32 -march=i686 LDFLAGS = -m32 -nostdlib -Wl,-N -Wl,-Ttext -Wl,$(LOADADDR) diff -r 88f97bb8f3ae -r 673f62edbfbe tools/firmware/vgabios/Makefile --- a/tools/firmware/vgabios/Makefile Wed Mar 1 17:01:54 2006 +++ b/tools/firmware/vgabios/Makefile Wed Mar 1 19:47:25 2006 @@ -1,6 +1,4 @@ CC = gcc -CFLAGS = -g -O2 -Wall -Wstrict-prototypes -LDFLAGS = GCC = gcc BCC = bcc diff -r 88f97bb8f3ae -r 673f62edbfbe tools/firmware/vmxassist/Makefile --- a/tools/firmware/vmxassist/Makefile Wed Mar 1 17:01:54 2006 +++ b/tools/firmware/vmxassist/Makefile Wed Mar 1 19:47:25 2006 @@ -19,7 +19,7 @@ # XEN_ROOT = ../../.. -include $(XEN_ROOT)/tools/Rules.mk +include $(XEN_ROOT)/Config.mk # The emulator code lives in ROM space TEXTADDR=0x000D0000 @@ -27,11 +27,14 @@ DEFINES=-DDEBUG -DTEXTADDR=$(TEXTADDR) XENINC=-I$(XEN_ROOT)/tools/libxc -LD = ld -CC = gcc +# Disable PIE/SSP if GCC supports them. They can break us. +CFLAGS += $(call test-gcc-flag,$(CC),-nopie) +CFLAGS += $(call test-gcc-flag,$(CC),-fno-stack-protector) +CFLAGS += $(call test-gcc-flag,$(CC),-fno-stack-protector-all) + CPP = cpp -P OBJCOPY = objcopy -p -O binary -R .note -R .comment -R .bss -S --gap-fill=0 -CFLAGS = $(DEFINES) -I. $(XENINC) -Wall -fno-builtin -O2 -msoft-float +CFLAGS += $(DEFINES) -I. $(XENINC) -Wall -fno-builtin -O2 -msoft-float CFLAGS += -m32 -march=i686 LDFLAGS = -m elf_i386 diff -r 88f97bb8f3ae -r 673f62edbfbe tools/ioemu/Makefile --- a/tools/ioemu/Makefile Wed Mar 1 17:01:54 2006 +++ b/tools/ioemu/Makefile Wed Mar 1 19:47:25 2006 @@ -1,6 +1,9 @@ +XEN_ROOT=../.. +include $(XEN_ROOT)/tools/Rules.mk + -include config-host.mak -CFLAGS=-Wall -O2 -g -fno-strict-aliasing +CFLAGS+=-Wall -O2 -g -fno-strict-aliasing ifdef CONFIG_DARWIN CFLAGS+= -mdynamic-no-pic endif diff -r 88f97bb8f3ae -r 673f62edbfbe tools/ioemu/hw/ide.c --- a/tools/ioemu/hw/ide.c Wed Mar 1 17:01:54 2006 +++ b/tools/ioemu/hw/ide.c Wed Mar 1 19:47:25 2006 @@ -669,9 +669,6 @@ } if (s->io_buffer_index >= s->io_buffer_size && s->nsector == 0) { s->status = READY_STAT | SEEK_STAT; - s->bmdma->status &= ~BM_STATUS_DMAING; - s->bmdma->status |= BM_STATUS_INT; - ide_set_irq(s); #ifdef DEBUG_IDE_ATAPI printf("dma status=0x%x\n", s->status); #endif @@ -738,9 +735,6 @@ if (n == 0) { /* end of transfer */ s->status = READY_STAT | SEEK_STAT; - s->bmdma->status &= ~BM_STATUS_DMAING; - s->bmdma->status |= BM_STATUS_INT; - ide_set_irq(s); return 0; } if (n > MAX_MULT_SECTORS) @@ -987,9 +981,6 @@ if (s->packet_transfer_size <= 0) { s->status = READY_STAT; s->nsector = (s->nsector & ~7) | ATAPI_INT_REASON_IO | ATAPI_INT_REASON_CD; - s->bmdma->status &= ~BM_STATUS_DMAING; - s->bmdma->status |= BM_STATUS_INT; - ide_set_irq(s); #ifdef DEBUG_IDE_ATAPI printf("dma status=0x%x\n", s->status); #endif @@ -2025,6 +2016,17 @@ } } +static void ide_dma_finish(BMDMAState *bm) +{ + IDEState *s = bm->ide_if; + + bm->status &= ~BM_STATUS_DMAING; + bm->status |= BM_STATUS_INT; + bm->dma_cb = NULL; + bm->ide_if = NULL; + ide_set_irq(s); +} + /* XXX: full callback usage to prepare non blocking I/Os support - error handling */ #ifdef DMA_MULTI_THREAD @@ -2070,9 +2072,8 @@ cur_addr += 8; } /* end of transfer */ - the_end: - bm->dma_cb = NULL; - bm->ide_if = NULL; +the_end: + ide_dma_finish(bm); } static void ide_dma_start(IDEState *s, IDEDMAFunc *dma_cb) diff -r 88f97bb8f3ae -r 673f62edbfbe tools/ioemu/hw/pcnet.c --- a/tools/ioemu/hw/pcnet.c Wed Mar 1 17:01:54 2006 +++ b/tools/ioemu/hw/pcnet.c Wed Mar 1 19:47:25 2006 @@ -376,6 +376,10 @@ if (s->recv_pos > 0) return 0; + pcnet_rdte_poll(s); + if (!(CSR_CRST(s) & 0x8000)) { + return 0; + } return sizeof(s->buffer)-16; } diff -r 88f97bb8f3ae -r 673f62edbfbe tools/ioemu/target-i386-dm/Makefile --- a/tools/ioemu/target-i386-dm/Makefile Wed Mar 1 17:01:54 2006 +++ b/tools/ioemu/target-i386-dm/Makefile Wed Mar 1 19:47:25 2006 @@ -1,7 +1,8 @@ +include config.mak +override TARGET_ARCH=i386 + XEN_ROOT=../../.. include $(XEN_ROOT)/tools/Rules.mk -include config.mak -override TARGET_ARCH=i386 INSTALL_DIR := $(DESTDIR)/usr/$(LIBDIR)/xen/bin TARGET_PATH=$(SRC_PATH)/target-$(TARGET_ARCH) @@ -12,7 +13,7 @@ VPATH+=:$(SRC_PATH)/linux-user DEFINES+=-I$(SRC_PATH)/linux-user -I$(SRC_PATH)/linux-user/$(TARGET_ARCH) endif -CFLAGS=-Wall -O2 -g -fno-strict-aliasing +CFLAGS+=-Wall -O2 -g -fno-strict-aliasing LDFLAGS=-g LIBS= HELPER_CFLAGS=$(CFLAGS) diff -r 88f97bb8f3ae -r 673f62edbfbe tools/libxc/xc_linux_build.c --- a/tools/libxc/xc_linux_build.c Wed Mar 1 17:01:54 2006 +++ b/tools/libxc/xc_linux_build.c Wed Mar 1 19:47:25 2006 @@ -45,6 +45,77 @@ #ifdef __ia64__ #define probe_aout9(image,image_size,load_funcs) 1 #endif + +static const char *feature_names[XENFEAT_NR_SUBMAPS*32] = { + [XENFEAT_writable_page_tables] = "writable_page_tables", + [XENFEAT_writable_descriptor_tables] = "writable_descriptor_tables", + [XENFEAT_auto_translated_physmap] = "auto_translated_physmap", + [XENFEAT_supervisor_mode_kernel] = "supervisor_mode_kernel", + [XENFEAT_pae_pgdir_above_4gb] = "pae_pgdir_above_4gb" +}; + +static inline void set_feature_bit (int nr, uint32_t *addr) +{ + addr[nr>>5] |= (1<<(nr&31)); +} + +static inline int test_feature_bit(int nr, uint32_t *addr) +{ + return !!(addr[nr>>5] & (1<<(nr&31))); +} + +static int parse_features( + const char *feats, + uint32_t supported[XENFEAT_NR_SUBMAPS], + uint32_t required[XENFEAT_NR_SUBMAPS]) +{ + const char *end, *p; + int i, req; + + if ( (end = strchr(feats, ',')) == NULL ) + end = feats + strlen(feats); + + while ( feats < end ) + { + p = strchr(feats, '|'); + if ( (p == NULL) || (p > end) ) + p = end; + + req = (*feats == '!'); + if ( req ) + feats++; + + for ( i = 0; i < XENFEAT_NR_SUBMAPS*32; i++ ) + { + if ( feature_names[i] == NULL ) + continue; + + if ( strncmp(feature_names[i], feats, p-feats) == 0 ) + { + set_feature_bit(i, supported); + if ( required && req ) + set_feature_bit(i, required); + break; + } + } + + if ( i == XENFEAT_NR_SUBMAPS*32 ) + { + ERROR("Unknown feature \"%.*s\".\n", (int)(p-feats), feats); + if ( req ) + { + ERROR("Kernel requires an unknown hypervisor feature.\n"); + return -EINVAL; + } + } + + feats = p; + if ( *feats == '|' ) + feats++; + } + + return -EINVAL; +} static int probeimageformat(char *image, unsigned long image_size, @@ -344,7 +415,8 @@ unsigned long shared_info_frame, unsigned long flags, unsigned int store_evtchn, unsigned long *store_mfn, - unsigned int console_evtchn, unsigned long *console_mfn) + unsigned int console_evtchn, unsigned long *console_mfn, + uint32_t required_features[XENFEAT_NR_SUBMAPS]) { unsigned long *page_array = NULL; struct load_funcs load_funcs; @@ -483,7 +555,8 @@ unsigned long shared_info_frame, unsigned long flags, unsigned int store_evtchn, unsigned long *store_mfn, - unsigned int console_evtchn, unsigned long *console_mfn) + unsigned int console_evtchn, unsigned long *console_mfn, + uint32_t required_features[XENFEAT_NR_SUBMAPS]) { unsigned long *page_array = NULL; unsigned long count, i, hypercall_pfn; @@ -515,8 +588,9 @@ unsigned long vpt_start; unsigned long vpt_end; unsigned long v_end; - unsigned shadow_mode_enabled; unsigned long guest_store_mfn, guest_console_mfn, guest_shared_info_mfn; + unsigned long shadow_mode_enabled; + uint32_t supported_features[XENFEAT_NR_SUBMAPS] = { 0, }; rc = probeimageformat(image, image_size, &load_funcs); if ( rc != 0 ) @@ -534,8 +608,6 @@ goto error_out; } - shadow_mode_enabled = !!strstr(dsi.xen_guest_string, - "SHADOW=translate"); /* * Why do we need this? The number of page-table frames depends on the * size of the bootstrap address space. But the size of the address space @@ -637,6 +709,35 @@ (load_funcs.loadimage)(image, image_size, xc_handle, dom, page_array, &dsi); + /* Parse and validate kernel features. */ + p = strstr(dsi.xen_guest_string, "FEATURES="); + if ( p != NULL ) + { + if ( !parse_features(p + strlen("FEATURES="), + supported_features, + required_features) ) + { + ERROR("Failed to parse guest kernel features.\n"); + goto error_out; + } + + fprintf(stderr, "Supported features = { %08x }.\n", + supported_features[0]); + fprintf(stderr, "Required features = { %08x }.\n", + required_features[0]); + } + + for ( i = 0; i < XENFEAT_NR_SUBMAPS; i++ ) + { + if ( (supported_features[i]&required_features[i]) != required_features[i] ) + { + ERROR("Guest kernel does not support a required feature.\n"); + goto error_out; + } + } + + shadow_mode_enabled = test_feature_bit(XENFEAT_auto_translated_physmap, required_features); + /* Load the initial ramdisk image. */ if ( initrd_len != 0 ) { @@ -870,6 +971,7 @@ const char *image_name, const char *ramdisk_name, const char *cmdline, + const char *features, unsigned long flags, unsigned int store_evtchn, unsigned long *store_mfn, @@ -886,6 +988,16 @@ char *image = NULL; unsigned long image_size, initrd_size=0; unsigned long vstartinfo_start, vkern_entry, vstack_start; + uint32_t features_bitmap[XENFEAT_NR_SUBMAPS] = { 0, }; + + if ( features != NULL ) + { + if ( !parse_features(features, features_bitmap, NULL) ) + { + PERROR("Failed to parse configured features\n"); + goto error_out; + } + } if ( (nr_pages = get_tot_pages(xc_handle, domid)) < 0 ) { @@ -940,7 +1052,8 @@ &vstack_start, ctxt, cmdline, op.u.getdomaininfo.shared_info_frame, flags, store_evtchn, store_mfn, - console_evtchn, console_mfn) < 0 ) + console_evtchn, console_mfn, + features_bitmap) < 0 ) { ERROR("Error constructing guest OS"); goto error_out; diff -r 88f97bb8f3ae -r 673f62edbfbe tools/libxc/xenguest.h --- a/tools/libxc/xenguest.h Wed Mar 1 17:01:54 2006 +++ b/tools/libxc/xenguest.h Wed Mar 1 19:47:25 2006 @@ -47,6 +47,7 @@ const char *image_name, const char *ramdisk_name, const char *cmdline, + const char *features, unsigned long flags, unsigned int store_evtchn, unsigned long *store_mfn, diff -r 88f97bb8f3ae -r 673f62edbfbe tools/pygrub/src/pygrub --- a/tools/pygrub/src/pygrub Wed Mar 1 17:01:54 2006 +++ b/tools/pygrub/src/pygrub Wed Mar 1 19:47:25 2006 @@ -94,11 +94,17 @@ return struct.unpack("<L", buf[poff+8:poff+12])[0] * SECTOR_SIZE return -1 -def get_config(fn): +def get_config(fn, isconfig = False): if not os.access(fn, os.R_OK): raise RuntimeError, "Unable to access %s" %(fn,) cf = grub.GrubConf.GrubConfigFile() + + if isconfig: + # set the config file and parse it + cf.filename = fn + cf.parse() + return cf offset = 0 if is_disk_image(fn): @@ -130,9 +136,7 @@ # then parse the grub config cf.parse(buf) else: - # set the config file and parse it - cf.filename = fn - cf.parse() + raise RuntimeError, "Unable to read filesystem" return cf @@ -214,7 +218,8 @@ try: opts, args = getopt.gnu_getopt(sys.argv[1:], 'qh::', - ["quiet", "help", "output=", "entry="]) + ["quiet", "help", "output=", "entry=", + "isconfig"]) except getopt.GetoptError: usage() sys.exit(1) @@ -227,6 +232,7 @@ output = None entry = None interactive = True + isconfig = False for o, a in opts: if o in ("-q", "--quiet"): interactive = False @@ -239,13 +245,15 @@ entry = a # specifying the entry to boot implies non-interactive interactive = False + elif o in ("--isconfig",): + isconfig = True if output is None or output == "-": fd = sys.stdout.fileno() else: fd = os.open(output, os.O_WRONLY) - cf = get_config(file) + cf = get_config(file, isconfig) if interactive: curses.wrapper(run_main) else: diff -r 88f97bb8f3ae -r 673f62edbfbe tools/python/xen/lowlevel/xc/xc.c --- a/tools/python/xen/lowlevel/xc/xc.c Wed Mar 1 17:01:54 2006 +++ b/tools/python/xen/lowlevel/xc/xc.c Wed Mar 1 19:47:25 2006 @@ -326,27 +326,29 @@ PyObject *kwds) { uint32_t dom; - char *image, *ramdisk = NULL, *cmdline = ""; + char *image, *ramdisk = NULL, *cmdline = "", *features = NULL; int flags = 0; int store_evtchn, console_evtchn; unsigned long store_mfn = 0; unsigned long console_mfn = 0; - static char *kwd_list[] = { "dom", "store_evtchn", - "console_evtchn", "image", + static char *kwd_list[] = { "dom", "store_evtchn", + "console_evtchn", "image", /* optional */ - "ramdisk", "cmdline", "flags", NULL }; - - if ( !PyArg_ParseTupleAndKeywords(args, kwds, "iiis|ssi", kwd_list, + "ramdisk", "cmdline", "flags", + "features", NULL }; + + if ( !PyArg_ParseTupleAndKeywords(args, kwds, "iiis|ssis", kwd_list, &dom, &store_evtchn, - &console_evtchn, &image, + &console_evtchn, &image, /* optional */ - &ramdisk, &cmdline, &flags) ) + &ramdisk, &cmdline, &flags, + &features) ) return NULL; if ( xc_linux_build(self->xc_handle, dom, image, - ramdisk, cmdline, flags, - store_evtchn, &store_mfn, + ramdisk, cmdline, features, flags, + store_evtchn, &store_mfn, console_evtchn, &console_mfn) != 0 ) { if (!errno) errno = EINVAL; diff -r 88f97bb8f3ae -r 673f62edbfbe tools/python/xen/xend/XendBootloader.py --- a/tools/python/xen/xend/XendBootloader.py Wed Mar 1 17:01:54 2006 +++ b/tools/python/xen/xend/XendBootloader.py Wed Mar 1 19:47:25 2006 @@ -1,7 +1,7 @@ # # XendBootloader.py - Framework to run a boot loader for picking the kernel # -# Copyright 2005 Red Hat, Inc. +# Copyright 2005-2006 Red Hat, Inc. # Jeremy Katz <katzj@xxxxxxxxxx> # # This software may be freely redistributed under the terms of the GNU @@ -13,12 +13,11 @@ # import os, select, errno +import random import sxp from XendLogging import log from XendError import VmError - -BL_FIFO = "/var/lib/xen/xenbl" def bootloader(blexec, disk, quiet = 0, vcpus = None, entry = None): """Run the boot loader executable on the given disk and return a @@ -38,14 +37,18 @@ log.error(msg) raise VmError(msg) - os.mkfifo(BL_FIFO, 0600) + while True: + fifo = "/var/lib/xen/xenbl.%s" %(random.randint(0, 32000),) + if not os.path.exists(fifo): + break + os.mkfifo(fifo, 0600) child = os.fork() if (not child): args = [ blexec ] if quiet: args.append("-q") - args.append("--output=%s" %(BL_FIFO,)) + args.append("--output=%s" %(fifo,)) if entry is not None: args.append("--entry=%s" %(entry,)) args.append(disk) @@ -59,7 +62,7 @@ while 1: try: - r = os.open(BL_FIFO, os.O_RDONLY) + r = os.open(fifo, os.O_RDONLY) except OSError, e: if e.errno == errno.EINTR: continue @@ -74,7 +77,7 @@ os.waitpid(child, 0) os.close(r) - os.unlink(BL_FIFO) + os.unlink(fifo) if len(ret) == 0: msg = "Boot loader didn't return any data!" diff -r 88f97bb8f3ae -r 673f62edbfbe tools/python/xen/xend/XendDomainInfo.py --- a/tools/python/xen/xend/XendDomainInfo.py Wed Mar 1 17:01:54 2006 +++ b/tools/python/xen/xend/XendDomainInfo.py Wed Mar 1 19:47:25 2006 @@ -1502,15 +1502,14 @@ if not self.info['bootloader']: return # if we're restarting with a bootloader, we need to run it - # FIXME: this assumes the disk is the first device and - # that we're booting from the first disk blcfg = None config = self.sxpr() # FIXME: this assumes that we want to use the first disk - dev = sxp.child_value(config, "device") - if dev: - disk = sxp.child_value(dev, "uname") - fn = blkdev_uname_to_file(disk) + for dev in sxp.children(config, "device"): + disk = sxp.child(dev, "vbd") + if disk is None: + continue + fn = blkdev_uname_to_file(sxp.child_value(disk, "uname")) blcfg = bootloader(self.info['bootloader'], fn, 1, self.info['vcpus']) if blcfg is None: diff -r 88f97bb8f3ae -r 673f62edbfbe tools/python/xen/xend/image.py --- a/tools/python/xen/xend/image.py Wed Mar 1 17:01:54 2006 +++ b/tools/python/xen/xend/image.py Wed Mar 1 19:47:25 2006 @@ -68,6 +68,7 @@ self.kernel = None self.ramdisk = None self.cmdline = None + self.features = None self.configure(imageConfig, deviceConfig) @@ -89,6 +90,7 @@ if args: self.cmdline += " " + args self.ramdisk = get_cfg("ramdisk", '') + self.features = get_cfg("features", '') self.vm.storeVm(("image/ostype", self.ostype), ("image/kernel", self.kernel), @@ -175,13 +177,15 @@ log.debug("cmdline = %s", self.cmdline) log.debug("ramdisk = %s", self.ramdisk) log.debug("vcpus = %d", self.vm.getVCpuCount()) + log.debug("features = %s", self.features) return xc.linux_build(dom = self.vm.getDomid(), image = self.kernel, store_evtchn = store_evtchn, console_evtchn = console_evtchn, cmdline = self.cmdline, - ramdisk = self.ramdisk) + ramdisk = self.ramdisk, + features = self.features) class HVMImageHandler(ImageHandler): diff -r 88f97bb8f3ae -r 673f62edbfbe tools/python/xen/xend/server/netif.py --- a/tools/python/xen/xend/server/netif.py Wed Mar 1 17:01:54 2006 +++ b/tools/python/xen/xend/server/netif.py Wed Mar 1 19:47:25 2006 @@ -113,7 +113,8 @@ script.replace(xroot.network_script_dir + os.sep, "")]) if ip: - result.append(['ip', ip.split(" ")]) + for i in ip.split(" "): + result.append(['ip', i]) if bridge: result.append(['bridge', bridge]) if mac: diff -r 88f97bb8f3ae -r 673f62edbfbe tools/python/xen/xm/create.py --- a/tools/python/xen/xm/create.py Wed Mar 1 17:01:54 2006 +++ b/tools/python/xen/xm/create.py Wed Mar 1 19:47:25 2006 @@ -137,6 +137,10 @@ fn=set_value, default='', use="Path to ramdisk.") +gopts.var('features', val='FEATURES', + fn=set_value, default='', + use="Features to enable in guest kernel") + gopts.var('builder', val='FUNCTION', fn=set_value, default='linux', use="Function to use to build the domain.") @@ -445,6 +449,8 @@ config_image.append(['root', cmdline_root]) if vals.extra: config_image.append(['args', vals.extra]) + if vals.features: + config_image.append(['features', vals.features]) if vals.builder == 'hvm': configure_hvm(config_image, vals) diff -r 88f97bb8f3ae -r 673f62edbfbe tools/tests/Makefile --- a/tools/tests/Makefile Wed Mar 1 17:01:54 2006 +++ b/tools/tests/Makefile Wed Mar 1 19:47:25 2006 @@ -4,13 +4,12 @@ TARGET := test_x86_emulator -CC := gcc -CFLAGS := -O2 -Wall -Werror -D__TEST_HARNESS__ +HOSTCFLAGS += -D__TEST_HARNESS__ all: $(TARGET) $(TARGET): x86_emulate.o test_x86_emulator.o - $(CC) -o $@ $^ + $(HOSTCC) -o $@ $^ clean: rm -rf $(TARGET) *.o *~ core @@ -18,7 +17,7 @@ install: x86_emulate.o: $(XEN_ROOT)/xen/arch/x86/x86_emulate.c - $(CC) $(CFLAGS) -I$(XEN_ROOT)/xen/include -c -o $@ $< + $(HOSTCC) $(HOSTCFLAGS) -I$(XEN_ROOT)/xen/include -c -o $@ $< %.o: %.c - $(CC) $(CFLAGS) -I$(XEN_ROOT)/xen/include -c -o $@ $< + $(HOSTCC) $(HOSTCFLAGS) -I$(XEN_ROOT)/xen/include -c -o $@ $< diff -r 88f97bb8f3ae -r 673f62edbfbe tools/xenstore/xs.c --- a/tools/xenstore/xs.c Wed Mar 1 17:01:54 2006 +++ b/tools/xenstore/xs.c Wed Mar 1 19:47:25 2006 @@ -31,7 +31,6 @@ #include <signal.h> #include <stdint.h> #include <errno.h> -#include <sys/ioctl.h> #include <pthread.h> #include "xs.h" #include "list.h" @@ -343,7 +342,6 @@ free(ret); saved_errno = EBADF; goto close_fd; - } return ret; diff -r 88f97bb8f3ae -r 673f62edbfbe tools/xm-test/configure.ac --- a/tools/xm-test/configure.ac Wed Mar 1 17:01:54 2006 +++ b/tools/xm-test/configure.ac Wed Mar 1 19:47:25 2006 @@ -93,6 +93,7 @@ tests/unpause/Makefile tests/vcpu-pin/Makefile tests/vcpu-disable/Makefile + tests/vtpm/Makefile tests/enforce_dom0_cpus/Makefile lib/XmTestReport/xmtest.py lib/XmTestLib/config.py diff -r 88f97bb8f3ae -r 673f62edbfbe tools/xm-test/lib/XmTestLib/Network.py --- a/tools/xm-test/lib/XmTestLib/Network.py Wed Mar 1 17:01:54 2006 +++ b/tools/xm-test/lib/XmTestLib/Network.py Wed Mar 1 19:47:25 2006 @@ -22,6 +22,7 @@ import sys; import os; import atexit; +import random; from Test import * from Xm import * @@ -53,12 +54,22 @@ if rc == 0: SKIP("Zeroconf address found: " + out) + # Randomize one octet of the IP addresses we choose, so that + # multiple machines running network tests don't interfere + # with each other. + self.subnet = random.randint(1,254) + def calc_ip_address(self, dom, interface): # Generate an IP address from the dom# and eth#: - # 169.254.(eth#+153).(dom#+10) + # 169.254.(self.subnet).(eth#)*16 + (dom# + 1) ethnum = int(interface[len("eth"):]) + if (ethnum > 15): + raise NetworkError("ethnum > 15 : " + interface) domnum = int(dom[len("dom"):]) - return "169.254."+ str(ethnum+153) + "." + str(domnum+10) + if (domnum > 14): + raise NetworkError("domnum > 14 : " + dom) + + return "169.254."+ str(self.subnet) + "." + str(ethnum*16+domnum+1) def ip(self, dom, interface, todomname=None, toeth=None, bridge=None): newip = self.calc_ip_address(dom, interface) @@ -96,4 +107,4 @@ return newip def mask(self, dom, interface): - return "255.255.255.0" + return "255.255.255.240" diff -r 88f97bb8f3ae -r 673f62edbfbe tools/xm-test/lib/XmTestLib/XenDomain.py --- a/tools/xm-test/lib/XmTestLib/XenDomain.py Wed Mar 1 17:01:54 2006 +++ b/tools/xm-test/lib/XmTestLib/XenDomain.py Wed Mar 1 19:47:25 2006 @@ -99,6 +99,7 @@ # These options need to be lists self.defaultOpts["disk"] = [] self.defaultOpts["vif"] = [] + self.defaultOpts["vtpm"] = [] self.opts = self.defaultOpts diff -r 88f97bb8f3ae -r 673f62edbfbe tools/xm-test/tests/Makefile.am --- a/tools/xm-test/tests/Makefile.am Wed Mar 1 17:01:54 2006 +++ b/tools/xm-test/tests/Makefile.am Wed Mar 1 19:47:25 2006 @@ -23,6 +23,7 @@ unpause \ vcpu-disable \ vcpu-pin \ + vtpm \ enforce_dom0_cpus \ save restore migrate diff -r 88f97bb8f3ae -r 673f62edbfbe xen/Rules.mk --- a/xen/Rules.mk Wed Mar 1 17:01:54 2006 +++ b/xen/Rules.mk Wed Mar 1 19:47:25 2006 @@ -45,7 +45,7 @@ include $(BASEDIR)/arch/$(TARGET_ARCH)/Rules.mk -CFLAGS += -g +CFLAGS += -g -D__XEN__ ifneq ($(debug),y) CFLAGS += -DNDEBUG diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/ia64/vmx/vmx_hypercall.c --- a/xen/arch/ia64/vmx/vmx_hypercall.c Wed Mar 1 17:01:54 2006 +++ b/xen/arch/ia64/vmx/vmx_hypercall.c Wed Mar 1 19:47:25 2006 @@ -57,45 +57,7 @@ vcpu_set_gr(vcpu, 8, ret, 0); vmx_vcpu_increment_iip(vcpu); } -/* turn off temporarily, we will merge hypercall parameter convention with xeno, when - VTI domain need to call hypercall */ -#if 0 -unsigned long __hypercall_create_continuation( - unsigned int op, unsigned int nr_args, ...) -{ - struct mc_state *mcs = &mc_state[smp_processor_id()]; - VCPU *vcpu = current; - struct cpu_user_regs *regs = vcpu_regs(vcpu); - unsigned int i; - va_list args; - - va_start(args, nr_args); - if ( test_bit(_MCSF_in_multicall, &mcs->flags) ) { - panic("PREEMPT happen in multicall\n"); // Not support yet - } else { - vcpu_set_gr(vcpu, 15, op, 0); - for ( i = 0; i < nr_args; i++) { - switch (i) { - case 0: vcpu_set_gr(vcpu, 16, va_arg(args, unsigned long), 0); - break; - case 1: vcpu_set_gr(vcpu, 17, va_arg(args, unsigned long), 0); - break; - case 2: vcpu_set_gr(vcpu, 18, va_arg(args, unsigned long), 0); - break; - case 3: vcpu_set_gr(vcpu, 19, va_arg(args, unsigned long), 0); - break; - case 4: vcpu_set_gr(vcpu, 20, va_arg(args, unsigned long), 0); - break; - default: panic("Too many args for hypercall continuation\n"); - break; - } - } - } - vcpu->arch.hypercall_continuation = 1; - va_end(args); - return op; -} -#endif + void hyper_dom_mem_op(void) { VCPU *vcpu=current; diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/ia64/xen/process.c --- a/xen/arch/ia64/xen/process.c Wed Mar 1 17:01:54 2006 +++ b/xen/arch/ia64/xen/process.c Wed Mar 1 19:47:25 2006 @@ -801,30 +801,48 @@ reflect_interruption(isr,regs,vector); } -unsigned long __hypercall_create_continuation( - unsigned int op, unsigned int nr_args, ...) +unsigned long hypercall_create_continuation( + unsigned int op, const char *format, ...) { struct mc_state *mcs = &mc_state[smp_processor_id()]; struct vcpu *v = current; + const char *p = format; + unsigned long arg; unsigned int i; va_list args; - va_start(args, nr_args); + va_start(args, format); if ( test_bit(_MCSF_in_multicall, &mcs->flags) ) { panic("PREEMPT happen in multicall\n"); // Not support yet } else { vcpu_set_gr(v, 2, op, 0); - for ( i = 0; i < nr_args; i++) { + for ( i = 0; *p != '\0'; i++) { + switch ( *p++ ) + { + case 'i': + arg = (unsigned long)va_arg(args, unsigned int); + break; + case 'l': + arg = (unsigned long)va_arg(args, unsigned long); + break; + case 'p': + case 'h': + arg = (unsigned long)va_arg(args, void *); + break; + default: + arg = 0; + BUG(); + } switch (i) { - case 0: vcpu_set_gr(v, 14, va_arg(args, unsigned long), 0); + case 0: vcpu_set_gr(v, 14, arg, 0); break; - case 1: vcpu_set_gr(v, 15, va_arg(args, unsigned long), 0); + case 1: vcpu_set_gr(v, 15, arg, 0); break; - case 2: vcpu_set_gr(v, 16, va_arg(args, unsigned long), 0); + case 2: vcpu_set_gr(v, 16, arg, 0); break; - case 3: vcpu_set_gr(v, 17, va_arg(args, unsigned long), 0); + case 3: vcpu_set_gr(v, 17, arg, 0); break; - case 4: vcpu_set_gr(v, 18, va_arg(args, unsigned long), 0); + case 4: vcpu_set_gr(v, 18, arg, 0); break; default: panic("Too many args for hypercall continuation\n"); break; diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/Makefile --- a/xen/arch/x86/Makefile Wed Mar 1 17:01:54 2006 +++ b/xen/arch/x86/Makefile Wed Mar 1 19:47:25 2006 @@ -33,6 +33,10 @@ endif endif +ifneq ($(supervisor_mode_kernel),y) +OBJS := $(subst x86_32/supervisor_mode_kernel.o,,$(OBJS)) +endif + OBJS := $(subst $(TARGET_SUBARCH)/asm-offsets.o,,$(OBJS)) OBJS := $(subst $(TARGET_SUBARCH)/xen.lds.o,,$(OBJS)) @@ -44,7 +48,7 @@ $(TARGET): $(TARGET)-syms boot/mkelf32 ./boot/mkelf32 $(TARGET)-syms $(TARGET) 0x100000 \ - `nm $(TARGET)-syms | sort | tail -n 1 | sed -e 's/^\([^ ]*\).*/0x\1/'` + `$(NM) $(TARGET)-syms | sort | tail -n 1 | sed -e 's/^\([^ ]*\).*/0x\1/'` $(CURDIR)/arch.o: $(OBJS) $(LD) $(LDFLAGS) -r -o $@ $(OBJS) diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/Rules.mk --- a/xen/arch/x86/Rules.mk Wed Mar 1 17:01:54 2006 +++ b/xen/arch/x86/Rules.mk Wed Mar 1 19:47:25 2006 @@ -6,6 +6,7 @@ # 'make clean' before rebuilding. # pae ?= n +supervisor_mode_kernel ?= n CFLAGS += -nostdinc -fno-builtin -fno-common -fno-strict-aliasing CFLAGS += -iwithprefix include -Wall -Werror -Wno-pointer-arith -pipe @@ -32,6 +33,9 @@ CFLAGS += -DCONFIG_X86_PAE=1 endif endif +ifeq ($(supervisor_mode_kernel),y) +CFLAGS += -DCONFIG_X86_SUPERVISOR_MODE_KERNEL=1 +endif ifeq ($(TARGET_SUBARCH),x86_64) CFLAGS += -m64 -mno-red-zone -fpic -fno-reorder-blocks diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/boot/mkelf32.c --- a/xen/arch/x86/boot/mkelf32.c Wed Mar 1 17:01:54 2006 +++ b/xen/arch/x86/boot/mkelf32.c Wed Mar 1 19:47:25 2006 @@ -244,7 +244,7 @@ inimage = argv[1]; outimage = argv[2]; - loadbase = strtoul(argv[3], NULL, 16); + loadbase = strtoull(argv[3], NULL, 16); final_exec_addr = strtoul(argv[4], NULL, 16); infd = open(inimage, O_RDONLY); diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/dom0_ops.c --- a/xen/arch/x86/dom0_ops.c Wed Mar 1 17:01:54 2006 +++ b/xen/arch/x86/dom0_ops.c Wed Mar 1 19:47:25 2006 @@ -181,10 +181,13 @@ { dom0_physinfo_t *pi = &op->u.physinfo; - pi->threads_per_core = smp_num_siblings; - pi->cores_per_socket = boot_cpu_data.x86_max_cores; + pi->threads_per_core = + cpus_weight(cpu_sibling_map[0]); + pi->cores_per_socket = + cpus_weight(cpu_core_map[0]) / pi->threads_per_core; pi->sockets_per_node = - num_online_cpus() / (pi->threads_per_core * pi->cores_per_socket); + num_online_cpus() / cpus_weight(cpu_core_map[0]); + pi->nr_nodes = 1; pi->total_pages = total_pages; pi->free_pages = avail_domheap_pages(); diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/domain.c --- a/xen/arch/x86/domain.c Wed Mar 1 17:01:54 2006 +++ b/xen/arch/x86/domain.c Wed Mar 1 19:47:25 2006 @@ -351,17 +351,17 @@ if ( !(c->flags & VGCF_HVM_GUEST) ) { - fixup_guest_selector(c->user_regs.ss); - fixup_guest_selector(c->kernel_ss); - fixup_guest_selector(c->user_regs.cs); + fixup_guest_stack_selector(c->user_regs.ss); + fixup_guest_stack_selector(c->kernel_ss); + fixup_guest_code_selector(c->user_regs.cs); #ifdef __i386__ - fixup_guest_selector(c->event_callback_cs); - fixup_guest_selector(c->failsafe_callback_cs); + fixup_guest_code_selector(c->event_callback_cs); + fixup_guest_code_selector(c->failsafe_callback_cs); #endif for ( i = 0; i < 256; i++ ) - fixup_guest_selector(c->trap_ctxt[i].cs); + fixup_guest_code_selector(c->trap_ctxt[i].cs); } else if ( !hvm_enabled ) return -EINVAL; @@ -784,6 +784,11 @@ context_saved(prev); + /* Update per-VCPU guest runstate shared memory area (if registered). */ + if ( next->runstate_guest != NULL ) + __copy_to_user(next->runstate_guest, &next->runstate, + sizeof(next->runstate)); + schedule_tail(next); BUG(); } @@ -820,56 +825,77 @@ flush_tlb_mask(v->vcpu_dirty_cpumask); } -unsigned long __hypercall_create_continuation( - unsigned int op, unsigned int nr_args, ...) +#define next_arg(fmt, args) ({ \ + unsigned long __arg; \ + switch ( *(fmt)++ ) \ + { \ + case 'i': __arg = (unsigned long)va_arg(args, unsigned int); break; \ + case 'l': __arg = (unsigned long)va_arg(args, unsigned long); break; \ + case 'p': __arg = (unsigned long)va_arg(args, void *); break; \ + case 'h': __arg = (unsigned long)va_arg(args, void *); break; \ + default: __arg = 0; BUG(); \ + } \ + __arg; \ +}) + +unsigned long hypercall_create_continuation( + unsigned int op, const char *format, ...) { struct mc_state *mcs = &mc_state[smp_processor_id()]; struct cpu_user_regs *regs; + const char *p = format; + unsigned long arg; unsigned int i; va_list args; - va_start(args, nr_args); + va_start(args, format); if ( test_bit(_MCSF_in_multicall, &mcs->flags) ) { __set_bit(_MCSF_call_preempted, &mcs->flags); - for ( i = 0; i < nr_args; i++ ) - mcs->call.args[i] = va_arg(args, unsigned long); + for ( i = 0; *p != '\0'; i++ ) + mcs->call.args[i] = next_arg(p, args); } else { regs = guest_cpu_user_regs(); #if defined(__i386__) regs->eax = op; - regs->eip -= 2; /* re-execute 'int 0x82' */ - - for ( i = 0; i < nr_args; i++ ) - { + + if ( supervisor_mode_kernel ) + regs->eip &= ~31; /* re-execute entire hypercall entry stub */ + else + regs->eip -= 2; /* re-execute 'int 0x82' */ + + for ( i = 0; *p != '\0'; i++ ) + { + arg = next_arg(p, args); switch ( i ) { - case 0: regs->ebx = va_arg(args, unsigned long); break; - case 1: regs->ecx = va_arg(args, unsigned long); break; - case 2: regs->edx = va_arg(args, unsigned long); break; - case 3: regs->esi = va_arg(args, unsigned long); break; - case 4: regs->edi = va_arg(args, unsigned long); break; - case 5: regs->ebp = va_arg(args, unsigned long); break; + case 0: regs->ebx = arg; break; + case 1: regs->ecx = arg; break; + case 2: regs->edx = arg; break; + case 3: regs->esi = arg; break; + case 4: regs->edi = arg; break; + case 5: regs->ebp = arg; break; } } #elif defined(__x86_64__) regs->rax = op; regs->rip -= 2; /* re-execute 'syscall' */ - for ( i = 0; i < nr_args; i++ ) - { + for ( i = 0; *p != '\0'; i++ ) + { + arg = next_arg(p, args); switch ( i ) { - case 0: regs->rdi = va_arg(args, unsigned long); break; - case 1: regs->rsi = va_arg(args, unsigned long); break; - case 2: regs->rdx = va_arg(args, unsigned long); break; - case 3: regs->r10 = va_arg(args, unsigned long); break; - case 4: regs->r8 = va_arg(args, unsigned long); break; - case 5: regs->r9 = va_arg(args, unsigned long); break; + case 0: regs->rdi = arg; break; + case 1: regs->rsi = arg; break; + case 2: regs->rdx = arg; break; + case 3: regs->r10 = arg; break; + case 4: regs->r8 = arg; break; + case 5: regs->r9 = arg; break; } } #endif diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/domain_build.c --- a/xen/arch/x86/domain_build.c Wed Mar 1 17:01:54 2006 +++ b/xen/arch/x86/domain_build.c Wed Mar 1 19:47:25 2006 @@ -27,6 +27,9 @@ #include <asm/shadow.h> #include <public/version.h> + +extern unsigned long initial_images_nrpages(void); +extern void discard_initial_images(void); static long dom0_nrpages; @@ -181,7 +184,8 @@ { printk("Unknown kernel feature \"%.*s\".\n", (int)(p-feats), feats); - panic("Domain 0 requires an unknown hypervisor feature.\n"); + if ( req ) + panic("Domain 0 requires an unknown hypervisor feature.\n"); } feats = p; @@ -248,9 +252,6 @@ uint32_t dom0_features_supported[XENFEAT_NR_SUBMAPS] = { 0 }; uint32_t dom0_features_required[XENFEAT_NR_SUBMAPS] = { 0 }; - extern void translate_l2pgtable( - struct domain *d, l1_pgentry_t *p2m, unsigned long l2mfn); - /* Sanity! */ BUG_ON(d->domain_id != 0); BUG_ON(d->vcpu[0] == NULL); @@ -271,18 +272,14 @@ */ if ( dom0_nrpages == 0 ) { - dom0_nrpages = avail_domheap_pages() + - ((initrd_len + PAGE_SIZE - 1) >> PAGE_SHIFT) + - ((image_len + PAGE_SIZE - 1) >> PAGE_SHIFT); + dom0_nrpages = avail_domheap_pages() + initial_images_nrpages(); dom0_nrpages = min(dom0_nrpages / 16, 128L << (20 - PAGE_SHIFT)); dom0_nrpages = -dom0_nrpages; } /* Negative memory specification means "all memory - specified amount". */ if ( dom0_nrpages < 0 ) - nr_pages = avail_domheap_pages() + - ((initrd_len + PAGE_SIZE - 1) >> PAGE_SHIFT) + - ((image_len + PAGE_SIZE - 1) >> PAGE_SHIFT) + + nr_pages = avail_domheap_pages() + initial_images_nrpages() + dom0_nrpages; else nr_pages = dom0_nrpages; @@ -704,16 +701,12 @@ hypercall_page_initialise((void *)hypercall_page); } - init_domheap_pages( - _image_start, (_image_start+image_len+PAGE_SIZE-1) & PAGE_MASK); - - /* Copy the initial ramdisk and free temporary buffer. */ + /* Copy the initial ramdisk. */ if ( initrd_len != 0 ) - { memcpy((void *)vinitrd_start, initrd_start, initrd_len); - init_domheap_pages( - _initrd_start, (_initrd_start+initrd_len+PAGE_SIZE-1) & PAGE_MASK); - } + + /* Free temporary buffers. */ + discard_initial_images(); /* Set up start info area. */ si = (start_info_t *)vstartinfo_start; @@ -790,6 +783,25 @@ { shadow_mode_enable(d, SHM_enable); update_pagetables(v); + } + + if ( supervisor_mode_kernel ) + { + v->arch.guest_context.kernel_ss &= ~3; + v->arch.guest_context.user_regs.ss &= ~3; + v->arch.guest_context.user_regs.es &= ~3; + v->arch.guest_context.user_regs.ds &= ~3; + v->arch.guest_context.user_regs.fs &= ~3; + v->arch.guest_context.user_regs.gs &= ~3; + printk("Dom0 runs in ring 0 (supervisor mode)\n"); + if ( !test_bit(XENFEAT_supervisor_mode_kernel, + dom0_features_supported) ) + panic("Dom0 does not support supervisor-mode execution\n"); + } + else + { + if ( test_bit(XENFEAT_supervisor_mode_kernel, dom0_features_required) ) + panic("Dom0 requires supervisor-mode execution\n"); } rc = 0; diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/hvm/hvm.c --- a/xen/arch/x86/hvm/hvm.c Wed Mar 1 17:01:54 2006 +++ b/xen/arch/x86/hvm/hvm.c Wed Mar 1 19:47:25 2006 @@ -25,6 +25,7 @@ #include <xen/sched.h> #include <xen/irq.h> #include <xen/softirq.h> +#include <xen/domain.h> #include <xen/domain_page.h> #include <asm/current.h> #include <asm/io.h> @@ -59,9 +60,9 @@ for ( i = 0; i < nr_pfn; i++ ) { - if ( pfn + i >= 0xfffff ) + if ( pfn + i >= 0xfffff ) break; - + __copy_to_user(&phys_to_machine_mapping[pfn + i], &val, sizeof (val)); } } @@ -217,7 +218,7 @@ global_iodata_t *spg; u16 *virq_line, irqs; struct hvm_virpic *pic = &v->domain->arch.hvm_domain.vpic; - + spg = &get_sp(v->domain)->sp_global; virq_line = &spg->pic_clear_irr; if ( *virq_line ) { @@ -312,6 +313,52 @@ } /* + * only called in HVM domain BSP context + * when booting, vcpuid is always equal to apic_id + */ +int hvm_bringup_ap(int vcpuid, int trampoline_vector) +{ + struct vcpu *bsp = current, *v; + struct domain *d = bsp->domain; + struct vcpu_guest_context *ctxt; + int rc = 0; + + /* current must be HVM domain BSP */ + if ( !(HVM_DOMAIN(bsp) && bsp->vcpu_id == 0) ) { + printk("Not calling hvm_bringup_ap from BSP context.\n"); + domain_crash_synchronous(); + } + + if ( (v = d->vcpu[vcpuid]) == NULL ) + return -ENOENT; + + if ( (ctxt = xmalloc(struct vcpu_guest_context)) == NULL ) { + printk("Failed to allocate memory in hvm_bringup_ap.\n"); + return -ENOMEM; + } + + hvm_init_ap_context(ctxt, vcpuid, trampoline_vector); + + LOCK_BIGLOCK(d); + rc = -EEXIST; + if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) ) + rc = boot_vcpu(d, vcpuid, ctxt); + UNLOCK_BIGLOCK(d); + + if ( rc != 0 ) + printk("AP %d bringup failed in boot_vcpu %x.\n", vcpuid, rc); + else { + if ( test_and_clear_bit(_VCPUF_down, &d->vcpu[vcpuid]->vcpu_flags) ) + vcpu_wake(d->vcpu[vcpuid]); + printk("AP %d bringup suceeded.\n", vcpuid); + } + + xfree(ctxt); + + return rc; +} + +/* * Local variables: * mode: C * c-set-style: "BSD" diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/hvm/svm/emulate.c --- a/xen/arch/x86/hvm/svm/emulate.c Wed Mar 1 17:01:54 2006 +++ b/xen/arch/x86/hvm/svm/emulate.c Wed Mar 1 19:47:25 2006 @@ -86,7 +86,7 @@ case 0x7: value = regs->edi; break; -#if X86_64 +#if __x86_64__ case 0x8: value = regs->r8; break; @@ -318,20 +318,14 @@ /* Get the register/mode number of src register in ModRM register. */ -unsigned int decode_dest_reg(u8 m) -{ -#if __x86_64__ - ASSERT(0); /* Need to adjust for REX prefix if applicable */ -#endif - return (m >> 3) & 7; -} - -unsigned int decode_src_reg(u8 m) -{ -#if __x86_64__ - ASSERT(0); /* Need to adjust for REX prefix if applicable */ -#endif - return m & 7; +unsigned int decode_dest_reg(u8 prefix, u8 m) +{ + return DECODE_MODRM_REG(prefix, m); +} + +unsigned int decode_src_reg(u8 prefix, u8 m) +{ + return DECODE_MODRM_RM(prefix, m); } @@ -431,7 +425,7 @@ * The caller can either pass a NULL pointer to the guest_eip_buf, or a pointer * to enough bytes to satisfy the instruction including prefix bytes. */ -unsigned int __get_instruction_length_from_list(struct vmcb_struct *vmcb, +int __get_instruction_length_from_list(struct vmcb_struct *vmcb, enum instruction_index *list, unsigned int list_count, u8 *guest_eip_buf, enum instruction_index *match) { diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/hvm/svm/intr.c --- a/xen/arch/x86/hvm/svm/intr.c Wed Mar 1 17:01:54 2006 +++ b/xen/arch/x86/hvm/svm/intr.c Wed Mar 1 19:47:25 2006 @@ -80,12 +80,7 @@ { struct hvm_virpit *vpit = &(v->domain->arch.hvm_domain.vpit); - switch(type) - { - case VLAPIC_DELIV_MODE_EXT: - case VLAPIC_DELIV_MODE_FIXED: - case VLAPIC_DELIV_MODE_LPRI: - if ( is_pit_irq(v, vector, type) ) { + if ( is_pit_irq(v, vector, type) ) { if ( !vpit->first_injected ) { vpit->first_injected = 1; vpit->pending_intr_nr = 0; @@ -95,12 +90,15 @@ } vpit->inject_point = NOW(); svm_set_tsc_shift (v, vpit); - } + } + + switch(type) + { + case VLAPIC_DELIV_MODE_EXT: break; default: - printk("Not support interrupt type: %d\n", type); - break; + vlapic_post_injection(v, vector, type); } } diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/hvm/svm/svm.c --- a/xen/arch/x86/hvm/svm/svm.c Wed Mar 1 17:01:54 2006 +++ b/xen/arch/x86/hvm/svm/svm.c Wed Mar 1 19:47:25 2006 @@ -164,7 +164,7 @@ } static inline void svm_inject_exception(struct vmcb_struct *vmcb, - int trap, int error_code) + int trap, int ev, int error_code) { eventinj_t event; @@ -172,7 +172,7 @@ event.fields.v = 1; event.fields.type = EVENTTYPE_EXCEPTION; event.fields.vector = trap; - event.fields.ev = 1; + event.fields.ev = ev; event.fields.errorcode = error_code; ASSERT(vmcb->eventinj.fields.v == 0); @@ -237,61 +237,16 @@ } #ifdef __x86_64__ -static struct svm_msr_state percpu_msr[NR_CPUS]; - -static u32 msr_data_index[VMX_MSR_COUNT] = -{ - MSR_LSTAR, MSR_STAR, MSR_CSTAR, - MSR_SYSCALL_MASK, MSR_EFER, -}; void svm_save_segments(struct vcpu *v) { - rdmsrl(MSR_SHADOW_GS_BASE, v->arch.hvm_svm.msr_content.shadow_gs); -} - -/* - * To avoid MSR save/restore at every VM exit/entry time, we restore - * the x86_64 specific MSRs at domain switch time. Since those MSRs are - * are not modified once set for generic domains, we don't save them, - * but simply reset them to the values set at percpu_traps_init(). - */ +} void svm_load_msrs(void) { - struct svm_msr_state *host_state = &percpu_msr[smp_processor_id()]; - int i; - - while ( host_state->flags ) - { - i = find_first_set_bit(host_state->flags); - wrmsrl(msr_data_index[i], host_state->msr_items[i]); - clear_bit(i, &host_state->flags); - } -} - -static void svm_save_init_msrs(void) -{ - struct svm_msr_state *host_state = &percpu_msr[smp_processor_id()]; - int i; - - for ( i = 0; i < SVM_MSR_COUNT; i++ ) - rdmsrl(msr_data_index[i], host_state->msr_items[i]); -} - -#define CASE_READ_MSR(address) \ - case MSR_ ## address: \ - msr_content = msr->msr_items[SVM_INDEX_MSR_ ## address]; \ - break - -#define CASE_WRITE_MSR(address) \ - case MSR_ ## address: \ - msr->msr_items[SVM_INDEX_MSR_ ## address] = msr_content; \ - if (!test_bit(SVM_INDEX_MSR_ ## address, &msr->flags)) \ - { \ - set_bit(SVM_INDEX_MSR_ ## address, &msr->flags); \ - } \ - break - +} +void svm_restore_msrs(struct vcpu *v) +{ +} #define IS_CANO_ADDRESS(add) 1 @@ -299,47 +254,45 @@ { u64 msr_content = 0; struct vcpu *vc = current; - struct svm_msr_state *msr = &vc->arch.hvm_svm.msr_content; + // struct svm_msr_state *msr = &vc->arch.hvm_svm.msr_content; struct vmcb_struct *vmcb = vc->arch.hvm_svm.vmcb; switch (regs->ecx) { case MSR_EFER: - msr_content = msr->msr_items[SVM_INDEX_MSR_EFER]; - HVM_DBG_LOG(DBG_LEVEL_2, "EFER msr_content %llx\n", - (unsigned long long)msr_content); - - if (test_bit(SVM_CPU_STATE_LME_ENABLED, &vc->arch.hvm_svm.cpu_state)) - msr_content |= 1 << _EFER_LME; - - if (SVM_LONG_GUEST(vc)) - msr_content |= 1 << _EFER_LMA; - + // msr_content = msr->msr_items[SVM_INDEX_MSR_EFER]; + msr_content = vmcb->efer; + msr_content &= ~EFER_SVME; break; case MSR_FS_BASE: - if (!(SVM_LONG_GUEST(vc))) - /* XXX should it be GP fault */ - domain_crash_synchronous(); - msr_content = vmcb->fs.base; break; case MSR_GS_BASE: - if (!(SVM_LONG_GUEST(vc))) - domain_crash_synchronous(); - msr_content = vmcb->gs.base; break; case MSR_SHADOW_GS_BASE: - msr_content = msr->shadow_gs; - break; - - CASE_READ_MSR(STAR); - CASE_READ_MSR(LSTAR); - CASE_READ_MSR(CSTAR); - CASE_READ_MSR(SYSCALL_MASK); + msr_content = vmcb->kerngsbase; + break; + + case MSR_STAR: + msr_content = vmcb->star; + break; + + case MSR_LSTAR: + msr_content = vmcb->lstar; + break; + + case MSR_CSTAR: + msr_content = vmcb->cstar; + break; + + case MSR_SYSCALL_MASK: + msr_content = vmcb->sfmask; + break; + default: return 0; } @@ -356,8 +309,6 @@ { u64 msr_content = regs->eax | ((u64)regs->edx << 32); struct vcpu *vc = current; - struct svm_msr_state *msr = &vc->arch.hvm_svm.msr_content; - struct svm_msr_state *host_state = &percpu_msr[smp_processor_id()]; struct vmcb_struct *vmcb = vc->arch.hvm_svm.vmcb; HVM_DBG_LOG(DBG_LEVEL_1, "mode_do_msr_write msr %lx msr_content %lx\n", @@ -373,26 +324,20 @@ || !test_bit(SVM_CPU_STATE_PAE_ENABLED, &vc->arch.hvm_svm.cpu_state)) { - svm_inject_exception(vmcb, TRAP_gp_fault, 0); + svm_inject_exception(vmcb, TRAP_gp_fault, 1, 0); } } if (msr_content & EFER_LME) set_bit(SVM_CPU_STATE_LME_ENABLED, &vc->arch.hvm_svm.cpu_state); + /* We have already recorded that we want LME, so it will be set + * next time CR0 gets updated. So we clear that bit and continue. + */ + if ((msr_content ^ vmcb->efer) & EFER_LME) + msr_content &= ~EFER_LME; /* No update for LME/LMA since it have no effect */ - msr->msr_items[SVM_INDEX_MSR_EFER] = msr_content; - if (msr_content & ~(EFER_LME | EFER_LMA)) - { - msr->msr_items[SVM_INDEX_MSR_EFER] = msr_content; - if (!test_bit(SVM_INDEX_MSR_EFER, &msr->flags)) - { - rdmsrl(MSR_EFER, host_state->msr_items[SVM_INDEX_MSR_EFER]); - set_bit(SVM_INDEX_MSR_EFER, &host_state->flags); - set_bit(SVM_INDEX_MSR_EFER, &msr->flags); - wrmsrl(MSR_EFER, msr_content); - } - } + vmcb->efer = msr_content | EFER_SVME; break; case MSR_FS_BASE: @@ -403,63 +348,42 @@ if (!IS_CANO_ADDRESS(msr_content)) { HVM_DBG_LOG(DBG_LEVEL_1, "Not cano address of msr write\n"); - svm_inject_exception(vmcb, TRAP_gp_fault, 0); + svm_inject_exception(vmcb, TRAP_gp_fault, 1, 0); } if (regs->ecx == MSR_FS_BASE) - vmcb->fs.base = msr_content; + vmcb->fs.base = msr_content; else - vmcb->gs.base = msr_content; + vmcb->gs.base = msr_content; break; case MSR_SHADOW_GS_BASE: - if (!(SVM_LONG_GUEST(vc))) - domain_crash_synchronous(); - - vc->arch.hvm_svm.msr_content.shadow_gs = msr_content; - wrmsrl(MSR_SHADOW_GS_BASE, msr_content); - break; - - CASE_WRITE_MSR(STAR); - CASE_WRITE_MSR(LSTAR); - CASE_WRITE_MSR(CSTAR); - CASE_WRITE_MSR(SYSCALL_MASK); + vmcb->kerngsbase = msr_content; + break; + + case MSR_STAR: + vmcb->star = msr_content; + break; + + case MSR_LSTAR: + vmcb->lstar = msr_content; + break; + + case MSR_CSTAR: + vmcb->cstar = msr_content; + break; + + case MSR_SYSCALL_MASK: + vmcb->sfmask = msr_content; + break; + default: return 0; } return 1; } -void -svm_restore_msrs(struct vcpu *v) -{ - int i = 0; - struct svm_msr_state *guest_state; - struct svm_msr_state *host_state; - unsigned long guest_flags; - - guest_state = &v->arch.hvm_svm.msr_content;; - host_state = &percpu_msr[smp_processor_id()]; - - wrmsrl(MSR_SHADOW_GS_BASE, guest_state->shadow_gs); - guest_flags = guest_state->flags; - if (!guest_flags) - return; - - while (guest_flags){ - i = find_first_set_bit(guest_flags); - - HVM_DBG_LOG(DBG_LEVEL_2, - "restore guest's index %d msr %lx with %lx\n", - i, (unsigned long) msr_data_index[i], (unsigned long) guest_state->msr_items[i]); - set_bit(i, &host_state->flags); - wrmsrl(msr_data_index[i], guest_state->msr_items[i]); - clear_bit(i, &guest_flags); - } -} #else -#define svm_save_init_msrs() ((void)0) - static inline int long_mode_do_msr_read(struct cpu_user_regs *regs) { return 0; @@ -497,9 +421,28 @@ { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; unsigned long cr0 = vmcb->cr0, eflags = vmcb->rflags, mode; - - mode = (eflags & X86_EFLAGS_VM) || !(cr0 & X86_CR0_PE) ? 2 : 4; + /* check which operating mode the guest is running */ + if( vmcb->efer & EFER_LMA ) + mode = vmcb->cs.attributes.fields.l ? 8 : 4; + else + mode = (eflags & X86_EFLAGS_VM) || !(cr0 & X86_CR0_PE) ? 2 : 4; return svm_instrlen(guest_cpu_user_regs(), mode); +} + +unsigned long svm_get_ctrl_reg(struct vcpu *v, unsigned int num) +{ + switch ( num ) + { + case 0: + return v->arch.hvm_svm.cpu_shadow_cr0; + case 2: + return v->arch.hvm_svm.cpu_cr2; + case 3: + return v->arch.hvm_svm.cpu_cr3; + default: + BUG(); + } + return 0; /* dummy */ } int start_svm(void) @@ -519,8 +462,6 @@ asidpool_init(smp_processor_id()); printk("AMD SVM Extension is enabled for cpu %d.\n", smp_processor_id()); - svm_save_init_msrs(); - /* Setup HVM interfaces */ hvm_funcs.disable = stop_svm; @@ -542,6 +483,7 @@ hvm_funcs.realmode = svm_realmode; hvm_funcs.paging_enabled = svm_paging_enabled; hvm_funcs.instruction_length = svm_instruction_length; + hvm_funcs.get_guest_ctrl_reg = svm_get_ctrl_reg; hvm_enabled = 1; @@ -631,8 +573,17 @@ } #if defined (__x86_64__) -void svm_store_cpu_user_regs(struct cpu_user_regs *regs, struct vcpu *c ) -{ +void svm_store_cpu_user_regs(struct cpu_user_regs *regs, struct vcpu *v ) +{ + struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; + + regs->rip = vmcb->rip; + regs->rsp = vmcb->rsp; + regs->rflags = vmcb->rflags; + regs->cs = vmcb->cs.sel; + regs->ds = vmcb->ds.sel; + regs->es = vmcb->es.sel; + regs->ss = vmcb->ss.sel; } #elif defined (__i386__) void svm_store_cpu_user_regs(struct cpu_user_regs *regs, struct vcpu *v) @@ -810,7 +761,8 @@ vpit = &v->domain->arch.hvm_domain.vpit; kill_timer(&vpit->pit_timer); kill_timer(&v->arch.hvm_svm.hlt_timer); - if ( hvm_apic_support(v->domain) ) { + if ( hvm_apic_support(v->domain) && (VLAPIC(v) != NULL) ) + { kill_timer( &(VLAPIC(v)->vlapic_timer) ); xfree( VLAPIC(v) ); } @@ -819,8 +771,29 @@ void arch_svm_do_resume(struct vcpu *v) { - svm_do_resume(v); - reset_stack_and_jump(svm_asm_do_resume); + /* pinning VCPU to a different core? */ + if ( v->arch.hvm_svm.launch_core == smp_processor_id()) { + svm_do_resume( v ); + reset_stack_and_jump( svm_asm_do_resume ); + } + else { + printk("VCPU core pinned: %d to %d\n", v->arch.hvm_svm.launch_core, smp_processor_id() ); + v->arch.hvm_svm.launch_core = smp_processor_id(); + svm_migrate_timers( v ); + svm_do_resume( v ); + reset_stack_and_jump( svm_asm_do_resume ); + } +} + + +void svm_migrate_timers(struct vcpu *v) +{ + struct hvm_virpit *vpit = &(v->domain->arch.hvm_domain.vpit); + + migrate_timer( &vpit->pit_timer, v->processor ); + migrate_timer( &v->arch.hvm_svm.hlt_timer, v->processor ); + if ( hvm_apic_support(v->domain) && VLAPIC( v )) + migrate_timer( &(VLAPIC(v)->vlapic_timer ), v->processor ); } @@ -860,9 +833,9 @@ /* No support for APIC */ if (!hvm_apic_support(v->domain) && gpa >= 0xFEC00000) { - unsigned long inst_len; - inst_len = svm_instruction_length(v); - if (inst_len == (unsigned long)-1) + int inst_len; + inst_len = svm_instruction_length(v); + if (inst_len == -1) { printf("%s: INST_LEN - Unable to decode properly.\n", __func__); domain_crash_synchronous(); @@ -914,6 +887,14 @@ eip = vmcb->rip; error_code = vmcb->exitinfo1; + + if (vmcb->idtr.limit == 0) { + printf("Huh? We got a GP Fault with an invalid IDTR!\n"); + svm_dump_vmcb(__func__, vmcb); + svm_dump_regs(__func__, regs); + svm_dump_inst(vmcb->rip); + __hvm_bug(regs); + } HVM_DBG_LOG(DBG_LEVEL_1, "svm_general_protection_fault: eip = %lx, erro_code = %lx", @@ -927,7 +908,7 @@ /* Reflect it back into the guest */ - svm_inject_exception(vmcb, TRAP_gp_fault, error_code); + svm_inject_exception(vmcb, TRAP_gp_fault, 1, error_code); } /* Reserved bits: [31:14], [12:1] */ @@ -939,7 +920,7 @@ unsigned int eax, ebx, ecx, edx; unsigned long eip; struct vcpu *v = current; - unsigned int inst_len; + int inst_len; ASSERT(vmcb); @@ -956,21 +937,29 @@ if (input == 1) { +#ifndef __x86_64__ if ( hvm_apic_support(v->domain) && !vlapic_global_enabled((VLAPIC(v))) ) +#endif clear_bit(X86_FEATURE_APIC, &edx); -#ifdef __x86_64__ +#if CONFIG_PAGING_LEVELS < 3 + clear_bit(X86_FEATURE_PAE, &edx); + clear_bit(X86_FEATURE_PSE, &edx); + clear_bit(X86_FEATURE_PSE36, &edx); +#else if ( v->domain->arch.ops->guest_paging_levels == PAGING_L2 ) -#endif { + if ( !v->domain->arch.hvm_domain.pae_enabled ) + clear_bit(X86_FEATURE_PAE, &edx); clear_bit(X86_FEATURE_PSE, &edx); - clear_bit(X86_FEATURE_PAE, &edx); clear_bit(X86_FEATURE_PSE36, &edx); } +#endif /* Clear out reserved bits. */ ecx &= ~SVM_VCPU_CPUID_L1_RESERVED; /* mask off reserved bits */ + clear_bit(X86_FEATURE_MWAIT & 31, &ecx); } #ifdef __i386__ else if ( input == 0x80000001 ) @@ -991,6 +980,7 @@ eip, input, eax, ebx, ecx, edx); inst_len = __get_instruction_length(vmcb, INSTR_CPUID, NULL); + ASSERT(inst_len > 0); __update_guest_eip(vmcb, inst_len); } @@ -1083,9 +1073,11 @@ unsigned long *reg_p = 0; unsigned int gpreg = 0; unsigned long eip; - unsigned int inst_len; + int inst_len; + int index; struct vmcb_struct *vmcb; u8 buffer[MAX_INST_LEN]; + u8 prefix = 0; vmcb = v->arch.hvm_svm.vmcb; @@ -1093,13 +1085,15 @@ eip = vmcb->rip; inst_copy_from_guest(buffer, svm_rip2pointer(vmcb), sizeof(buffer)); - - ASSERT(buffer[0] == 0x0f && (buffer[1] & 0xFD) == 0x21); - - gpreg = decode_src_reg(buffer[2]); -#if DEBUG - ASSERT(reg == decode_dest_reg(buffer[2])); -#endif + index = skip_prefix_bytes(buffer, sizeof(buffer)); + + ASSERT(buffer[index+0] == 0x0f && (buffer[index+1] & 0xFD) == 0x21); + + if (index > 0 && (buffer[index-1] & 0xF0) == 0x40) + prefix = buffer[index-1]; + + gpreg = decode_src_reg(prefix, buffer[index + 2]); + ASSERT(reg == decode_dest_reg(prefix, buffer[index + 2])); HVM_DBG_LOG(DBG_LEVEL_1, "svm_dr_access : eip=%lx, reg=%d, gpreg = %x", eip, reg, gpreg); @@ -1120,6 +1114,7 @@ __hvm_bug(regs); break; } + ASSERT(inst_len > 0); __update_guest_eip(vmcb, inst_len); } @@ -1335,13 +1330,13 @@ } } - static int svm_set_cr0(unsigned long value) { struct vcpu *v = current; unsigned long mfn; int paging_enabled; struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; + unsigned long crn; ASSERT(vmcb); @@ -1377,7 +1372,7 @@ &v->arch.hvm_svm.cpu_state)) { HVM_DBG_LOG(DBG_LEVEL_1, "Enable paging before PAE enable\n"); - svm_inject_exception(vmcb, TRAP_gp_fault, 0); + svm_inject_exception(vmcb, TRAP_gp_fault, 1, 0); } if (test_bit(SVM_CPU_STATE_LME_ENABLED, &v->arch.hvm_svm.cpu_state)) @@ -1386,14 +1381,7 @@ HVM_DBG_LOG(DBG_LEVEL_1, "Enable the Long mode\n"); set_bit(SVM_CPU_STATE_LMA_ENABLED, &v->arch.hvm_svm.cpu_state); -#if 0 - __vmread(VM_ENTRY_CONTROLS, &vm_entry_value); - vm_entry_value |= VM_ENTRY_CONTROLS_IA32E_MODE; - __vmwrite(VM_ENTRY_CONTROLS, vm_entry_value); -#else - printk("Cannot yet set SVM_CPU_STATE_LMA_ENABLED\n"); - domain_crash_synchronous(); -#endif + vmcb->efer |= (EFER_LMA | EFER_LME); #if CONFIG_PAGING_LEVELS >= 4 if (!shadow_set_guest_paging_levels(v->domain, 4)) @@ -1404,8 +1392,9 @@ #endif } else +#endif /* __x86_64__ */ { -#if CONFIG_PAGING_LEVELS >= 4 +#if CONFIG_PAGING_LEVELS >= 3 if (!shadow_set_guest_paging_levels(v->domain, 2)) { printk("Unsupported guest paging levels\n"); @@ -1414,33 +1403,18 @@ #endif } -#if 0 - unsigned long crn; - /* update CR4's PAE if needed */ - __vmread(GUEST_CR4, &crn); + crn = vmcb->cr4; if ((!(crn & X86_CR4_PAE)) && test_bit(SVM_CPU_STATE_PAE_ENABLED, &v->arch.hvm_svm.cpu_state)) { HVM_DBG_LOG(DBG_LEVEL_1, "enable PAE on cr4\n"); - __vmwrite(GUEST_CR4, crn | X86_CR4_PAE); - } -#else - printk("Cannot yet set SVM_CPU_STATE_PAE_ENABLED\n"); - domain_crash_synchronous(); -#endif -#elif defined(__i386__) - { - unsigned long old_base_mfn; - old_base_mfn = pagetable_get_pfn(v->arch.guest_table); - if (old_base_mfn) - put_page(mfn_to_page(old_base_mfn)); - } -#endif + vmcb->cr4 |= X86_CR4_PAE; + } /* Now arch.guest_table points to machine physical. */ - v->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT); + v->arch.guest_table = mk_pagetable((u64)mfn << PAGE_SHIFT); update_pagetables(v); HVM_DBG_LOG(DBG_LEVEL_VMMU, "New arch.guest_table = %lx", @@ -1461,7 +1435,7 @@ */ if ((value & X86_CR0_PE) == 0) { if (value & X86_CR0_PG) { - svm_inject_exception(vmcb, TRAP_gp_fault, 0); + svm_inject_exception(vmcb, TRAP_gp_fault, 1, 0); return 0; } @@ -1471,7 +1445,6 @@ return 1; } - /* * Read from control registers. CR0 and CR4 are read from the shadow. @@ -1497,7 +1470,7 @@ value = (unsigned long) v->arch.hvm_svm.cpu_cr3; break; case 4: - value = vmcb->cr4; + value = (unsigned long) v->arch.hvm_svm.cpu_shadow_cr4; break; case 8: #if 0 @@ -1579,7 +1552,7 @@ } old_base_mfn = pagetable_get_pfn(v->arch.guest_table); - v->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT); + v->arch.guest_table = mk_pagetable((u64)mfn << PAGE_SHIFT); if (old_base_mfn) put_page(mfn_to_page(old_base_mfn)); @@ -1596,12 +1569,19 @@ case 4: /* CR4 */ - if (value & X86_CR4_PAE) - __hvm_bug(regs); /* not implemented */ - - old_cr = vmcb->cr4; - - vmcb->cr4 = value; + if (value & X86_CR4_PAE) { + set_bit(SVM_CPU_STATE_PAE_ENABLED, &v->arch.hvm_svm.cpu_state); + } else { + if (test_bit(SVM_CPU_STATE_LMA_ENABLED, + &v->arch.hvm_svm.cpu_state)) { + svm_inject_exception(vmcb, TRAP_gp_fault, 1, 0); + } + clear_bit(SVM_CPU_STATE_PAE_ENABLED, &v->arch.hvm_svm.cpu_state); + } + + old_cr = v->arch.hvm_svm.cpu_shadow_cr4; + v->arch.hvm_svm.cpu_shadow_cr4 = value; + vmcb->cr4 = value | SVM_CR4_HOST_MASK; /* * Writing to CR4 to modify the PSE, PGE, or PAE flag invalidates @@ -1630,10 +1610,12 @@ struct cpu_user_regs *regs) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; - unsigned int inst_len = 0; + int inst_len = 0; + int index; unsigned int gpreg; unsigned long value; - u8 buffer[6]; + u8 buffer[MAX_INST_LEN]; + u8 prefix = 0; int result = 1; enum instruction_index list_a[] = {INSTR_MOV2CR, INSTR_CLTS, INSTR_LMSW}; enum instruction_index list_b[] = {INSTR_MOVCR2, INSTR_SMSW}; @@ -1642,29 +1624,41 @@ ASSERT(vmcb); inst_copy_from_guest(buffer, svm_rip2pointer(vmcb), sizeof(buffer)); + /* get index to first actual instruction byte - as we will need to know where the + * prefix lives later on + */ + index = skip_prefix_bytes(buffer, sizeof(buffer)); if (type == TYPE_MOV_TO_CR) { inst_len = __get_instruction_length_from_list(vmcb, list_a, - ARR_SIZE(list_a), buffer, &match); + ARR_SIZE(list_a), &buffer[index], &match); } else { inst_len = __get_instruction_length_from_list(vmcb, list_b, - ARR_SIZE(list_b), buffer, &match); - } + ARR_SIZE(list_b), &buffer[index], &match); + } + + ASSERT(inst_len > 0); + + inst_len += index; + + /* Check for REX prefix - it's ALWAYS the last byte of any prefix bytes */ + if (index > 0 && (buffer[index-1] & 0xF0) == 0x40) + prefix = buffer[index-1]; HVM_DBG_LOG(DBG_LEVEL_1, "eip = %lx", (unsigned long) vmcb->rip); switch (match) { case INSTR_MOV2CR: - gpreg = decode_src_reg(buffer[2]); + gpreg = decode_src_reg(prefix, buffer[index+2]); result = mov_to_cr(gpreg, cr, regs); break; case INSTR_MOVCR2: - gpreg = decode_src_reg(buffer[2]); + gpreg = decode_src_reg(prefix, buffer[index+2]); mov_from_cr(cr, gpreg, regs); break; @@ -1680,7 +1674,7 @@ if (svm_dbg_on) svm_dump_inst(svm_rip2pointer(vmcb)); - gpreg = decode_src_reg(buffer[2]); + gpreg = decode_src_reg(prefix, buffer[index+2]); value = get_reg(gpreg, regs, vmcb) & 0xF; if (svm_dbg_on) @@ -1698,7 +1692,7 @@ case INSTR_SMSW: svm_dump_inst(svm_rip2pointer(vmcb)); value = v->arch.hvm_svm.cpu_shadow_cr0; - gpreg = decode_src_reg(buffer[2]); + gpreg = decode_src_reg(prefix, buffer[index+2]); set_reg(gpreg, value, regs, vmcb); if (svm_dbg_on) @@ -1721,7 +1715,7 @@ static inline void svm_do_msr_access(struct vcpu *v, struct cpu_user_regs *regs) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; - unsigned int inst_len; + int inst_len; int64_t tsc_sum; ASSERT(vmcb); @@ -1813,7 +1807,9 @@ next_wakeup = next_pit; if ( next_wakeup != - 1 ) set_timer(¤t->arch.hvm_svm.hlt_timer, next_wakeup); +/* temporary workaround for 8828/8822 evtchn patches causing SVM failure. hvm_safe_block(); +*/ } @@ -1860,7 +1856,7 @@ struct vcpu *v = current; u8 opcode[MAX_INST_SIZE], prefix, length = MAX_INST_SIZE; unsigned long g_vaddr; - unsigned int inst_len; + int inst_len; struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; ASSERT(vmcb); @@ -1877,6 +1873,7 @@ if (invlpga) { inst_len = __get_instruction_length(vmcb, INSTR_INVLPGA, opcode); + ASSERT(inst_len > 0); __update_guest_eip(vmcb, inst_len); /* @@ -1890,6 +1887,7 @@ /* What about multiple prefix codes? */ prefix = (is_prefix(opcode[0])?opcode[0]:0); inst_len = __get_instruction_length(vmcb, INSTR_INVLPG, opcode); + ASSERT(inst_len > 0); inst_len--; length -= inst_len; @@ -1941,7 +1939,10 @@ v->arch.hvm_svm.cpu_shadow_cr0 = X86_CR0_ET; vmcb->cr2 = 0; - vmcb->cr4 = 0; + vmcb->efer = EFER_SVME; + + vmcb->cr4 = SVM_CR4_HOST_MASK; + v->arch.hvm_svm.cpu_shadow_cr4 = 0; /* This will jump to ROMBIOS */ vmcb->rip = 0xFFF0; @@ -2011,12 +2012,13 @@ static int svm_do_vmmcall(struct vcpu *v, struct cpu_user_regs *regs) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; - unsigned int inst_len; + int inst_len; ASSERT(vmcb); ASSERT(regs); inst_len = __get_instruction_length(vmcb, INSTR_VMCALL, NULL); + ASSERT(inst_len > 0); /* VMMCALL sanity check */ if (vmcb->cpl > get_vmmcall_cpl(regs->edi)) @@ -2470,7 +2472,7 @@ { v->arch.hvm_svm.injecting_event = 1; /* Inject #PG using Interruption-Information Fields */ - svm_inject_exception(vmcb, TRAP_page_fault, regs.error_code); + svm_inject_exception(vmcb, TRAP_page_fault, 1, regs.error_code); v->arch.hvm_svm.cpu_cr2 = va; vmcb->cr2 = va; @@ -2665,26 +2667,23 @@ { struct vcpu *v = current; struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; - int core = smp_processor_id(); - int oldcore = v->arch.hvm_svm.core; - /* - * if need to assign new asid or if switching cores, - * then retire asid for old core, and assign new for new core. - */ - if( v->arch.hvm_svm.core != core ) { - if (svm_dbg_on) - printk("old core %d new core %d\n",(int)v->arch.hvm_svm.core,(int)core); - v->arch.hvm_svm.core = core; - } - if( test_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags) || - (oldcore != core)) { - if(!asidpool_assign_next(vmcb, 1, - oldcore, core)) { + + /* + * if need to assign new asid, or if switching cores, + * retire asid for the old core, and assign a new asid to the current core. + */ + if ( test_bit( ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags ) || + ( v->arch.hvm_svm.asid_core != v->arch.hvm_svm.launch_core )) { + /* recycle asid */ + if ( !asidpool_assign_next( vmcb, 1, + v->arch.hvm_svm.asid_core, v->arch.hvm_svm.launch_core )) { /* If we get here, we have a major problem */ domain_crash_synchronous(); } - } - clear_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags); + + v->arch.hvm_svm.asid_core = v->arch.hvm_svm.launch_core; + clear_bit( ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags ); + } } /* diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/hvm/svm/vmcb.c --- a/xen/arch/x86/hvm/svm/vmcb.c Wed Mar 1 17:01:54 2006 +++ b/xen/arch/x86/hvm/svm/vmcb.c Wed Mar 1 19:47:25 2006 @@ -190,7 +190,6 @@ unsigned long eflags; unsigned long shadow_cr; struct vmcb_struct *vmcb = arch_svm->vmcb; - struct Xgt_desc_struct desc; /* Allows IRQs to be shares */ vmcb->vintr.fields.intr_masking = 1; @@ -224,9 +223,9 @@ vmcb->fs.base = 0; vmcb->gs.base = 0; - __asm__ __volatile__ ("sidt (%0) \n" :: "a"(&desc) : "memory"); - vmcb->idtr.base = desc.address; - vmcb->idtr.limit = desc.size; + /* Guest Interrupt descriptor table */ + vmcb->idtr.base = 0; + vmcb->idtr.limit = 0; /* Set up segment attributes */ attrib.bytes = 0; @@ -248,15 +247,11 @@ attrib.fields.type = 0xb; /* type=0xb -> executable/readable, accessed */ vmcb->cs.attributes = attrib; - /* Global descriptor table */ - //NMERGE7500 - can probably remove access to gdtr - vmcb->gdtr.base = regs->edx; - regs->edx = 0; - ASSERT(regs->eax <= 0xFFFF); /* Make sure we're in the limit */ - vmcb->gdtr.limit = regs->eax; - regs->eax = 0; - - /* Local Descriptor Table */ + /* Guest Global descriptor table */ + vmcb->gdtr.base = 0; + vmcb->gdtr.limit = 0; + + /* Guest Local Descriptor Table */ attrib.fields.s = 0; /* not code or data segement */ attrib.fields.type = 0x2; /* LDT */ attrib.fields.db = 0; /* 16-bit */ @@ -279,11 +274,10 @@ /* CR3 is set in svm_final_setup_guest */ __asm__ __volatile__ ("mov %%cr4,%0" : "=r" (crn) :); - shadow_cr = crn; - vmcb->cr4 = shadow_cr; - -//MERGE7500 - should write a 0 instead to rsp? - vmcb->rsp = regs->esp; + arch_svm->cpu_shadow_cr4 = crn & ~(X86_CR4_PGE | X86_CR4_PSE); + vmcb->cr4 = crn | SVM_CR4_HOST_MASK; + + vmcb->rsp = 0; vmcb->rip = regs->eip; eflags = regs->eflags & ~HVM_EFLAGS_RESERVED_0; /* clear 0s */ @@ -306,7 +300,7 @@ { if(arch_svm->vmcb != NULL) { - asidpool_retire(arch_svm->vmcb, arch_svm->core); + asidpool_retire(arch_svm->vmcb, arch_svm->asid_core); free_vmcb(arch_svm->vmcb); } if(arch_svm->iopm != NULL) { @@ -404,18 +398,17 @@ void svm_do_launch(struct vcpu *v) { + struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; + int core = smp_processor_id(); + ASSERT(vmcb); + /* Update CR3, GDT, LDT, TR */ - struct vmcb_struct *vmcb; - int core = smp_processor_id(); - vmcb = v->arch.hvm_svm.vmcb; - ASSERT(vmcb); - svm_stts(v); - /* current core is the one we will perform the vmrun on */ - v->arch.hvm_svm.core = core; + /* current core is the one we intend to perform the VMRUN on */ + v->arch.hvm_svm.launch_core = v->arch.hvm_svm.asid_core = core; clear_bit(ARCH_SVM_VMCB_ASSIGN_ASID, &v->arch.hvm_svm.flags); - if ( !asidpool_assign_next(vmcb, 0, core, core) ) + if ( !asidpool_assign_next( vmcb, 0, core, core )) BUG(); if (v->vcpu_id == 0) diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/hvm/svm/x86_64/exits.S --- a/xen/arch/x86/hvm/svm/x86_64/exits.S Wed Mar 1 17:01:54 2006 +++ b/xen/arch/x86/hvm/svm/x86_64/exits.S Wed Mar 1 19:47:25 2006 @@ -107,8 +107,6 @@ movq %rax, VMCB_rax(%rcx) movq VCPU_svm_hsa_pa(%rbx), %rax VMSAVE - /* XXX FPU SAVE */ - /* XXX DO TSC OFFSET */ movq VCPU_svm_vmcb_pa(%rbx), %rax popq %r15 @@ -137,9 +135,7 @@ VMSAVE /* rax is the only register we're allowed to touch here... */ - /* XXX FPU SAVE */ GET_CURRENT(%rax) - /* XXX DO TSC OFFSET */ movq VCPU_svm_hsa_pa(%rax), %rax VMLOAD diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/hvm/vlapic.c --- a/xen/arch/x86/hvm/vlapic.c Wed Mar 1 17:01:54 2006 +++ b/xen/arch/x86/hvm/vlapic.c Wed Mar 1 19:47:25 2006 @@ -225,27 +225,35 @@ break; case VLAPIC_DELIV_MODE_INIT: - if (!level && trig_mode == 1) { //Deassert + if ( !level && trig_mode == 1 ) { //Deassert printk("This hvm_vlapic is for P4, no work for De-assert init\n"); } else { /* FIXME How to check the situation after vcpu reset? */ - vlapic->init_sipi_sipi_state = VLAPIC_INIT_SIPI_SIPI_STATE_WAIT_SIPI; - if (vlapic->vcpu) { - vcpu_pause(vlapic->vcpu); + if ( test_and_clear_bit(_VCPUF_initialised, &v->vcpu_flags) ) { + printk("Reset hvm vcpu not supported yet\n"); + domain_crash_synchronous(); } + v->arch.hvm_vcpu.init_sipi_sipi_state = + HVM_VCPU_INIT_SIPI_SIPI_STATE_WAIT_SIPI; + result = 1; } break; case VLAPIC_DELIV_MODE_STARTUP: - if (vlapic->init_sipi_sipi_state != VLAPIC_INIT_SIPI_SIPI_STATE_WAIT_SIPI) + if ( v->arch.hvm_vcpu.init_sipi_sipi_state == + HVM_VCPU_INIT_SIPI_SIPI_STATE_NORM ) break; - vlapic->init_sipi_sipi_state = VLAPIC_INIT_SIPI_SIPI_STATE_NORM; - if (!vlapic->vcpu) { - /* XXX Call hvm_bringup_ap here */ - result = 0; - }else{ - //hvm_vcpu_reset(vlapic->vcpu); - } + + v->arch.hvm_vcpu.init_sipi_sipi_state = + HVM_VCPU_INIT_SIPI_SIPI_STATE_NORM; + + if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) ) { + printk("SIPI for initialized vcpu vcpuid %x\n", v->vcpu_id); + domain_crash_synchronous(); + } + + if ( hvm_bringup_ap(v->vcpu_id, vector) != 0 ) + result = 0; break; default: diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/hvm/vmx/io.c --- a/xen/arch/x86/hvm/vmx/io.c Wed Mar 1 17:01:54 2006 +++ b/xen/arch/x86/hvm/vmx/io.c Wed Mar 1 19:47:25 2006 @@ -113,13 +113,15 @@ struct hvm_virpit *vpit = &plat->vpit; struct hvm_virpic *pic= &plat->vpic; - hvm_pic_assist(v); - __vmread_vcpu(v, CPU_BASED_VM_EXEC_CONTROL, &cpu_exec_control); - if ( vpit->pending_intr_nr ) { + if ( v->vcpu_id == 0 ) + hvm_pic_assist(v); + + if ( (v->vcpu_id == 0) && vpit->pending_intr_nr ) { pic_set_irq(pic, 0, 0); pic_set_irq(pic, 0, 1); } + __vmread_vcpu(v, CPU_BASED_VM_EXEC_CONTROL, &cpu_exec_control); __vmread(VM_ENTRY_INTR_INFO_FIELD, &intr_fields); if (intr_fields & INTR_INFO_VALID_MASK) { diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/hvm/vmx/vmx.c --- a/xen/arch/x86/hvm/vmx/vmx.c Wed Mar 1 17:01:54 2006 +++ b/xen/arch/x86/hvm/vmx/vmx.c Wed Mar 1 19:47:25 2006 @@ -448,6 +448,37 @@ return 0; /* dummy */ } +/* SMP VMX guest support */ +void vmx_init_ap_context(struct vcpu_guest_context *ctxt, + int vcpuid, int trampoline_vector) +{ + int i; + + memset(ctxt, 0, sizeof(*ctxt)); + + /* + * Initial register values: + */ + ctxt->user_regs.eip = VMXASSIST_BASE; + ctxt->user_regs.edx = vcpuid; + ctxt->user_regs.ebx = trampoline_vector; + + ctxt->flags = VGCF_HVM_GUEST; + + /* Virtual IDT is empty at start-of-day. */ + for ( i = 0; i < 256; i++ ) + { + ctxt->trap_ctxt[i].vector = i; + ctxt->trap_ctxt[i].cs = FLAT_KERNEL_CS; + } + + /* No callback handlers. */ +#if defined(__i386__) + ctxt->event_callback_cs = FLAT_KERNEL_CS; + ctxt->failsafe_callback_cs = FLAT_KERNEL_CS; +#endif +} + void do_nmi(struct cpu_user_regs *); static int check_vmx_controls(ctrls, msr) @@ -544,6 +575,8 @@ hvm_funcs.paging_enabled = vmx_paging_enabled; hvm_funcs.instruction_length = vmx_instruction_length; hvm_funcs.get_guest_ctrl_reg = vmx_get_ctrl_reg; + + hvm_funcs.init_ap_context = vmx_init_ap_context; hvm_enabled = 1; diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/mm.c --- a/xen/arch/x86/mm.c Wed Mar 1 17:01:54 2006 +++ b/xen/arch/x86/mm.c Wed Mar 1 19:47:25 2006 @@ -97,11 +97,11 @@ #include <xen/domain_page.h> #include <xen/event.h> #include <xen/iocap.h> +#include <xen/guest_access.h> #include <asm/shadow.h> #include <asm/page.h> #include <asm/flushtlb.h> #include <asm/io.h> -#include <asm/uaccess.h> #include <asm/ldt.h> #include <asm/x86_emulate.h> #include <public/memory.h> @@ -475,7 +475,8 @@ { MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte " for dom%d", - mfn, get_gpfn_from_mfn(mfn), l1e_get_intpte(l1e), d->domain_id); + mfn, get_gpfn_from_mfn(mfn), + l1e_get_intpte(l1e), d->domain_id); } return okay; @@ -515,7 +516,6 @@ #if CONFIG_PAGING_LEVELS >= 3 - static int get_page_from_l3e( l3_pgentry_t l3e, unsigned long pfn, @@ -545,11 +545,9 @@ #endif return rc; } - #endif /* 3 level */ #if CONFIG_PAGING_LEVELS >= 4 - static int get_page_from_l4e( l4_pgentry_t l4e, unsigned long pfn, @@ -579,7 +577,6 @@ return rc; } - #endif /* 4 level */ @@ -649,27 +646,22 @@ #if CONFIG_PAGING_LEVELS >= 3 - static void put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn) { if ( (l3e_get_flags(l3e) & _PAGE_PRESENT) && (l3e_get_pfn(l3e) != pfn) ) put_page_and_type(mfn_to_page(l3e_get_pfn(l3e))); } - #endif #if CONFIG_PAGING_LEVELS >= 4 - static void put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn) { if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) && (l4e_get_pfn(l4e) != pfn) ) put_page_and_type(mfn_to_page(l4e_get_pfn(l4e))); } - #endif - static int alloc_l1_table(struct page_info *page) { @@ -1569,43 +1561,71 @@ int okay; unsigned long old_base_mfn; + ASSERT(writable_pagetable_in_sync(d)); + if ( shadow_mode_refcounts(d) ) + { okay = get_page_from_pagenr(mfn, d); + if ( unlikely(!okay) ) + { + MEM_LOG("Error while installing new baseptr %lx", mfn); + return 0; + } + } else + { okay = get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d); - - if ( likely(okay) ) - { - invalidate_shadow_ldt(v); - - old_base_mfn = pagetable_get_pfn(v->arch.guest_table); - v->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT); - update_pagetables(v); /* update shadow_table and monitor_table */ - - write_ptbase(v); - + if ( unlikely(!okay) ) + { + /* Switch to idle pagetable: this VCPU has no active p.t. now. */ + old_base_mfn = pagetable_get_pfn(v->arch.guest_table); + v->arch.guest_table = mk_pagetable(0); + update_pagetables(v); + write_cr3(__pa(idle_pg_table)); + if ( old_base_mfn != 0 ) + put_page_and_type(mfn_to_page(old_base_mfn)); + + /* Retry the validation with no active p.t. for this VCPU. */ + okay = get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d); + if ( !okay ) + { + /* Failure here is unrecoverable: the VCPU has no pagetable! */ + MEM_LOG("Fatal error while installing new baseptr %lx", mfn); + domain_crash(d); + percpu_info[v->processor].deferred_ops = 0; + return 0; + } + } + } + + invalidate_shadow_ldt(v); + + old_base_mfn = pagetable_get_pfn(v->arch.guest_table); + v->arch.guest_table = mk_pagetable(mfn << PAGE_SHIFT); + update_pagetables(v); /* update shadow_table and monitor_table */ + + write_ptbase(v); + + if ( likely(old_base_mfn != 0) ) + { if ( shadow_mode_refcounts(d) ) put_page(mfn_to_page(old_base_mfn)); else put_page_and_type(mfn_to_page(old_base_mfn)); - - /* CR3 also holds a ref to its shadow... */ - if ( shadow_mode_enabled(d) ) - { - if ( v->arch.monitor_shadow_ref ) - put_shadow_ref(v->arch.monitor_shadow_ref); - v->arch.monitor_shadow_ref = - pagetable_get_pfn(v->arch.monitor_table); - ASSERT(!page_get_owner(mfn_to_page(v->arch.monitor_shadow_ref))); - get_shadow_ref(v->arch.monitor_shadow_ref); - } - } - else - { - MEM_LOG("Error while installing new baseptr %lx", mfn); - } - - return okay; + } + + /* CR3 also holds a ref to its shadow... */ + if ( shadow_mode_enabled(d) ) + { + if ( v->arch.monitor_shadow_ref ) + put_shadow_ref(v->arch.monitor_shadow_ref); + v->arch.monitor_shadow_ref = + pagetable_get_pfn(v->arch.monitor_table); + ASSERT(!page_get_owner(mfn_to_page(v->arch.monitor_shadow_ref))); + get_shadow_ref(v->arch.monitor_shadow_ref); + } + + return 1; } static void process_deferred_ops(unsigned int cpu) @@ -1625,7 +1645,7 @@ else local_flush_tlb(); } - + if ( deferred_ops & DOP_RELOAD_LDT ) (void)map_ldt_shadow_page(0); @@ -1752,9 +1772,9 @@ { if ( hypercall_preempt_check() ) { - rc = hypercall4_create_continuation( - __HYPERVISOR_mmuext_op, uops, - (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom); + rc = hypercall_create_continuation( + __HYPERVISOR_mmuext_op, "pipi", + uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom); break; } @@ -2018,9 +2038,9 @@ { if ( hypercall_preempt_check() ) { - rc = hypercall4_create_continuation( - __HYPERVISOR_mmu_update, ureqs, - (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom); + rc = hypercall_create_continuation( + __HYPERVISOR_mmu_update, "pipi", + ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom); break; } @@ -2769,7 +2789,7 @@ } -long arch_memory_op(int op, void *arg) +long arch_memory_op(int op, GUEST_HANDLE(void) arg) { struct xen_reserved_phys_area xrpa; unsigned long pfn; @@ -2779,7 +2799,7 @@ switch ( op ) { case XENMEM_reserved_phys_area: - if ( copy_from_user(&xrpa, arg, sizeof(xrpa)) ) + if ( copy_from_guest(&xrpa, arg, 1) ) return -EFAULT; /* No guest has more than one reserved area. */ @@ -2813,7 +2833,7 @@ put_domain(d); - if ( copy_to_user(arg, &xrpa, sizeof(xrpa)) ) + if ( copy_to_guest(arg, &xrpa, 1) ) return -EFAULT; break; diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/setup.c --- a/xen/arch/x86/setup.c Wed Mar 1 17:01:54 2006 +++ b/xen/arch/x86/setup.c Wed Mar 1 19:47:25 2006 @@ -144,6 +144,20 @@ static struct e820entry e820_raw[E820MAX]; +static unsigned long initial_images_start, initial_images_end; + +unsigned long initial_images_nrpages(void) +{ + unsigned long s = initial_images_start + PAGE_SIZE - 1; + unsigned long e = initial_images_end; + return ((e >> PAGE_SHIFT) - (s >> PAGE_SHIFT)); +} + +void discard_initial_images(void) +{ + init_domheap_pages(initial_images_start, initial_images_end); +} + void __init __start_xen(multiboot_info_t *mbi) { char *cmdline; @@ -152,7 +166,6 @@ unsigned int initrdidx = 1; module_t *mod = (module_t *)__va(mbi->mods_addr); unsigned long nr_pages, modules_length; - unsigned long initial_images_start, initial_images_end; paddr_t s, e; int i, e820_warn = 0, e820_raw_nr = 0, bytes = 0; struct ns16550_defaults ns16550 = { @@ -437,11 +450,7 @@ set_in_cr4(X86_CR4_OSXMMEXCPT); if ( opt_nosmp ) - { max_cpus = 0; - smp_num_siblings = 1; - boot_cpu_data.x86_max_cores = 1; - } smp_prepare_cpus(max_cpus); diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/shadow32.c --- a/xen/arch/x86/shadow32.c Wed Mar 1 17:01:54 2006 +++ b/xen/arch/x86/shadow32.c Wed Mar 1 19:47:25 2006 @@ -43,7 +43,8 @@ static void mark_shadows_as_reflecting_snapshot(struct domain *d, unsigned long gpfn); #endif -static void free_p2m_table(struct vcpu *v); +static int alloc_p2m_table(struct domain *d); +static void free_p2m_table(struct domain *d); /******** @@ -739,7 +740,7 @@ mpl2e = (l2_pgentry_t *)map_domain_page_global(mmfn); memset(mpl2e, 0, PAGE_SIZE); - memcpy(&mpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], + memcpy(&mpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE], HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t)); @@ -760,6 +761,23 @@ if ( v->vcpu_id == 0 ) alloc_p2m_table(d); + else + { + unsigned long mfn; + + mfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table); + if ( mfn ) + { + l2_pgentry_t *l2tab; + + l2tab = map_domain_page(mfn); + + mpl2e[l2_table_offset(RO_MPT_VIRT_START)] = + l2tab[l2_table_offset(RO_MPT_VIRT_START)]; + + unmap_domain_page(l2tab); + } + } } /* @@ -771,7 +789,7 @@ unsigned long mfn; ASSERT( pagetable_get_paddr(v->arch.monitor_table) ); - + mpl2e = v->arch.monitor_vtable; /* @@ -794,7 +812,7 @@ } if ( v->vcpu_id == 0 ) - free_p2m_table(v); + free_p2m_table(v->domain); /* * Then free monitor_table. @@ -808,8 +826,8 @@ } static int -map_p2m_entry( - l1_pgentry_t *l1tab, unsigned long va, unsigned long gpa, unsigned long mfn) +map_p2m_entry(l1_pgentry_t *l1tab, unsigned long va, + unsigned long gpa, unsigned long mfn) { unsigned long *l0tab = NULL; l1_pgentry_t l1e = { 0 }; @@ -820,27 +838,22 @@ { page = alloc_domheap_page(NULL); if ( !page ) - goto fail; - - if ( l0tab ) - unmap_domain_page(l0tab); + return 0; + l0tab = map_domain_page(page_to_mfn(page)); - memset(l0tab, 0, PAGE_SIZE ); + memset(l0tab, 0, PAGE_SIZE); + l1e = l1tab[l1_table_offset(va)] = l1e_from_page(page, __PAGE_HYPERVISOR); } - else if ( l0tab == NULL) + else l0tab = map_domain_page(l1e_get_pfn(l1e)); - l0tab[gpa & ((PAGE_SIZE / sizeof (mfn)) - 1) ] = mfn; - - if ( l0tab ) - unmap_domain_page(l0tab); + l0tab[gpa & ((PAGE_SIZE / sizeof(mfn)) - 1)] = mfn; + + unmap_domain_page(l0tab); return 1; - -fail: - return 0; } int @@ -853,7 +866,6 @@ l1_pgentry_t *l1; struct page_info *l1page; unsigned long va = pfn << PAGE_SHIFT; - int error; if ( shadow_mode_external(d) ) { @@ -877,6 +889,7 @@ if ( shadow_mode_external(d) ) { + int error; l1_pgentry_t *l1tab = NULL; l2_pgentry_t l2e; @@ -885,14 +898,13 @@ ASSERT( l2e_get_flags(l2e) & _PAGE_PRESENT ); l1tab = map_domain_page(l2e_get_pfn(l2e)); - error = map_p2m_entry(l1tab, va, pfn, mfn); - if ( !error ) - domain_crash_synchronous(); + if ( !(error = map_p2m_entry(l1tab, va, pfn, mfn)) ) + domain_crash(d); unmap_domain_page(l1tab); unmap_domain_page_with_cache(l2, l2cache); - return 1; + return error; } /* @@ -926,7 +938,7 @@ return 1; } -int +static int alloc_p2m_table(struct domain *d) { struct list_head *list_ent; @@ -937,7 +949,7 @@ l2_pgentry_t l2e = { 0 }; struct page_info *page; unsigned long gpfn, mfn; - int error; + int error = 0; if ( pagetable_get_pfn(d->vcpu[0]->arch.monitor_table) ) { @@ -955,6 +967,9 @@ } else l1tab = map_domain_page(l2e_get_pfn(l2e)); + + if ( l2tab ) + unmap_domain_page(l2tab); } else { @@ -972,23 +987,23 @@ page = list_entry(list_ent, struct page_info, list); mfn = page_to_mfn(page); - error = map_p2m_entry(l1tab, va, gpfn, mfn); - if ( !error ) - domain_crash_synchronous(); + if ( !(error = map_p2m_entry(l1tab, va, gpfn, mfn)) ) + { + domain_crash(d); + break; + } list_ent = frame_table[mfn].list.next; va += sizeof(mfn); } - if (l2tab) - unmap_domain_page(l2tab); unmap_domain_page(l1tab); - return 1; -} - -static void -free_p2m_table(struct vcpu *v) + return error; +} + +static void +free_p2m_table(struct domain *d) { unsigned long va; l2_pgentry_t *l2tab; @@ -996,10 +1011,10 @@ l2_pgentry_t l2e; l1_pgentry_t l1e; - ASSERT ( pagetable_get_pfn(v->arch.monitor_table) ); + ASSERT( pagetable_get_pfn(d->vcpu[0]->arch.monitor_table) ); l2tab = map_domain_page( - pagetable_get_pfn(v->arch.monitor_table)); + pagetable_get_pfn(d->vcpu[0]->arch.monitor_table)); for ( va = RO_MPT_VIRT_START; va < RO_MPT_VIRT_END; ) { @@ -1015,11 +1030,13 @@ if ( l1e_get_flags(l1e) & _PAGE_PRESENT ) free_domheap_page(mfn_to_page(l1e_get_pfn(l1e))); - va += PAGE_SIZE; + va += PAGE_SIZE; } unmap_domain_page(l1tab); free_domheap_page(mfn_to_page(l2e_get_pfn(l2e))); } + else + va += PAGE_SIZE * L1_PAGETABLE_ENTRIES; } unmap_domain_page(l2tab); } @@ -1246,7 +1263,7 @@ if ( shadow_mode_refcounts(d) ) { - struct list_head *list_ent; + struct list_head *list_ent; struct page_info *page; /* diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/shadow_public.c --- a/xen/arch/x86/shadow_public.c Wed Mar 1 17:01:54 2006 +++ b/xen/arch/x86/shadow_public.c Wed Mar 1 19:47:25 2006 @@ -31,7 +31,8 @@ #include <xen/trace.h> #include <asm/shadow_64.h> -static void free_p2m_table(struct vcpu *v); +static int alloc_p2m_table(struct domain *d); +static void free_p2m_table(struct domain *d); #define SHADOW_MAX_GUEST32(_encoded) ((L1_PAGETABLE_ENTRIES_32 - 1) - ((_encoded) >> 16)) @@ -328,6 +329,23 @@ if ( v->vcpu_id == 0 ) alloc_p2m_table(d); + else + { + unsigned long mfn; + + mfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table); + if ( mfn ) + { + l4_pgentry_t *l4tab; + + l4tab = map_domain_page(mfn); + + mpl4e[l4_table_offset(RO_MPT_VIRT_START)] = + l4tab[l4_table_offset(RO_MPT_VIRT_START)]; + + unmap_domain_page(l4tab); + } + } } void free_monitor_pagetable(struct vcpu *v) @@ -338,7 +356,7 @@ * free monitor_table. */ if ( v->vcpu_id == 0 ) - free_p2m_table(v); + free_p2m_table(v->domain); /* * Then free monitor_table. @@ -397,13 +415,49 @@ l2e_empty(); mpl2e[l2_table_offset(RO_MPT_VIRT_START)] = l2e_empty(); - unmap_domain_page(mpl2e); - v->arch.monitor_table = mk_pagetable(m3mfn << PAGE_SHIFT); /* < 4GB */ v->arch.monitor_vtable = (l2_pgentry_t *) mpl3e; if ( v->vcpu_id == 0 ) alloc_p2m_table(d); + else + { + unsigned long mfn; + + mfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table); + if ( mfn ) + { + l3_pgentry_t *l3tab, l3e; + l2_pgentry_t *l2tab; + + l3tab = map_domain_page(mfn); + l3e = l3tab[l3_table_offset(RO_MPT_VIRT_START)]; + + /* + * NB: when CONFIG_PAGING_LEVELS == 3, + * (entry_get_flags(l3e) & _PAGE_PRESENT) is always true here. + * alloc_monitor_pagetable should guarantee this. + */ + if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ) + BUG(); + + l2tab = map_domain_page(l3e_get_pfn(l3e)); + + /* + * Just one l2 slot is used here, so at most 2M for p2m table: + * ((4K * 512)/sizeof(unsigned long)) * 4K = 2G + * should be OK on PAE xen, since Qemu DM can only map 1.5G VMX + * guest memory. + */ + mpl2e[l2_table_offset(RO_MPT_VIRT_START)] = + l2tab[l2_table_offset(RO_MPT_VIRT_START)]; + + unmap_domain_page(l2tab); + unmap_domain_page(l3tab); + } + } + + unmap_domain_page(mpl2e); } void free_monitor_pagetable(struct vcpu *v) @@ -413,7 +467,7 @@ * free monitor_table. */ if ( v->vcpu_id == 0 ) - free_p2m_table(v); + free_p2m_table(v->domain); m3mfn = pagetable_get_pfn(v->arch.monitor_table); m2mfn = l2e_get_pfn(v->arch.monitor_vtable[L3_PAGETABLE_ENTRIES - 1]); @@ -1348,14 +1402,14 @@ } static int -map_p2m_entry( - pgentry_64_t *top_tab, unsigned long va, unsigned long gpa, unsigned long mfn) +map_p2m_entry(pgentry_64_t *top_tab, unsigned long va, + unsigned long gpfn, unsigned long mfn) { #if CONFIG_PAGING_LEVELS >= 4 pgentry_64_t l4e = { 0 }; + pgentry_64_t *l3tab = NULL; #endif #if CONFIG_PAGING_LEVELS >= 3 - pgentry_64_t *l3tab = NULL; pgentry_64_t l3e = { 0 }; #endif l2_pgentry_t *l2tab = NULL; @@ -1367,7 +1421,7 @@ #if CONFIG_PAGING_LEVELS >= 4 l4e = top_tab[l4_table_offset(va)]; - if ( !(entry_get_flags(l4e) & _PAGE_PRESENT) ) + if ( !(entry_get_flags(l4e) & _PAGE_PRESENT) ) { page = alloc_domheap_page(NULL); if ( !page ) @@ -1375,17 +1429,14 @@ l3tab = map_domain_page(page_to_mfn(page)); memset(l3tab, 0, PAGE_SIZE); - l4e = top_tab[l4_table_offset(va)] = + l4e = top_tab[l4_table_offset(va)] = entry_from_page(page, __PAGE_HYPERVISOR); - } - else if ( l3tab == NULL) + } + else l3tab = map_domain_page(entry_get_pfn(l4e)); l3e = l3tab[l3_table_offset(va)]; -#else - l3e = top_tab[l3_table_offset(va)]; -#endif - if ( !(entry_get_flags(l3e) & _PAGE_PRESENT) ) + if ( !(entry_get_flags(l3e) & _PAGE_PRESENT) ) { page = alloc_domheap_page(NULL); if ( !page ) @@ -1393,14 +1444,29 @@ l2tab = map_domain_page(page_to_mfn(page)); memset(l2tab, 0, PAGE_SIZE); - l3e = l3tab[l3_table_offset(va)] = + l3e = l3tab[l3_table_offset(va)] = entry_from_page(page, __PAGE_HYPERVISOR); - } - else if ( l2tab == NULL) + } + else l2tab = map_domain_page(entry_get_pfn(l3e)); + unmap_domain_page(l3tab); +#else + l3e = top_tab[l3_table_offset(va)]; + + /* + * NB: when CONFIG_PAGING_LEVELS == 3, + * (entry_get_flags(l3e) & _PAGE_PRESENT) is always true here. + * alloc_monitor_pagetable should guarantee this. + */ + if ( !(entry_get_flags(l3e) & _PAGE_PRESENT) ) + BUG(); + + l2tab = map_domain_page(entry_get_pfn(l3e)); +#endif + l2e = l2tab[l2_table_offset(va)]; - if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) + if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) { page = alloc_domheap_page(NULL); if ( !page ) @@ -1408,14 +1474,16 @@ l1tab = map_domain_page(page_to_mfn(page)); memset(l1tab, 0, PAGE_SIZE); - l2e = l2tab[l2_table_offset(va)] = + l2e = l2tab[l2_table_offset(va)] = l2e_from_page(page, __PAGE_HYPERVISOR); - } - else if ( l1tab == NULL) + } + else l1tab = map_domain_page(l2e_get_pfn(l2e)); + unmap_domain_page(l2tab); + l1e = l1tab[l1_table_offset(va)]; - if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) ) + if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) ) { page = alloc_domheap_page(NULL); if ( !page ) @@ -1423,96 +1491,88 @@ l0tab = map_domain_page(page_to_mfn(page)); memset(l0tab, 0, PAGE_SIZE); - l1e = l1tab[l1_table_offset(va)] = + l1e = l1tab[l1_table_offset(va)] = l1e_from_page(page, __PAGE_HYPERVISOR); } - else if ( l0tab == NULL) + else l0tab = map_domain_page(l1e_get_pfn(l1e)); - l0tab[gpa & ((PAGE_SIZE / sizeof (mfn)) - 1) ] = mfn; - - if ( l2tab ) - { - unmap_domain_page(l2tab); - l2tab = NULL; - } - if ( l1tab ) - { - unmap_domain_page(l1tab); - l1tab = NULL; - } - if ( l0tab ) - { - unmap_domain_page(l0tab); - l0tab = NULL; - } + unmap_domain_page(l1tab); + + l0tab[gpfn & ((PAGE_SIZE / sizeof (mfn)) - 1) ] = mfn; + + unmap_domain_page(l0tab); return 1; nomem: - return 0; } int -set_p2m_entry(struct domain *d, unsigned long pfn, unsigned long mfn, +set_p2m_entry(struct domain *d, unsigned long gpfn, unsigned long mfn, struct domain_mmap_cache *l2cache, struct domain_mmap_cache *l1cache) { - unsigned long tabpfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table); - pgentry_64_t *top; - unsigned long va = RO_MPT_VIRT_START + (pfn * sizeof (unsigned long)); + unsigned long tabmfn = pagetable_get_pfn(d->vcpu[0]->arch.monitor_table); + unsigned long va = RO_MPT_VIRT_START + (gpfn * sizeof(unsigned long)); + pgentry_64_t *top_tab; int error; - ASSERT(tabpfn != 0); + ASSERT(tabmfn != 0); ASSERT(shadow_lock_is_acquired(d)); - top = map_domain_page_with_cache(tabpfn, l2cache); - error = map_p2m_entry(top, va, pfn, mfn); - unmap_domain_page_with_cache(top, l2cache); - - if ( !error ) - domain_crash_synchronous(); - - return 1; -} - -int + top_tab = map_domain_page_with_cache(tabmfn, l2cache); + + if ( !(error = map_p2m_entry(top_tab, va, gpfn, mfn)) ) + domain_crash(d); + + unmap_domain_page_with_cache(top_tab, l2cache); + + return error; +} + +static int alloc_p2m_table(struct domain *d) { struct list_head *list_ent; unsigned long va = RO_MPT_VIRT_START; /* phys_to_machine_mapping */ pgentry_64_t *top_tab = NULL; unsigned long mfn; - int gpa; - - ASSERT ( pagetable_get_pfn(d->vcpu[0]->arch.monitor_table) ); + int gpfn, error = 0; + + ASSERT( pagetable_get_pfn(d->vcpu[0]->arch.monitor_table) ); top_tab = map_domain_page( pagetable_get_pfn(d->vcpu[0]->arch.monitor_table)); - list_ent = d->page_list.next; - for ( gpa = 0; list_ent != &d->page_list; gpa++ ) + for ( gpfn = 0; list_ent != &d->page_list; gpfn++ ) { struct page_info *page; + page = list_entry(list_ent, struct page_info, list); mfn = page_to_mfn(page); - map_p2m_entry(top_tab, va, gpa, mfn); + if ( !(error = map_p2m_entry(top_tab, va, gpfn, mfn)) ) + { + domain_crash(d); + break; + } + list_ent = frame_table[mfn].list.next; va += sizeof(mfn); } unmap_domain_page(top_tab); - return 1; + return error; } #if CONFIG_PAGING_LEVELS >= 3 static void -free_p2m_table(struct vcpu *v) +free_p2m_table(struct domain *d) { unsigned long va; l1_pgentry_t *l1tab; @@ -1520,27 +1580,35 @@ l2_pgentry_t *l2tab; l2_pgentry_t l2e; #if CONFIG_PAGING_LEVELS >= 3 - l3_pgentry_t *l3tab; + l3_pgentry_t *l3tab; l3_pgentry_t l3e; #endif #if CONFIG_PAGING_LEVELS == 4 int i3; - l4_pgentry_t *l4tab; + l4_pgentry_t *l4tab; l4_pgentry_t l4e; #endif - ASSERT ( pagetable_get_pfn(v->arch.monitor_table) ); + ASSERT( pagetable_get_pfn(d->vcpu[0]->arch.monitor_table) ); #if CONFIG_PAGING_LEVELS == 4 l4tab = map_domain_page( - pagetable_get_pfn(v->arch.monitor_table)); + pagetable_get_pfn(d->vcpu[0]->arch.monitor_table)); #endif #if CONFIG_PAGING_LEVELS == 3 l3tab = map_domain_page( - pagetable_get_pfn(v->arch.monitor_table)); - - va = RO_MPT_VIRT_START; - l3e = l3tab[l3_table_offset(va)]; + pagetable_get_pfn(d->vcpu[0]->arch.monitor_table)); + + l3e = l3tab[l3_table_offset(RO_MPT_VIRT_START)]; + + /* + * NB: when CONFIG_PAGING_LEVELS == 3, + * (entry_get_flags(l3e) & _PAGE_PRESENT) is always true here. + * alloc_monitor_pagetable should guarantee this. + */ + if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ) + BUG(); + l2tab = map_domain_page(l3e_get_pfn(l3e)); #endif @@ -1555,8 +1623,8 @@ for ( i3 = 0; i3 < L3_PAGETABLE_ENTRIES; i3++ ) { - l3e = l3tab[l3_table_offset(va)]; + if ( l3e_get_flags(l3e) & _PAGE_PRESENT ) { int i2; @@ -1567,12 +1635,13 @@ { #endif l2e = l2tab[l2_table_offset(va)]; + if ( l2e_get_flags(l2e) & _PAGE_PRESENT ) { int i1; l1tab = map_domain_page(l2e_get_pfn(l2e)); - + /* * unsigned long phys_to_machine_mapping[] */ @@ -1591,7 +1660,7 @@ else va += PAGE_SIZE * L1_PAGETABLE_ENTRIES; -#if CONFIG_PAGING_LEVELS == 4 +#if CONFIG_PAGING_LEVELS == 4 } unmap_domain_page(l2tab); free_domheap_page(mfn_to_page(l3e_get_pfn(l3e))); @@ -1603,7 +1672,7 @@ free_domheap_page(mfn_to_page(l4e_get_pfn(l4e))); } else - va += PAGE_SIZE * + va += PAGE_SIZE * L1_PAGETABLE_ENTRIES * L2_PAGETABLE_ENTRIES * L3_PAGETABLE_ENTRIES; #endif } @@ -1622,7 +1691,7 @@ paddr_t pa, l1_pgentry_t gpte, struct domain_mmap_cache *cache) { - unsigned long sl1mfn; + unsigned long sl1mfn; l1_pgentry_t *spl1e, spte; shadow_lock(d); diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/traps.c --- a/xen/arch/x86/traps.c Wed Mar 1 17:01:54 2006 +++ b/xen/arch/x86/traps.c Wed Mar 1 19:47:25 2006 @@ -951,6 +951,7 @@ case 3: /* Write CR3 */ LOCK_BIGLOCK(v->domain); + cleanup_writable_pagetable(v->domain); (void)new_guest_cr3(gmfn_to_mfn(v->domain, paddr_to_pfn(*reg))); UNLOCK_BIGLOCK(v->domain); break; @@ -1002,7 +1003,6 @@ #endif default: if ( (rdmsr_safe(regs->ecx, l, h) != 0) || - (regs->ecx != MSR_EFER) || (regs->eax != l) || (regs->edx != h) ) DPRINTK("Domain attempted WRMSR %p from " "%08x:%08x to %08lx:%08lx.\n", @@ -1033,8 +1033,8 @@ goto fail; break; default: - DPRINTK("Domain attempted RDMSR %p.\n", _p(regs->ecx)); /* Everyone can read the MSR space. */ + /*DPRINTK("Domain attempted RDMSR %p.\n", _p(regs->ecx));*/ if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) ) goto fail; break; @@ -1416,8 +1416,8 @@ { if ( hypercall_preempt_check() ) { - rc = hypercall1_create_continuation( - __HYPERVISOR_set_trap_table, traps); + rc = hypercall_create_continuation( + __HYPERVISOR_set_trap_table, "p", traps); break; } @@ -1430,7 +1430,7 @@ if ( cur.address == 0 ) break; - fixup_guest_selector(cur.cs); + fixup_guest_code_selector(cur.cs); memcpy(&dst[cur.vector], &cur, sizeof(cur)); diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/x86_32/asm-offsets.c --- a/xen/arch/x86/x86_32/asm-offsets.c Wed Mar 1 17:01:54 2006 +++ b/xen/arch/x86/x86_32/asm-offsets.c Wed Mar 1 19:47:25 2006 @@ -72,6 +72,13 @@ DEFINE(_VCPUF_nmi_masked, _VCPUF_nmi_masked); BLANK(); + OFFSET(TSS_ss0, struct tss_struct, ss0); + OFFSET(TSS_esp0, struct tss_struct, esp0); + OFFSET(TSS_ss1, struct tss_struct, ss1); + OFFSET(TSS_esp1, struct tss_struct, esp1); + DEFINE(TSS_sizeof, sizeof(struct tss_struct)); + BLANK(); + OFFSET(VCPU_svm_vmcb_pa, struct vcpu, arch.hvm_svm.vmcb_pa); OFFSET(VCPU_svm_hsa_pa, struct vcpu, arch.hvm_svm.host_save_pa); OFFSET(VCPU_svm_vmcb, struct vcpu, arch.hvm_svm.vmcb); diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/x86_32/entry.S --- a/xen/arch/x86/x86_32/entry.S Wed Mar 1 17:01:54 2006 +++ b/xen/arch/x86/x86_32/entry.S Wed Mar 1 19:47:25 2006 @@ -77,6 +77,13 @@ restore_all_guest: testl $X86_EFLAGS_VM,UREGS_eflags(%esp) jnz restore_all_vm86 +#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL + testl $2,UREGS_cs(%esp) + jnz 1f + call restore_ring0_guest + jmp restore_all_vm86 +1: +#endif FLT1: mov UREGS_ds(%esp),%ds FLT2: mov UREGS_es(%esp),%es FLT3: mov UREGS_fs(%esp),%fs @@ -157,6 +164,7 @@ ALIGN ENTRY(hypercall) subl $4,%esp + FIXUP_RING0_GUEST_STACK SAVE_ALL(b) sti GET_CURRENT(%ebx) @@ -294,6 +302,11 @@ popl %eax shll $16,%eax # Bits 16-23: saved_upcall_mask movw UREGS_cs+4(%esp),%ax # Bits 0-15: CS +#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL + testw $2,%ax + jnz FLT15 + and $~3,%ax # RPL 1 -> RPL 0 +#endif FLT15: movl %eax,%gs:4(%esi) test $0x00FF0000,%eax # Bits 16-23: saved_upcall_mask setz %ch # %ch == !saved_upcall_mask @@ -388,6 +401,7 @@ pushl $TRAP_divide_error<<16 ALIGN error_code: + FIXUP_RING0_GUEST_STACK SAVE_ALL_NOSEGREGS(a) SET_XEN_SEGMENTS(a) testb $X86_EFLAGS_IF>>8,UREGS_eflags+1(%esp) @@ -505,6 +519,10 @@ jmp error_code ENTRY(nmi) +#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL + # NMI entry protocol is incompatible with guest kernel in ring 0. + iret +#else # Save state but do not trash the segment registers! # We may otherwise be unable to reload them or copy them to ring 1. pushl %eax @@ -546,6 +564,7 @@ movl $(APIC_DM_FIXED | APIC_DEST_SELF | APIC_DEST_LOGICAL | \ TRAP_deferred_nmi),%ss:APIC_ICR(%eax) jmp restore_all_xen +#endif /* !CONFIG_X86_SUPERVISOR_MODE_KERNEL */ ENTRY(setup_vm86_frame) # Copies the entire stack frame forwards by 16 bytes. diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/x86_32/mm.c --- a/xen/arch/x86/x86_32/mm.c Wed Mar 1 17:01:54 2006 +++ b/xen/arch/x86/x86_32/mm.c Wed Mar 1 19:47:25 2006 @@ -23,6 +23,7 @@ #include <xen/init.h> #include <xen/mm.h> #include <xen/sched.h> +#include <xen/guest_access.h> #include <asm/current.h> #include <asm/page.h> #include <asm/flushtlb.h> @@ -180,9 +181,18 @@ page_set_owner(page, dom_xen); } } -} - -long subarch_memory_op(int op, void *arg) + + if ( supervisor_mode_kernel ) + { + /* Guest kernel runs in ring 0, not ring 1. */ + struct desc_struct *d; + d = &gdt_table[(FLAT_RING1_CS >> 3) - FIRST_RESERVED_GDT_ENTRY]; + d[0].b &= ~_SEGMENT_DPL; + d[1].b &= ~_SEGMENT_DPL; + } +} + +long subarch_memory_op(int op, GUEST_HANDLE(void) arg) { struct xen_machphys_mfn_list xmml; unsigned long mfn; @@ -192,7 +202,7 @@ switch ( op ) { case XENMEM_machphys_mfn_list: - if ( copy_from_user(&xmml, arg, sizeof(xmml)) ) + if ( copy_from_guest(&xmml, arg, 1) ) return -EFAULT; max = min_t(unsigned int, xmml.max_extents, mpt_size >> 21); @@ -201,11 +211,12 @@ { mfn = l2e_get_pfn(idle_pg_table_l2[l2_linear_offset( RDWR_MPT_VIRT_START + (i << 21))]) + l1_table_offset(i << 21); - if ( put_user(mfn, &xmml.extent_start[i]) ) + if ( copy_to_guest_offset(xmml.extent_start, i, &mfn, 1) ) return -EFAULT; } - if ( put_user(i, &((struct xen_machphys_mfn_list *)arg)->nr_extents) ) + xmml.nr_extents = i; + if ( copy_to_guest(arg, &xmml, 1) ) return -EFAULT; break; @@ -223,7 +234,7 @@ int nr = smp_processor_id(); struct tss_struct *t = &init_tss[nr]; - fixup_guest_selector(ss); + fixup_guest_stack_selector(ss); current->arch.guest_context.kernel_ss = ss; current->arch.guest_context.kernel_sp = esp; @@ -239,6 +250,10 @@ unsigned long base, limit; u32 a = d->a, b = d->b; u16 cs; + + /* Let a ring0 guest kernel set any descriptor it wants to. */ + if ( supervisor_mode_kernel ) + return 1; /* A not-present descriptor will always fault, so is safe. */ if ( !(b & _SEGMENT_P) ) @@ -273,7 +288,7 @@ /* Validate and fix up the target code selector. */ cs = a >> 16; - fixup_guest_selector(cs); + fixup_guest_code_selector(cs); if ( !guest_gate_selector_okay(cs) ) goto bad; a = d->a = (d->a & 0xffffU) | (cs << 16); diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/x86_32/traps.c --- a/xen/arch/x86/x86_32/traps.c Wed Mar 1 17:01:54 2006 +++ b/xen/arch/x86/x86_32/traps.c Wed Mar 1 19:47:25 2006 @@ -256,8 +256,14 @@ * We can't virtualise interrupt gates, as there's no way to get * the CPU to automatically clear the events_mask variable. Also we * must ensure that the CS is safe to poke into an interrupt gate. - */ - if ( TI_GET_IF(ti) || !guest_gate_selector_okay(ti->cs) ) + * + * When running with supervisor_mode_kernel enabled a direct trap + * to the guest OS cannot be used because the INT instruction will + * switch to the Xen stack and we need to swap back to the guest + * kernel stack before passing control to the system call entry point. + */ + if ( TI_GET_IF(ti) || !guest_gate_selector_okay(ti->cs) || + supervisor_mode_kernel ) { v->arch.int80_desc.a = v->arch.int80_desc.b = 0; return; @@ -278,8 +284,8 @@ { struct vcpu *d = current; - fixup_guest_selector(event_selector); - fixup_guest_selector(failsafe_selector); + fixup_guest_code_selector(event_selector); + fixup_guest_code_selector(failsafe_selector); d->arch.guest_context.event_callback_cs = event_selector; d->arch.guest_context.event_callback_eip = event_address; @@ -289,12 +295,51 @@ return 0; } -void hypercall_page_initialise(void *hypercall_page) -{ +static void hypercall_page_initialise_ring0_kernel(void *hypercall_page) +{ + extern asmlinkage int hypercall(void); char *p; int i; /* Fill in all the transfer points with template machine code. */ + + for ( i = 0; i < NR_hypercalls; i++ ) + { + p = (char *)(hypercall_page + (i * 32)); + + *(u8 *)(p+ 0) = 0x9c; /* pushf */ + *(u8 *)(p+ 1) = 0xfa; /* cli */ + *(u8 *)(p+ 2) = 0xb8; /* mov $<i>,%eax */ + *(u32 *)(p+ 3) = i; + *(u8 *)(p+ 7) = 0x9a; /* lcall $__HYPERVISOR_CS,&hypercall */ + *(u32 *)(p+ 8) = (u32)&hypercall; + *(u16 *)(p+12) = (u16)__HYPERVISOR_CS; + *(u8 *)(p+14) = 0xc3; /* ret */ + } + + /* + * HYPERVISOR_iret is special because it doesn't return and expects a + * special stack frame. Guests jump at this transfer point instead of + * calling it. + */ + p = (char *)(hypercall_page + (__HYPERVISOR_iret * 32)); + *(u8 *)(p+ 0) = 0x50; /* push %eax */ + *(u8 *)(p+ 1) = 0x9c; /* pushf */ + *(u8 *)(p+ 2) = 0xfa; /* cli */ + *(u8 *)(p+ 3) = 0xb8; /* mov $<i>,%eax */ + *(u32 *)(p+ 4) = __HYPERVISOR_iret; + *(u8 *)(p+ 8) = 0x9a; /* lcall $__HYPERVISOR_CS,&hypercall */ + *(u32 *)(p+ 9) = (u32)&hypercall; + *(u16 *)(p+13) = (u16)__HYPERVISOR_CS; +} + +static void hypercall_page_initialise_ring1_kernel(void *hypercall_page) +{ + char *p; + int i; + + /* Fill in all the transfer points with template machine code. */ + for ( i = 0; i < (PAGE_SIZE / 32); i++ ) { p = (char *)(hypercall_page + (i * 32)); @@ -314,6 +359,14 @@ *(u8 *)(p+ 1) = 0xb8; /* mov $__HYPERVISOR_iret,%eax */ *(u32 *)(p+ 2) = __HYPERVISOR_iret; *(u16 *)(p+ 6) = 0x82cd; /* int $0x82 */ +} + +void hypercall_page_initialise(void *hypercall_page) +{ + if ( supervisor_mode_kernel ) + hypercall_page_initialise_ring0_kernel(hypercall_page); + else + hypercall_page_initialise_ring1_kernel(hypercall_page); } /* diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/x86_64/mm.c --- a/xen/arch/x86/x86_64/mm.c Wed Mar 1 17:01:54 2006 +++ b/xen/arch/x86/x86_64/mm.c Wed Mar 1 19:47:25 2006 @@ -22,6 +22,7 @@ #include <xen/init.h> #include <xen/mm.h> #include <xen/sched.h> +#include <xen/guest_access.h> #include <asm/current.h> #include <asm/asm_defns.h> #include <asm/page.h> @@ -182,7 +183,7 @@ } } -long subarch_memory_op(int op, void *arg) +long subarch_memory_op(int op, GUEST_HANDLE(void) arg) { struct xen_machphys_mfn_list xmml; l3_pgentry_t l3e; @@ -194,7 +195,7 @@ switch ( op ) { case XENMEM_machphys_mfn_list: - if ( copy_from_user(&xmml, arg, sizeof(xmml)) ) + if ( copy_from_guest(&xmml, arg, 1) ) return -EFAULT; for ( i = 0, v = RDWR_MPT_VIRT_START; @@ -209,11 +210,12 @@ if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) break; mfn = l2e_get_pfn(l2e) + l1_table_offset(v); - if ( put_user(mfn, &xmml.extent_start[i]) ) + if ( copy_to_guest_offset(xmml.extent_start, i, &mfn, 1) ) return -EFAULT; } - if ( put_user(i, &((struct xen_machphys_mfn_list *)arg)->nr_extents) ) + xmml.nr_extents = i; + if ( copy_to_guest(arg, &xmml, 1) ) return -EFAULT; break; @@ -228,7 +230,7 @@ long do_stack_switch(unsigned long ss, unsigned long esp) { - fixup_guest_selector(ss); + fixup_guest_stack_selector(ss); current->arch.guest_context.kernel_ss = ss; current->arch.guest_context.kernel_sp = esp; return 0; @@ -315,7 +317,7 @@ /* Validate and fix up the target code selector. */ cs = a >> 16; - fixup_guest_selector(cs); + fixup_guest_code_selector(cs); if ( !guest_gate_selector_okay(cs) ) goto bad; a = d->a = (d->a & 0xffffU) | (cs << 16); diff -r 88f97bb8f3ae -r 673f62edbfbe xen/common/dom0_ops.c --- a/xen/common/dom0_ops.c Wed Mar 1 17:01:54 2006 +++ b/xen/common/dom0_ops.c Wed Mar 1 19:47:25 2006 @@ -46,6 +46,7 @@ struct vcpu *v; u64 cpu_time = 0; int flags = DOMFLAGS_BLOCKED; + struct vcpu_runstate_info runstate; info->domain = d->domain_id; info->nr_online_vcpus = 0; @@ -55,7 +56,8 @@ * - domain is marked as running if any of its vcpus is running */ for_each_vcpu ( d, v ) { - cpu_time += v->cpu_time; + vcpu_runstate_get(v, &runstate); + cpu_time += runstate.time[RUNSTATE_running]; info->max_vcpu_id = v->vcpu_id; if ( !test_bit(_VCPUF_down, &v->vcpu_flags) ) { @@ -165,7 +167,15 @@ domid_t dom; struct vcpu *v; unsigned int i, cnt[NR_CPUS] = { 0 }; + cpumask_t cpu_exclude_map; static domid_t rover = 0; + + /* + * Running the domain 0 kernel in ring 0 is not compatible + * with multiple guests. + */ + if ( supervisor_mode_kernel ) + return -EINVAL; dom = op->u.createdomain.domain; if ( (dom > 0) && (dom < DOMID_FIRST_RESERVED) ) @@ -195,18 +205,29 @@ read_lock(&domlist_lock); for_each_domain ( d ) for_each_vcpu ( d, v ) - cnt[v->processor]++; + if ( !test_bit(_VCPUF_down, &v->vcpu_flags) ) + cnt[v->processor]++; read_unlock(&domlist_lock); /* - * If we're on a HT system, we only use the first HT for dom0, other - * domains will all share the second HT of each CPU. Since dom0 is on - * CPU 0, we favour high numbered CPUs in the event of a tie. + * If we're on a HT system, we only auto-allocate to a non-primary HT. + * We favour high numbered CPUs in the event of a tie. */ - pro = smp_num_siblings - 1; - for ( i = pro; i < num_online_cpus(); i += smp_num_siblings ) + pro = first_cpu(cpu_sibling_map[0]); + if ( cpus_weight(cpu_sibling_map[0]) > 1 ) + pro = next_cpu(pro, cpu_sibling_map[0]); + cpu_exclude_map = cpu_sibling_map[0]; + for_each_online_cpu ( i ) + { + if ( cpu_isset(i, cpu_exclude_map) ) + continue; + if ( (i == first_cpu(cpu_sibling_map[i])) && + (cpus_weight(cpu_sibling_map[i]) > 1) ) + continue; + cpus_or(cpu_exclude_map, cpu_exclude_map, cpu_sibling_map[i]); if ( cnt[i] <= cnt[pro] ) pro = i; + } ret = -ENOMEM; if ( (d = domain_create(dom, pro)) == NULL ) @@ -485,6 +506,7 @@ { struct domain *d; struct vcpu *v; + struct vcpu_runstate_info runstate; ret = -ESRCH; if ( (d = find_domain_by_id(op->u.getvcpuinfo.domain)) == NULL ) @@ -498,10 +520,12 @@ if ( (v = d->vcpu[op->u.getvcpuinfo.vcpu]) == NULL ) goto getvcpuinfo_out; + vcpu_runstate_get(v, &runstate); + op->u.getvcpuinfo.online = !test_bit(_VCPUF_down, &v->vcpu_flags); op->u.getvcpuinfo.blocked = test_bit(_VCPUF_blocked, &v->vcpu_flags); op->u.getvcpuinfo.running = test_bit(_VCPUF_running, &v->vcpu_flags); - op->u.getvcpuinfo.cpu_time = v->cpu_time; + op->u.getvcpuinfo.cpu_time = runstate.time[RUNSTATE_running]; op->u.getvcpuinfo.cpu = v->processor; op->u.getvcpuinfo.cpumap = 0; memcpy(&op->u.getvcpuinfo.cpumap, diff -r 88f97bb8f3ae -r 673f62edbfbe xen/common/domain.c --- a/xen/common/domain.c Wed Mar 1 17:01:54 2006 +++ b/xen/common/domain.c Wed Mar 1 19:47:25 2006 @@ -451,6 +451,41 @@ case VCPUOP_is_up: rc = !test_bit(_VCPUF_down, &v->vcpu_flags); break; + + case VCPUOP_get_runstate_info: + { + struct vcpu_runstate_info runstate; + vcpu_runstate_get(v, &runstate); + if ( copy_to_user(arg, &runstate, sizeof(runstate)) ) + rc = -EFAULT; + break; + } + + case VCPUOP_register_runstate_memory_area: + { + struct vcpu_register_runstate_memory_area area; + + rc = -EINVAL; + if ( v != current ) + break; + + rc = -EFAULT; + if ( copy_from_user(&area, arg, sizeof(area)) ) + break; + + if ( !access_ok(area.addr.v, sizeof(*area.addr.v)) ) + break; + + rc = 0; + v->runstate_guest = area.addr.v; + __copy_to_user(v->runstate_guest, &v->runstate, sizeof(v->runstate)); + + break; + } + + default: + rc = -ENOSYS; + break; } return rc; diff -r 88f97bb8f3ae -r 673f62edbfbe xen/common/kernel.c --- a/xen/common/kernel.c Wed Mar 1 17:01:54 2006 +++ b/xen/common/kernel.c Wed Mar 1 19:47:25 2006 @@ -195,6 +195,8 @@ (1U << XENFEAT_writable_page_tables) | (1U << XENFEAT_auto_translated_physmap) | (1U << XENFEAT_pae_pgdir_above_4gb); + if ( supervisor_mode_kernel ) + fi.submap |= 1U << XENFEAT_supervisor_mode_kernel; break; default: return -EINVAL; diff -r 88f97bb8f3ae -r 673f62edbfbe xen/common/keyhandler.c --- a/xen/common/keyhandler.c Wed Mar 1 17:01:54 2006 +++ b/xen/common/keyhandler.c Wed Mar 1 19:47:25 2006 @@ -169,8 +169,6 @@ } extern void dump_runq(unsigned char key); -extern void print_sched_histo(unsigned char key); -extern void reset_sched_histo(unsigned char key); #ifndef NDEBUG extern void audit_domains_key(unsigned char key); #endif @@ -206,10 +204,6 @@ 'd', dump_registers, "dump registers"); register_keyhandler( 'h', show_handlers, "show this message"); - register_keyhandler( - 'l', print_sched_histo, "print sched latency histogram"); - register_keyhandler( - 'L', reset_sched_histo, "reset sched latency histogram"); register_keyhandler( 'q', dump_domains, "dump domain (and guest debug) info"); register_keyhandler( diff -r 88f97bb8f3ae -r 673f62edbfbe xen/common/memory.c --- a/xen/common/memory.c Wed Mar 1 17:01:54 2006 +++ b/xen/common/memory.c Wed Mar 1 19:47:25 2006 @@ -16,6 +16,7 @@ #include <xen/event.h> #include <xen/shadow.h> #include <xen/iocap.h> +#include <xen/guest_access.h> #include <asm/current.h> #include <asm/hardirq.h> #include <public/memory.h> @@ -30,7 +31,7 @@ static long increase_reservation( struct domain *d, - unsigned long *extent_list, + GUEST_HANDLE(xen_ulong) extent_list, unsigned int nr_extents, unsigned int extent_order, unsigned int flags, @@ -39,8 +40,8 @@ struct page_info *page; unsigned long i, mfn; - if ( (extent_list != NULL) && - !array_access_ok(extent_list, nr_extents, sizeof(*extent_list)) ) + if ( !guest_handle_is_null(extent_list) && + !guest_handle_okay(extent_list, nr_extents) ) return 0; if ( (extent_order != 0) && @@ -65,10 +66,10 @@ } /* Inform the domain of the new page's machine address. */ - if ( extent_list != NULL ) + if ( !guest_handle_is_null(extent_list) ) { mfn = page_to_mfn(page); - if ( unlikely(__copy_to_user(&extent_list[i], &mfn, sizeof(mfn))) ) + if ( unlikely(__copy_to_guest_offset(extent_list, i, &mfn, 1)) ) return i; } } @@ -79,16 +80,16 @@ static long populate_physmap( struct domain *d, - unsigned long *extent_list, - unsigned int nr_extents, - unsigned int extent_order, - unsigned int flags, - int *preempted) + GUEST_HANDLE(xen_ulong) extent_list, + unsigned int nr_extents, + unsigned int extent_order, + unsigned int flags, + int *preempted) { struct page_info *page; unsigned long i, j, gpfn, mfn; - if ( !array_access_ok(extent_list, nr_extents, sizeof(*extent_list)) ) + if ( !guest_handle_okay(extent_list, nr_extents) ) return 0; if ( (extent_order != 0) && @@ -103,7 +104,7 @@ goto out; } - if ( unlikely(__copy_from_user(&gpfn, &extent_list[i], sizeof(gpfn))) ) + if ( unlikely(__copy_from_guest_offset(&gpfn, extent_list, i, 1)) ) goto out; if ( unlikely((page = alloc_domheap_pages( @@ -128,7 +129,7 @@ set_gpfn_from_mfn(mfn + j, gpfn + j); /* Inform the domain of the new page's machine address. */ - if ( unlikely(__copy_to_user(&extent_list[i], &mfn, sizeof(mfn))) ) + if ( unlikely(__copy_to_guest_offset(extent_list, i, &mfn, 1)) ) goto out; } } @@ -139,8 +140,8 @@ static long decrease_reservation( - struct domain *d, - unsigned long *extent_list, + struct domain *d, + GUEST_HANDLE(xen_ulong) extent_list, unsigned int nr_extents, unsigned int extent_order, unsigned int flags, @@ -149,7 +150,7 @@ struct page_info *page; unsigned long i, j, gmfn, mfn; - if ( !array_access_ok(extent_list, nr_extents, sizeof(*extent_list)) ) + if ( !guest_handle_okay(extent_list, nr_extents) ) return 0; for ( i = 0; i < nr_extents; i++ ) @@ -160,7 +161,7 @@ return i; } - if ( unlikely(__copy_from_user(&gmfn, &extent_list[i], sizeof(gmfn))) ) + if ( unlikely(__copy_from_guest_offset(&gmfn, extent_list, i, 1)) ) return i; for ( j = 0; j < (1 << extent_order); j++ ) @@ -197,21 +198,21 @@ static long translate_gpfn_list( - struct xen_translate_gpfn_list *uop, unsigned long *progress) + GUEST_HANDLE(xen_translate_gpfn_list_t) uop, unsigned long *progress) { struct xen_translate_gpfn_list op; unsigned long i, gpfn, mfn; struct domain *d; - if ( copy_from_user(&op, uop, sizeof(op)) ) + if ( copy_from_guest(&op, uop, 1) ) return -EFAULT; /* Is size too large for us to encode a continuation? */ if ( op.nr_gpfns > (ULONG_MAX >> START_EXTENT_SHIFT) ) return -EINVAL; - if ( !array_access_ok(op.gpfn_list, op.nr_gpfns, sizeof(*op.gpfn_list)) || - !array_access_ok(op.mfn_list, op.nr_gpfns, sizeof(*op.mfn_list)) ) + if ( !guest_handle_okay(op.gpfn_list, op.nr_gpfns) || + !guest_handle_okay(op.mfn_list, op.nr_gpfns) ) return -EFAULT; if ( op.domid == DOMID_SELF ) @@ -237,8 +238,7 @@ return -EAGAIN; } - if ( unlikely(__copy_from_user(&gpfn, &op.gpfn_list[i], - sizeof(gpfn))) ) + if ( unlikely(__copy_from_guest_offset(&gpfn, op.gpfn_list, i, 1)) ) { put_domain(d); return -EFAULT; @@ -246,8 +246,7 @@ mfn = gmfn_to_mfn(d, gpfn); - if ( unlikely(__copy_to_user(&op.mfn_list[i], &mfn, - sizeof(mfn))) ) + if ( unlikely(__copy_to_guest_offset(op.mfn_list, i, &mfn, 1)) ) { put_domain(d); return -EFAULT; @@ -258,7 +257,7 @@ return 0; } -long do_memory_op(unsigned long cmd, void *arg) +long do_memory_op(unsigned long cmd, GUEST_HANDLE(void) arg) { struct domain *d; int rc, op, flags = 0, preempted = 0; @@ -273,7 +272,7 @@ case XENMEM_increase_reservation: case XENMEM_decrease_reservation: case XENMEM_populate_physmap: - if ( copy_from_user(&reservation, arg, sizeof(reservation)) ) + if ( copy_from_guest(&reservation, arg, 1) ) return -EFAULT; /* Is size too large for us to encode a continuation? */ @@ -283,9 +282,9 @@ start_extent = cmd >> START_EXTENT_SHIFT; if ( unlikely(start_extent > reservation.nr_extents) ) return -EINVAL; - - if ( reservation.extent_start != NULL ) - reservation.extent_start += start_extent; + + if ( !guest_handle_is_null(reservation.extent_start) ) + guest_handle_add_offset(reservation.extent_start, start_extent); reservation.nr_extents -= start_extent; if ( (reservation.address_bits != 0) && @@ -342,8 +341,9 @@ rc += start_extent; if ( preempted ) - return hypercall2_create_continuation( - __HYPERVISOR_memory_op, op | (rc << START_EXTENT_SHIFT), arg); + return hypercall_create_continuation( + __HYPERVISOR_memory_op, "lh", + op | (rc << START_EXTENT_SHIFT), arg); break; @@ -353,10 +353,10 @@ case XENMEM_current_reservation: case XENMEM_maximum_reservation: - if ( copy_from_user(&domid, (domid_t *)arg, sizeof(domid)) ) + if ( copy_from_guest(&domid, arg, 1) ) return -EFAULT; - if ( likely((domid = (unsigned long)arg) == DOMID_SELF) ) + if ( likely(domid == DOMID_SELF) ) d = current->domain; else if ( !IS_PRIV(current->domain) ) return -EPERM; @@ -372,12 +372,13 @@ case XENMEM_translate_gpfn_list: progress = cmd >> START_EXTENT_SHIFT; - rc = translate_gpfn_list(arg, &progress); + rc = translate_gpfn_list( + guest_handle_cast(arg, xen_translate_gpfn_list_t), + &progress); if ( rc == -EAGAIN ) - return hypercall2_create_continuation( - __HYPERVISOR_memory_op, - op | (progress << START_EXTENT_SHIFT), - arg); + return hypercall_create_continuation( + __HYPERVISOR_memory_op, "lh", + op | (progress << START_EXTENT_SHIFT), arg); break; default: diff -r 88f97bb8f3ae -r 673f62edbfbe xen/common/multicall.c --- a/xen/common/multicall.c Wed Mar 1 17:01:54 2006 +++ b/xen/common/multicall.c Wed Mar 1 19:47:25 2006 @@ -81,8 +81,8 @@ if ( i < nr_calls ) { mcs->flags = 0; - return hypercall2_create_continuation( - __HYPERVISOR_multicall, &call_list[i], nr_calls-i); + return hypercall_create_continuation( + __HYPERVISOR_multicall, "pi", &call_list[i], nr_calls-i); } } } diff -r 88f97bb8f3ae -r 673f62edbfbe xen/common/page_alloc.c --- a/xen/common/page_alloc.c Wed Mar 1 17:01:54 2006 +++ b/xen/common/page_alloc.c Wed Mar 1 19:47:25 2006 @@ -32,6 +32,7 @@ #include <xen/softirq.h> #include <xen/shadow.h> #include <xen/domain_page.h> +#include <xen/keyhandler.h> #include <asm/page.h> /* @@ -662,6 +663,26 @@ } +static void pagealloc_keyhandler(unsigned char key) +{ + printk("Physical memory information:\n"); + printk(" Xen heap: %lukB free\n" + " DMA heap: %lukB free\n" + " Dom heap: %lukB free\n", + avail[MEMZONE_XEN]<<(PAGE_SHIFT-10), + avail[MEMZONE_DMADOM]<<(PAGE_SHIFT-10), + avail[MEMZONE_DOM]<<(PAGE_SHIFT-10)); +} + + +static __init int pagealloc_keyhandler_init(void) +{ + register_keyhandler('m', pagealloc_keyhandler, "memory info"); + return 0; +} +__initcall(pagealloc_keyhandler_init); + + /************************* * PAGE SCRUBBING diff -r 88f97bb8f3ae -r 673f62edbfbe xen/common/sched_bvt.c --- a/xen/common/sched_bvt.c Wed Mar 1 17:01:54 2006 +++ b/xen/common/sched_bvt.c Wed Mar 1 19:47:25 2006 @@ -132,13 +132,13 @@ vcpu_schedule_unlock_irq(v); } -static inline u32 calc_avt(struct vcpu *d, s_time_t now) +static inline u32 calc_avt(struct vcpu *v, s_time_t now) { u32 ranfor, mcus; - struct bvt_dom_info *inf = BVT_INFO(d->domain); - struct bvt_vcpu_info *einf = EBVT_INFO(d); - - ranfor = (u32)(now - d->lastschd); + struct bvt_dom_info *inf = BVT_INFO(v->domain); + struct bvt_vcpu_info *einf = EBVT_INFO(v); + + ranfor = (u32)(now - v->runstate.state_entry_time); mcus = (ranfor + MCU - 1)/MCU; return einf->avt + mcus * inf->mcu_advance; @@ -262,7 +262,7 @@ curr_evt = calc_evt(curr, calc_avt(curr, now)); /* Calculate the time the current domain would run assuming the second smallest evt is of the newly woken domain */ - r_time = curr->lastschd + + r_time = curr->runstate.state_entry_time + ((einf->evt - curr_evt) / BVT_INFO(curr->domain)->mcu_advance) + ctx_allow; @@ -558,7 +558,6 @@ printk("%3d: %u has=%c ", loop++, v->domain->domain_id, test_bit(_VCPUF_running, &v->vcpu_flags) ? 'T':'F'); bvt_dump_runq_el(v); - printk("c=0x%X%08X\n", (u32)(v->cpu_time>>32), (u32)v->cpu_time); printk(" l: %p n: %p p: %p\n", &vcpu_inf->run_list, vcpu_inf->run_list.next, vcpu_inf->run_list.prev); diff -r 88f97bb8f3ae -r 673f62edbfbe xen/common/sched_sedf.c --- a/xen/common/sched_sedf.c Wed Mar 1 17:01:54 2006 +++ b/xen/common/sched_sedf.c Wed Mar 1 19:47:25 2006 @@ -1408,18 +1408,14 @@ { printk("%i.%i has=%c ", d->domain->domain_id, d->vcpu_id, test_bit(_VCPUF_running, &d->vcpu_flags) ? 'T':'F'); - printk("p=%"PRIu64" sl=%"PRIu64" ddl=%"PRIu64" w=%hu c=%"PRIu64 + printk("p=%"PRIu64" sl=%"PRIu64" ddl=%"PRIu64" w=%hu" " sc=%i xtr(%s)=%"PRIu64" ew=%hu", EDOM_INFO(d)->period, EDOM_INFO(d)->slice, EDOM_INFO(d)->deadl_abs, - EDOM_INFO(d)->weight, d->cpu_time, + EDOM_INFO(d)->weight, EDOM_INFO(d)->score[EXTRA_UTIL_Q], (EDOM_INFO(d)->status & EXTRA_AWARE) ? "yes" : "no", EDOM_INFO(d)->extra_time_tot, EDOM_INFO(d)->extraweight); - if ( d->cpu_time != 0 ) - printf(" (%"PRIu64"%%)", (EDOM_INFO(d)->extra_time_tot * 100) - / d->cpu_time); - #ifdef SEDF_STATS if ( EDOM_INFO(d)->block_time_tot != 0 ) printf(" pen=%"PRIu64"%%", (EDOM_INFO(d)->penalty_time_tot * 100) / diff -r 88f97bb8f3ae -r 673f62edbfbe xen/common/schedule.c --- a/xen/common/schedule.c Wed Mar 1 17:01:54 2006 +++ b/xen/common/schedule.c Wed Mar 1 19:47:25 2006 @@ -36,14 +36,6 @@ static char opt_sched[10] = "sedf"; string_param("sched", opt_sched); -/*#define WAKE_HISTO*/ -/*#define BLOCKTIME_HISTO*/ -#if defined(WAKE_HISTO) -#define BUCKETS 31 -#elif defined(BLOCKTIME_HISTO) -#define BUCKETS 200 -#endif - #define TIME_SLOP (s32)MICROSECS(50) /* allow time to slip a bit */ /* Various timer handlers. */ @@ -73,6 +65,36 @@ /* Per-CPU periodic timer sends an event to the currently-executing domain. */ static struct timer t_timer[NR_CPUS]; +static inline void vcpu_runstate_change( + struct vcpu *v, int new_state, s_time_t new_entry_time) +{ + ASSERT(v->runstate.state != new_state); + ASSERT(spin_is_locked(&schedule_data[v->processor].schedule_lock)); + + v->runstate.time[v->runstate.state] += + new_entry_time - v->runstate.state_entry_time; + v->runstate.state_entry_time = new_entry_time; + v->runstate.state = new_state; +} + +void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate) +{ + if ( likely(v == current) ) + { + /* Fast lock-free path. */ + memcpy(runstate, &v->runstate, sizeof(*runstate)); + ASSERT(runstate->state == RUNSTATE_running); + runstate->time[RUNSTATE_running] += NOW() - runstate->state_entry_time; + } + else + { + vcpu_schedule_lock_irq(v); + memcpy(runstate, &v->runstate, sizeof(*runstate)); + runstate->time[runstate->state] += NOW() - runstate->state_entry_time; + vcpu_schedule_unlock_irq(v); + } +} + struct domain *alloc_domain(void) { struct domain *d; @@ -119,6 +141,9 @@ v->cpu_affinity = is_idle_domain(d) ? cpumask_of_cpu(cpu_id) : CPU_MASK_ALL; + v->runstate.state = is_idle_vcpu(v) ? RUNSTATE_running : RUNSTATE_offline; + v->runstate.state_entry_time = NOW(); + if ( (vcpu_id != 0) && !is_idle_domain(d) ) set_bit(_VCPUF_down, &v->vcpu_flags); @@ -165,8 +190,15 @@ unsigned long flags; vcpu_schedule_lock_irqsave(v, flags); + if ( likely(!vcpu_runnable(v)) ) + { + if ( v->runstate.state == RUNSTATE_runnable ) + vcpu_runstate_change(v, RUNSTATE_offline, NOW()); + SCHED_OP(sleep, v); + } + vcpu_schedule_unlock_irqrestore(v, flags); TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id); @@ -187,11 +219,19 @@ unsigned long flags; vcpu_schedule_lock_irqsave(v, flags); + if ( likely(vcpu_runnable(v)) ) { + if ( v->runstate.state >= RUNSTATE_blocked ) + vcpu_runstate_change(v, RUNSTATE_runnable, NOW()); SCHED_OP(wake, v); - v->wokenup = NOW(); - } + } + else if ( !test_bit(_VCPUF_blocked, &v->vcpu_flags) ) + { + if ( v->runstate.state == RUNSTATE_blocked ) + vcpu_runstate_change(v, RUNSTATE_offline, NOW()); + } + vcpu_schedule_unlock_irqrestore(v, flags); TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id); @@ -376,8 +416,6 @@ stop_timer(&schedule_data[cpu].s_timer); - prev->cpu_time += now - prev->lastschd; - /* get policy-specific decision on scheduling... */ next_slice = ops.do_schedule(now); @@ -386,8 +424,6 @@ schedule_data[cpu].curr = next; - next->lastschd = now; - set_timer(&schedule_data[cpu].s_timer, now + r_time); if ( unlikely(prev == next) ) @@ -397,38 +433,23 @@ } TRACE_2D(TRC_SCHED_SWITCH_INFPREV, - prev->domain->domain_id, now - prev->lastschd); + prev->domain->domain_id, + now - prev->runstate.state_entry_time); TRACE_3D(TRC_SCHED_SWITCH_INFNEXT, - next->domain->domain_id, now - next->wokenup, r_time); - - /* - * Logic of wokenup field in domain struct: - * Used to calculate "waiting time", which is the time that a domain - * spends being "runnable", but not actually running. wokenup is set - * set whenever a domain wakes from sleeping. However, if wokenup is not - * also set here then a preempted runnable domain will get a screwed up - * "waiting time" value next time it is scheduled. - */ - prev->wokenup = now; - -#if defined(WAKE_HISTO) - if ( !is_idle_vcpu(next) && next->wokenup ) - { - ulong diff = (ulong)(now - next->wokenup); - diff /= (ulong)MILLISECS(1); - if (diff <= BUCKETS-2) schedule_data[cpu].hist[diff]++; - else schedule_data[cpu].hist[BUCKETS-1]++; - } - next->wokenup = (s_time_t)0; -#elif defined(BLOCKTIME_HISTO) - prev->lastdeschd = now; - if ( !is_idle_vcpu(next) ) - { - ulong diff = (ulong)((now - next->lastdeschd) / MILLISECS(10)); - if (diff <= BUCKETS-2) schedule_data[cpu].hist[diff]++; - else schedule_data[cpu].hist[BUCKETS-1]++; - } -#endif + next->domain->domain_id, + (next->runstate.state == RUNSTATE_runnable) ? + (now - next->runstate.state_entry_time) : 0, + r_time); + + ASSERT(prev->runstate.state == RUNSTATE_running); + vcpu_runstate_change( + prev, + (test_bit(_VCPUF_blocked, &prev->vcpu_flags) ? RUNSTATE_blocked : + (vcpu_runnable(prev) ? RUNSTATE_runnable : RUNSTATE_offline)), + now); + + ASSERT(next->runstate.state != RUNSTATE_running); + vcpu_runstate_change(next, RUNSTATE_running, now); ASSERT(!test_bit(_VCPUF_running, &next->vcpu_flags)); set_bit(_VCPUF_running, &next->vcpu_flags); @@ -567,47 +588,6 @@ local_irq_restore(flags); } - -#if defined(WAKE_HISTO) || defined(BLOCKTIME_HISTO) - -void print_sched_histo(unsigned char key) -{ - int i, j, k; - for_each_online_cpu ( k ) - { - j = 0; - printf ("CPU[%02d]: scheduler latency histogram (ms:[count])\n", k); - for ( i = 0; i < BUCKETS; i++ ) - { - if ( schedule_data[k].hist[i] != 0 ) - { - if ( i < BUCKETS-1 ) - printk("%2d:[%7u] ", i, schedule_data[k].hist[i]); - else - printk(" >:[%7u] ", schedule_data[k].hist[i]); - if ( !(++j % 5) ) - printk("\n"); - } - } - printk("\n"); - } - -} - -void reset_sched_histo(unsigned char key) -{ - int i, j; - for ( j = 0; j < NR_CPUS; j++ ) - for ( i=0; i < BUCKETS; i++ ) - schedule_data[j].hist[i] = 0; -} - -#else - -void print_sched_histo(unsigned char key) { } -void reset_sched_histo(unsigned char key) { } - -#endif /* * Local variables: diff -r 88f97bb8f3ae -r 673f62edbfbe xen/drivers/char/console.c --- a/xen/drivers/char/console.c Wed Mar 1 17:01:54 2006 +++ b/xen/drivers/char/console.c Wed Mar 1 19:47:25 2006 @@ -335,8 +335,9 @@ } if ( hypercall_preempt_check() ) - return hypercall3_create_continuation( - __HYPERVISOR_console_io, CONSOLEIO_write, count, buffer); + return hypercall_create_continuation( + __HYPERVISOR_console_io, "iip", + CONSOLEIO_write, count, buffer); kcount = min_t(int, count, sizeof(kbuf)-1); if ( copy_from_user(kbuf, buffer, kcount) ) diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-ia64/config.h --- a/xen/include/asm-ia64/config.h Wed Mar 1 17:01:54 2006 +++ b/xen/include/asm-ia64/config.h Wed Mar 1 19:47:25 2006 @@ -36,6 +36,8 @@ //#define CONFIG_NR_CPUS 16 //leave SMP for a later time //#undef CONFIG_SMP + +#define supervisor_mode_kernel (0) #define MAX_DMADOM_PFN (0x7FFFFFFFUL >> PAGE_SHIFT) /* 31 addressable bits */ @@ -190,11 +192,6 @@ #define find_first_set_bit(x) (ffs(x)-1) // FIXME: Is this right??? -// from include/asm-x86/*/uaccess.h -#define array_access_ok(addr,count,size) \ - (likely(sizeof(count) <= 4) /* disallow 64-bit counts */ && \ - access_ok(type,addr,count*size)) - // see drivers/char/console.c #ifndef VALIDATE_VT #define OPT_CONSOLE_STR "com1" @@ -299,7 +296,6 @@ //#define raw_smp_processor_id() 0 //#endif - #ifndef __ASSEMBLY__ #include <linux/linkage.h> #define FORCE_CRASH() asm("break.m 0;;"); diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-ia64/linux-xen/asm/README.origin --- a/xen/include/asm-ia64/linux-xen/asm/README.origin Wed Mar 1 17:01:54 2006 +++ b/xen/include/asm-ia64/linux-xen/asm/README.origin Wed Mar 1 19:47:25 2006 @@ -22,4 +22,3 @@ system.h -> linux/include/asm-ia64/system.h tlbflush.h -> linux/include/asm-ia64/tlbflush.h types.h -> linux/include/asm-ia64/types.h -uaccess.h -> linux/include/asm-ia64/uaccess.h diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/config.h --- a/xen/include/asm-x86/config.h Wed Mar 1 17:01:54 2006 +++ b/xen/include/asm-x86/config.h Wed Mar 1 19:47:25 2006 @@ -36,6 +36,12 @@ #define OPT_CONSOLE_STR "com1,vga" #define NR_CPUS 32 + +#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL +# define supervisor_mode_kernel (1) +#else +# define supervisor_mode_kernel (0) +#endif /* Linkage for x86 */ #define __ALIGN .align 16,0x90 diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/desc.h --- a/xen/include/asm-x86/desc.h Wed Mar 1 17:01:54 2006 +++ b/xen/include/asm-x86/desc.h Wed Mar 1 19:47:25 2006 @@ -27,9 +27,22 @@ #endif /* Fix up the RPL of a guest segment selector. */ -#define fixup_guest_selector(sel) \ +#define __fixup_guest_selector(sel) \ ((sel) = (((sel) & 3) >= GUEST_KERNEL_RPL) ? (sel) : \ (((sel) & ~3) | GUEST_KERNEL_RPL)) + +/* Stack selectors don't need fixing up if the kernel runs in ring 0. */ +#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL +#define fixup_guest_stack_selector(ss) ((void)0) +#else +#define fixup_guest_stack_selector(ss) __fixup_guest_selector(ss) +#endif + +/* + * Code selectors are always fixed up. It allows the Xen exit stub to detect + * return to guest context, even when the guest kernel runs in ring 0. + */ +#define fixup_guest_code_selector(cs) __fixup_guest_selector(cs) /* * We need this function because enforcing the correct guest kernel RPL is diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/hvm/hvm.h --- a/xen/include/asm-x86/hvm/hvm.h Wed Mar 1 17:01:54 2006 +++ b/xen/include/asm-x86/hvm/hvm.h Wed Mar 1 19:47:25 2006 @@ -67,6 +67,9 @@ int (*paging_enabled)(struct vcpu *v); int (*instruction_length)(struct vcpu *v); unsigned long (*get_guest_ctrl_reg)(struct vcpu *v, unsigned int num); + + void (*init_ap_context)(struct vcpu_guest_context *ctxt, + int vcpuid, int trampoline_vector); }; extern struct hvm_function_table hvm_funcs; @@ -173,4 +176,14 @@ return hvm_funcs.get_guest_ctrl_reg(v, num); return 0; /* force to fail */ } + +static inline void +hvm_init_ap_context(struct vcpu_guest_context *ctxt, + int vcpuid, int trampoline_vector) +{ + return hvm_funcs.init_ap_context(ctxt, vcpuid, trampoline_vector); +} + +extern int hvm_bringup_ap(int vcpuid, int trampoline_vector); + #endif /* __ASM_X86_HVM_HVM_H__ */ diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/hvm/svm/emulate.h --- a/xen/include/asm-x86/hvm/svm/emulate.h Wed Mar 1 17:01:54 2006 +++ b/xen/include/asm-x86/hvm/svm/emulate.h Wed Mar 1 19:47:25 2006 @@ -83,15 +83,15 @@ struct cpu_user_regs *regs, const u8 prefix, const u8 *operand, u8 *size); extern OPERATING_MODE get_operating_mode (struct vmcb_struct *vmcb); -extern unsigned int decode_dest_reg(u8 modrm); -extern unsigned int decode_src_reg(u8 modrm); +extern unsigned int decode_dest_reg(u8 prefix, u8 modrm); +extern unsigned int decode_src_reg(u8 prefix, u8 modrm); extern unsigned long svm_rip2pointer(struct vmcb_struct *vmcb); -extern unsigned int __get_instruction_length_from_list(struct vmcb_struct *vmcb, +extern int __get_instruction_length_from_list(struct vmcb_struct *vmcb, enum instruction_index *list, unsigned int list_count, u8 *guest_eip_buf, enum instruction_index *match); -static inline unsigned int __get_instruction_length(struct vmcb_struct *vmcb, +static inline int __get_instruction_length(struct vmcb_struct *vmcb, enum instruction_index instr, u8 *guest_eip_buf) { return __get_instruction_length_from_list(vmcb, &instr, 1, guest_eip_buf, @@ -138,9 +138,20 @@ } +static inline int skip_prefix_bytes(u8 *buf, size_t size) +{ + int index; + for (index = 0; index < size && is_prefix(buf[index]); index ++) + /* do nothing */ ; + return index; +} + + + static void inline __update_guest_eip(struct vmcb_struct *vmcb, - unsigned long inst_len) + int inst_len) { + ASSERT(inst_len > 0); vmcb->rip += inst_len; } diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/hvm/svm/svm.h --- a/xen/include/asm-x86/hvm/svm/svm.h Wed Mar 1 17:01:54 2006 +++ b/xen/include/asm-x86/hvm/svm/svm.h Wed Mar 1 19:47:25 2006 @@ -54,6 +54,8 @@ /* For debugging. Remove when no longer needed. */ extern void svm_dump_host_regs(const char *from); +extern void svm_migrate_timers(struct vcpu *v); + /* ASID API */ enum { ASID_AVAILABLE = 0, diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/hvm/svm/vmcb.h --- a/xen/include/asm-x86/hvm/svm/vmcb.h Wed Mar 1 17:01:54 2006 +++ b/xen/include/asm-x86/hvm/svm/vmcb.h Wed Mar 1 19:47:25 2006 @@ -269,21 +269,6 @@ #define SVM_LONG_GUEST(ed) \ (test_bit(SVM_CPU_STATE_LMA_ENABLED, &ed->arch.hvm_svm.cpu_state)) -enum { - SVM_INDEX_MSR_LSTAR = 0, - SVM_INDEX_MSR_STAR, - SVM_INDEX_MSR_CSTAR, - SVM_INDEX_MSR_SYSCALL_MASK, - SVM_INDEX_MSR_EFER, - - SVM_MSR_COUNT, -}; - -struct svm_msr_state { - unsigned long flags; - unsigned long msr_items[SVM_MSR_COUNT]; - unsigned long shadow_gs; -}; /* * Attribute for segment selector. This is a copy of bit 40:47 & 52:55 of the @@ -449,7 +434,7 @@ struct arch_svm_struct { struct vmcb_struct *vmcb; - void *host_save_area; + void *host_save_area; u64 host_save_pa; u64 vmcb_pa; u32 *iopm; @@ -457,14 +442,15 @@ u64 vmexit_tsc; /* tsc read at #VMEXIT. for TSC_OFFSET */ int injecting_event; int saved_irq_vector; - u32 core; /* cpu of last vmexit */ + u32 launch_core; + u32 asid_core; unsigned long flags; /* VMCB flags */ - unsigned long cpu_shadow_cr0; /* copy of guest read shadow CR0 */ + unsigned long cpu_shadow_cr0; /* Guest value for CR0 */ + unsigned long cpu_shadow_cr4; /* Guest value for CR4 */ unsigned long cpu_cr2; unsigned long cpu_cr3; unsigned long cpu_state; - struct svm_msr_state msr_content; struct timer hlt_timer; /* hlt ins emulation wakeup timer */ }; @@ -485,6 +471,14 @@ #define VMCB_EFLAGS_RESERVED_0 0xffc08028 /* bitmap for 0 */ #define VMCB_EFLAGS_RESERVED_1 0x00000002 /* bitmap for 1 */ + +/* These bits in the CR4 are owned by the host */ +#ifdef __i386__ +#define SVM_CR4_HOST_MASK (0) +#else +#define SVM_CR4_HOST_MASK (X86_CR4_PAE) +#endif + #endif /* ASM_X86_HVM_SVM_VMCS_H__ */ diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/hvm/vcpu.h --- a/xen/include/asm-x86/hvm/vcpu.h Wed Mar 1 17:01:54 2006 +++ b/xen/include/asm-x86/hvm/vcpu.h Wed Mar 1 19:47:25 2006 @@ -25,10 +25,15 @@ #include <asm/hvm/vmx/vmcs.h> #include <asm/hvm/svm/vmcb.h> +#define HVM_VCPU_INIT_SIPI_SIPI_STATE_NORM 0 +#define HVM_VCPU_INIT_SIPI_SIPI_STATE_WAIT_SIPI 1 + struct hvm_vcpu { - unsigned long ioflags; - struct mmio_op mmio_op; - struct vlapic *vlapic; + unsigned long ioflags; + struct mmio_op mmio_op; + struct vlapic *vlapic; + /* For AP startup */ + unsigned long init_sipi_sipi_state; union { struct arch_vmx_struct vmx; diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/hvm/vlapic.h --- a/xen/include/asm-x86/hvm/vlapic.h Wed Mar 1 17:01:54 2006 +++ b/xen/include/asm-x86/hvm/vlapic.h Wed Mar 1 19:47:25 2006 @@ -158,9 +158,6 @@ int deliver_mode; int source[6]; } direct_intr_info_t; - -#define VLAPIC_INIT_SIPI_SIPI_STATE_NORM 0 -#define VLAPIC_INIT_SIPI_SIPI_STATE_WAIT_SIPI 1 struct vlapic { @@ -197,7 +194,6 @@ unsigned long init_ticks; uint32_t err_write_count; uint64_t apic_base_msr; - uint32_t init_sipi_sipi_state; struct vcpu *vcpu; struct domain *domain; }; diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/mm.h --- a/xen/include/asm-x86/mm.h Wed Mar 1 17:01:54 2006 +++ b/xen/include/asm-x86/mm.h Wed Mar 1 19:47:25 2006 @@ -337,6 +337,10 @@ UNLOCK_BIGLOCK(d); \ } while ( 0 ) +#define writable_pagetable_in_sync(d) \ + (!((d)->arch.ptwr[PTWR_PT_ACTIVE].l1va | \ + (d)->arch.ptwr[PTWR_PT_INACTIVE].l1va)) + int audit_adjust_pgtables(struct domain *d, int dir, int noisy); #ifndef NDEBUG @@ -376,7 +380,7 @@ int __sync_lazy_execstate(void); /* Arch-specific portion of memory_op hypercall. */ -long arch_memory_op(int op, void *arg); -long subarch_memory_op(int op, void *arg); +long arch_memory_op(int op, GUEST_HANDLE(void) arg); +long subarch_memory_op(int op, GUEST_HANDLE(void) arg); #endif /* __ASM_X86_MM_H__ */ diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/shadow_64.h --- a/xen/include/asm-x86/shadow_64.h Wed Mar 1 17:01:54 2006 +++ b/xen/include/asm-x86/shadow_64.h Wed Mar 1 19:47:25 2006 @@ -223,6 +223,7 @@ int i; pgentry_64_t *le_e; pgentry_64_t *le_p = NULL; + pgentry_64_t *phys_vtable = NULL; unsigned long mfn; int index; u32 level = flag & L_MASK; @@ -251,25 +252,35 @@ { root_level = PAE_PAGING_LEVELS; index = table_offset_64(va, root_level); - le_e = (pgentry_64_t *)map_domain_page( + phys_vtable = (pgentry_64_t *)map_domain_page( pagetable_get_pfn(v->domain->arch.phys_table)); + le_e = &phys_vtable[index]; } /* * If it's not external mode, then mfn should be machine physical. */ - for (i = root_level - level; i > 0; i--) { - if ( unlikely(!(entry_get_flags(*le_e) & _PAGE_PRESENT)) ) { + for ( i = root_level - level; i > 0; i-- ) + { + if ( unlikely(!(entry_get_flags(*le_e) & _PAGE_PRESENT)) ) + { if ( le_p ) unmap_domain_page(le_p); + + if ( phys_vtable ) + unmap_domain_page(phys_vtable); + return 0; } + mfn = entry_get_pfn(*le_e); if ( (flag & GUEST_ENTRY) && shadow_mode_translate(d) ) mfn = get_mfn_from_gpfn(mfn); + if ( le_p ) unmap_domain_page(le_p); le_p = (pgentry_64_t *)map_domain_page(mfn); + if ( flag & SHADOW_ENTRY ) index = table_offset_64(va, (level + i - 1)); else @@ -285,8 +296,10 @@ if ( le_p ) unmap_domain_page(le_p); + if ( phys_vtable ) + unmap_domain_page(phys_vtable); + return 1; - } static inline int __rw_entry( diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/shadow_public.h --- a/xen/include/asm-x86/shadow_public.h Wed Mar 1 17:01:54 2006 +++ b/xen/include/asm-x86/shadow_public.h Wed Mar 1 19:47:25 2006 @@ -21,8 +21,6 @@ #ifndef _XEN_SHADOW_PUBLIC_H #define _XEN_SHADOW_PUBLIC_H - -extern int alloc_p2m_table(struct domain *d); #if CONFIG_PAGING_LEVELS >= 3 #define MFN_PINNED(_x) (mfn_to_page(_x)->u.inuse.type_info & PGT_pinned) diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-x86/x86_32/asm_defns.h --- a/xen/include/asm-x86/x86_32/asm_defns.h Wed Mar 1 17:01:54 2006 +++ b/xen/include/asm-x86/x86_32/asm_defns.h Wed Mar 1 19:47:25 2006 @@ -48,9 +48,24 @@ #ifdef PERF_COUNTERS #define PERFC_INCR(_name,_idx) \ - lock incl perfcounters+_name(,_idx,4) + lock incl perfcounters+_name(,_idx,4) #else #define PERFC_INCR(_name,_idx) +#endif + +#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL +#define FIXUP_RING0_GUEST_STACK \ + testl $2,8(%esp); \ + jnz 1f; /* rings 2 & 3 permitted */ \ + testl $1,8(%esp); \ + jz 2f; \ + ud2; /* ring 1 should not be used */ \ + 2:cmpl $(__HYPERVISOR_VIRT_START),%esp; \ + jge 1f; \ + call fixup_ring0_guest_stack; \ + 1: +#else +#define FIXUP_RING0_GUEST_STACK #endif #define BUILD_SMP_INTERRUPT(x,v) XBUILD_SMP_INTERRUPT(x,v) @@ -61,6 +76,7 @@ ".globl " STR(x) "\n\t" \ STR(x) ":\n\t" \ "pushl $"#v"<<16\n\t" \ + STR(FIXUP_RING0_GUEST_STACK) \ STR(SAVE_ALL(a)) \ "movl %esp,%eax\n\t" \ "pushl %eax\n\t" \ @@ -72,6 +88,7 @@ __asm__( \ "\n" __ALIGN_STR"\n" \ "common_interrupt:\n\t" \ + STR(FIXUP_RING0_GUEST_STACK) \ STR(SAVE_ALL(a)) \ "movl %esp,%eax\n\t" \ "pushl %eax\n\t" \ diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/public/memory.h --- a/xen/include/public/memory.h Wed Mar 1 17:01:54 2006 +++ b/xen/include/public/memory.h Wed Mar 1 19:47:25 2006 @@ -29,7 +29,7 @@ * OUT: GMFN bases of extents that were allocated * (NB. This command also updates the mach_to_phys translation table) */ - unsigned long *extent_start; + GUEST_HANDLE(xen_ulong) extent_start; /* Number of extents, and size/alignment of each (2^extent_order pages). */ unsigned long nr_extents; @@ -50,6 +50,7 @@ domid_t domid; } xen_memory_reservation_t; +DEFINE_GUEST_HANDLE(xen_memory_reservation_t); /* * Returns the maximum machine frame number of mapped RAM in this system. @@ -85,7 +86,7 @@ * any large discontiguities in the machine address space, 2MB gaps in * the machphys table will be represented by an MFN base of zero. */ - unsigned long *extent_start; + GUEST_HANDLE(xen_ulong) extent_start; /* * Number of extents written to the above array. This will be smaller @@ -93,6 +94,7 @@ */ unsigned int nr_extents; } xen_machphys_mfn_list_t; +DEFINE_GUEST_HANDLE(xen_machphys_mfn_list_t); /* * Returns the base and size of the specified reserved 'RAM hole' in the @@ -113,6 +115,7 @@ /* Base and size of the specified reserved area. */ unsigned long first_gpfn, nr_gpfns; } xen_reserved_phys_area_t; +DEFINE_GUEST_HANDLE(xen_reserved_phys_area_t); /* * Translates a list of domain-specific GPFNs into MFNs. Returns a -ve error @@ -127,14 +130,15 @@ unsigned long nr_gpfns; /* List of GPFNs to translate. */ - unsigned long *gpfn_list; + GUEST_HANDLE(xen_ulong) gpfn_list; /* * Output list to contain MFN translations. May be the same as the input * list (in which case each input GPFN is overwritten with the output MFN). */ - unsigned long *mfn_list; + GUEST_HANDLE(xen_ulong) mfn_list; } xen_translate_gpfn_list_t; +DEFINE_GUEST_HANDLE(xen_translate_gpfn_list_t); #endif /* __XEN_PUBLIC_MEMORY_H__ */ diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/public/vcpu.h --- a/xen/include/public/vcpu.h Wed Mar 1 17:01:54 2006 +++ b/xen/include/public/vcpu.h Wed Mar 1 19:47:25 2006 @@ -51,6 +51,61 @@ /* Returns 1 if the given VCPU is up. */ #define VCPUOP_is_up 3 +/* + * Return information about the state and running time of a VCPU. + * @extra_arg == pointer to vcpu_runstate_info structure. + */ +#define VCPUOP_get_runstate_info 4 +typedef struct vcpu_runstate_info { + /* VCPU's current state (RUNSTATE_*). */ + int state; + /* When was current state entered (system time, ns)? */ + uint64_t state_entry_time; + /* + * Time spent in each RUNSTATE_* (ns). The sum of these times is + * guaranteed not to drift from system time. + */ + uint64_t time[4]; +} vcpu_runstate_info_t; + +/* VCPU is currently running on a physical CPU. */ +#define RUNSTATE_running 0 + +/* VCPU is runnable, but not currently scheduled on any physical CPU. */ +#define RUNSTATE_runnable 1 + +/* VCPU is blocked (a.k.a. idle). It is therefore not runnable. */ +#define RUNSTATE_blocked 2 + +/* + * VCPU is not runnable, but it is not blocked. + * This is a 'catch all' state for things like hotplug and pauses by the + * system administrator (or for critical sections in the hypervisor). + * RUNSTATE_blocked dominates this state (it is the preferred state). + */ +#define RUNSTATE_offline 3 + +/* + * Register a shared memory area from which the guest may obtain its own + * runstate information without needing to execute a hypercall. + * Notes: + * 1. The registered address may be virtual or physical, depending on the + * platform. The virtual address should be registered on x86 systems. + * 2. Only one shared area may be registered per VCPU. The shared area is + * updated by the hypervisor each time the VCPU is scheduled. Thus + * runstate.state will always be RUNSTATE_running and + * runstate.state_entry_time will indicate the system time at which the + * VCPU was last scheduled to run. + * @extra_arg == pointer to vcpu_register_runstate_memory_area structure. + */ +#define VCPUOP_register_runstate_memory_area 5 +typedef struct vcpu_register_runstate_memory_area { + union { + struct vcpu_runstate_info *v; + uint64_t p; + } addr; +} vcpu_register_runstate_memory_area_t; + #endif /* __XEN_PUBLIC_VCPU_H__ */ /* diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/public/version.h --- a/xen/include/public/version.h Wed Mar 1 17:01:54 2006 +++ b/xen/include/public/version.h Wed Mar 1 19:47:25 2006 @@ -48,36 +48,8 @@ uint32_t submap; /* OUT: 32-bit submap */ } xen_feature_info_t; -/* - * If set, the guest does not need to write-protect its pagetables, and can - * update them via direct writes. - */ -#define XENFEAT_writable_page_tables 0 - -/* - * If set, the guest does not need to write-protect its segment descriptor - * tables, and can update them via direct writes. - */ -#define XENFEAT_writable_descriptor_tables 1 - -/* - * If set, translation between the guest's 'pseudo-physical' address space - * and the host's machine address space are handled by the hypervisor. In this - * mode the guest does not need to perform phys-to/from-machine translations - * when performing page table operations. - */ -#define XENFEAT_auto_translated_physmap 2 - -/* If set, the guest is running in supervisor mode (e.g., x86 ring 0). */ -#define XENFEAT_supervisor_mode_kernel 3 - -/* - * If set, the guest does not need to allocate x86 PAE page directories - * below 4GB. This flag is usually implied by auto_translated_physmap. - */ -#define XENFEAT_pae_pgdir_above_4gb 4 - -#define XENFEAT_NR_SUBMAPS 1 +/* Declares the features reported by XENVER_get_features. */ +#include "features.h" #endif /* __XEN_PUBLIC_VERSION_H__ */ diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/public/xen.h --- a/xen/include/public/xen.h Wed Mar 1 17:01:54 2006 +++ b/xen/include/public/xen.h Wed Mar 1 19:47:25 2006 @@ -8,6 +8,22 @@ #ifndef __XEN_PUBLIC_XEN_H__ #define __XEN_PUBLIC_XEN_H__ + +#ifdef __XEN__ +#define DEFINE_GUEST_HANDLE(type) struct __guest_handle_ ## type { type *p; } +#define GUEST_HANDLE(type) struct __guest_handle_ ## type +#else +#define DEFINE_GUEST_HANDLE(type) +#define GUEST_HANDLE(type) type * +#endif + +#ifndef __ASSEMBLY__ +/* Guest handle for unsigned long pointer. Define a name with no whitespace. */ +typedef unsigned long xen_ulong; +DEFINE_GUEST_HANDLE(xen_ulong); +/* Guest handle for arbitrary-type pointer (void *). */ +DEFINE_GUEST_HANDLE(void); +#endif #if defined(__i386__) #include "arch-x86_32.h" diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/xen/sched-if.h --- a/xen/include/xen/sched-if.h Wed Mar 1 17:01:54 2006 +++ b/xen/include/xen/sched-if.h Wed Mar 1 19:47:25 2006 @@ -8,9 +8,6 @@ #ifndef __XEN_SCHED_IF_H__ #define __XEN_SCHED_IF_H__ -#define BUCKETS 10 -/*300*/ - struct schedule_data { spinlock_t schedule_lock; /* spinlock protecting curr */ struct vcpu *curr; /* current task */ @@ -18,9 +15,6 @@ void *sched_priv; struct timer s_timer; /* scheduling timer */ unsigned long tick; /* current periodic 'tick' */ -#ifdef BUCKETS - u32 hist[BUCKETS]; /* for scheduler latency histogram */ -#endif } __cacheline_aligned; extern struct schedule_data schedule_data[]; diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/xen/sched.h --- a/xen/include/xen/sched.h Wed Mar 1 17:01:54 2006 +++ b/xen/include/xen/sched.h Wed Mar 1 19:47:25 2006 @@ -8,6 +8,7 @@ #include <xen/smp.h> #include <public/xen.h> #include <public/dom0_ops.h> +#include <public/vcpu.h> #include <xen/time.h> #include <xen/timer.h> #include <xen/grant_table.h> @@ -63,14 +64,13 @@ struct vcpu *next_in_list; - struct timer timer; /* one-shot timer for timeout values */ + struct timer timer; /* one-shot timer for timeout values */ unsigned long sleep_tick; /* tick at which this vcpu started sleep */ - s_time_t lastschd; /* time this domain was last scheduled */ - s_time_t lastdeschd; /* time this domain was last descheduled */ - s_time_t cpu_time; /* total CPU time received till now */ - s_time_t wokenup; /* time domain got woken up */ void *sched_priv; /* scheduler-specific data */ + + struct vcpu_runstate_info runstate; + struct vcpu_runstate_info *runstate_guest; /* guest address */ unsigned long vcpu_flags; @@ -303,31 +303,18 @@ void startup_cpu_idle_loop(void); -unsigned long __hypercall_create_continuation( - unsigned int op, unsigned int nr_args, ...); -#define hypercall0_create_continuation(_op) \ - __hypercall_create_continuation((_op), 0) -#define hypercall1_create_continuation(_op, _a1) \ - __hypercall_create_continuation((_op), 1, \ - (unsigned long)(_a1)) -#define hypercall2_create_continuation(_op, _a1, _a2) \ - __hypercall_create_continuation((_op), 2, \ - (unsigned long)(_a1), (unsigned long)(_a2)) -#define hypercall3_create_continuation(_op, _a1, _a2, _a3) \ - __hypercall_create_continuation((_op), 3, \ - (unsigned long)(_a1), (unsigned long)(_a2), (unsigned long)(_a3)) -#define hypercall4_create_continuation(_op, _a1, _a2, _a3, _a4) \ - __hypercall_create_continuation((_op), 4, \ - (unsigned long)(_a1), (unsigned long)(_a2), (unsigned long)(_a3), \ - (unsigned long)(_a4)) -#define hypercall5_create_continuation(_op, _a1, _a2, _a3, _a4, _a5) \ - __hypercall_create_continuation((_op), 5, \ - (unsigned long)(_a1), (unsigned long)(_a2), (unsigned long)(_a3), \ - (unsigned long)(_a4), (unsigned long)(_a5)) -#define hypercall6_create_continuation(_op, _a1, _a2, _a3, _a4, _a5, _a6) \ - __hypercall_create_continuation((_op), 6, \ - (unsigned long)(_a1), (unsigned long)(_a2), (unsigned long)(_a3), \ - (unsigned long)(_a4), (unsigned long)(_a5), (unsigned long)(_a6)) +/* + * Creates a continuation to resume the current hypercall. The caller should + * return immediately, propagating the value returned from this invocation. + * The format string specifies the types and number of hypercall arguments. + * It contains one character per argument as follows: + * 'i' [unsigned] {char, int} + * 'l' [unsigned] long + * 'p' pointer (foo *) + * 'h' guest handle (GUEST_HANDLE(foo)) + */ +unsigned long hypercall_create_continuation( + unsigned int op, const char *format, ...); #define hypercall_preempt_check() (unlikely( \ softirq_pending(smp_processor_id()) | \ @@ -397,7 +384,6 @@ #define _DOMF_debugging 4 #define DOMF_debugging (1UL<<_DOMF_debugging) - static inline int vcpu_runnable(struct vcpu *v) { return ( (atomic_read(&v->pausecnt) == 0) && @@ -415,6 +401,8 @@ int vcpu_set_affinity(struct vcpu *v, cpumask_t *affinity); +void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate); + static inline void vcpu_unblock(struct vcpu *v) { if ( test_and_clear_bit(_VCPUF_blocked, &v->vcpu_flags) ) diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/xen/string.h --- a/xen/include/xen/string.h Wed Mar 1 17:01:54 2006 +++ b/xen/include/xen/string.h Wed Mar 1 19:47:25 2006 @@ -24,6 +24,9 @@ #endif #ifndef __HAVE_ARCH_STRNCPY extern char * strncpy(char *,const char *, __kernel_size_t); +#endif +#ifndef __HAVE_ARCH_STRLCPY +extern size_t strlcpy(char *,const char *, __kernel_size_t); #endif #ifndef __HAVE_ARCH_STRCAT extern char * strcat(char *, const char *); diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/arch/i386/mm/pgtable.c --- /dev/null Wed Mar 1 17:01:54 2006 +++ b/linux-2.6-xen-sparse/arch/i386/mm/pgtable.c Wed Mar 1 19:47:25 2006 @@ -0,0 +1,283 @@ +/* + * linux/arch/i386/mm/pgtable.c + */ + +#include <linux/config.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/smp.h> +#include <linux/highmem.h> +#include <linux/slab.h> +#include <linux/pagemap.h> +#include <linux/spinlock.h> +#include <linux/module.h> + +#include <asm/system.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/fixmap.h> +#include <asm/e820.h> +#include <asm/tlb.h> +#include <asm/tlbflush.h> + +void show_mem(void) +{ + int total = 0, reserved = 0; + int shared = 0, cached = 0; + int highmem = 0; + struct page *page; + pg_data_t *pgdat; + unsigned long i; + struct page_state ps; + unsigned long flags; + + printk(KERN_INFO "Mem-info:\n"); + show_free_areas(); + printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); + for_each_pgdat(pgdat) { + pgdat_resize_lock(pgdat, &flags); + for (i = 0; i < pgdat->node_spanned_pages; ++i) { + page = pgdat_page_nr(pgdat, i); + total++; + if (PageHighMem(page)) + highmem++; + if (PageReserved(page)) + reserved++; + else if (PageSwapCache(page)) + cached++; + else if (page_count(page)) + shared += page_count(page) - 1; + } + pgdat_resize_unlock(pgdat, &flags); + } + printk(KERN_INFO "%d pages of RAM\n", total); + printk(KERN_INFO "%d pages of HIGHMEM\n", highmem); + printk(KERN_INFO "%d reserved pages\n", reserved); + printk(KERN_INFO "%d pages shared\n", shared); + printk(KERN_INFO "%d pages swap cached\n", cached); + + get_page_state(&ps); + printk(KERN_INFO "%lu pages dirty\n", ps.nr_dirty); + printk(KERN_INFO "%lu pages writeback\n", ps.nr_writeback); + printk(KERN_INFO "%lu pages mapped\n", ps.nr_mapped); + printk(KERN_INFO "%lu pages slab\n", ps.nr_slab); + printk(KERN_INFO "%lu pages pagetables\n", ps.nr_page_table_pages); +} + +/* + * Associate a virtual page frame with a given physical page frame + * and protection flags for that frame. + */ +static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = swapper_pg_dir + pgd_index(vaddr); + if (pgd_none(*pgd)) { + BUG(); + return; + } + pud = pud_offset(pgd, vaddr); + if (pud_none(*pud)) { + BUG(); + return; + } + pmd = pmd_offset(pud, vaddr); + if (pmd_none(*pmd)) { + BUG(); + return; + } + pte = pte_offset_kernel(pmd, vaddr); + /* <pfn,flags> stored as-is, to permit clearing entries */ + set_pte(pte, pfn_pte(pfn, flags)); + + /* + * It's enough to flush this one mapping. + * (PGE mappings get flushed as well) + */ + __flush_tlb_one(vaddr); +} + +/* + * Associate a large virtual page frame with a given physical page frame + * and protection flags for that frame. pfn is for the base of the page, + * vaddr is what the page gets mapped to - both must be properly aligned. + * The pmd must already be instantiated. Assumes PAE mode. + */ +void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + + if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */ + printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n"); + return; /* BUG(); */ + } + if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */ + printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n"); + return; /* BUG(); */ + } + pgd = swapper_pg_dir + pgd_index(vaddr); + if (pgd_none(*pgd)) { + printk(KERN_WARNING "set_pmd_pfn: pgd_none\n"); + return; /* BUG(); */ + } + pud = pud_offset(pgd, vaddr); + pmd = pmd_offset(pud, vaddr); + set_pmd(pmd, pfn_pmd(pfn, flags)); + /* + * It's enough to flush this one mapping. + * (PGE mappings get flushed as well) + */ + __flush_tlb_one(vaddr); +} + +static int nr_fixmaps = 0; +unsigned long __FIXADDR_TOP = 0xfffff000; +EXPORT_SYMBOL(__FIXADDR_TOP); + +void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags) +{ + unsigned long address = __fix_to_virt(idx); + + if (idx >= __end_of_fixed_addresses) { + BUG(); + return; + } + set_pte_pfn(address, phys >> PAGE_SHIFT, flags); + nr_fixmaps++; +} + +void set_fixaddr_top(unsigned long top) +{ + BUG_ON(nr_fixmaps > 0); + __FIXADDR_TOP = top - PAGE_SIZE; +} + +pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) +{ + return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); +} + +struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) +{ + struct page *pte; + +#ifdef CONFIG_HIGHPTE + pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); +#else + pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); +#endif + return pte; +} + +void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags) +{ + memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); +} + +/* + * List of all pgd's needed for non-PAE so it can invalidate entries + * in both cached and uncached pgd's; not needed for PAE since the + * kernel pmd is shared. If PAE were not to share the pmd a similar + * tactic would be needed. This is essentially codepath-based locking + * against pageattr.c; it is the unique case in which a valid change + * of kernel pagetables can't be lazily synchronized by vmalloc faults. + * vmalloc faults work because attached pagetables are never freed. + * The locking scheme was chosen on the basis of manfred's + * recommendations and having no core impact whatsoever. + * -- wli + */ +DEFINE_SPINLOCK(pgd_lock); +struct page *pgd_list; + +static inline void pgd_list_add(pgd_t *pgd) +{ + struct page *page = virt_to_page(pgd); + page->index = (unsigned long)pgd_list; + if (pgd_list) + set_page_private(pgd_list, (unsigned long)&page->index); + pgd_list = page; + set_page_private(page, (unsigned long)&pgd_list); +} + +static inline void pgd_list_del(pgd_t *pgd) +{ + struct page *next, **pprev, *page = virt_to_page(pgd); + next = (struct page *)page->index; + pprev = (struct page **)page_private(page); + *pprev = next; + if (next) + set_page_private(next, (unsigned long)pprev); +} + +void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused) +{ + unsigned long flags; + + if (PTRS_PER_PMD == 1) { + memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); + spin_lock_irqsave(&pgd_lock, flags); + } + + clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, + swapper_pg_dir + USER_PTRS_PER_PGD, + KERNEL_PGD_PTRS); + if (PTRS_PER_PMD > 1) + return; + + pgd_list_add(pgd); + spin_unlock_irqrestore(&pgd_lock, flags); +} + +/* never called when PTRS_PER_PMD > 1 */ +void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused) +{ + unsigned long flags; /* can be called from interrupt context */ + + spin_lock_irqsave(&pgd_lock, flags); + pgd_list_del(pgd); + spin_unlock_irqrestore(&pgd_lock, flags); +} + +pgd_t *pgd_alloc(struct mm_struct *mm) +{ + int i; + pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL); + + if (PTRS_PER_PMD == 1 || !pgd) + return pgd; + + for (i = 0; i < USER_PTRS_PER_PGD; ++i) { + pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); + if (!pmd) + goto out_oom; + set_pgd(&pgd[i], __pgd(1 + __pa(pmd))); + } + return pgd; + +out_oom: + for (i--; i >= 0; i--) + kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); + kmem_cache_free(pgd_cache, pgd); + return NULL; +} + +void pgd_free(pgd_t *pgd) +{ + int i; + + /* in the PAE case user pgd entries are overwritten before usage */ + if (PTRS_PER_PMD > 1) + for (i = 0; i < USER_PTRS_PER_PGD; ++i) + kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); + /* in the non-PAE case, free_pgtables() clears user pgd entries */ + kmem_cache_free(pgd_cache, pgd); +} diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/include/asm-i386/fixmap.h --- /dev/null Wed Mar 1 17:01:54 2006 +++ b/linux-2.6-xen-sparse/include/asm-i386/fixmap.h Wed Mar 1 19:47:25 2006 @@ -0,0 +1,151 @@ +/* + * fixmap.h: compile-time virtual memory allocation + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (C) 1998 Ingo Molnar + * + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 + */ + +#ifndef _ASM_FIXMAP_H +#define _ASM_FIXMAP_H + +#include <linux/config.h> + +/* used by vmalloc.c, vsyscall.lds.S. + * + * Leave one empty page between vmalloc'ed areas and + * the start of the fixmap. + */ +extern unsigned long __FIXADDR_TOP; + +#ifndef __ASSEMBLY__ +#include <linux/kernel.h> +#include <asm/acpi.h> +#include <asm/apicdef.h> +#include <asm/page.h> +#ifdef CONFIG_HIGHMEM +#include <linux/threads.h> +#include <asm/kmap_types.h> +#endif + +/* + * Here we define all the compile-time 'special' virtual + * addresses. The point is to have a constant address at + * compile time, but to set the physical address only + * in the boot process. We allocate these special addresses + * from the end of virtual memory (0xfffff000) backwards. + * Also this lets us do fail-safe vmalloc(), we + * can guarantee that these special addresses and + * vmalloc()-ed addresses never overlap. + * + * these 'compile-time allocated' memory buffers are + * fixed-size 4k pages. (or larger if used with an increment + * highger than 1) use fixmap_set(idx,phys) to associate + * physical memory with fixmap indices. + * + * TLB entries of such buffers will not be flushed across + * task switches. + */ +enum fixed_addresses { + FIX_HOLE, +#ifdef CONFIG_X86_LOCAL_APIC + FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */ +#endif +#ifdef CONFIG_X86_IO_APIC + FIX_IO_APIC_BASE_0, + FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1, +#endif +#ifdef CONFIG_X86_VISWS_APIC + FIX_CO_CPU, /* Cobalt timer */ + FIX_CO_APIC, /* Cobalt APIC Redirection Table */ + FIX_LI_PCIA, /* Lithium PCI Bridge A */ + FIX_LI_PCIB, /* Lithium PCI Bridge B */ +#endif +#ifdef CONFIG_X86_F00F_BUG + FIX_F00F_IDT, /* Virtual mapping for IDT */ +#endif +#ifdef CONFIG_X86_CYCLONE_TIMER + FIX_CYCLONE_TIMER, /*cyclone timer register*/ +#endif +#ifdef CONFIG_HIGHMEM + FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ + FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, +#endif +#ifdef CONFIG_ACPI + FIX_ACPI_BEGIN, + FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1, +#endif +#ifdef CONFIG_PCI_MMCONFIG + FIX_PCIE_MCFG, +#endif + __end_of_permanent_fixed_addresses, + /* temporary boot-time mappings, used before ioremap() is functional */ +#define NR_FIX_BTMAPS 16 + FIX_BTMAP_END = __end_of_permanent_fixed_addresses, + FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1, + FIX_WP_TEST, + __end_of_fixed_addresses +}; + +extern void __set_fixmap (enum fixed_addresses idx, + unsigned long phys, pgprot_t flags); + +extern void set_fixaddr_top(unsigned long top); + +#define set_fixmap(idx, phys) \ + __set_fixmap(idx, phys, PAGE_KERNEL) +/* + * Some hardware wants to get fixmapped without caching. + */ +#define set_fixmap_nocache(idx, phys) \ + __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE) + +#define clear_fixmap(idx) \ + __set_fixmap(idx, 0, __pgprot(0)) + +#define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP) + +#define __FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT) +#define __FIXADDR_BOOT_SIZE (__end_of_fixed_addresses << PAGE_SHIFT) +#define FIXADDR_START (FIXADDR_TOP - __FIXADDR_SIZE) +#define FIXADDR_BOOT_START (FIXADDR_TOP - __FIXADDR_BOOT_SIZE) + +#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT)) +#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT) + +extern void __this_fixmap_does_not_exist(void); + +/* + * 'index to address' translation. If anyone tries to use the idx + * directly without tranlation, we catch the bug with a NULL-deference + * kernel oops. Illegal ranges of incoming indices are caught too. + */ +static __always_inline unsigned long fix_to_virt(const unsigned int idx) +{ + /* + * this branch gets completely eliminated after inlining, + * except when someone tries to use fixaddr indices in an + * illegal way. (such as mixing up address types or using + * out-of-range indices). + * + * If it doesn't get removed, the linker will complain + * loudly with a reasonably clear error message.. + */ + if (idx >= __end_of_fixed_addresses) + __this_fixmap_does_not_exist(); + + return __fix_to_virt(idx); +} + +static inline unsigned long virt_to_fix(const unsigned long vaddr) +{ + BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START); + return __virt_to_fix(vaddr); +} + +#endif /* !__ASSEMBLY__ */ +#endif diff -r 88f97bb8f3ae -r 673f62edbfbe linux-2.6-xen-sparse/include/asm-i386/page.h --- /dev/null Wed Mar 1 17:01:54 2006 +++ b/linux-2.6-xen-sparse/include/asm-i386/page.h Wed Mar 1 19:47:25 2006 @@ -0,0 +1,148 @@ +#ifndef _I386_PAGE_H +#define _I386_PAGE_H + +/* PAGE_SHIFT determines the page size */ +#define PAGE_SHIFT 12 +#define PAGE_SIZE (1UL << PAGE_SHIFT) +#define PAGE_MASK (~(PAGE_SIZE-1)) + +#define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1)) +#define LARGE_PAGE_SIZE (1UL << PMD_SHIFT) + +#ifdef __KERNEL__ +#ifndef __ASSEMBLY__ + +#include <linux/config.h> + +#ifdef CONFIG_X86_USE_3DNOW + +#include <asm/mmx.h> + +#define clear_page(page) mmx_clear_page((void *)(page)) +#define copy_page(to,from) mmx_copy_page(to,from) + +#else + +/* + * On older X86 processors it's not a win to use MMX here it seems. + * Maybe the K6-III ? + */ + +#define clear_page(page) memset((void *)(page), 0, PAGE_SIZE) +#define copy_page(to,from) memcpy((void *)(to), (void *)(from), PAGE_SIZE) + +#endif + +#define clear_user_page(page, vaddr, pg) clear_page(page) +#define copy_user_page(to, from, vaddr, pg) copy_page(to, from) + +#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr) +#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE + +/* + * These are used to make use of C type-checking.. + */ +extern int nx_enabled; +#ifdef CONFIG_X86_PAE +extern unsigned long long __supported_pte_mask; +typedef struct { unsigned long pte_low, pte_high; } pte_t; +typedef struct { unsigned long long pmd; } pmd_t; +typedef struct { unsigned long long pgd; } pgd_t; +typedef struct { unsigned long long pgprot; } pgprot_t; +#define pmd_val(x) ((x).pmd) +#define pte_val(x) ((x).pte_low | ((unsigned long long)(x).pte_high << 32)) +#define __pmd(x) ((pmd_t) { (x) } ) +#define HPAGE_SHIFT 21 +#else +typedef struct { unsigned long pte_low; } pte_t; +typedef struct { unsigned long pgd; } pgd_t; +typedef struct { unsigned long pgprot; } pgprot_t; +#define boot_pte_t pte_t /* or would you rather have a typedef */ +#define pte_val(x) ((x).pte_low) +#define HPAGE_SHIFT 22 +#endif +#define PTE_MASK PAGE_MASK + +#ifdef CONFIG_HUGETLB_PAGE +#define HPAGE_SIZE ((1UL) << HPAGE_SHIFT) +#define HPAGE_MASK (~(HPAGE_SIZE - 1)) +#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT) +#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA +#endif + +#define pgd_val(x) ((x).pgd) +#define pgprot_val(x) ((x).pgprot) + +#define __pte(x) ((pte_t) { (x) } ) +#define __pgd(x) ((pgd_t) { (x) } ) +#define __pgprot(x) ((pgprot_t) { (x) } ) + +#endif /* !__ASSEMBLY__ */ + +/* to align the pointer to the (next) page boundary */ +#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK) + +/* + * This handles the memory map.. We could make this a config + * option, but too many people screw it up, and too few need + * it. + * + * A __PAGE_OFFSET of 0xC0000000 means that the kernel has + * a virtual address space of one gigabyte, which limits the + * amount of physical memory you can use to about 950MB. + * + * If you want more physical memory than this then see the CONFIG_HIGHMEM4G + * and CONFIG_HIGHMEM64G options in the kernel configuration. + */ + +#ifndef __ASSEMBLY__ + +/* + * This much address space is reserved for vmalloc() and iomap() + * as well as fixmap mappings. + */ +extern unsigned int __VMALLOC_RESERVE; + +extern int sysctl_legacy_va_layout; + +extern int page_is_ram(unsigned long pagenr); + +#endif /* __ASSEMBLY__ */ + +#ifdef __ASSEMBLY__ +#define __PAGE_OFFSET CONFIG_PAGE_OFFSET +#define __PHYSICAL_START CONFIG_PHYSICAL_START +#else +#define __PAGE_OFFSET ((unsigned long)CONFIG_PAGE_OFFSET) +#define __PHYSICAL_START ((unsigned long)CONFIG_PHYSICAL_START) +#endif +#define __KERNEL_START (__PAGE_OFFSET + __PHYSICAL_START) + + +#define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET) +#define VMALLOC_RESERVE ((unsigned long)__VMALLOC_RESERVE) +#define MAXMEM (__FIXADDR_TOP-__PAGE_OFFSET-__VMALLOC_RESERVE) +#define __pa(x) ((unsigned long)(x)-PAGE_OFFSET) +#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) +#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) +#ifdef CONFIG_FLATMEM +#define pfn_to_page(pfn) (mem_map + (pfn)) +#define page_to_pfn(page) ((unsigned long)((page) - mem_map)) +#define pfn_valid(pfn) ((pfn) < max_mapnr) +#endif /* CONFIG_FLATMEM */ +#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) + +#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) + +#define VM_DATA_DEFAULT_FLAGS \ + (VM_READ | VM_WRITE | \ + ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \ + VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) + +#define __HAVE_ARCH_GATE_AREA 1 + +#endif /* __KERNEL__ */ + +#include <asm-generic/page.h> + +#endif /* _I386_PAGE_H */ diff -r 88f97bb8f3ae -r 673f62edbfbe patches/linux-2.6.16-rc5/i386-mach-io-check-nmi.patch --- /dev/null Wed Mar 1 17:01:54 2006 +++ b/patches/linux-2.6.16-rc5/i386-mach-io-check-nmi.patch Wed Mar 1 19:47:25 2006 @@ -0,0 +1,45 @@ +diff -pruN ../pristine-linux-2.6.16-rc5/arch/i386/kernel/traps.c ./arch/i386/kernel/traps.c +--- ../pristine-linux-2.6.16-rc5/arch/i386/kernel/traps.c 2006-02-27 15:46:58.000000000 +0000 ++++ ./arch/i386/kernel/traps.c 2006-02-27 15:55:23.000000000 +0000 +@@ -567,18 +567,11 @@ static void mem_parity_error(unsigned ch + + static void io_check_error(unsigned char reason, struct pt_regs * regs) + { +- unsigned long i; +- + printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n"); + show_registers(regs); + + /* Re-enable the IOCK line, wait for a few seconds */ +- reason = (reason & 0xf) | 8; +- outb(reason, 0x61); +- i = 2000; +- while (--i) udelay(1000); +- reason &= ~8; +- outb(reason, 0x61); ++ clear_io_check_error(reason); + } + + static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs) +diff -pruN ../pristine-linux-2.6.16-rc5/include/asm-i386/mach-default/mach_traps.h ./include/asm-i386/mach-default/mach_traps.h +--- ../pristine-linux-2.6.16-rc5/include/asm-i386/mach-default/mach_traps.h 2006-01-03 03:21:10.000000000 +0000 ++++ ./include/asm-i386/mach-default/mach_traps.h 2006-02-27 15:55:23.000000000 +0000 +@@ -15,6 +15,18 @@ static inline void clear_mem_error(unsig + outb(reason, 0x61); + } + ++static inline void clear_io_check_error(unsigned char reason) ++{ ++ unsigned long i; ++ ++ reason = (reason & 0xf) | 8; ++ outb(reason, 0x61); ++ i = 2000; ++ while (--i) udelay(1000); ++ reason &= ~8; ++ outb(reason, 0x61); ++} ++ + static inline unsigned char get_nmi_reason(void) + { + return inb(0x61); diff -r 88f97bb8f3ae -r 673f62edbfbe patches/linux-2.6.16-rc5/net-csum.patch --- /dev/null Wed Mar 1 17:01:54 2006 +++ b/patches/linux-2.6.16-rc5/net-csum.patch Wed Mar 1 19:47:25 2006 @@ -0,0 +1,41 @@ +diff -pruN ../pristine-linux-2.6.16-rc5/net/ipv4/netfilter/ip_nat_proto_tcp.c ./net/ipv4/netfilter/ip_nat_proto_tcp.c +--- ../pristine-linux-2.6.16-rc5/net/ipv4/netfilter/ip_nat_proto_tcp.c 2006-02-27 15:47:38.000000000 +0000 ++++ ./net/ipv4/netfilter/ip_nat_proto_tcp.c 2006-02-27 15:55:25.000000000 +0000 +@@ -129,10 +129,14 @@ tcp_manip_pkt(struct sk_buff **pskb, + if (hdrsize < sizeof(*hdr)) + return 1; + +- hdr->check = ip_nat_cheat_check(~oldip, newip, ++ if ((*pskb)->proto_csum_blank) { ++ hdr->check = ip_nat_cheat_check(oldip, ~newip, hdr->check); ++ } else { ++ hdr->check = ip_nat_cheat_check(~oldip, newip, + ip_nat_cheat_check(oldport ^ 0xFFFF, + newport, + hdr->check)); ++ } + return 1; + } + +diff -pruN ../pristine-linux-2.6.16-rc5/net/ipv4/netfilter/ip_nat_proto_udp.c ./net/ipv4/netfilter/ip_nat_proto_udp.c +--- ../pristine-linux-2.6.16-rc5/net/ipv4/netfilter/ip_nat_proto_udp.c 2006-02-27 15:47:38.000000000 +0000 ++++ ./net/ipv4/netfilter/ip_nat_proto_udp.c 2006-02-27 15:55:25.000000000 +0000 +@@ -113,11 +113,16 @@ udp_manip_pkt(struct sk_buff **pskb, + newport = tuple->dst.u.udp.port; + portptr = &hdr->dest; + } +- if (hdr->check) /* 0 is a special case meaning no checksum */ +- hdr->check = ip_nat_cheat_check(~oldip, newip, ++ if (hdr->check) { /* 0 is a special case meaning no checksum */ ++ if ((*pskb)->proto_csum_blank) { ++ hdr->check = ip_nat_cheat_check(oldip, ~newip, hdr->check); ++ } else { ++ hdr->check = ip_nat_cheat_check(~oldip, newip, + ip_nat_cheat_check(*portptr ^ 0xFFFF, + newport, + hdr->check)); ++ } ++ } + *portptr = newport; + return 1; + } diff -r 88f97bb8f3ae -r 673f62edbfbe patches/linux-2.6.16-rc5/pmd-shared.patch --- /dev/null Wed Mar 1 17:01:54 2006 +++ b/patches/linux-2.6.16-rc5/pmd-shared.patch Wed Mar 1 19:47:25 2006 @@ -0,0 +1,111 @@ +diff -pruN ../pristine-linux-2.6.16-rc5/arch/i386/mm/pageattr.c ./arch/i386/mm/pageattr.c +--- ../pristine-linux-2.6.16-rc5/arch/i386/mm/pageattr.c 2006-02-27 15:46:58.000000000 +0000 ++++ ./arch/i386/mm/pageattr.c 2006-02-27 15:55:31.000000000 +0000 +@@ -78,7 +78,7 @@ static void set_pmd_pte(pte_t *kpte, uns + unsigned long flags; + + set_pte_atomic(kpte, pte); /* change init_mm */ +- if (PTRS_PER_PMD > 1) ++ if (HAVE_SHARED_KERNEL_PMD) + return; + + spin_lock_irqsave(&pgd_lock, flags); +diff -pruN ../pristine-linux-2.6.16-rc5/arch/i386/mm/pgtable.c ./arch/i386/mm/pgtable.c +--- ../pristine-linux-2.6.16-rc5/arch/i386/mm/pgtable.c 2006-01-03 03:21:10.000000000 +0000 ++++ ./arch/i386/mm/pgtable.c 2006-02-27 15:55:31.000000000 +0000 +@@ -215,9 +215,10 @@ void pgd_ctor(void *pgd, kmem_cache_t *c + spin_lock_irqsave(&pgd_lock, flags); + } + +- clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, +- swapper_pg_dir + USER_PTRS_PER_PGD, +- KERNEL_PGD_PTRS); ++ if (PTRS_PER_PMD == 1 || HAVE_SHARED_KERNEL_PMD) ++ clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, ++ swapper_pg_dir + USER_PTRS_PER_PGD, ++ KERNEL_PGD_PTRS); + if (PTRS_PER_PMD > 1) + return; + +@@ -249,6 +250,30 @@ pgd_t *pgd_alloc(struct mm_struct *mm) + goto out_oom; + set_pgd(&pgd[i], __pgd(1 + __pa(pmd))); + } ++ ++ if (!HAVE_SHARED_KERNEL_PMD) { ++ unsigned long flags; ++ ++ for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) { ++ pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); ++ if (!pmd) ++ goto out_oom; ++ set_pgd(&pgd[USER_PTRS_PER_PGD], __pgd(1 + __pa(pmd))); ++ } ++ ++ spin_lock_irqsave(&pgd_lock, flags); ++ for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) { ++ unsigned long v = (unsigned long)i << PGDIR_SHIFT; ++ pgd_t *kpgd = pgd_offset_k(v); ++ pud_t *kpud = pud_offset(kpgd, v); ++ pmd_t *kpmd = pmd_offset(kpud, v); ++ pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1); ++ memcpy(pmd, kpmd, PAGE_SIZE); ++ } ++ pgd_list_add(pgd); ++ spin_unlock_irqrestore(&pgd_lock, flags); ++ } ++ + return pgd; + + out_oom: +@@ -263,9 +288,23 @@ void pgd_free(pgd_t *pgd) + int i; + + /* in the PAE case user pgd entries are overwritten before usage */ +- if (PTRS_PER_PMD > 1) +- for (i = 0; i < USER_PTRS_PER_PGD; ++i) +- kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); ++ if (PTRS_PER_PMD > 1) { ++ for (i = 0; i < USER_PTRS_PER_PGD; ++i) { ++ pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1); ++ kmem_cache_free(pmd_cache, pmd); ++ } ++ if (!HAVE_SHARED_KERNEL_PMD) { ++ unsigned long flags; ++ spin_lock_irqsave(&pgd_lock, flags); ++ pgd_list_del(pgd); ++ spin_unlock_irqrestore(&pgd_lock, flags); ++ for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) { ++ pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1); ++ memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); ++ kmem_cache_free(pmd_cache, pmd); ++ } ++ } ++ } + /* in the non-PAE case, free_pgtables() clears user pgd entries */ + kmem_cache_free(pgd_cache, pgd); + } +diff -pruN ../pristine-linux-2.6.16-rc5/include/asm-i386/pgtable-2level-defs.h ./include/asm-i386/pgtable-2level-defs.h +--- ../pristine-linux-2.6.16-rc5/include/asm-i386/pgtable-2level-defs.h 2006-01-03 03:21:10.000000000 +0000 ++++ ./include/asm-i386/pgtable-2level-defs.h 2006-02-27 15:55:31.000000000 +0000 +@@ -1,6 +1,8 @@ + #ifndef _I386_PGTABLE_2LEVEL_DEFS_H + #define _I386_PGTABLE_2LEVEL_DEFS_H + ++#define HAVE_SHARED_KERNEL_PMD 0 ++ + /* + * traditional i386 two-level paging structure: + */ +diff -pruN ../pristine-linux-2.6.16-rc5/include/asm-i386/pgtable-3level-defs.h ./include/asm-i386/pgtable-3level-defs.h +--- ../pristine-linux-2.6.16-rc5/include/asm-i386/pgtable-3level-defs.h 2006-01-03 03:21:10.000000000 +0000 ++++ ./include/asm-i386/pgtable-3level-defs.h 2006-02-27 15:55:31.000000000 +0000 +@@ -1,6 +1,8 @@ + #ifndef _I386_PGTABLE_3LEVEL_DEFS_H + #define _I386_PGTABLE_3LEVEL_DEFS_H + ++#define HAVE_SHARED_KERNEL_PMD 1 ++ + /* + * PGDIR_SHIFT determines what a top-level page table entry can map + */ diff -r 88f97bb8f3ae -r 673f62edbfbe patches/linux-2.6.16-rc5/smp-alts.patch --- /dev/null Wed Mar 1 17:01:54 2006 +++ b/patches/linux-2.6.16-rc5/smp-alts.patch Wed Mar 1 19:47:25 2006 @@ -0,0 +1,591 @@ +diff -pruN ../pristine-linux-2.6.16-rc5/arch/i386/Kconfig ./arch/i386/Kconfig +--- ../pristine-linux-2.6.16-rc5/arch/i386/Kconfig 2006-02-27 15:46:58.000000000 +0000 ++++ ./arch/i386/Kconfig 2006-02-27 15:55:34.000000000 +0000 +@@ -202,6 +202,19 @@ config SMP + + If you don't know what to do here, say N. + ++config SMP_ALTERNATIVES ++ bool "SMP alternatives support (EXPERIMENTAL)" ++ depends on SMP && EXPERIMENTAL ++ help ++ Try to reduce the overhead of running an SMP kernel on a uniprocessor ++ host slightly by replacing certain key instruction sequences ++ according to whether we currently have more than one CPU available. ++ This should provide a noticeable boost to performance when ++ running SMP kernels on UP machines, and have negligible impact ++ when running on an true SMP host. ++ ++ If unsure, say N. ++ + config NR_CPUS + int "Maximum number of CPUs (2-255)" + range 2 255 +diff -pruN ../pristine-linux-2.6.16-rc5/arch/i386/kernel/Makefile ./arch/i386/kernel/Makefile +--- ../pristine-linux-2.6.16-rc5/arch/i386/kernel/Makefile 2006-02-27 15:46:58.000000000 +0000 ++++ ./arch/i386/kernel/Makefile 2006-02-27 15:55:34.000000000 +0000 +@@ -37,6 +37,7 @@ obj-$(CONFIG_EFI) += efi.o efi_stub.o + obj-$(CONFIG_DOUBLEFAULT) += doublefault.o + obj-$(CONFIG_VM86) += vm86.o + obj-$(CONFIG_EARLY_PRINTK) += early_printk.o ++obj-$(CONFIG_SMP_ALTERNATIVES) += smpalts.o + + EXTRA_AFLAGS := -traditional + +diff -pruN ../pristine-linux-2.6.16-rc5/arch/i386/kernel/smpalts.c ./arch/i386/kernel/smpalts.c +--- ../pristine-linux-2.6.16-rc5/arch/i386/kernel/smpalts.c 1970-01-01 01:00:00.000000000 +0100 ++++ ./arch/i386/kernel/smpalts.c 2006-02-27 15:55:34.000000000 +0000 +@@ -0,0 +1,85 @@ ++#include <linux/kernel.h> ++#include <asm/system.h> ++#include <asm/smp_alt.h> ++#include <asm/processor.h> ++#include <asm/string.h> ++ ++struct smp_replacement_record { ++ unsigned char targ_size; ++ unsigned char smp1_size; ++ unsigned char smp2_size; ++ unsigned char up_size; ++ unsigned char feature; ++ unsigned char data[0]; ++}; ++ ++struct smp_alternative_record { ++ void *targ_start; ++ struct smp_replacement_record *repl; ++}; ++ ++extern struct smp_alternative_record __start_smp_alternatives_table, ++ __stop_smp_alternatives_table; ++extern unsigned long __init_begin, __init_end; ++ ++void prepare_for_smp(void) ++{ ++ struct smp_alternative_record *r; ++ printk(KERN_INFO "Enabling SMP...\n"); ++ for (r = &__start_smp_alternatives_table; ++ r != &__stop_smp_alternatives_table; ++ r++) { ++ BUG_ON(r->repl->targ_size < r->repl->smp1_size); ++ BUG_ON(r->repl->targ_size < r->repl->smp2_size); ++ BUG_ON(r->repl->targ_size < r->repl->up_size); ++ if (system_state == SYSTEM_RUNNING && ++ r->targ_start >= (void *)&__init_begin && ++ r->targ_start < (void *)&__init_end) ++ continue; ++ if (r->repl->feature != (unsigned char)-1 && ++ boot_cpu_has(r->repl->feature)) { ++ memcpy(r->targ_start, ++ r->repl->data + r->repl->smp1_size, ++ r->repl->smp2_size); ++ memset(r->targ_start + r->repl->smp2_size, ++ 0x90, ++ r->repl->targ_size - r->repl->smp2_size); ++ } else { ++ memcpy(r->targ_start, ++ r->repl->data, ++ r->repl->smp1_size); ++ memset(r->targ_start + r->repl->smp1_size, ++ 0x90, ++ r->repl->targ_size - r->repl->smp1_size); ++ } ++ } ++ /* Paranoia */ ++ asm volatile ("jmp 1f\n1:"); ++ mb(); ++} ++ ++void unprepare_for_smp(void) ++{ ++ struct smp_alternative_record *r; ++ printk(KERN_INFO "Disabling SMP...\n"); ++ for (r = &__start_smp_alternatives_table; ++ r != &__stop_smp_alternatives_table; ++ r++) { ++ BUG_ON(r->repl->targ_size < r->repl->smp1_size); ++ BUG_ON(r->repl->targ_size < r->repl->smp2_size); ++ BUG_ON(r->repl->targ_size < r->repl->up_size); ++ if (system_state == SYSTEM_RUNNING && ++ r->targ_start >= (void *)&__init_begin && ++ r->targ_start < (void *)&__init_end) ++ continue; ++ memcpy(r->targ_start, ++ r->repl->data + r->repl->smp1_size + r->repl->smp2_size, ++ r->repl->up_size); ++ memset(r->targ_start + r->repl->up_size, ++ 0x90, ++ r->repl->targ_size - r->repl->up_size); ++ } ++ /* Paranoia */ ++ asm volatile ("jmp 1f\n1:"); ++ mb(); ++} +diff -pruN ../pristine-linux-2.6.16-rc5/arch/i386/kernel/smpboot.c ./arch/i386/kernel/smpboot.c +--- ../pristine-linux-2.6.16-rc5/arch/i386/kernel/smpboot.c 2006-02-27 15:46:58.000000000 +0000 ++++ ./arch/i386/kernel/smpboot.c 2006-02-27 15:55:34.000000000 +0000 +@@ -1208,6 +1208,11 @@ static void __init smp_boot_cpus(unsigne + if (max_cpus <= cpucount+1) + continue; + ++#ifdef CONFIG_SMP_ALTERNATIVES ++ if (kicked == 1) ++ prepare_for_smp(); ++#endif ++ + if (((cpu = alloc_cpu_id()) <= 0) || do_boot_cpu(apicid, cpu)) + printk("CPU #%d not responding - cannot use it.\n", + apicid); +@@ -1386,6 +1391,11 @@ int __devinit __cpu_up(unsigned int cpu) + return -EIO; + } + ++#ifdef CONFIG_SMP_ALTERNATIVES ++ if (num_online_cpus() == 1) ++ prepare_for_smp(); ++#endif ++ + local_irq_enable(); + per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; + /* Unleash the CPU! */ +diff -pruN ../pristine-linux-2.6.16-rc5/arch/i386/kernel/vmlinux.lds.S ./arch/i386/kernel/vmlinux.lds.S +--- ../pristine-linux-2.6.16-rc5/arch/i386/kernel/vmlinux.lds.S 2006-01-03 03:21:10.000000000 +0000 ++++ ./arch/i386/kernel/vmlinux.lds.S 2006-02-27 15:55:34.000000000 +0000 +@@ -34,6 +34,13 @@ SECTIONS + __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) } + __stop___ex_table = .; + ++ . = ALIGN(16); ++ __start_smp_alternatives_table = .; ++ __smp_alternatives : { *(__smp_alternatives) } ++ __stop_smp_alternatives_table = .; ++ ++ __smp_replacements : { *(__smp_replacements) } ++ + RODATA + + /* writeable */ +diff -pruN ../pristine-linux-2.6.16-rc5/include/asm-i386/atomic.h ./include/asm-i386/atomic.h +--- ../pristine-linux-2.6.16-rc5/include/asm-i386/atomic.h 2006-02-27 15:47:25.000000000 +0000 ++++ ./include/asm-i386/atomic.h 2006-02-27 15:55:34.000000000 +0000 +@@ -4,18 +4,13 @@ + #include <linux/config.h> + #include <linux/compiler.h> + #include <asm/processor.h> ++#include <asm/smp_alt.h> + + /* + * Atomic operations that C can't guarantee us. Useful for + * resource counting etc.. + */ + +-#ifdef CONFIG_SMP +-#define LOCK "lock ; " +-#else +-#define LOCK "" +-#endif +- + /* + * Make sure gcc doesn't try to be clever and move things around + * on us. We need to use _exactly_ the address the user gave us, +diff -pruN ../pristine-linux-2.6.16-rc5/include/asm-i386/bitops.h ./include/asm-i386/bitops.h +--- ../pristine-linux-2.6.16-rc5/include/asm-i386/bitops.h 2006-02-27 15:47:25.000000000 +0000 ++++ ./include/asm-i386/bitops.h 2006-02-27 15:55:34.000000000 +0000 +@@ -7,6 +7,7 @@ + + #include <linux/config.h> + #include <linux/compiler.h> ++#include <asm/smp_alt.h> + + /* + * These have to be done with inline assembly: that way the bit-setting +@@ -16,12 +17,6 @@ + * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1). + */ + +-#ifdef CONFIG_SMP +-#define LOCK_PREFIX "lock ; " +-#else +-#define LOCK_PREFIX "" +-#endif +- + #define ADDR (*(volatile long *) addr) + + /** +@@ -41,7 +36,7 @@ + */ + static inline void set_bit(int nr, volatile unsigned long * addr) + { +- __asm__ __volatile__( LOCK_PREFIX ++ __asm__ __volatile__( LOCK + "btsl %1,%0" + :"+m" (ADDR) + :"Ir" (nr)); +@@ -76,7 +71,7 @@ static inline void __set_bit(int nr, vol + */ + static inline void clear_bit(int nr, volatile unsigned long * addr) + { +- __asm__ __volatile__( LOCK_PREFIX ++ __asm__ __volatile__( LOCK + "btrl %1,%0" + :"+m" (ADDR) + :"Ir" (nr)); +@@ -121,7 +116,7 @@ static inline void __change_bit(int nr, + */ + static inline void change_bit(int nr, volatile unsigned long * addr) + { +- __asm__ __volatile__( LOCK_PREFIX ++ __asm__ __volatile__( LOCK + "btcl %1,%0" + :"+m" (ADDR) + :"Ir" (nr)); +@@ -140,7 +135,7 @@ static inline int test_and_set_bit(int n + { + int oldbit; + +- __asm__ __volatile__( LOCK_PREFIX ++ __asm__ __volatile__( LOCK + "btsl %2,%1\n\tsbbl %0,%0" + :"=r" (oldbit),"+m" (ADDR) + :"Ir" (nr) : "memory"); +@@ -180,7 +175,7 @@ static inline int test_and_clear_bit(int + { + int oldbit; + +- __asm__ __volatile__( LOCK_PREFIX ++ __asm__ __volatile__( LOCK + "btrl %2,%1\n\tsbbl %0,%0" + :"=r" (oldbit),"+m" (ADDR) + :"Ir" (nr) : "memory"); +@@ -231,7 +226,7 @@ static inline int test_and_change_bit(in + { + int oldbit; + +- __asm__ __volatile__( LOCK_PREFIX ++ __asm__ __volatile__( LOCK + "btcl %2,%1\n\tsbbl %0,%0" + :"=r" (oldbit),"+m" (ADDR) + :"Ir" (nr) : "memory"); +diff -pruN ../pristine-linux-2.6.16-rc5/include/asm-i386/futex.h ./include/asm-i386/futex.h +--- ../pristine-linux-2.6.16-rc5/include/asm-i386/futex.h 2006-02-27 15:47:25.000000000 +0000 ++++ ./include/asm-i386/futex.h 2006-02-27 15:55:34.000000000 +0000 +@@ -28,7 +28,7 @@ + "1: movl %2, %0\n\ + movl %0, %3\n" \ + insn "\n" \ +-"2: " LOCK_PREFIX "cmpxchgl %3, %2\n\ ++"2: " LOCK "cmpxchgl %3, %2\n\ + jnz 1b\n\ + 3: .section .fixup,\"ax\"\n\ + 4: mov %5, %1\n\ +@@ -68,7 +68,7 @@ futex_atomic_op_inuser (int encoded_op, + #endif + switch (op) { + case FUTEX_OP_ADD: +- __futex_atomic_op1(LOCK_PREFIX "xaddl %0, %2", ret, ++ __futex_atomic_op1(LOCK "xaddl %0, %2", ret, + oldval, uaddr, oparg); + break; + case FUTEX_OP_OR: +diff -pruN ../pristine-linux-2.6.16-rc5/include/asm-i386/rwsem.h ./include/asm-i386/rwsem.h +--- ../pristine-linux-2.6.16-rc5/include/asm-i386/rwsem.h 2006-01-03 03:21:10.000000000 +0000 ++++ ./include/asm-i386/rwsem.h 2006-02-27 15:55:34.000000000 +0000 +@@ -40,6 +40,7 @@ + + #include <linux/list.h> + #include <linux/spinlock.h> ++#include <asm/smp_alt.h> + + struct rwsem_waiter; + +@@ -99,7 +100,7 @@ static inline void __down_read(struct rw + { + __asm__ __volatile__( + "# beginning down_read\n\t" +-LOCK_PREFIX " incl (%%eax)\n\t" /* adds 0x00000001, returns the old value */ ++LOCK " incl (%%eax)\n\t" /* adds 0x00000001, returns the old value */ + " js 2f\n\t" /* jump if we weren't granted the lock */ + "1:\n\t" + LOCK_SECTION_START("") +@@ -130,7 +131,7 @@ static inline int __down_read_trylock(st + " movl %1,%2\n\t" + " addl %3,%2\n\t" + " jle 2f\n\t" +-LOCK_PREFIX " cmpxchgl %2,%0\n\t" ++LOCK " cmpxchgl %2,%0\n\t" + " jnz 1b\n\t" + "2:\n\t" + "# ending __down_read_trylock\n\t" +@@ -150,7 +151,7 @@ static inline void __down_write(struct r + tmp = RWSEM_ACTIVE_WRITE_BIAS; + __asm__ __volatile__( + "# beginning down_write\n\t" +-LOCK_PREFIX " xadd %%edx,(%%eax)\n\t" /* subtract 0x0000ffff, returns the old value */ ++LOCK " xadd %%edx,(%%eax)\n\t" /* subtract 0x0000ffff, returns the old value */ + " testl %%edx,%%edx\n\t" /* was the count 0 before? */ + " jnz 2f\n\t" /* jump if we weren't granted the lock */ + "1:\n\t" +@@ -188,7 +189,7 @@ static inline void __up_read(struct rw_s + __s32 tmp = -RWSEM_ACTIVE_READ_BIAS; + __asm__ __volatile__( + "# beginning __up_read\n\t" +-LOCK_PREFIX " xadd %%edx,(%%eax)\n\t" /* subtracts 1, returns the old value */ ++LOCK " xadd %%edx,(%%eax)\n\t" /* subtracts 1, returns the old value */ + " js 2f\n\t" /* jump if the lock is being waited upon */ + "1:\n\t" + LOCK_SECTION_START("") +@@ -214,7 +215,7 @@ static inline void __up_write(struct rw_ + __asm__ __volatile__( + "# beginning __up_write\n\t" + " movl %2,%%edx\n\t" +-LOCK_PREFIX " xaddl %%edx,(%%eax)\n\t" /* tries to transition 0xffff0001 -> 0x00000000 */ ++LOCK " xaddl %%edx,(%%eax)\n\t" /* tries to transition 0xffff0001 -> 0x00000000 */ + " jnz 2f\n\t" /* jump if the lock is being waited upon */ + "1:\n\t" + LOCK_SECTION_START("") +@@ -239,7 +240,7 @@ static inline void __downgrade_write(str + { + __asm__ __volatile__( + "# beginning __downgrade_write\n\t" +-LOCK_PREFIX " addl %2,(%%eax)\n\t" /* transitions 0xZZZZ0001 -> 0xYYYY0001 */ ++LOCK " addl %2,(%%eax)\n\t" /* transitions 0xZZZZ0001 -> 0xYYYY0001 */ + " js 2f\n\t" /* jump if the lock is being waited upon */ + "1:\n\t" + LOCK_SECTION_START("") +@@ -263,7 +264,7 @@ LOCK_PREFIX " addl %2,(%%eax)\n\t" + static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) + { + __asm__ __volatile__( +-LOCK_PREFIX "addl %1,%0" ++LOCK "addl %1,%0" + : "=m"(sem->count) + : "ir"(delta), "m"(sem->count)); + } +@@ -276,7 +277,7 @@ static inline int rwsem_atomic_update(in + int tmp = delta; + + __asm__ __volatile__( +-LOCK_PREFIX "xadd %0,(%2)" ++LOCK "xadd %0,(%2)" + : "+r"(tmp), "=m"(sem->count) + : "r"(sem), "m"(sem->count) + : "memory"); +diff -pruN ../pristine-linux-2.6.16-rc5/include/asm-i386/smp_alt.h ./include/asm-i386/smp_alt.h +--- ../pristine-linux-2.6.16-rc5/include/asm-i386/smp_alt.h 1970-01-01 01:00:00.000000000 +0100 ++++ ./include/asm-i386/smp_alt.h 2006-02-27 15:55:34.000000000 +0000 +@@ -0,0 +1,32 @@ ++#ifndef __ASM_SMP_ALT_H__ ++#define __ASM_SMP_ALT_H__ ++ ++#include <linux/config.h> ++ ++#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP_ALTERNATIVES) && !defined(MODULE) ++#define LOCK \ ++ "6677: nop\n" \ ++ ".section __smp_alternatives,\"a\"\n" \ ++ ".long 6677b\n" \ ++ ".long 6678f\n" \ ++ ".previous\n" \ ++ ".section __smp_replacements,\"a\"\n" \ ++ "6678: .byte 1\n" \ ++ ".byte 1\n" \ ++ ".byte 0\n" \ ++ ".byte 1\n" \ ++ ".byte -1\n" \ ++ "lock\n" \ ++ "nop\n" \ ++ ".previous\n" ++void prepare_for_smp(void); ++void unprepare_for_smp(void); ++#else ++#define LOCK "lock ; " ++#endif ++#else ++#define LOCK "" ++#endif ++ ++#endif /* __ASM_SMP_ALT_H__ */ +diff -pruN ../pristine-linux-2.6.16-rc5/include/asm-i386/spinlock.h ./include/asm-i386/spinlock.h +--- ../pristine-linux-2.6.16-rc5/include/asm-i386/spinlock.h 2006-01-03 03:21:10.000000000 +0000 ++++ ./include/asm-i386/spinlock.h 2006-02-27 15:55:34.000000000 +0000 +@@ -6,6 +6,7 @@ + #include <asm/page.h> + #include <linux/config.h> + #include <linux/compiler.h> ++#include <asm/smp_alt.h> + + /* + * Your basic SMP spinlocks, allowing only a single CPU anywhere +@@ -23,7 +24,8 @@ + + #define __raw_spin_lock_string \ + "\n1:\t" \ +- "lock ; decb %0\n\t" \ ++ LOCK \ ++ "decb %0\n\t" \ + "jns 3f\n" \ + "2:\t" \ + "rep;nop\n\t" \ +@@ -34,7 +36,8 @@ + + #define __raw_spin_lock_string_flags \ + "\n1:\t" \ +- "lock ; decb %0\n\t" \ ++ LOCK \ ++ "decb %0\n\t" \ + "jns 4f\n\t" \ + "2:\t" \ + "testl $0x200, %1\n\t" \ +@@ -65,10 +68,34 @@ static inline void __raw_spin_lock_flags + static inline int __raw_spin_trylock(raw_spinlock_t *lock) + { + char oldval; ++#ifdef CONFIG_SMP_ALTERNATIVES + __asm__ __volatile__( +- "xchgb %b0,%1" ++ "1:movb %1,%b0\n" ++ "movb $0,%1\n" ++ "2:" ++ ".section __smp_alternatives,\"a\"\n" ++ ".long 1b\n" ++ ".long 3f\n" ++ ".previous\n" ++ ".section __smp_replacements,\"a\"\n" ++ "3: .byte 2b - 1b\n" ++ ".byte 5f-4f\n" ++ ".byte 0\n" ++ ".byte 6f-5f\n" ++ ".byte -1\n" ++ "4: xchgb %b0,%1\n" ++ "5: movb %1,%b0\n" ++ "movb $0,%1\n" ++ "6:\n" ++ ".previous\n" + :"=q" (oldval), "=m" (lock->slock) + :"0" (0) : "memory"); ++#else ++ __asm__ __volatile__( ++ "xchgb %b0,%1\n" ++ :"=q" (oldval), "=m" (lock->slock) ++ :"0" (0) : "memory"); ++#endif + return oldval > 0; + } + +@@ -178,12 +205,12 @@ static inline int __raw_write_trylock(ra + + static inline void __raw_read_unlock(raw_rwlock_t *rw) + { +- asm volatile("lock ; incl %0" :"=m" (rw->lock) : : "memory"); ++ asm volatile(LOCK "incl %0" :"=m" (rw->lock) : : "memory"); + } + + static inline void __raw_write_unlock(raw_rwlock_t *rw) + { +- asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ", %0" ++ asm volatile(LOCK "addl $" RW_LOCK_BIAS_STR ", %0" + : "=m" (rw->lock) : : "memory"); + } + +diff -pruN ../pristine-linux-2.6.16-rc5/include/asm-i386/system.h ./include/asm-i386/system.h +--- ../pristine-linux-2.6.16-rc5/include/asm-i386/system.h 2006-02-27 15:47:25.000000000 +0000 ++++ ./include/asm-i386/system.h 2006-02-27 15:55:34.000000000 +0000 +@@ -5,7 +5,7 @@ + #include <linux/kernel.h> + #include <asm/segment.h> + #include <asm/cpufeature.h> +-#include <linux/bitops.h> /* for LOCK_PREFIX */ ++#include <asm/smp_alt.h> + + #ifdef __KERNEL__ + +@@ -271,19 +271,19 @@ static inline unsigned long __cmpxchg(vo + unsigned long prev; + switch (size) { + case 1: +- __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2" ++ __asm__ __volatile__(LOCK "cmpxchgb %b1,%2" + : "=a"(prev) + : "q"(new), "m"(*__xg(ptr)), "0"(old) + : "memory"); + return prev; + case 2: +- __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2" ++ __asm__ __volatile__(LOCK "cmpxchgw %w1,%2" + : "=a"(prev) + : "r"(new), "m"(*__xg(ptr)), "0"(old) + : "memory"); + return prev; + case 4: +- __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2" ++ __asm__ __volatile__(LOCK "cmpxchgl %1,%2" + : "=a"(prev) + : "r"(new), "m"(*__xg(ptr)), "0"(old) + : "memory"); +@@ -336,7 +336,7 @@ static inline unsigned long long __cmpxc + unsigned long long new) + { + unsigned long long prev; +- __asm__ __volatile__(LOCK_PREFIX "cmpxchg8b %3" ++ __asm__ __volatile__(LOCK "cmpxchg8b %3" + : "=A"(prev) + : "b"((unsigned long)new), + "c"((unsigned long)(new >> 32)), +@@ -503,11 +503,55 @@ struct alt_instr { + #endif + + #ifdef CONFIG_SMP ++#if defined(CONFIG_SMP_ALTERNATIVES) && !defined(MODULE) ++#define smp_alt_mb(instr) \ ++__asm__ __volatile__("6667:\nnop\nnop\nnop\nnop\nnop\nnop\n6668:\n" \ ++ ".section __smp_alternatives,\"a\"\n" \ ++ ".long 6667b\n" \ ++ ".long 6673f\n" \ ++ ".previous\n" \ ++ ".section __smp_replacements,\"a\"\n" \ ++ "6673:.byte 6668b-6667b\n" \ ++ ".byte 6670f-6669f\n" \ ++ ".byte 6671f-6670f\n" \ ++ ".byte 0\n" \ ++ ".byte %c0\n" \ ++ "6669:lock;addl $0,0(%%esp)\n" \ ++ "6670:" instr "\n" \ ++ "6671:\n" \ ++ ".previous\n" \ ++ : \ ++ : "i" (X86_FEATURE_XMM2) \ ++ : "memory") ++#define smp_rmb() smp_alt_mb("lfence") ++#define smp_mb() smp_alt_mb("mfence") ++#define set_mb(var, value) do { \ ++unsigned long __set_mb_temp; \ ++__asm__ __volatile__("6667:movl %1, %0\n6668:\n" \ ++ ".section __smp_alternatives,\"a\"\n" \ ++ ".long 6667b\n" \ ++ ".long 6673f\n" \ ++ ".previous\n" \ ++ ".section __smp_replacements,\"a\"\n" \ ++ "6673: .byte 6668b-6667b\n" \ ++ ".byte 6670f-6669f\n" \ ++ ".byte 0\n" \ ++ ".byte 6671f-6670f\n" \ ++ ".byte -1\n" \ ++ "6669: xchg %1, %0\n" \ ++ "6670:movl %1, %0\n" \ ++ "6671:\n" \ ++ ".previous\n" \ ++ : "=m" (var), "=r" (__set_mb_temp) \ ++ : "1" (value) \ ++ : "memory"); } while (0) ++#else + #define smp_mb() mb() + #define smp_rmb() rmb() ++#define set_mb(var, value) do { (void) xchg(&var, value); } while (0) ++#endif + #define smp_wmb() wmb() + #define smp_read_barrier_depends() read_barrier_depends() +-#define set_mb(var, value) do { (void) xchg(&var, value); } while (0) + #else + #define smp_mb() barrier() + #define smp_rmb() barrier() diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/locking.sh --- /dev/null Wed Mar 1 17:01:54 2006 +++ b/tools/examples/locking.sh Wed Mar 1 19:47:25 2006 @@ -0,0 +1,98 @@ +# +# Copyright (c) 2005 XenSource Ltd. +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# + +# +# Serialisation +# + +LOCK_SLEEPTIME=1 +LOCK_SPINNING_RETRIES=5 +LOCK_RETRIES=10 +LOCK_BASEDIR=/var/run/xen-hotplug + + +claim_lock() +{ + local lockdir="$LOCK_BASEDIR/$1" + mkdir -p "$LOCK_BASEDIR" + _claim_lock "$lockdir" +} + + +release_lock() +{ + _release_lock "$LOCK_BASEDIR/$1" +} + + +_claim_lock() +{ + local lockdir="$1" + local owner=$(_lock_owner "$lockdir") + local retries=0 + + while [ $retries -lt $LOCK_RETRIES ] + do + mkdir "$lockdir" 2>/dev/null && trap "release_lock $1; sigerr" ERR && + _update_lock_info "$lockdir" && return + + local new_owner=$(_lock_owner "$lockdir") + if [ "$new_owner" != "$owner" ] + then + owner="$new_owner" + retries=0 + fi + + if [ $retries -gt $LOCK_SPINNING_RETRIES ] + then + sleep $LOCK_SLEEPTIME + else + sleep 0 + fi + retries=$(($retries + 1)) + done + _steal_lock "$lockdir" +} + + +_release_lock() +{ + trap sigerr ERR + rm -rf "$1" 2>/dev/null || true +} + + +_steal_lock() +{ + local lockdir="$1" + local owner=$(cat "$lockdir/owner" 2>/dev/null || echo "unknown") + log err "Forced to steal lock on $lockdir from $owner!" + _release_lock "$lockdir" + _claim_lock "$lockdir" +} + + +_lock_owner() +{ + cat "$1/owner" 2>/dev/null || echo "unknown" +} + + +_update_lock_info() +{ + echo "$$: $0" >"$1/owner" +} diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/logging.sh --- /dev/null Wed Mar 1 17:01:54 2006 +++ b/tools/examples/logging.sh Wed Mar 1 19:47:25 2006 @@ -0,0 +1,22 @@ +# +# Copyright (c) 2005 XenSource Ltd. +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# + +log() { + local level="$1" + shift + logger -p "daemon.$level" -- "$0:" "$@" || echo "$0 $@" >&2 +} diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/vtpm-delete --- /dev/null Wed Mar 1 17:01:54 2006 +++ b/tools/examples/vtpm-delete Wed Mar 1 19:47:25 2006 @@ -0,0 +1,9 @@ +#!/bin/sh + +# This scripts must be called the following way: +# vtpm-delete <domain name> + +dir=$(dirname "$0") +. "$dir/vtpm-common.sh" + +vtpm_delete_instance $1 diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/vtpm-hotplug-common.sh --- /dev/null Wed Mar 1 17:01:54 2006 +++ b/tools/examples/vtpm-hotplug-common.sh Wed Mar 1 19:47:25 2006 @@ -0,0 +1,35 @@ +# +# Copyright (c) 2005 IBM Corporation +# Copyright (c) 2005 XenSource Ltd. +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# + +dir=$(dirname "$0") +. "$dir/xen-hotplug-common.sh" + +findCommand "$@" +if [ "$command" != "online" ] && + [ "$command" != "offline" ] && + [ "$command" != "add" ] && + [ "$command" != "remove" ] +then + log err "Invalid command: $command" + exit 1 +fi + + +XENBUS_PATH="${XENBUS_PATH:?}" + +. "$dir/vtpm-common.sh" diff -r 88f97bb8f3ae -r 673f62edbfbe tools/examples/xen-hotplug-cleanup --- /dev/null Wed Mar 1 17:01:54 2006 +++ b/tools/examples/xen-hotplug-cleanup Wed Mar 1 19:47:25 2006 @@ -0,0 +1,21 @@ +#! /bin/sh + +dir=$(dirname "$0") +. "$dir/xen-hotplug-common.sh" + +# Claim the lock protecting /etc/xen/scripts/block. This stops a race whereby +# paths in the store would disappear underneath that script as it attempted to +# read from the store checking for device sharing. +# Any other scripts that do similar things will have to have their lock +# claimed too. +# This is pretty horrible, but there's not really a nicer way of solving this. +claim_lock "block" + +# remove device frontend store entries +xenstore-rm -t $(xenstore-read "$XENBUS_PATH/frontend") || true + +# remove device backend store entries +xenstore-rm -t "$XENBUS_PATH" || true +xenstore-rm -t "error/$XENBUS_PATH" || true + +release_lock "block" diff -r 88f97bb8f3ae -r 673f62edbfbe tools/xm-test/tests/vtpm/01_vtpm-list_pos.py --- /dev/null Wed Mar 1 17:01:54 2006 +++ b/tools/xm-test/tests/vtpm/01_vtpm-list_pos.py Wed Mar 1 19:47:25 2006 @@ -0,0 +1,45 @@ +#!/usr/bin/python + +# Copyright (C) International Business Machines Corp., 2006 +# Author: Stefan Berger <stefanb@xxxxxxxxxx) + +# Positive Test: create domain with virtual TPM attached at build time, +# verify list + + +from XmTestLib import * + +def vtpm_cleanup(domName): + # Since this is only a temporary domain I clean up the domain from the + # virtual TPM directory + traceCommand("/etc/xen/scripts/vtpm-delete %s" % domName) + +if ENABLE_HVM_SUPPORT: + SKIP("vtpm-list not supported for HVM domains") + +config = {"vtpm":"instance=1,backend=0"} +domain = XmTestDomain(extraConfig=config) + +try: + domain.start() +except DomainError, e: + if verbose: + print e.extra + vtpm_cleanup(domain.getName()) + FAIL("Unable to create domain") + +domName = domain.getName() + +status, output = traceCommand("xm vtpm-list %s" % domain.getId()) +eyecatcher = "/local/domain/0/backend/vtpm" +where = output.find(eyecatcher) +if status != 0: + vtpm_cleanup(domName) + FAIL("xm vtpm-list returned bad status, expected 0, status is %i" % status) +elif where < 0: + vtpm_cleanup(domName) + FAIL("Fail to list virtual TPM device") + +domain.stop() + +vtpm_cleanup(domName) diff -r 88f97bb8f3ae -r 673f62edbfbe tools/xm-test/tests/vtpm/02_vtpm-cat_pcrs.py --- /dev/null Wed Mar 1 17:01:54 2006 +++ b/tools/xm-test/tests/vtpm/02_vtpm-cat_pcrs.py Wed Mar 1 19:47:25 2006 @@ -0,0 +1,81 @@ +#!/usr/bin/python + +# Copyright (C) International Business Machines Corp., 2006 +# Author: Stefan Berger <stefanb@xxxxxxxxxx) + +# Positive Test: create domain with virtual TPM attached at build time, +# check list of pcrs + +from XmTestLib import * + +def vtpm_cleanup(domName): + # Since this is only a temporary domain I clean up the domain from the + # virtual TPM directory + traceCommand("/etc/xen/scripts/vtpm-delete %s" % domName) + +if ENABLE_HVM_SUPPORT: + SKIP("vtpm-list not supported for HVM domains") + +status, output = traceCommand("ls /dev/tpm0") +if re.search("No such file or directory",output): + SKIP("This machine has no hardware TPM; cannot run this test") + +status, output = traceCommand("ps aux | grep vtpm_manager | grep -v grep") +if output == "": + FAIL("virtual TPM manager must be started to run this test") + +# vtpm manager has been detected +config = {"vtpm":"instance=1,backend=0"} +domain = XmTestDomain(extraConfig=config) + +try: + domain.start() +except DomainError, e: + if verbose: + print e.extra + vtpm_cleanup(domain.getName()) + FAIL("Unable to create domain") + +domName = domain.getName() + +try: + console = XmConsole(domain.getName()) +except ConsoleError, e: + vtpm_cleanup(domName) + FAIL(str(e)) + +try: + console.sendInput("input") + run = console.runCmd("ls /sys") +except ConsoleError, e: + saveLog(console.getHistory()) + vtpm_cleanup(domName) + FAIL(str(e)) + +if re.search("No such file",run["output"]): + try: + run = console.runCmd("mkdir /sys") + run = console.runCmd("mount -t sysfs /sys /sys") + except ConsoleError, e: + saveLog(console.getHistory()) + vtpm_cleanup(domName) + FAIL(str(e)) + +try: + run = console.runCmd("cat /sys/devices/platform/tpm_vtpm/pcrs") +except ConsoleError, e: + saveLog(console.getHistory()) + vtpm_cleanup(domName) + FAIL(str(e)) + +if re.search("No such file",run["output"]): + FAIL("TPM frontend support not compiled into (domU?) kernel") + +console.closeConsole() + +domain.stop() + +vtpm_cleanup(domName) + +if not re.search("PCR-00:",run["output"]): + FAIL("Virtual TPM is not working correctly on /dev/vtpm on backend side") diff -r 88f97bb8f3ae -r 673f62edbfbe tools/xm-test/tests/vtpm/Makefile.am --- /dev/null Wed Mar 1 17:01:54 2006 +++ b/tools/xm-test/tests/vtpm/Makefile.am Wed Mar 1 19:47:25 2006 @@ -0,0 +1,22 @@ + +SUBDIRS = + +TESTS = 01_vtpm-list_pos.test \ + 02_vtpm-cat_pcrs.test + +XFAIL_TESTS = + +EXTRA_DIST = $(TESTS) $(XFAIL_TESTS) + +TESTS_ENVIRONMENT=@TENV@ + +%.test: %.py + cp $< $@ + chmod +x $@ + +clean-local: am_config_clean-local + +am_config_clean-local: + rm -f *test + rm -f *log + rm -f *~ diff -r 88f97bb8f3ae -r 673f62edbfbe xen/arch/x86/x86_32/supervisor_mode_kernel.S --- /dev/null Wed Mar 1 17:01:54 2006 +++ b/xen/arch/x86/x86_32/supervisor_mode_kernel.S Wed Mar 1 19:47:25 2006 @@ -0,0 +1,145 @@ +/* + * Handle stack fixup for guest running in RING 0. + * + * Copyright (c) 2006 Ian Campbell + * + * When a guest kernel is allowed to run in RING 0 a hypercall, + * interrupt or exception interrupting the guest kernel will not cause + * a privilege level change and therefore the stack will not be swapped + * to the Xen stack. + * + * To fix this we look for RING 0 activation frames with a stack + * pointer below HYPERVISOR_VIRT_START (indicating a guest kernel + * frame) and fix this up by locating the Xen stack via the TSS + * and moving the activation frame to the Xen stack. In the process we + * convert the frame into an inter-privilege frame returning to RING 1 + * so that we can catch and reverse the process on exit. + */ + +#include <xen/config.h> +#include <asm/asm_defns.h> +#include <public/xen.h> + + # Upon entry the stack should be the Xen stack and contain: + # %ss, %esp, EFLAGS, %cs|1, %eip, ERROR, SAVE_ALL, RETURN + # On exit the stack should be %ss:%esp (i.e. the guest stack) + # and contain: + # EFLAGS, %cs, %eip, ERROR, SAVE_ALL, RETURN + ALIGN +ENTRY(restore_ring0_guest) + # Point %gs:%esi to guest stack. +RRG0: movw UREGS_ss+4(%esp),%gs + movl UREGS_esp+4(%esp),%esi + + # Copy EFLAGS...EBX, RETURN from Xen stack to guest stack. + movl $(UREGS_kernel_sizeof>>2)+1,%ecx + +1: subl $4,%esi + movl -4(%esp,%ecx,4),%eax +RRG1: movl %eax,%gs:(%esi) + loop 1b + +RRG2: andl $~3,%gs:UREGS_cs+4(%esi) + + movl %gs,%eax + + # We need to do this because these registers are not present + # on the guest stack so they cannot be restored by the code in + # restore_all_guest. +RRG3: mov UREGS_ds+4(%esp),%ds +RRG4: mov UREGS_es+4(%esp),%es +RRG5: mov UREGS_fs+4(%esp),%fs +RRG6: mov UREGS_gs+4(%esp),%gs + +RRG7: movl %eax,%ss + movl %esi,%esp + + ret +.section __ex_table,"a" + .long RRG0,domain_crash_synchronous + .long RRG1,domain_crash_synchronous + .long RRG2,domain_crash_synchronous + .long RRG3,domain_crash_synchronous + .long RRG4,domain_crash_synchronous + .long RRG5,domain_crash_synchronous + .long RRG6,domain_crash_synchronous + .long RRG7,domain_crash_synchronous +.previous + + # Upon entry the stack should be a guest stack and contain: + # EFLAGS, %cs, %eip, ERROR, RETURN + # On exit the stack should be the Xen stack and contain: + # %ss, %esp, EFLAGS, %cs|1, %eip, ERROR, RETURN + ALIGN +ENTRY(fixup_ring0_guest_stack) + pushl %eax + pushl %ecx + pushl %ds + pushl %gs + pushl %esi + + movw $__HYPERVISOR_DS,%ax + movw %ax,%ds + + # Point %gs:%esi to guest stack frame. + movw %ss,%ax + movw %ax,%gs + movl %esp,%esi + # Account for entries on the guest stack: + # * Pushed by normal exception/interrupt/hypercall mechanisms + # * EFLAGS, %cs, %eip, ERROR == 4 words. + # * Pushed by the fixup routine + # * [RETURN], %eax, %ecx, %ds, %gs and %esi == 6 words. + addl $((6+4)*4),%esi + + # %gs:%esi now points to the guest stack before the + # interrupt/exception occured. + + /* + * Reverse the __TSS macro, giving us the CPU number. + * The TSS for this cpu is at init_tss + ( cpu * 128 ). + */ + str %ecx + shrl $3,%ecx # Calculate GDT index for TSS. + subl $(FIRST_RESERVED_GDT_ENTRY+8),%ecx # %ecx = 2*cpu. + shll $6,%ecx # Each TSS entry is 0x80 bytes + addl $init_tss,%ecx # but we have 2*cpu from above. + + # Load Xen stack from TSS. + movw TSS_ss0(%ecx),%ax +TRP1: movw %ax,%ss + movl TSS_esp0(%ecx),%esp + + pushl %gs + pushl %esi + + # Move EFLAGS, %cs, %eip, ERROR, RETURN, %eax, %ecx, %ds, %gs, %esi + # from guest stack to Xen stack. + movl $10,%ecx +1: subl $4,%esp + subl $4,%esi +TRP2: movl %gs:(%esi),%eax + movl %eax,(%esp) + loop 1b + + # CS = CS|1 to simulate RING1 stack frame. + orl $1,32(%esp) + + popl %esi + popl %gs + popl %ds + popl %ecx + popl %eax + ret +.section __ex_table,"a" + .long TRP1,domain_crash_synchronous + .long TRP2,domain_crash_synchronous +.previous + +domain_crash_synchronous_string: + .asciz "domain_crash_sync called from supervisor_mode_kernel.S (%lx)\n" + +domain_crash_synchronous: + pushl $domain_crash_synchronous_string + call printf + jmp __domain_crash_synchronous diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-ia64/uaccess.h --- /dev/null Wed Mar 1 17:01:54 2006 +++ b/xen/include/asm-ia64/uaccess.h Wed Mar 1 19:47:25 2006 @@ -0,0 +1,285 @@ +#ifndef _ASM_IA64_UACCESS_H +#define _ASM_IA64_UACCESS_H + +/* + * This file defines various macros to transfer memory areas across + * the user/kernel boundary. This needs to be done carefully because + * this code is executed in kernel mode and uses user-specified + * addresses. Thus, we need to be careful not to let the user to + * trick us into accessing kernel memory that would normally be + * inaccessible. This code is also fairly performance sensitive, + * so we want to spend as little time doing safety checks as + * possible. + * + * To make matters a bit more interesting, these macros sometimes also + * called from within the kernel itself, in which case the address + * validity check must be skipped. The get_fs() macro tells us what + * to do: if get_fs()==USER_DS, checking is performed, if + * get_fs()==KERNEL_DS, checking is bypassed. + * + * Note that even if the memory area specified by the user is in a + * valid address range, it is still possible that we'll get a page + * fault while accessing it. This is handled by filling out an + * exception handler fixup entry for each instruction that has the + * potential to fault. When such a fault occurs, the page fault + * handler checks to see whether the faulting instruction has a fixup + * associated and, if so, sets r8 to -EFAULT and clears r9 to 0 and + * then resumes execution at the continuation point. + * + * Based on <asm-alpha/uaccess.h>. + * + * Copyright (C) 1998, 1999, 2001-2004 Hewlett-Packard Co + * David Mosberger-Tang <davidm@xxxxxxxxxx> + */ + +#include <linux/compiler.h> +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/page-flags.h> +#include <linux/mm.h> + +#include <asm/intrinsics.h> +#include <asm/pgtable.h> +#include <asm/io.h> + +#define IS_VMM_ADDRESS(addr) ((((addr) >> 60) ^ ((addr) >> 59)) & 1) +#define __access_ok(addr) (!IS_VMM_ADDRESS((unsigned long)(addr))) +#define access_ok(addr, size) (__access_ok(addr)) +#define array_access_ok(addr,count,size)( __access_ok(addr)) + +/* + * These are the main single-value transfer routines. They automatically + * use the right size if we just have the right pointer type. + * + * Careful to not + * (a) re-use the arguments for side effects (sizeof/typeof is ok) + * (b) require any knowledge of processes at this stage + */ +#define put_user(x, ptr) __put_user_check((__typeof__(*(ptr))) (x), (ptr), sizeof(*(ptr)), get_fs()) +#define get_user(x, ptr) __get_user_check((x), (ptr), sizeof(*(ptr)), get_fs()) + +/* + * The "__xxx" versions do not do address space checking, useful when + * doing multiple accesses to the same area (the programmer has to do the + * checks by hand with "access_ok()") + */ +#define __put_user(x, ptr) __put_user_nocheck((__typeof__(*(ptr))) (x), (ptr), sizeof(*(ptr))) +#define __get_user(x, ptr) __get_user_nocheck((x), (ptr), sizeof(*(ptr))) + +extern long __put_user_unaligned_unknown (void); + +#define __put_user_unaligned(x, ptr) \ +({ \ + long __ret; \ + switch (sizeof(*(ptr))) { \ + case 1: __ret = __put_user((x), (ptr)); break; \ + case 2: __ret = (__put_user((x), (u8 __user *)(ptr))) \ + | (__put_user((x) >> 8, ((u8 __user *)(ptr) + 1))); break; \ + case 4: __ret = (__put_user((x), (u16 __user *)(ptr))) \ + | (__put_user((x) >> 16, ((u16 __user *)(ptr) + 1))); break; \ + case 8: __ret = (__put_user((x), (u32 __user *)(ptr))) \ + | (__put_user((x) >> 32, ((u32 __user *)(ptr) + 1))); break; \ + default: __ret = __put_user_unaligned_unknown(); \ + } \ + __ret; \ +}) + +extern long __get_user_unaligned_unknown (void); + +#define __get_user_unaligned(x, ptr) \ +({ \ + long __ret; \ + switch (sizeof(*(ptr))) { \ + case 1: __ret = __get_user((x), (ptr)); break; \ + case 2: __ret = (__get_user((x), (u8 __user *)(ptr))) \ + | (__get_user((x) >> 8, ((u8 __user *)(ptr) + 1))); break; \ + case 4: __ret = (__get_user((x), (u16 __user *)(ptr))) \ + | (__get_user((x) >> 16, ((u16 __user *)(ptr) + 1))); break; \ + case 8: __ret = (__get_user((x), (u32 __user *)(ptr))) \ + | (__get_user((x) >> 32, ((u32 __user *)(ptr) + 1))); break; \ + default: __ret = __get_user_unaligned_unknown(); \ + } \ + __ret; \ +}) + +#ifdef ASM_SUPPORTED + struct __large_struct { unsigned long buf[100]; }; +# define __m(x) (*(struct __large_struct __user *)(x)) + +/* We need to declare the __ex_table section before we can use it in .xdata. */ +asm (".section \"__ex_table\", \"a\"\n\t.previous"); + +# define __get_user_size(val, addr, n, err) \ +do { \ + register long __gu_r8 asm ("r8") = 0; \ + register long __gu_r9 asm ("r9"); \ + asm ("\n[1:]\tld"#n" %0=%2%P2\t// %0 and %1 get overwritten by exception handler\n" \ + "\t.xdata4 \"__ex_table\", 1b-., 1f-.+4\n" \ + "[1:]" \ + : "=r"(__gu_r9), "=r"(__gu_r8) : "m"(__m(addr)), "1"(__gu_r8)); \ + (err) = __gu_r8; \ + (val) = __gu_r9; \ +} while (0) + +/* + * The "__put_user_size()" macro tells gcc it reads from memory instead of writing it. This + * is because they do not write to any memory gcc knows about, so there are no aliasing + * issues. + */ +# define __put_user_size(val, addr, n, err) \ +do { \ + register long __pu_r8 asm ("r8") = 0; \ + asm volatile ("\n[1:]\tst"#n" %1=%r2%P1\t// %0 gets overwritten by exception handler\n" \ + "\t.xdata4 \"__ex_table\", 1b-., 1f-.\n" \ + "[1:]" \ + : "=r"(__pu_r8) : "m"(__m(addr)), "rO"(val), "0"(__pu_r8)); \ + (err) = __pu_r8; \ +} while (0) + +#else /* !ASM_SUPPORTED */ +# define RELOC_TYPE 2 /* ip-rel */ +# define __get_user_size(val, addr, n, err) \ +do { \ + __ld_user("__ex_table", (unsigned long) addr, n, RELOC_TYPE); \ + (err) = ia64_getreg(_IA64_REG_R8); \ + (val) = ia64_getreg(_IA64_REG_R9); \ +} while (0) +# define __put_user_size(val, addr, n, err) \ +do { \ + __st_user("__ex_table", (unsigned long) addr, n, RELOC_TYPE, (unsigned long) (val)); \ + (err) = ia64_getreg(_IA64_REG_R8); \ +} while (0) +#endif /* !ASM_SUPPORTED */ + +extern void __get_user_unknown (void); + +/* + * Evaluating arguments X, PTR, SIZE, and SEGMENT may involve subroutine-calls, which + * could clobber r8 and r9 (among others). Thus, be careful not to evaluate it while + * using r8/r9. + */ +#define __do_get_user(check, x, ptr, size, segment) \ +({ \ + const __typeof__(*(ptr)) __user *__gu_ptr = (ptr); \ + __typeof__ (size) __gu_size = (size); \ + long __gu_err = -EFAULT, __gu_val = 0; \ + \ + if (!check || __access_ok(__gu_ptr)) \ + switch (__gu_size) { \ + case 1: __get_user_size(__gu_val, __gu_ptr, 1, __gu_err); break; \ + case 2: __get_user_size(__gu_val, __gu_ptr, 2, __gu_err); break; \ + case 4: __get_user_size(__gu_val, __gu_ptr, 4, __gu_err); break; \ + case 8: __get_user_size(__gu_val, __gu_ptr, 8, __gu_err); break; \ + default: __get_user_unknown(); break; \ + } \ + (x) = (__typeof__(*(__gu_ptr))) __gu_val; \ + __gu_err; \ +}) + +#define __get_user_nocheck(x, ptr, size) __do_get_user(0, x, ptr, size, KERNEL_DS) +#define __get_user_check(x, ptr, size, segment) __do_get_user(1, x, ptr, size, segment) + +extern void __put_user_unknown (void); + +/* + * Evaluating arguments X, PTR, SIZE, and SEGMENT may involve subroutine-calls, which + * could clobber r8 (among others). Thus, be careful not to evaluate them while using r8. + */ +#define __do_put_user(check, x, ptr, size, segment) \ +({ \ + __typeof__ (x) __pu_x = (x); \ + __typeof__ (*(ptr)) __user *__pu_ptr = (ptr); \ + __typeof__ (size) __pu_size = (size); \ + long __pu_err = -EFAULT; \ + \ + if (!check || __access_ok(__pu_ptr)) \ + switch (__pu_size) { \ + case 1: __put_user_size(__pu_x, __pu_ptr, 1, __pu_err); break; \ + case 2: __put_user_size(__pu_x, __pu_ptr, 2, __pu_err); break; \ + case 4: __put_user_size(__pu_x, __pu_ptr, 4, __pu_err); break; \ + case 8: __put_user_size(__pu_x, __pu_ptr, 8, __pu_err); break; \ + default: __put_user_unknown(); break; \ + } \ + __pu_err; \ +}) + +#define __put_user_nocheck(x, ptr, size) __do_put_user(0, x, ptr, size, KERNEL_DS) +#define __put_user_check(x, ptr, size, segment) __do_put_user(1, x, ptr, size, segment) + +/* + * Complex access routines + */ +extern unsigned long __must_check __copy_user (void __user *to, const void __user *from, + unsigned long count); + +static inline unsigned long +__copy_to_user (void __user *to, const void *from, unsigned long count) +{ + return __copy_user(to, (void __user *) from, count); +} + +static inline unsigned long +__copy_from_user (void *to, const void __user *from, unsigned long count) +{ + return __copy_user((void __user *) to, from, count); +} + +#define __copy_to_user_inatomic __copy_to_user +#define __copy_from_user_inatomic __copy_from_user +#define copy_to_user(to, from, n) \ +({ \ + void __user *__cu_to = (to); \ + const void *__cu_from = (from); \ + long __cu_len = (n); \ + \ + if (__access_ok(__cu_to)) \ + __cu_len = __copy_user(__cu_to, (void __user *) __cu_from, __cu_len); \ + __cu_len; \ +}) + +#define copy_from_user(to, from, n) \ +({ \ + void *__cu_to = (to); \ + const void __user *__cu_from = (from); \ + long __cu_len = (n); \ + \ + __chk_user_ptr(__cu_from); \ + if (__access_ok(__cu_from)) \ + __cu_len = __copy_user((void __user *) __cu_to, __cu_from, __cu_len); \ + __cu_len; \ +}) + +#define __copy_in_user(to, from, size) __copy_user((to), (from), (size)) + +static inline unsigned long +copy_in_user (void __user *to, const void __user *from, unsigned long n) +{ + if (likely(access_ok(from, n) && access_ok(to, n))) + n = __copy_user(to, from, n); + return n; +} + +#define ARCH_HAS_SORT_EXTABLE +#define ARCH_HAS_SEARCH_EXTABLE + +struct exception_table_entry { + int addr; /* location-relative address of insn this fixup is for */ + int cont; /* location-relative continuation addr.; if bit 2 is set, r9 is set to 0 */ +}; + +extern void ia64_handle_exception (struct pt_regs *regs, const struct exception_table_entry *e); +extern const struct exception_table_entry *search_exception_tables (unsigned long addr); + +static inline int +ia64_done_with_exception (struct pt_regs *regs) +{ + const struct exception_table_entry *e; + e = search_exception_tables(regs->cr_iip + ia64_psr(regs)->ri); + if (e) { + ia64_handle_exception(regs, e); + return 1; + } + return 0; +} + +#endif /* _ASM_IA64_UACCESS_H */ diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/public/features.h --- /dev/null Wed Mar 1 17:01:54 2006 +++ b/xen/include/public/features.h Wed Mar 1 19:47:25 2006 @@ -0,0 +1,53 @@ +/****************************************************************************** + * features.h + * + * Feature flags, reported by XENVER_get_features. + * + * Copyright (c) 2006, Keir Fraser <keir@xxxxxxxxxxxxx> + */ + +#ifndef __XEN_PUBLIC_FEATURES_H__ +#define __XEN_PUBLIC_FEATURES_H__ + +/* + * If set, the guest does not need to write-protect its pagetables, and can + * update them via direct writes. + */ +#define XENFEAT_writable_page_tables 0 + +/* + * If set, the guest does not need to write-protect its segment descriptor + * tables, and can update them via direct writes. + */ +#define XENFEAT_writable_descriptor_tables 1 + +/* + * If set, translation between the guest's 'pseudo-physical' address space + * and the host's machine address space are handled by the hypervisor. In this + * mode the guest does not need to perform phys-to/from-machine translations + * when performing page table operations. + */ +#define XENFEAT_auto_translated_physmap 2 + +/* If set, the guest is running in supervisor mode (e.g., x86 ring 0). */ +#define XENFEAT_supervisor_mode_kernel 3 + +/* + * If set, the guest does not need to allocate x86 PAE page directories + * below 4GB. This flag is usually implied by auto_translated_physmap. + */ +#define XENFEAT_pae_pgdir_above_4gb 4 + +#define XENFEAT_NR_SUBMAPS 1 + +#endif /* __XEN_PUBLIC_FEATURES_H__ */ + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/xen/guest_access.h --- /dev/null Wed Mar 1 17:01:54 2006 +++ b/xen/include/xen/guest_access.h Wed Mar 1 19:47:25 2006 @@ -0,0 +1,71 @@ +/****************************************************************************** + * guest_access.h + * + * Copyright (x) 2006, K A Fraser + */ + +#ifndef __XEN_GUEST_ACCESS_H__ +#define __XEN_GUEST_ACCESS_H__ + +#include <asm/uaccess.h> + +/* Is the guest handle a NULL reference? */ +#define guest_handle_is_null(hnd) ((hnd).p == NULL) + +/* Offset the given guest handle into the array it refers to. */ +#define guest_handle_add_offset(hnd, nr) ((hnd).p += (nr)) + +/* Cast a guest handle to the specified type of handle. */ +#define guest_handle_cast(hnd, type) ({ \ + type *_x = (hnd).p; \ + (GUEST_HANDLE(type)) { _x }; \ +}) + +/* + * Copy an array of objects to guest context via a guest handle. + * Optionally specify an offset into the guest array. + */ +#define copy_to_guest_offset(hnd, off, ptr, nr) ({ \ + const typeof(ptr) _x = (hnd).p; \ + const typeof(ptr) _y = (ptr); \ + copy_to_user(_x+(off), _y, sizeof(*_x)*(nr)); \ +}) +#define copy_to_guest(hnd, ptr, nr) \ + copy_to_guest_offset(hnd, 0, ptr, nr) + +/* + * Copy an array of objects from guest context via a guest handle. + * Optionally specify an offset into the guest array. + */ +#define copy_from_guest_offset(ptr, hnd, off, nr) ({ \ + const typeof(ptr) _x = (hnd).p; \ + const typeof(ptr) _y = (ptr); \ + copy_from_user(_y, _x+(off), sizeof(*_x)*(nr)); \ +}) +#define copy_from_guest(ptr, hnd, nr) \ + copy_from_guest_offset(ptr, hnd, 0, nr) + +/* + * Pre-validate a guest handle. + * Allows use of faster __copy_* functions. + */ +#define guest_handle_okay(hnd, nr) \ + array_access_ok((hnd).p, (nr), sizeof(*(hnd).p)) + +#define __copy_to_guest_offset(hnd, off, ptr, nr) ({ \ + const typeof(ptr) _x = (hnd).p; \ + const typeof(ptr) _y = (ptr); \ + __copy_to_user(_x+(off), _y, sizeof(*_x)*(nr)); \ +}) +#define __copy_to_guest(hnd, ptr, nr) \ + __copy_to_guest_offset(hnd, 0, ptr, nr) + +#define __copy_from_guest_offset(ptr, hnd, off, nr) ({ \ + const typeof(ptr) _x = (hnd).p; \ + const typeof(ptr) _y = (ptr); \ + __copy_from_user(_y, _x+(off), sizeof(*_x)*(nr)); \ +}) +#define __copy_from_guest(ptr, hnd, nr) \ + __copy_from_guest_offset(ptr, hnd, 0, nr) + +#endif /* __XEN_GUEST_ACCESS_H__ */ diff -r 88f97bb8f3ae -r 673f62edbfbe patches/linux-2.6.16-rc4/i386-mach-io-check-nmi.patch --- a/patches/linux-2.6.16-rc4/i386-mach-io-check-nmi.patch Wed Mar 1 17:01:54 2006 +++ /dev/null Wed Mar 1 19:47:25 2006 @@ -1,45 +0,0 @@ -diff -pruN ../pristine-linux-2.6.16-rc3/arch/i386/kernel/traps.c ./arch/i386/kernel/traps.c ---- ../pristine-linux-2.6.16-rc3/arch/i386/kernel/traps.c 2006-02-15 20:38:51.000000000 +0000 -+++ ./arch/i386/kernel/traps.c 2006-02-15 20:40:43.000000000 +0000 -@@ -567,18 +567,11 @@ static void mem_parity_error(unsigned ch - - static void io_check_error(unsigned char reason, struct pt_regs * regs) - { -- unsigned long i; -- - printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n"); - show_registers(regs); - - /* Re-enable the IOCK line, wait for a few seconds */ -- reason = (reason & 0xf) | 8; -- outb(reason, 0x61); -- i = 2000; -- while (--i) udelay(1000); -- reason &= ~8; -- outb(reason, 0x61); -+ clear_io_check_error(reason); - } - - static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs) -diff -pruN ../pristine-linux-2.6.16-rc3/include/asm-i386/mach-default/mach_traps.h ./include/asm-i386/mach-default/mach_traps.h ---- ../pristine-linux-2.6.16-rc3/include/asm-i386/mach-default/mach_traps.h 2006-01-03 03:21:10.000000000 +0000 -+++ ./include/asm-i386/mach-default/mach_traps.h 2006-02-15 20:40:43.000000000 +0000 -@@ -15,6 +15,18 @@ static inline void clear_mem_error(unsig - outb(reason, 0x61); - } - -+static inline void clear_io_check_error(unsigned char reason) -+{ -+ unsigned long i; -+ -+ reason = (reason & 0xf) | 8; -+ outb(reason, 0x61); -+ i = 2000; -+ while (--i) udelay(1000); -+ reason &= ~8; -+ outb(reason, 0x61); -+} -+ - static inline unsigned char get_nmi_reason(void) - { - return inb(0x61); diff -r 88f97bb8f3ae -r 673f62edbfbe patches/linux-2.6.16-rc4/net-csum.patch --- a/patches/linux-2.6.16-rc4/net-csum.patch Wed Mar 1 17:01:54 2006 +++ /dev/null Wed Mar 1 19:47:25 2006 @@ -1,41 +0,0 @@ -diff -pruN ../pristine-linux-2.6.16-rc1-git4/net/ipv4/netfilter/ip_nat_proto_tcp.c ./net/ipv4/netfilter/ip_nat_proto_tcp.c ---- ../pristine-linux-2.6.16-rc1-git4/net/ipv4/netfilter/ip_nat_proto_tcp.c 2006-02-02 17:39:51.000000000 +0000 -+++ ./net/ipv4/netfilter/ip_nat_proto_tcp.c 2006-02-02 17:44:18.000000000 +0000 -@@ -129,10 +129,14 @@ tcp_manip_pkt(struct sk_buff **pskb, - if (hdrsize < sizeof(*hdr)) - return 1; - -- hdr->check = ip_nat_cheat_check(~oldip, newip, -+ if ((*pskb)->proto_csum_blank) { -+ hdr->check = ip_nat_cheat_check(oldip, ~newip, hdr->check); -+ } else { -+ hdr->check = ip_nat_cheat_check(~oldip, newip, - ip_nat_cheat_check(oldport ^ 0xFFFF, - newport, - hdr->check)); -+ } - return 1; - } - -diff -pruN ../pristine-linux-2.6.16-rc1-git4/net/ipv4/netfilter/ip_nat_proto_udp.c ./net/ipv4/netfilter/ip_nat_proto_udp.c ---- ../pristine-linux-2.6.16-rc1-git4/net/ipv4/netfilter/ip_nat_proto_udp.c 2006-02-02 17:39:51.000000000 +0000 -+++ ./net/ipv4/netfilter/ip_nat_proto_udp.c 2006-02-02 17:44:18.000000000 +0000 -@@ -113,11 +113,16 @@ udp_manip_pkt(struct sk_buff **pskb, - newport = tuple->dst.u.udp.port; - portptr = &hdr->dest; - } -- if (hdr->check) /* 0 is a special case meaning no checksum */ -- hdr->check = ip_nat_cheat_check(~oldip, newip, -+ if (hdr->check) { /* 0 is a special case meaning no checksum */ -+ if ((*pskb)->proto_csum_blank) { -+ hdr->check = ip_nat_cheat_check(oldip, ~newip, hdr->check); -+ } else { -+ hdr->check = ip_nat_cheat_check(~oldip, newip, - ip_nat_cheat_check(*portptr ^ 0xFFFF, - newport, - hdr->check)); -+ } -+ } - *portptr = newport; - return 1; - } diff -r 88f97bb8f3ae -r 673f62edbfbe patches/linux-2.6.16-rc4/pmd-shared.patch --- a/patches/linux-2.6.16-rc4/pmd-shared.patch Wed Mar 1 17:01:54 2006 +++ /dev/null Wed Mar 1 19:47:25 2006 @@ -1,111 +0,0 @@ -diff -pruN ../pristine-linux-2.6.16-rc1-git4/arch/i386/mm/pageattr.c ./arch/i386/mm/pageattr.c ---- ../pristine-linux-2.6.16-rc1-git4/arch/i386/mm/pageattr.c 2006-02-02 17:39:29.000000000 +0000 -+++ ./arch/i386/mm/pageattr.c 2006-02-02 17:45:14.000000000 +0000 -@@ -78,7 +78,7 @@ static void set_pmd_pte(pte_t *kpte, uns - unsigned long flags; - - set_pte_atomic(kpte, pte); /* change init_mm */ -- if (PTRS_PER_PMD > 1) -+ if (HAVE_SHARED_KERNEL_PMD) - return; - - spin_lock_irqsave(&pgd_lock, flags); -diff -pruN ../pristine-linux-2.6.16-rc1-git4/arch/i386/mm/pgtable.c ./arch/i386/mm/pgtable.c ---- ../pristine-linux-2.6.16-rc1-git4/arch/i386/mm/pgtable.c 2006-01-03 03:21:10.000000000 +0000 -+++ ./arch/i386/mm/pgtable.c 2006-02-02 17:45:14.000000000 +0000 -@@ -215,9 +215,10 @@ void pgd_ctor(void *pgd, kmem_cache_t *c - spin_lock_irqsave(&pgd_lock, flags); - } - -- clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, -- swapper_pg_dir + USER_PTRS_PER_PGD, -- KERNEL_PGD_PTRS); -+ if (PTRS_PER_PMD == 1 || HAVE_SHARED_KERNEL_PMD) -+ clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, -+ swapper_pg_dir + USER_PTRS_PER_PGD, -+ KERNEL_PGD_PTRS); - if (PTRS_PER_PMD > 1) - return; - -@@ -249,6 +250,30 @@ pgd_t *pgd_alloc(struct mm_struct *mm) - goto out_oom; - set_pgd(&pgd[i], __pgd(1 + __pa(pmd))); - } -+ -+ if (!HAVE_SHARED_KERNEL_PMD) { -+ unsigned long flags; -+ -+ for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) { -+ pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); -+ if (!pmd) -+ goto out_oom; -+ set_pgd(&pgd[USER_PTRS_PER_PGD], __pgd(1 + __pa(pmd))); -+ } -+ -+ spin_lock_irqsave(&pgd_lock, flags); -+ for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) { -+ unsigned long v = (unsigned long)i << PGDIR_SHIFT; -+ pgd_t *kpgd = pgd_offset_k(v); -+ pud_t *kpud = pud_offset(kpgd, v); -+ pmd_t *kpmd = pmd_offset(kpud, v); -+ pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1); -+ memcpy(pmd, kpmd, PAGE_SIZE); -+ } -+ pgd_list_add(pgd); -+ spin_unlock_irqrestore(&pgd_lock, flags); -+ } -+ - return pgd; - - out_oom: -@@ -263,9 +288,23 @@ void pgd_free(pgd_t *pgd) - int i; - - /* in the PAE case user pgd entries are overwritten before usage */ -- if (PTRS_PER_PMD > 1) -- for (i = 0; i < USER_PTRS_PER_PGD; ++i) -- kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); -+ if (PTRS_PER_PMD > 1) { -+ for (i = 0; i < USER_PTRS_PER_PGD; ++i) { -+ pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1); -+ kmem_cache_free(pmd_cache, pmd); -+ } -+ if (!HAVE_SHARED_KERNEL_PMD) { -+ unsigned long flags; -+ spin_lock_irqsave(&pgd_lock, flags); -+ pgd_list_del(pgd); -+ spin_unlock_irqrestore(&pgd_lock, flags); -+ for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) { -+ pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1); -+ memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); -+ kmem_cache_free(pmd_cache, pmd); -+ } -+ } -+ } - /* in the non-PAE case, free_pgtables() clears user pgd entries */ - kmem_cache_free(pgd_cache, pgd); - } -diff -pruN ../pristine-linux-2.6.16-rc1-git4/include/asm-i386/pgtable-2level-defs.h ./include/asm-i386/pgtable-2level-defs.h ---- ../pristine-linux-2.6.16-rc1-git4/include/asm-i386/pgtable-2level-defs.h 2006-01-03 03:21:10.000000000 +0000 -+++ ./include/asm-i386/pgtable-2level-defs.h 2006-02-02 17:45:14.000000000 +0000 -@@ -1,6 +1,8 @@ - #ifndef _I386_PGTABLE_2LEVEL_DEFS_H - #define _I386_PGTABLE_2LEVEL_DEFS_H - -+#define HAVE_SHARED_KERNEL_PMD 0 -+ - /* - * traditional i386 two-level paging structure: - */ -diff -pruN ../pristine-linux-2.6.16-rc1-git4/include/asm-i386/pgtable-3level-defs.h ./include/asm-i386/pgtable-3level-defs.h ---- ../pristine-linux-2.6.16-rc1-git4/include/asm-i386/pgtable-3level-defs.h 2006-01-03 03:21:10.000000000 +0000 -+++ ./include/asm-i386/pgtable-3level-defs.h 2006-02-02 17:45:14.000000000 +0000 -@@ -1,6 +1,8 @@ - #ifndef _I386_PGTABLE_3LEVEL_DEFS_H - #define _I386_PGTABLE_3LEVEL_DEFS_H - -+#define HAVE_SHARED_KERNEL_PMD 1 -+ - /* - * PGDIR_SHIFT determines what a top-level page table entry can map - */ diff -r 88f97bb8f3ae -r 673f62edbfbe patches/linux-2.6.16-rc4/smp-alts.patch --- a/patches/linux-2.6.16-rc4/smp-alts.patch Wed Mar 1 17:01:54 2006 +++ /dev/null Wed Mar 1 19:47:25 2006 @@ -1,591 +0,0 @@ -diff -pruN ../pristine-linux-2.6.16-rc3/arch/i386/Kconfig ./arch/i386/Kconfig ---- ../pristine-linux-2.6.16-rc3/arch/i386/Kconfig 2006-02-15 20:38:51.000000000 +0000 -+++ ./arch/i386/Kconfig 2006-02-15 20:45:57.000000000 +0000 -@@ -202,6 +202,19 @@ config SMP - - If you don't know what to do here, say N. - -+config SMP_ALTERNATIVES -+ bool "SMP alternatives support (EXPERIMENTAL)" -+ depends on SMP && EXPERIMENTAL -+ help -+ Try to reduce the overhead of running an SMP kernel on a uniprocessor -+ host slightly by replacing certain key instruction sequences -+ according to whether we currently have more than one CPU available. -+ This should provide a noticeable boost to performance when -+ running SMP kernels on UP machines, and have negligible impact -+ when running on an true SMP host. -+ -+ If unsure, say N. -+ - config NR_CPUS - int "Maximum number of CPUs (2-255)" - range 2 255 -diff -pruN ../pristine-linux-2.6.16-rc3/arch/i386/kernel/Makefile ./arch/i386/kernel/Makefile ---- ../pristine-linux-2.6.16-rc3/arch/i386/kernel/Makefile 2006-02-15 20:38:51.000000000 +0000 -+++ ./arch/i386/kernel/Makefile 2006-02-15 20:45:57.000000000 +0000 -@@ -37,6 +37,7 @@ obj-$(CONFIG_EFI) += efi.o efi_stub.o - obj-$(CONFIG_DOUBLEFAULT) += doublefault.o - obj-$(CONFIG_VM86) += vm86.o - obj-$(CONFIG_EARLY_PRINTK) += early_printk.o -+obj-$(CONFIG_SMP_ALTERNATIVES) += smpalts.o - - EXTRA_AFLAGS := -traditional - -diff -pruN ../pristine-linux-2.6.16-rc3/arch/i386/kernel/smpalts.c ./arch/i386/kernel/smpalts.c ---- ../pristine-linux-2.6.16-rc3/arch/i386/kernel/smpalts.c 1970-01-01 01:00:00.000000000 +0100 -+++ ./arch/i386/kernel/smpalts.c 2006-02-15 20:45:57.000000000 +0000 -@@ -0,0 +1,85 @@ -+#include <linux/kernel.h> -+#include <asm/system.h> -+#include <asm/smp_alt.h> -+#include <asm/processor.h> -+#include <asm/string.h> -+ -+struct smp_replacement_record { -+ unsigned char targ_size; -+ unsigned char smp1_size; -+ unsigned char smp2_size; -+ unsigned char up_size; -+ unsigned char feature; -+ unsigned char data[0]; -+}; -+ -+struct smp_alternative_record { -+ void *targ_start; -+ struct smp_replacement_record *repl; -+}; -+ -+extern struct smp_alternative_record __start_smp_alternatives_table, -+ __stop_smp_alternatives_table; -+extern unsigned long __init_begin, __init_end; -+ -+void prepare_for_smp(void) -+{ -+ struct smp_alternative_record *r; -+ printk(KERN_INFO "Enabling SMP...\n"); -+ for (r = &__start_smp_alternatives_table; -+ r != &__stop_smp_alternatives_table; -+ r++) { -+ BUG_ON(r->repl->targ_size < r->repl->smp1_size); -+ BUG_ON(r->repl->targ_size < r->repl->smp2_size); -+ BUG_ON(r->repl->targ_size < r->repl->up_size); -+ if (system_state == SYSTEM_RUNNING && -+ r->targ_start >= (void *)&__init_begin && -+ r->targ_start < (void *)&__init_end) -+ continue; -+ if (r->repl->feature != (unsigned char)-1 && -+ boot_cpu_has(r->repl->feature)) { -+ memcpy(r->targ_start, -+ r->repl->data + r->repl->smp1_size, -+ r->repl->smp2_size); -+ memset(r->targ_start + r->repl->smp2_size, -+ 0x90, -+ r->repl->targ_size - r->repl->smp2_size); -+ } else { -+ memcpy(r->targ_start, -+ r->repl->data, -+ r->repl->smp1_size); -+ memset(r->targ_start + r->repl->smp1_size, -+ 0x90, -+ r->repl->targ_size - r->repl->smp1_size); -+ } -+ } -+ /* Paranoia */ -+ asm volatile ("jmp 1f\n1:"); -+ mb(); -+} -+ -+void unprepare_for_smp(void) -+{ -+ struct smp_alternative_record *r; -+ printk(KERN_INFO "Disabling SMP...\n"); -+ for (r = &__start_smp_alternatives_table; -+ r != &__stop_smp_alternatives_table; -+ r++) { -+ BUG_ON(r->repl->targ_size < r->repl->smp1_size); -+ BUG_ON(r->repl->targ_size < r->repl->smp2_size); -+ BUG_ON(r->repl->targ_size < r->repl->up_size); -+ if (system_state == SYSTEM_RUNNING && -+ r->targ_start >= (void *)&__init_begin && -+ r->targ_start < (void *)&__init_end) -+ continue; -+ memcpy(r->targ_start, -+ r->repl->data + r->repl->smp1_size + r->repl->smp2_size, -+ r->repl->up_size); -+ memset(r->targ_start + r->repl->up_size, -+ 0x90, -+ r->repl->targ_size - r->repl->up_size); -+ } -+ /* Paranoia */ -+ asm volatile ("jmp 1f\n1:"); -+ mb(); -+} -diff -pruN ../pristine-linux-2.6.16-rc3/arch/i386/kernel/smpboot.c ./arch/i386/kernel/smpboot.c ---- ../pristine-linux-2.6.16-rc3/arch/i386/kernel/smpboot.c 2006-02-15 20:38:51.000000000 +0000 -+++ ./arch/i386/kernel/smpboot.c 2006-02-15 20:45:57.000000000 +0000 -@@ -1214,6 +1214,11 @@ static void __init smp_boot_cpus(unsigne - if (max_cpus <= cpucount+1) - continue; - -+#ifdef CONFIG_SMP_ALTERNATIVES -+ if (kicked == 1) -+ prepare_for_smp(); -+#endif -+ - if (((cpu = alloc_cpu_id()) <= 0) || do_boot_cpu(apicid, cpu)) - printk("CPU #%d not responding - cannot use it.\n", - apicid); -@@ -1392,6 +1397,11 @@ int __devinit __cpu_up(unsigned int cpu) - return -EIO; - } - -+#ifdef CONFIG_SMP_ALTERNATIVES -+ if (num_online_cpus() == 1) -+ prepare_for_smp(); -+#endif -+ - local_irq_enable(); - per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; - /* Unleash the CPU! */ -diff -pruN ../pristine-linux-2.6.16-rc3/arch/i386/kernel/vmlinux.lds.S ./arch/i386/kernel/vmlinux.lds.S ---- ../pristine-linux-2.6.16-rc3/arch/i386/kernel/vmlinux.lds.S 2006-01-03 03:21:10.000000000 +0000 -+++ ./arch/i386/kernel/vmlinux.lds.S 2006-02-15 20:45:57.000000000 +0000 -@@ -34,6 +34,13 @@ SECTIONS - __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) } - __stop___ex_table = .; - -+ . = ALIGN(16); -+ __start_smp_alternatives_table = .; -+ __smp_alternatives : { *(__smp_alternatives) } -+ __stop_smp_alternatives_table = .; -+ -+ __smp_replacements : { *(__smp_replacements) } -+ - RODATA - - /* writeable */ -diff -pruN ../pristine-linux-2.6.16-rc3/include/asm-i386/atomic.h ./include/asm-i386/atomic.h ---- ../pristine-linux-2.6.16-rc3/include/asm-i386/atomic.h 2006-02-15 20:38:57.000000000 +0000 -+++ ./include/asm-i386/atomic.h 2006-02-15 20:45:57.000000000 +0000 -@@ -4,18 +4,13 @@ - #include <linux/config.h> - #include <linux/compiler.h> - #include <asm/processor.h> -+#include <asm/smp_alt.h> - - /* - * Atomic operations that C can't guarantee us. Useful for - * resource counting etc.. - */ - --#ifdef CONFIG_SMP --#define LOCK "lock ; " --#else --#define LOCK "" --#endif -- - /* - * Make sure gcc doesn't try to be clever and move things around - * on us. We need to use _exactly_ the address the user gave us, -diff -pruN ../pristine-linux-2.6.16-rc3/include/asm-i386/bitops.h ./include/asm-i386/bitops.h ---- ../pristine-linux-2.6.16-rc3/include/asm-i386/bitops.h 2006-02-15 20:38:57.000000000 +0000 -+++ ./include/asm-i386/bitops.h 2006-02-15 20:45:57.000000000 +0000 -@@ -7,6 +7,7 @@ - - #include <linux/config.h> - #include <linux/compiler.h> -+#include <asm/smp_alt.h> - - /* - * These have to be done with inline assembly: that way the bit-setting -@@ -16,12 +17,6 @@ - * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1). - */ - --#ifdef CONFIG_SMP --#define LOCK_PREFIX "lock ; " --#else --#define LOCK_PREFIX "" --#endif -- - #define ADDR (*(volatile long *) addr) - - /** -@@ -41,7 +36,7 @@ - */ - static inline void set_bit(int nr, volatile unsigned long * addr) - { -- __asm__ __volatile__( LOCK_PREFIX -+ __asm__ __volatile__( LOCK - "btsl %1,%0" - :"+m" (ADDR) - :"Ir" (nr)); -@@ -76,7 +71,7 @@ static inline void __set_bit(int nr, vol - */ - static inline void clear_bit(int nr, volatile unsigned long * addr) - { -- __asm__ __volatile__( LOCK_PREFIX -+ __asm__ __volatile__( LOCK - "btrl %1,%0" - :"+m" (ADDR) - :"Ir" (nr)); -@@ -121,7 +116,7 @@ static inline void __change_bit(int nr, - */ - static inline void change_bit(int nr, volatile unsigned long * addr) - { -- __asm__ __volatile__( LOCK_PREFIX -+ __asm__ __volatile__( LOCK - "btcl %1,%0" - :"+m" (ADDR) - :"Ir" (nr)); -@@ -140,7 +135,7 @@ static inline int test_and_set_bit(int n - { - int oldbit; - -- __asm__ __volatile__( LOCK_PREFIX -+ __asm__ __volatile__( LOCK - "btsl %2,%1\n\tsbbl %0,%0" - :"=r" (oldbit),"+m" (ADDR) - :"Ir" (nr) : "memory"); -@@ -180,7 +175,7 @@ static inline int test_and_clear_bit(int - { - int oldbit; - -- __asm__ __volatile__( LOCK_PREFIX -+ __asm__ __volatile__( LOCK - "btrl %2,%1\n\tsbbl %0,%0" - :"=r" (oldbit),"+m" (ADDR) - :"Ir" (nr) : "memory"); -@@ -231,7 +226,7 @@ static inline int test_and_change_bit(in - { - int oldbit; - -- __asm__ __volatile__( LOCK_PREFIX -+ __asm__ __volatile__( LOCK - "btcl %2,%1\n\tsbbl %0,%0" - :"=r" (oldbit),"+m" (ADDR) - :"Ir" (nr) : "memory"); -diff -pruN ../pristine-linux-2.6.16-rc3/include/asm-i386/futex.h ./include/asm-i386/futex.h ---- ../pristine-linux-2.6.16-rc3/include/asm-i386/futex.h 2006-02-15 20:38:57.000000000 +0000 -+++ ./include/asm-i386/futex.h 2006-02-15 20:45:57.000000000 +0000 -@@ -28,7 +28,7 @@ - "1: movl %2, %0\n\ - movl %0, %3\n" \ - insn "\n" \ --"2: " LOCK_PREFIX "cmpxchgl %3, %2\n\ -+"2: " LOCK "cmpxchgl %3, %2\n\ - jnz 1b\n\ - 3: .section .fixup,\"ax\"\n\ - 4: mov %5, %1\n\ -@@ -68,7 +68,7 @@ futex_atomic_op_inuser (int encoded_op, - #endif - switch (op) { - case FUTEX_OP_ADD: -- __futex_atomic_op1(LOCK_PREFIX "xaddl %0, %2", ret, -+ __futex_atomic_op1(LOCK "xaddl %0, %2", ret, - oldval, uaddr, oparg); - break; - case FUTEX_OP_OR: -diff -pruN ../pristine-linux-2.6.16-rc3/include/asm-i386/rwsem.h ./include/asm-i386/rwsem.h ---- ../pristine-linux-2.6.16-rc3/include/asm-i386/rwsem.h 2006-01-03 03:21:10.000000000 +0000 -+++ ./include/asm-i386/rwsem.h 2006-02-15 20:45:57.000000000 +0000 -@@ -40,6 +40,7 @@ - - #include <linux/list.h> - #include <linux/spinlock.h> -+#include <asm/smp_alt.h> - - struct rwsem_waiter; - -@@ -99,7 +100,7 @@ static inline void __down_read(struct rw - { - __asm__ __volatile__( - "# beginning down_read\n\t" --LOCK_PREFIX " incl (%%eax)\n\t" /* adds 0x00000001, returns the old value */ -+LOCK " incl (%%eax)\n\t" /* adds 0x00000001, returns the old value */ - " js 2f\n\t" /* jump if we weren't granted the lock */ - "1:\n\t" - LOCK_SECTION_START("") -@@ -130,7 +131,7 @@ static inline int __down_read_trylock(st - " movl %1,%2\n\t" - " addl %3,%2\n\t" - " jle 2f\n\t" --LOCK_PREFIX " cmpxchgl %2,%0\n\t" -+LOCK " cmpxchgl %2,%0\n\t" - " jnz 1b\n\t" - "2:\n\t" - "# ending __down_read_trylock\n\t" -@@ -150,7 +151,7 @@ static inline void __down_write(struct r - tmp = RWSEM_ACTIVE_WRITE_BIAS; - __asm__ __volatile__( - "# beginning down_write\n\t" --LOCK_PREFIX " xadd %%edx,(%%eax)\n\t" /* subtract 0x0000ffff, returns the old value */ -+LOCK " xadd %%edx,(%%eax)\n\t" /* subtract 0x0000ffff, returns the old value */ - " testl %%edx,%%edx\n\t" /* was the count 0 before? */ - " jnz 2f\n\t" /* jump if we weren't granted the lock */ - "1:\n\t" -@@ -188,7 +189,7 @@ static inline void __up_read(struct rw_s - __s32 tmp = -RWSEM_ACTIVE_READ_BIAS; - __asm__ __volatile__( - "# beginning __up_read\n\t" --LOCK_PREFIX " xadd %%edx,(%%eax)\n\t" /* subtracts 1, returns the old value */ -+LOCK " xadd %%edx,(%%eax)\n\t" /* subtracts 1, returns the old value */ - " js 2f\n\t" /* jump if the lock is being waited upon */ - "1:\n\t" - LOCK_SECTION_START("") -@@ -214,7 +215,7 @@ static inline void __up_write(struct rw_ - __asm__ __volatile__( - "# beginning __up_write\n\t" - " movl %2,%%edx\n\t" --LOCK_PREFIX " xaddl %%edx,(%%eax)\n\t" /* tries to transition 0xffff0001 -> 0x00000000 */ -+LOCK " xaddl %%edx,(%%eax)\n\t" /* tries to transition 0xffff0001 -> 0x00000000 */ - " jnz 2f\n\t" /* jump if the lock is being waited upon */ - "1:\n\t" - LOCK_SECTION_START("") -@@ -239,7 +240,7 @@ static inline void __downgrade_write(str - { - __asm__ __volatile__( - "# beginning __downgrade_write\n\t" --LOCK_PREFIX " addl %2,(%%eax)\n\t" /* transitions 0xZZZZ0001 -> 0xYYYY0001 */ -+LOCK " addl %2,(%%eax)\n\t" /* transitions 0xZZZZ0001 -> 0xYYYY0001 */ - " js 2f\n\t" /* jump if the lock is being waited upon */ - "1:\n\t" - LOCK_SECTION_START("") -@@ -263,7 +264,7 @@ LOCK_PREFIX " addl %2,(%%eax)\n\t" - static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) - { - __asm__ __volatile__( --LOCK_PREFIX "addl %1,%0" -+LOCK "addl %1,%0" - : "=m"(sem->count) - : "ir"(delta), "m"(sem->count)); - } -@@ -276,7 +277,7 @@ static inline int rwsem_atomic_update(in - int tmp = delta; - - __asm__ __volatile__( --LOCK_PREFIX "xadd %0,(%2)" -+LOCK "xadd %0,(%2)" - : "+r"(tmp), "=m"(sem->count) - : "r"(sem), "m"(sem->count) - : "memory"); -diff -pruN ../pristine-linux-2.6.16-rc3/include/asm-i386/smp_alt.h ./include/asm-i386/smp_alt.h ---- ../pristine-linux-2.6.16-rc3/include/asm-i386/smp_alt.h 1970-01-01 01:00:00.000000000 +0100 -+++ ./include/asm-i386/smp_alt.h 2006-02-15 20:45:57.000000000 +0000 -@@ -0,0 +1,32 @@ -+#ifndef __ASM_SMP_ALT_H__ -+#define __ASM_SMP_ALT_H__ -+ -+#include <linux/config.h> -+ -+#ifdef CONFIG_SMP -+#if defined(CONFIG_SMP_ALTERNATIVES) && !defined(MODULE) -+#define LOCK \ -+ "6677: nop\n" \ -+ ".section __smp_alternatives,\"a\"\n" \ -+ ".long 6677b\n" \ -+ ".long 6678f\n" \ -+ ".previous\n" \ -+ ".section __smp_replacements,\"a\"\n" \ -+ "6678: .byte 1\n" \ -+ ".byte 1\n" \ -+ ".byte 0\n" \ -+ ".byte 1\n" \ -+ ".byte -1\n" \ -+ "lock\n" \ -+ "nop\n" \ -+ ".previous\n" -+void prepare_for_smp(void); -+void unprepare_for_smp(void); -+#else -+#define LOCK "lock ; " -+#endif -+#else -+#define LOCK "" -+#endif -+ -+#endif /* __ASM_SMP_ALT_H__ */ -diff -pruN ../pristine-linux-2.6.16-rc3/include/asm-i386/spinlock.h ./include/asm-i386/spinlock.h ---- ../pristine-linux-2.6.16-rc3/include/asm-i386/spinlock.h 2006-01-03 03:21:10.000000000 +0000 -+++ ./include/asm-i386/spinlock.h 2006-02-15 20:45:57.000000000 +0000 -@@ -6,6 +6,7 @@ - #include <asm/page.h> - #include <linux/config.h> - #include <linux/compiler.h> -+#include <asm/smp_alt.h> - - /* - * Your basic SMP spinlocks, allowing only a single CPU anywhere -@@ -23,7 +24,8 @@ - - #define __raw_spin_lock_string \ - "\n1:\t" \ -- "lock ; decb %0\n\t" \ -+ LOCK \ -+ "decb %0\n\t" \ - "jns 3f\n" \ - "2:\t" \ - "rep;nop\n\t" \ -@@ -34,7 +36,8 @@ - - #define __raw_spin_lock_string_flags \ - "\n1:\t" \ -- "lock ; decb %0\n\t" \ -+ LOCK \ -+ "decb %0\n\t" \ - "jns 4f\n\t" \ - "2:\t" \ - "testl $0x200, %1\n\t" \ -@@ -65,10 +68,34 @@ static inline void __raw_spin_lock_flags - static inline int __raw_spin_trylock(raw_spinlock_t *lock) - { - char oldval; -+#ifdef CONFIG_SMP_ALTERNATIVES - __asm__ __volatile__( -- "xchgb %b0,%1" -+ "1:movb %1,%b0\n" -+ "movb $0,%1\n" -+ "2:" -+ ".section __smp_alternatives,\"a\"\n" -+ ".long 1b\n" -+ ".long 3f\n" -+ ".previous\n" -+ ".section __smp_replacements,\"a\"\n" -+ "3: .byte 2b - 1b\n" -+ ".byte 5f-4f\n" -+ ".byte 0\n" -+ ".byte 6f-5f\n" -+ ".byte -1\n" -+ "4: xchgb %b0,%1\n" -+ "5: movb %1,%b0\n" -+ "movb $0,%1\n" -+ "6:\n" -+ ".previous\n" - :"=q" (oldval), "=m" (lock->slock) - :"0" (0) : "memory"); -+#else -+ __asm__ __volatile__( -+ "xchgb %b0,%1\n" -+ :"=q" (oldval), "=m" (lock->slock) -+ :"0" (0) : "memory"); -+#endif - return oldval > 0; - } - -@@ -178,12 +205,12 @@ static inline int __raw_write_trylock(ra - - static inline void __raw_read_unlock(raw_rwlock_t *rw) - { -- asm volatile("lock ; incl %0" :"=m" (rw->lock) : : "memory"); -+ asm volatile(LOCK "incl %0" :"=m" (rw->lock) : : "memory"); - } - - static inline void __raw_write_unlock(raw_rwlock_t *rw) - { -- asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ", %0" -+ asm volatile(LOCK "addl $" RW_LOCK_BIAS_STR ", %0" - : "=m" (rw->lock) : : "memory"); - } - -diff -pruN ../pristine-linux-2.6.16-rc3/include/asm-i386/system.h ./include/asm-i386/system.h ---- ../pristine-linux-2.6.16-rc3/include/asm-i386/system.h 2006-02-15 20:38:57.000000000 +0000 -+++ ./include/asm-i386/system.h 2006-02-15 20:45:57.000000000 +0000 -@@ -5,7 +5,7 @@ - #include <linux/kernel.h> - #include <asm/segment.h> - #include <asm/cpufeature.h> --#include <linux/bitops.h> /* for LOCK_PREFIX */ -+#include <asm/smp_alt.h> - - #ifdef __KERNEL__ - -@@ -271,19 +271,19 @@ static inline unsigned long __cmpxchg(vo - unsigned long prev; - switch (size) { - case 1: -- __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2" -+ __asm__ __volatile__(LOCK "cmpxchgb %b1,%2" - : "=a"(prev) - : "q"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - case 2: -- __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2" -+ __asm__ __volatile__(LOCK "cmpxchgw %w1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - case 4: -- __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2" -+ __asm__ __volatile__(LOCK "cmpxchgl %1,%2" - : "=a"(prev) - : "r"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); -@@ -336,7 +336,7 @@ static inline unsigned long long __cmpxc - unsigned long long new) - { - unsigned long long prev; -- __asm__ __volatile__(LOCK_PREFIX "cmpxchg8b %3" -+ __asm__ __volatile__(LOCK "cmpxchg8b %3" - : "=A"(prev) - : "b"((unsigned long)new), - "c"((unsigned long)(new >> 32)), -@@ -503,11 +503,55 @@ struct alt_instr { - #endif - - #ifdef CONFIG_SMP -+#if defined(CONFIG_SMP_ALTERNATIVES) && !defined(MODULE) -+#define smp_alt_mb(instr) \ -+__asm__ __volatile__("6667:\nnop\nnop\nnop\nnop\nnop\nnop\n6668:\n" \ -+ ".section __smp_alternatives,\"a\"\n" \ -+ ".long 6667b\n" \ -+ ".long 6673f\n" \ -+ ".previous\n" \ -+ ".section __smp_replacements,\"a\"\n" \ -+ "6673:.byte 6668b-6667b\n" \ -+ ".byte 6670f-6669f\n" \ -+ ".byte 6671f-6670f\n" \ -+ ".byte 0\n" \ -+ ".byte %c0\n" \ -+ "6669:lock;addl $0,0(%%esp)\n" \ -+ "6670:" instr "\n" \ -+ "6671:\n" \ -+ ".previous\n" \ -+ : \ -+ : "i" (X86_FEATURE_XMM2) \ -+ : "memory") -+#define smp_rmb() smp_alt_mb("lfence") -+#define smp_mb() smp_alt_mb("mfence") -+#define set_mb(var, value) do { \ -+unsigned long __set_mb_temp; \ -+__asm__ __volatile__("6667:movl %1, %0\n6668:\n" \ -+ ".section __smp_alternatives,\"a\"\n" \ -+ ".long 6667b\n" \ -+ ".long 6673f\n" \ -+ ".previous\n" \ -+ ".section __smp_replacements,\"a\"\n" \ -+ "6673: .byte 6668b-6667b\n" \ -+ ".byte 6670f-6669f\n" \ -+ ".byte 0\n" \ -+ ".byte 6671f-6670f\n" \ -+ ".byte -1\n" \ -+ "6669: xchg %1, %0\n" \ -+ "6670:movl %1, %0\n" \ -+ "6671:\n" \ -+ ".previous\n" \ -+ : "=m" (var), "=r" (__set_mb_temp) \ -+ : "1" (value) \ -+ : "memory"); } while (0) -+#else - #define smp_mb() mb() - #define smp_rmb() rmb() -+#define set_mb(var, value) do { (void) xchg(&var, value); } while (0) -+#endif - #define smp_wmb() wmb() - #define smp_read_barrier_depends() read_barrier_depends() --#define set_mb(var, value) do { (void) xchg(&var, value); } while (0) - #else - #define smp_mb() barrier() - #define smp_rmb() barrier() diff -r 88f97bb8f3ae -r 673f62edbfbe xen/include/asm-ia64/linux-xen/asm/uaccess.h --- a/xen/include/asm-ia64/linux-xen/asm/uaccess.h Wed Mar 1 17:01:54 2006 +++ /dev/null Wed Mar 1 19:47:25 2006 @@ -1,415 +0,0 @@ -#ifndef _ASM_IA64_UACCESS_H -#define _ASM_IA64_UACCESS_H - -/* - * This file defines various macros to transfer memory areas across - * the user/kernel boundary. This needs to be done carefully because - * this code is executed in kernel mode and uses user-specified - * addresses. Thus, we need to be careful not to let the user to - * trick us into accessing kernel memory that would normally be - * inaccessible. This code is also fairly performance sensitive, - * so we want to spend as little time doing safety checks as - * possible. - * - * To make matters a bit more interesting, these macros sometimes also - * called from within the kernel itself, in which case the address - * validity check must be skipped. The get_fs() macro tells us what - * to do: if get_fs()==USER_DS, checking is performed, if - * get_fs()==KERNEL_DS, checking is bypassed. - * - * Note that even if the memory area specified by the user is in a - * valid address range, it is still possible that we'll get a page - * fault while accessing it. This is handled by filling out an - * exception handler fixup entry for each instruction that has the - * potential to fault. When such a fault occurs, the page fault - * handler checks to see whether the faulting instruction has a fixup - * associated and, if so, sets r8 to -EFAULT and clears r9 to 0 and - * then resumes execution at the continuation point. - * - * Based on <asm-alpha/uaccess.h>. - * - * Copyright (C) 1998, 1999, 2001-2004 Hewlett-Packard Co - * David Mosberger-Tang <davidm@xxxxxxxxxx> - */ - -#include <linux/compiler.h> -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/page-flags.h> -#include <linux/mm.h> - -#include <asm/intrinsics.h> -#include <asm/pgtable.h> -#include <asm/io.h> - -/* - * For historical reasons, the following macros are grossly misnamed: - */ -#define KERNEL_DS ((mm_segment_t) { ~0UL }) /* cf. access_ok() */ -#define USER_DS ((mm_segment_t) { TASK_SIZE-1 }) /* cf. access_ok() */ - -#define VERIFY_READ 0 -#define VERIFY_WRITE 1 - -#define get_ds() (KERNEL_DS) -#define get_fs() (current_thread_info()->addr_limit) -#define set_fs(x) (current_thread_info()->addr_limit = (x)) - -#define segment_eq(a, b) ((a).seg == (b).seg) - -/* - * When accessing user memory, we need to make sure the entire area really is in - * user-level space. In order to do this efficiently, we make sure that the page at - * address TASK_SIZE is never valid. We also need to make sure that the address doesn't - * point inside the virtually mapped linear page table. - */ -#ifdef XEN -#define IS_VMM_ADDRESS(addr) ((((addr) >> 60) ^ ((addr) >> 59)) & 1) -#define __access_ok(addr, size, segment) (!IS_VMM_ADDRESS((unsigned long)(addr))) -#else -#define __access_ok(addr, size, segment) \ -({ \ - __chk_user_ptr(addr); \ - (likely((unsigned long) (addr) <= (segment).seg) \ - && ((segment).seg == KERNEL_DS.seg \ - || likely(REGION_OFFSET((unsigned long) (addr)) < RGN_MAP_LIMIT))); \ -}) -#endif -#define access_ok(type, addr, size) __access_ok((addr), (size), get_fs()) - -/* this function will go away soon - use access_ok() instead */ -static inline int __deprecated -verify_area (int type, const void __user *addr, unsigned long size) -{ - return access_ok(type, addr, size) ? 0 : -EFAULT; -} - -/* - * These are the main single-value transfer routines. They automatically - * use the right size if we just have the right pointer type. - * - * Careful to not - * (a) re-use the arguments for side effects (sizeof/typeof is ok) - * (b) require any knowledge of processes at this stage - */ -#define put_user(x, ptr) __put_user_check((__typeof__(*(ptr))) (x), (ptr), sizeof(*(ptr)), get_fs()) -#define get_user(x, ptr) __get_user_check((x), (ptr), sizeof(*(ptr)), get_fs()) - -/* - * The "__xxx" versions do not do address space checking, useful when - * doing multiple accesses to the same area (the programmer has to do the - * checks by hand with "access_ok()") - */ -#define __put_user(x, ptr) __put_user_nocheck((__typeof__(*(ptr))) (x), (ptr), sizeof(*(ptr))) -#define __get_user(x, ptr) __get_user_nocheck((x), (ptr), sizeof(*(ptr))) - -extern long __put_user_unaligned_unknown (void); - -#define __put_user_unaligned(x, ptr) \ -({ \ - long __ret; \ - switch (sizeof(*(ptr))) { \ - case 1: __ret = __put_user((x), (ptr)); break; \ - case 2: __ret = (__put_user((x), (u8 __user *)(ptr))) \ - | (__put_user((x) >> 8, ((u8 __user *)(ptr) + 1))); break; \ - case 4: __ret = (__put_user((x), (u16 __user *)(ptr))) \ - | (__put_user((x) >> 16, ((u16 __user *)(ptr) + 1))); break; \ - case 8: __ret = (__put_user((x), (u32 __user *)(ptr))) \ - | (__put_user((x) >> 32, ((u32 __user *)(ptr) + 1))); break; \ - default: __ret = __put_user_unaligned_unknown(); \ - } \ - __ret; \ -}) - -extern long __get_user_unaligned_unknown (void); - -#define __get_user_unaligned(x, ptr) \ -({ \ - long __ret; \ - switch (sizeof(*(ptr))) { \ - case 1: __ret = __get_user((x), (ptr)); break; \ - case 2: __ret = (__get_user((x), (u8 __user *)(ptr))) \ - | (__get_user((x) >> 8, ((u8 __user *)(ptr) + 1))); break; \ - case 4: __ret = (__get_user((x), (u16 __user *)(ptr))) \ - | (__get_user((x) >> 16, ((u16 __user *)(ptr) + 1))); break; \ - case 8: __ret = (__get_user((x), (u32 __user *)(ptr))) \ - | (__get_user((x) >> 32, ((u32 __user *)(ptr) + 1))); break; \ - default: __ret = __get_user_unaligned_unknown(); \ - } \ - __ret; \ -}) - -#ifdef ASM_SUPPORTED - struct __large_struct { unsigned long buf[100]; }; -# define __m(x) (*(struct __large_struct __user *)(x)) - -/* We need to declare the __ex_table section before we can use it in .xdata. */ -asm (".section \"__ex_table\", \"a\"\n\t.previous"); - -# define __get_user_size(val, addr, n, err) \ -do { \ - register long __gu_r8 asm ("r8") = 0; \ - register long __gu_r9 asm ("r9"); \ - asm ("\n[1:]\tld"#n" %0=%2%P2\t// %0 and %1 get overwritten by exception handler\n" \ - "\t.xdata4 \"__ex_table\", 1b-., 1f-.+4\n" \ - "[1:]" \ - : "=r"(__gu_r9), "=r"(__gu_r8) : "m"(__m(addr)), "1"(__gu_r8)); \ - (err) = __gu_r8; \ - (val) = __gu_r9; \ -} while (0) - -/* - * The "__put_user_size()" macro tells gcc it reads from memory instead of writing it. This - * is because they do not write to any memory gcc knows about, so there are no aliasing - * issues. - */ -# define __put_user_size(val, addr, n, err) \ -do { \ - register long __pu_r8 asm ("r8") = 0; \ - asm volatile ("\n[1:]\tst"#n" %1=%r2%P1\t// %0 gets overwritten by exception handler\n" \ - "\t.xdata4 \"__ex_table\", 1b-., 1f-.\n" \ - "[1:]" \ - : "=r"(__pu_r8) : "m"(__m(addr)), "rO"(val), "0"(__pu_r8)); \ - (err) = __pu_r8; \ -} while (0) - -#else /* !ASM_SUPPORTED */ -# define RELOC_TYPE 2 /* ip-rel */ -# define __get_user_size(val, addr, n, err) \ -do { \ - __ld_user("__ex_table", (unsigned long) addr, n, RELOC_TYPE); \ - (err) = ia64_getreg(_IA64_REG_R8); \ - (val) = ia64_getreg(_IA64_REG_R9); \ -} while (0) -# define __put_user_size(val, addr, n, err) \ -do { \ - __st_user("__ex_table", (unsigned long) addr, n, RELOC_TYPE, (unsigned long) (val)); \ - (err) = ia64_getreg(_IA64_REG_R8); \ -} while (0) -#endif /* !ASM_SUPPORTED */ - -extern void __get_user_unknown (void); - -/* - * Evaluating arguments X, PTR, SIZE, and SEGMENT may involve subroutine-calls, which - * could clobber r8 and r9 (among others). Thus, be careful not to evaluate it while - * using r8/r9. - */ -#define __do_get_user(check, x, ptr, size, segment) \ -({ \ - const __typeof__(*(ptr)) __user *__gu_ptr = (ptr); \ - __typeof__ (size) __gu_size = (size); \ - long __gu_err = -EFAULT, __gu_val = 0; \ - \ - if (!check || __access_ok(__gu_ptr, size, segment)) \ - switch (__gu_size) { \ - case 1: __get_user_size(__gu_val, __gu_ptr, 1, __gu_err); break; \ - case 2: __get_user_size(__gu_val, __gu_ptr, 2, __gu_err); break; \ - case 4: __get_user_size(__gu_val, __gu_ptr, 4, __gu_err); break; \ - case 8: __get_user_size(__gu_val, __gu_ptr, 8, __gu_err); break; \ - default: __get_user_unknown(); break; \ - } \ - (x) = (__typeof__(*(__gu_ptr))) __gu_val; \ - __gu_err; \ -}) - -#define __get_user_nocheck(x, ptr, size) __do_get_user(0, x, ptr, size, KERNEL_DS) -#define __get_user_check(x, ptr, size, segment) __do_get_user(1, x, ptr, size, segment) - -extern void __put_user_unknown (void); - -/* - * Evaluating arguments X, PTR, SIZE, and SEGMENT may involve subroutine-calls, which - * could clobber r8 (among others). Thus, be careful not to evaluate them while using r8. - */ -#define __do_put_user(check, x, ptr, size, segment) \ -({ \ - __typeof__ (x) __pu_x = (x); \ - __typeof__ (*(ptr)) __user *__pu_ptr = (ptr); \ - __typeof__ (size) __pu_size = (size); \ - long __pu_err = -EFAULT; \ - \ - if (!check || __access_ok(__pu_ptr, __pu_size, segment)) \ - switch (__pu_size) { \ - case 1: __put_user_size(__pu_x, __pu_ptr, 1, __pu_err); break; \ - case 2: __put_user_size(__pu_x, __pu_ptr, 2, __pu_err); break; \ - case 4: __put_user_size(__pu_x, __pu_ptr, 4, __pu_err); break; \ - case 8: __put_user_size(__pu_x, __pu_ptr, 8, __pu_err); break; \ - default: __put_user_unknown(); break; \ - } \ - __pu_err; \ -}) - -#define __put_user_nocheck(x, ptr, size) __do_put_user(0, x, ptr, size, KERNEL_DS) -#define __put_user_check(x, ptr, size, segment) __do_put_user(1, x, ptr, size, segment) - -/* - * Complex access routines - */ -extern unsigned long __must_check __copy_user (void __user *to, const void __user *from, - unsigned long count); - -static inline unsigned long -__copy_to_user (void __user *to, const void *from, unsigned long count) -{ - return __copy_user(to, (void __user *) from, count); -} - -static inline unsigned long -__copy_from_user (void *to, const void __user *from, unsigned long count) -{ - return __copy_user((void __user *) to, from, count); -} - -#define __copy_to_user_inatomic __copy_to_user -#define __copy_from_user_inatomic __copy_from_user -#define copy_to_user(to, from, n) \ -({ \ - void __user *__cu_to = (to); \ - const void *__cu_from = (from); \ - long __cu_len = (n); \ - \ - if (__access_ok(__cu_to, __cu_len, get_fs())) \ - __cu_len = __copy_user(__cu_to, (void __user *) __cu_from, __cu_len); \ - __cu_len; \ -}) - -#define copy_from_user(to, from, n) \ -({ \ - void *__cu_to = (to); \ - const void __user *__cu_from = (from); \ - long __cu_len = (n); \ - \ - __chk_user_ptr(__cu_from); \ - if (__access_ok(__cu_from, __cu_len, get_fs())) \ - __cu_len = __copy_user((void __user *) __cu_to, __cu_from, __cu_len); \ - __cu_len; \ -}) - -#define __copy_in_user(to, from, size) __copy_user((to), (from), (size)) - -static inline unsigned long -copy_in_user (void __user *to, const void __user *from, unsigned long n) -{ - if (likely(access_ok(VERIFY_READ, from, n) && access_ok(VERIFY_WRITE, to, n))) - n = __copy_user(to, from, n); - return n; -} - -extern unsigned long __do_clear_user (void __user *, unsigned long); - -#define __clear_user(to, n) __do_clear_user(to, n) - -#define clear_user(to, n) \ -({ \ - unsigned long __cu_len = (n); \ - if (__access_ok(to, __cu_len, get_fs())) \ - __cu_len = __do_clear_user(to, __cu_len); \ - __cu_len; \ -}) - - -/* - * Returns: -EFAULT if exception before terminator, N if the entire buffer filled, else - * strlen. - */ -extern long __must_check __strncpy_from_user (char *to, const char __user *from, long to_len); - -#define strncpy_from_user(to, from, n) \ -({ \ - const char __user * __sfu_from = (from); \ - long __sfu_ret = -EFAULT; \ - if (__access_ok(__sfu_from, 0, get_fs())) \ - __sfu_ret = __strncpy_from_user((to), __sfu_from, (n)); \ - __sfu_ret; \ -}) - -/* Returns: 0 if bad, string length+1 (memory size) of string if ok */ -extern unsigned long __strlen_user (const char __user *); - -#define strlen_user(str) \ -({ \ - const char __user *__su_str = (str); \ - unsigned long __su_ret = 0; \ - if (__access_ok(__su_str, 0, get_fs())) \ - __su_ret = __strlen_user(__su_str); \ - __su_ret; \ -}) - -/* - * Returns: 0 if exception before NUL or reaching the supplied limit - * (N), a value greater than N if the limit would be exceeded, else - * strlen. - */ -extern unsigned long __strnlen_user (const char __user *, long); - -#define strnlen_user(str, len) \ -({ \ - const char __user *__su_str = (str); \ - unsigned long __su_ret = 0; \ - if (__access_ok(__su_str, 0, get_fs())) \ - __su_ret = __strnlen_user(__su_str, len); \ - __su_ret; \ -}) - -/* Generic code can't deal with the location-relative format that we use for compactness. */ -#define ARCH_HAS_SORT_EXTABLE -#define ARCH_HAS_SEARCH_EXTABLE - -struct exception_table_entry { - int addr; /* location-relative address of insn this fixup is for */ - int cont; /* location-relative continuation addr.; if bit 2 is set, r9 is set to 0 */ -}; - -extern void ia64_handle_exception (struct pt_regs *regs, const struct exception_table_entry *e); -extern const struct exception_table_entry *search_exception_tables (unsigned long addr); - -static inline int -ia64_done_with_exception (struct pt_regs *regs) -{ - const struct exception_table_entry *e; - e = search_exception_tables(regs->cr_iip + ia64_psr(regs)->ri); - if (e) { - ia64_handle_exception(regs, e); - return 1; - } - return 0; -} - -#ifndef XEN -#define ARCH_HAS_TRANSLATE_MEM_PTR 1 -static __inline__ char * -xlate_dev_mem_ptr (unsigned long p) -{ - struct page *page; - char * ptr; - - page = mfn_to_page(p >> PAGE_SHIFT); - if (PageUncached(page)) - ptr = (char *)p + __IA64_UNCACHED_OFFSET; - else - ptr = __va(p); - - return ptr; -} - -/* - * Convert a virtual cached kernel memory pointer to an uncached pointer - */ -static __inline__ char * -xlate_dev_kmem_ptr (char * p) -{ - struct page *page; - char * ptr; - - page = virt_to_page((unsigned long)p >> PAGE_SHIFT); - if (PageUncached(page)) - ptr = (char *)__pa(p) + __IA64_UNCACHED_OFFSET; - else - ptr = p; - - return ptr; -} -#endif - -#endif /* _ASM_IA64_UACCESS_H */ _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-changelog
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |