[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] Re: [Xen-merge] i386 subarch
Here's that output from the script. The diffstat can be a guiding list of files to be converted. (Kconfig and Makefiles can be ignored) arch/i386/Kconfig | 718 +++++---------------- arch/i386/Makefile | 128 --- arch/i386/kernel/Makefile | 79 +- arch/i386/kernel/acpi/Makefile | 15 arch/i386/kernel/acpi/boot.c | 26 arch/i386/kernel/apic.c | 1201 ------------------------------------ arch/i386/kernel/cpu/Makefile | 34 - arch/i386/kernel/cpu/common.c | 58 - arch/i386/kernel/cpu/mtrr/Makefile | 19 arch/i386/kernel/cpu/mtrr/main.c | 629 +----------------- arch/i386/kernel/entry.S | 358 ++++++---- arch/i386/kernel/head.S | 457 +------------ arch/i386/kernel/i386_ksyms.c | 2 arch/i386/kernel/io_apic.c | 104 ++- arch/i386/kernel/ioport.c | 74 -- arch/i386/kernel/irq.c | 66 + arch/i386/kernel/ldt.c | 32 arch/i386/kernel/microcode.c | 375 ----------- arch/i386/kernel/mpparse.c | 27 arch/i386/kernel/pci-dma.c | 141 ++++ arch/i386/kernel/process.c | 291 +++----- arch/i386/kernel/quirks.c | 11 arch/i386/kernel/setup.c | 241 ++++++- arch/i386/kernel/signal.c | 2 arch/i386/kernel/smp.c | 208 +++--- arch/i386/kernel/smpboot.c | 476 ++++++++++++-- arch/i386/kernel/time.c | 553 +++++++++++++++- arch/i386/kernel/timers/Makefile | 16 arch/i386/kernel/timers/timer_tsc.c | 277 +------- arch/i386/kernel/traps.c | 210 ++---- arch/i386/kernel/vsyscall.S | 4 arch/i386/mach-default/Makefile | 9 arch/i386/mm/Makefile | 22 arch/i386/mm/fault.c | 35 - arch/i386/mm/highmem.c | 15 arch/i386/mm/hypervisor.c | 363 ++++++++++ arch/i386/mm/init.c | 131 +++ arch/i386/mm/ioremap.c | 312 ++++++--- arch/i386/mm/pgtable.c | 309 ++++++++- arch/i386/pci/Makefile | 38 - arch/i386/pci/irq.c | 5 41 files changed, 3673 insertions(+), 4398 deletions(-) diff -x mkbuildtree -x include -x xen -x SCCS -urPp pristine-linux-2.6.12/arch/i386/Kconfig linux-2.6-xen-sparse/arch/i386/Kconfig --- pristine-linux-2.6.12/arch/i386/Kconfig 2005-06-17 12:48:29.000000000 -0700 +++ linux-2.6-xen-sparse/arch/i386/Kconfig 2005-07-28 13:17:07.000000000 -0700 @@ -3,7 +3,11 @@ # see Documentation/kbuild/kconfig-language.txt. # -mainmenu "Linux Kernel Configuration" +menu "X86 Processor Configuration" + +config XENARCH + string + default i386 config X86 bool @@ -33,119 +37,6 @@ config GENERIC_IOMAP bool default y -source "init/Kconfig" - -menu "Processor type and features" - -choice - prompt "Subarchitecture Type" - default X86_PC - -config X86_PC - bool "PC-compatible" - help - Choose this option if your computer is a standard PC or compatible. - -config X86_ELAN - bool "AMD Elan" - help - Select this for an AMD Elan processor. - - Do not use this option for K6/Athlon/Opteron processors! - - If unsure, choose "PC-compatible" instead. - -config X86_VOYAGER - bool "Voyager (NCR)" - help - Voyager is an MCA-based 32-way capable SMP architecture proprietary - to NCR Corp. Machine classes 345x/35xx/4100/51xx are Voyager-based. - - *** WARNING *** - - If you do not specifically know you have a Voyager based machine, - say N here, otherwise the kernel you build will not be bootable. - -config X86_NUMAQ - bool "NUMAQ (IBM/Sequent)" - select DISCONTIGMEM - select NUMA - help - This option is used for getting Linux to run on a (IBM/Sequent) NUMA - multiquad box. This changes the way that processors are bootstrapped, - and uses Clustered Logical APIC addressing mode instead of Flat Logical. - You will need a new lynxer.elf file to flash your firmware with - send - email to <Martin.Bligh@xxxxxxxxxx>. - -config X86_SUMMIT - bool "Summit/EXA (IBM x440)" - depends on SMP - help - This option is needed for IBM systems that use the Summit/EXA chipset. - In particular, it is needed for the x440. - - If you don't have one of these computers, you should say N here. - -config X86_BIGSMP - bool "Support for other sub-arch SMP systems with more than 8 CPUs" - depends on SMP - help - This option is needed for the systems that have more than 8 CPUs - and if the system is not of any sub-arch type above. - - If you don't have such a system, you should say N here. - -config X86_VISWS - bool "SGI 320/540 (Visual Workstation)" - help - The SGI Visual Workstation series is an IA32-based workstation - based on SGI systems chips with some legacy PC hardware attached. - - Say Y here to create a kernel to run on the SGI 320 or 540. - - A kernel compiled for the Visual Workstation will not run on PCs - and vice versa. See <file:Documentation/sgi-visws.txt> for details. - -config X86_GENERICARCH - bool "Generic architecture (Summit, bigsmp, ES7000, default)" - depends on SMP - help - This option compiles in the Summit, bigsmp, ES7000, default subarchitectures. - It is intended for a generic binary kernel. - -config X86_ES7000 - bool "Support for Unisys ES7000 IA32 series" - depends on SMP - help - Support for Unisys ES7000 systems. Say 'Y' here if this kernel is - supposed to run on an IA32-based Unisys ES7000 system. - Only choose this option if you have such a system, otherwise you - should say N here. - -endchoice - -config ACPI_SRAT - bool - default y - depends on NUMA && (X86_SUMMIT || X86_GENERICARCH) - -config X86_SUMMIT_NUMA - bool - default y - depends on NUMA && (X86_SUMMIT || X86_GENERICARCH) - -config X86_CYCLONE_TIMER - bool - default y - depends on X86_SUMMIT || X86_GENERICARCH - -config ES7000_CLUSTERED_APIC - bool - default y - depends on SMP && X86_ES7000 && MPENTIUMIII - -if !X86_ELAN - choice prompt "Processor family" default M686 @@ -347,8 +238,6 @@ config X86_GENERIC This is really intended for distributors who need more generic optimizations. -endif - # # Define implied options from the CPU selection here # @@ -444,19 +333,21 @@ config X86_OOSTORE default y config HPET_TIMER - bool "HPET Timer Support" - help - This enables the use of the HPET for the kernel's internal timer. - HPET is the next generation timer replacing legacy 8254s. - You can safely choose Y here. However, HPET will only be - activated if the platform and the BIOS support this feature. - Otherwise the 8254 will be used for timing services. - - Choose N to continue using the legacy 8254 timer. + bool + default n +#config HPET_TIMER +# bool "HPET Timer Support" +# help +# This enables the use of the HPET for the kernel's internal timer. +# HPET is the next generation timer replacing legacy 8254s. +# You can safely choose Y here. However, HPET will only be +# activated if the platform and the BIOS support this feature. +# Otherwise the 8254 will be used for timing services. +# +# Choose N to continue using the legacy 8254 timer. config HPET_EMULATE_RTC - bool "Provide RTC interrupt" - depends on HPET_TIMER && RTC=y + def_bool HPET_TIMER && RTC=y config SMP bool "Symmetric multi-processing support" @@ -487,6 +378,19 @@ config SMP If you don't know what to do here, say N. +config SMP_ALTERNATIVES + bool "SMP alternatives support (EXPERIMENTAL)" + depends on SMP && EXPERIMENTAL + help + Try to reduce the overhead of running an SMP kernel on a uniprocessor + host slightly by replacing certain key instruction sequences + according to whether we currently have more than one CPU available. + This should provide a noticeable boost to performance when + running SMP kernels on UP machines, and have negligible impact + when running on an true SMP host. + + If unsure, say N. + config NR_CPUS int "Maximum number of CPUs (2-255)" range 2 255 @@ -534,122 +438,47 @@ config PREEMPT_BKL Say Y here if you are building a kernel for a desktop system. Say N if you are unsure. -config X86_UP_APIC - bool "Local APIC support on uniprocessors" - depends on !SMP && !(X86_VISWS || X86_VOYAGER) - help - A local APIC (Advanced Programmable Interrupt Controller) is an - integrated interrupt controller in the CPU. If you have a single-CPU - system which has a processor with a local APIC, you can say Y here to - enable and use it. If you say Y here even though your machine doesn't - have a local APIC, then the kernel will still run with no slowdown at - all. The local APIC supports CPU-generated self-interrupts (timer, - performance counters), and the NMI watchdog which detects hard - lockups. - -config X86_UP_IOAPIC - bool "IO-APIC support on uniprocessors" - depends on X86_UP_APIC - help - An IO-APIC (I/O Advanced Programmable Interrupt Controller) is an - SMP-capable replacement for PC-style interrupt controllers. Most - SMP systems and many recent uniprocessor systems have one. - - If you have a single-CPU system with an IO-APIC, you can say Y here - to use it. If you say Y here even though your machine doesn't have - an IO-APIC, then the kernel will still run with no slowdown at all. - -config X86_LOCAL_APIC - bool - depends on X86_UP_APIC || ((X86_VISWS || SMP) && !X86_VOYAGER) - default y - -config X86_IO_APIC - bool - depends on X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER)) - default y - -config X86_VISWS_APIC - bool - depends on X86_VISWS - default y - -config X86_TSC - bool - depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MGEODEGX1) && !X86_NUMAQ - default y - -config X86_MCE - bool "Machine Check Exception" - depends on !X86_VOYAGER - ---help--- - Machine Check Exception support allows the processor to notify the - kernel if it detects a problem (e.g. overheating, component failure). - The action the kernel takes depends on the severity of the problem, - ranging from a warning message on the console, to halting the machine. - Your processor must be a Pentium or newer to support this - check the - flags in /proc/cpuinfo for mce. Note that some older Pentium systems - have a design flaw which leads to false MCE events - hence MCE is - disabled on all P5 processors, unless explicitly enabled with "mce" - as a boot argument. Similarly, if MCE is built in and creates a - problem on some new non-standard machine, you can boot with "nomce" - to disable it. MCE support simply ignores non-MCE processors like - the 386 and 486, so nearly everyone can say Y here. - -config X86_MCE_NONFATAL - tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4" - depends on X86_MCE - help - Enabling this feature starts a timer that triggers every 5 seconds which - will look at the machine check registers to see if anything happened. - Non-fatal problems automatically get corrected (but still logged). - Disable this if you don't want to see these messages. - Seeing the messages this option prints out may be indicative of dying hardware, - or out-of-spec (ie, overclocked) hardware. - This option only does something on certain CPUs. - (AMD Athlon/Duron and Intel Pentium 4) - -config X86_MCE_P4THERMAL - bool "check for P4 thermal throttling interrupt." - depends on X86_MCE && (X86_UP_APIC || SMP) && !X86_VISWS - help - Enabling this feature will cause a message to be printed when the P4 - enters thermal throttling. - -config TOSHIBA - tristate "Toshiba Laptop support" - ---help--- - This adds a driver to safely access the System Management Mode of - the CPU on Toshiba portables with a genuine Toshiba BIOS. It does - not work on models with a Phoenix BIOS. The System Management Mode - is used to set the BIOS and power saving options on Toshiba portables. - - For information on utilities to make use of this driver see the - Toshiba Linux utilities web site at: - <http://www.buzzard.org.uk/toshiba/>. - - Say Y if you intend to run this kernel on a Toshiba portable. - Say N otherwise. - -config I8K - tristate "Dell laptop support" - ---help--- - This adds a driver to safely access the System Management Mode - of the CPU on the Dell Inspiron 8000. The System Management Mode - is used to read cpu temperature and cooling fan status and to - control the fans on the I8K portables. - - This driver has been tested only on the Inspiron 8000 but it may - also work with other Dell laptops. You can force loading on other - models by passing the parameter `force=1' to the module. Use at - your own risk. - - For information on utilities to make use of this driver see the - I8K Linux utilities web site at: - <http://people.debian.org/~dz/i8k/> - - Say Y if you intend to run this kernel on a Dell Inspiron 8000. - Say N otherwise. +#config X86_TSC +# bool +# depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MGEODEGX1) && !X86_NUMAQ +# default y + +#config X86_MCE +# bool "Machine Check Exception" +# depends on !X86_VOYAGER +# ---help--- +# Machine Check Exception support allows the processor to notify the +# kernel if it detects a problem (e.g. overheating, component failure). +# The action the kernel takes depends on the severity of the problem, +# ranging from a warning message on the console, to halting the machine. +# Your processor must be a Pentium or newer to support this - check the +# flags in /proc/cpuinfo for mce. Note that some older Pentium systems +# have a design flaw which leads to false MCE events - hence MCE is +# disabled on all P5 processors, unless explicitly enabled with "mce" +# as a boot argument. Similarly, if MCE is built in and creates a +# problem on some new non-standard machine, you can boot with "nomce" +# to disable it. MCE support simply ignores non-MCE processors like +# the 386 and 486, so nearly everyone can say Y here. + +#config X86_MCE_NONFATAL +# tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4" +# depends on X86_MCE +# help +# Enabling this feature starts a timer that triggers every 5 seconds which +# will look at the machine check registers to see if anything happened. +# Non-fatal problems automatically get corrected (but still logged). +# Disable this if you don't want to see these messages. +# Seeing the messages this option prints out may be indicative of dying hardware, +# or out-of-spec (ie, overclocked) hardware. +# This option only does something on certain CPUs. +# (AMD Athlon/Duron and Intel Pentium 4) + +#config X86_MCE_P4THERMAL +# bool "check for P4 thermal throttling interrupt." +# depends on X86_MCE && (X86_UP_APIC || SMP) +# help +# Enabling this feature will cause a message to be printed when the P4 +# enters thermal throttling. config X86_REBOOTFIXUPS bool "Enable X86 board specific fixups for reboot" @@ -671,6 +500,7 @@ config X86_REBOOTFIXUPS config MICROCODE tristate "/dev/cpu/microcode - Intel IA32 CPU microcode support" + depends on XEN_PRIVILEGED_GUEST ---help--- If you say Y here and also to "/dev file system support" in the 'File systems' section, you will be able to update the microcode on @@ -686,14 +516,14 @@ config MICROCODE To compile this driver as a module, choose M here: the module will be called microcode. -config X86_MSR - tristate "/dev/cpu/*/msr - Model-specific register support" - help - This device gives privileged processes access to the x86 - Model-Specific Registers (MSRs). It is a character device with - major 202 and minors 0 to 31 for /dev/cpu/0/msr to /dev/cpu/31/msr. - MSR accesses are directed to a specific CPU on multi-processor - systems. +#config X86_MSR +# tristate "/dev/cpu/*/msr - Model-specific register support" +# help +# This device gives privileged processes access to the x86 +# Model-Specific Registers (MSRs). It is a character device with +# major 202 and minors 0 to 31 for /dev/cpu/0/msr to /dev/cpu/31/msr. +# MSR accesses are directed to a specific CPU on multi-processor +# systems. config X86_CPUID tristate "/dev/cpu/*/cpuid - CPU information support" @@ -803,95 +633,57 @@ config NEED_NODE_MEMMAP_SIZE depends on DISCONTIGMEM default y -config HIGHPTE - bool "Allocate 3rd-level pagetables from highmem" - depends on HIGHMEM4G || HIGHMEM64G - help - The VM uses one page table entry for each page of physical memory. - For systems with a lot of RAM, this can be wasteful of precious - low memory. Setting this option will put user-space page table - entries in high memory. - -config MATH_EMULATION - bool "Math emulation" - ---help--- - Linux can emulate a math coprocessor (used for floating point - operations) if you don't have one. 486DX and Pentium processors have - a math coprocessor built in, 486SX and 386 do not, unless you added - a 487DX or 387, respectively. (The messages during boot time can - give you some hints here ["man dmesg"].) Everyone needs either a - coprocessor or this emulation. - - If you don't have a math coprocessor, you need to say Y here; if you - say Y here even though you have a coprocessor, the coprocessor will - be used nevertheless. (This behavior can be changed with the kernel - command line option "no387", which comes handy if your coprocessor - is broken. Try "man bootparam" or see the documentation of your boot - loader (lilo or loadlin) about how to pass options to the kernel at - boot time.) This means that it is a good idea to say Y here if you - intend to use this kernel on different machines. - - More information about the internals of the Linux math coprocessor - emulation can be found in <file:arch/i386/math-emu/README>. - - If you are not sure, say Y; apart from resulting in a 66 KB bigger - kernel, it won't hurt. +#config HIGHPTE +# bool "Allocate 3rd-level pagetables from highmem" +# depends on HIGHMEM4G || HIGHMEM64G +# help +# The VM uses one page table entry for each page of physical memory. +# For systems with a lot of RAM, this can be wasteful of precious +# low memory. Setting this option will put user-space page table +# entries in high memory. config MTRR - bool "MTRR (Memory Type Range Register) support" - ---help--- - On Intel P6 family processors (Pentium Pro, Pentium II and later) - the Memory Type Range Registers (MTRRs) may be used to control - processor access to memory ranges. This is most useful if you have - a video (VGA) card on a PCI or AGP bus. Enabling write-combining - allows bus write transfers to be combined into a larger transfer - before bursting over the PCI/AGP bus. This can increase performance - of image write operations 2.5 times or more. Saying Y here creates a - /proc/mtrr file which may be used to manipulate your processor's - MTRRs. Typically the X server should use this. - - This code has a reasonably generic interface so that similar - control registers on other processors can be easily supported - as well: - - The Cyrix 6x86, 6x86MX and M II processors have Address Range - Registers (ARRs) which provide a similar functionality to MTRRs. For - these, the ARRs are used to emulate the MTRRs. - The AMD K6-2 (stepping 8 and above) and K6-3 processors have two - MTRRs. The Centaur C6 (WinChip) has 8 MCRs, allowing - write-combining. All of these processors are supported by this code - and it makes sense to say Y here if you have one of them. - - Saying Y here also fixes a problem with buggy SMP BIOSes which only - set the MTRRs for the boot CPU and not for the secondary CPUs. This - can lead to all sorts of problems, so it's good to say Y here. - - You can safely say Y even if your machine doesn't have MTRRs, you'll - just add about 9 KB to your kernel. - - See <file:Documentation/mtrr.txt> for more information. - -config EFI - bool "Boot from EFI support (EXPERIMENTAL)" - depends on ACPI - default n - ---help--- - This enables the the kernel to boot on EFI platforms using - system configuration information passed to it from the firmware. - This also enables the kernel to use any EFI runtime services that are - available (such as the EFI variable services). - - This option is only useful on systems that have EFI firmware - and will result in a kernel image that is ~8k larger. In addition, - you must use the latest ELILO loader available at - <http://elilo.sourceforge.net> in order to take advantage of - kernel initialization using EFI information (neither GRUB nor LILO know - anything about EFI). However, even with this option, the resultant - kernel should continue to boot on existing non-EFI platforms. + bool + depends on XEN_PRIVILEGED_GUEST + default y + +#config MTRR +# bool "MTRR (Memory Type Range Register) support" +# ---help--- +# On Intel P6 family processors (Pentium Pro, Pentium II and later) +# the Memory Type Range Registers (MTRRs) may be used to control +# processor access to memory ranges. This is most useful if you have +# a video (VGA) card on a PCI or AGP bus. Enabling write-combining +# allows bus write transfers to be combined into a larger transfer +# before bursting over the PCI/AGP bus. This can increase performance +# of image write operations 2.5 times or more. Saying Y here creates a +# /proc/mtrr file which may be used to manipulate your processor's +# MTRRs. Typically the X server should use this. +# +# This code has a reasonably generic interface so that similar +# control registers on other processors can be easily supported +# as well: +# +# The Cyrix 6x86, 6x86MX and M II processors have Address Range +# Registers (ARRs) which provide a similar functionality to MTRRs. For +# these, the ARRs are used to emulate the MTRRs. +# The AMD K6-2 (stepping 8 and above) and K6-3 processors have two +# MTRRs. The Centaur C6 (WinChip) has 8 MCRs, allowing +# write-combining. All of these processors are supported by this code +# and it makes sense to say Y here if you have one of them. +# +# Saying Y here also fixes a problem with buggy SMP BIOSes which only +# set the MTRRs for the boot CPU and not for the secondary CPUs. This +# can lead to all sorts of problems, so it's good to say Y here. +# +# You can safely say Y even if your machine doesn't have MTRRs, you'll +# just add about 9 KB to your kernel. +# +# See <file:Documentation/mtrr.txt> for more information. config IRQBALANCE bool "Enable kernel irq balancing" - depends on SMP && X86_IO_APIC + depends on SMP && X86_IO_APIC && !XEN default y help The default yes will allow the kernel to do irq load balancing. @@ -922,186 +714,59 @@ config REGPARM generate incorrect output with certain kernel constructs when -mregparm=3 is used. -config SECCOMP - bool "Enable seccomp to safely compute untrusted bytecode" - depends on PROC_FS +config X86_LOCAL_APIC + bool + depends on XEN_PRIVILEGED_GUEST && (X86_UP_APIC || ((X86_VISWS || SMP) && !X86_VOYAGER)) default y - help - This kernel feature is useful for number crunching applications - that may need to compute untrusted bytecode during their - execution. By using pipes or other transports made available to - the process as file descriptors supporting the read/write - syscalls, it's possible to isolate those applications in - their own address space using seccomp. Once seccomp is - enabled via /proc/<pid>/seccomp, it cannot be disabled - and the task is only allowed to execute a few safe syscalls - defined by each seccomp mode. - - If unsure, say Y. Only embedded should say N here. - -endmenu - - -menu "Power management options (ACPI, APM)" - depends on !X86_VOYAGER - -source kernel/power/Kconfig -source "drivers/acpi/Kconfig" +config X86_IO_APIC + bool + depends on XEN_PRIVILEGED_GUEST && (X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER))) + default y -menu "APM (Advanced Power Management) BIOS Support" -depends on PM && !X86_VISWS +config X86_VISWS_APIC + bool + depends on X86_VISWS + default y -config APM - tristate "APM (Advanced Power Management) BIOS support" - depends on PM +config HOTPLUG_CPU + bool "Support for hot-pluggable CPUs (EXPERIMENTAL)" + depends on SMP && HOTPLUG && EXPERIMENTAL ---help--- - APM is a BIOS specification for saving power using several different - techniques. This is mostly useful for battery powered laptops with - APM compliant BIOSes. If you say Y here, the system time will be - reset after a RESUME operation, the /proc/apm device will provide - battery status information, and user-space programs will receive - notification of APM "events" (e.g. battery status change). - - If you select "Y" here, you can disable actual use of the APM - BIOS by passing the "apm=off" option to the kernel at boot time. - - Note that the APM support is almost completely disabled for - machines with more than one CPU. - - In order to use APM, you will need supporting software. For location - and more information, read <file:Documentation/pm.txt> and the - Battery Powered Linux mini-HOWTO, available from - <http://www.tldp.org/docs.html#howto>. + Say Y here to experiment with turning CPUs off and on. CPUs + can be controlled through /sys/devices/system/cpu. - This driver does not spin down disk drives (see the hdparm(8) - manpage ("man 8 hdparm") for that), and it doesn't turn off - VESA-compliant "green" monitors. - - This driver does not support the TI 4000M TravelMate and the ACER - 486/DX4/75 because they don't have compliant BIOSes. Many "green" - desktop machines also don't have compliant BIOSes, and this driver - may cause those machines to panic during the boot phase. - - Generally, if you don't have a battery in your machine, there isn't - much point in using this driver and you should say N. If you get - random kernel OOPSes or reboots that don't seem to be related to - anything, try disabling/enabling this option (or disabling/enabling - APM in your BIOS). - - Some other things you should try when experiencing seemingly random, - "weird" problems: - - 1) make sure that you have enough swap space and that it is - enabled. - 2) pass the "no-hlt" option to the kernel - 3) switch on floating point emulation in the kernel and pass - the "no387" option to the kernel - 4) pass the "floppy=nodma" option to the kernel - 5) pass the "mem=4M" option to the kernel (thereby disabling - all but the first 4 MB of RAM) - 6) make sure that the CPU is not over clocked. - 7) read the sig11 FAQ at <http://www.bitwizard.nl/sig11/> - 8) disable the cache from your BIOS settings - 9) install a fan for the video card or exchange video RAM - 10) install a better fan for the CPU - 11) exchange RAM chips - 12) exchange the motherboard. + Say N. - To compile this driver as a module, choose M here: the - module will be called apm. -config APM_IGNORE_USER_SUSPEND - bool "Ignore USER SUSPEND" - depends on APM - help - This option will ignore USER SUSPEND requests. On machines with a - compliant APM BIOS, you want to say N. However, on the NEC Versa M - series notebooks, it is necessary to say Y because of a BIOS bug. - -config APM_DO_ENABLE - bool "Enable PM at boot time" - depends on APM - ---help--- - Enable APM features at boot time. From page 36 of the APM BIOS - specification: "When disabled, the APM BIOS does not automatically - power manage devices, enter the Standby State, enter the Suspend - State, or take power saving steps in response to CPU Idle calls." - This driver will make CPU Idle calls when Linux is idle (unless this - feature is turned off -- see "Do CPU IDLE calls", below). This - should always save battery power, but more complicated APM features - will be dependent on your BIOS implementation. You may need to turn - this option off if your computer hangs at boot time when using APM - support, or if it beeps continuously instead of suspending. Turn - this off if you have a NEC UltraLite Versa 33/C or a Toshiba - T400CDT. This is off by default since most machines do fine without - this feature. - -config APM_CPU_IDLE - bool "Make CPU Idle calls when idle" - depends on APM - help - Enable calls to APM CPU Idle/CPU Busy inside the kernel's idle loop. - On some machines, this can activate improved power savings, such as - a slowed CPU clock rate, when the machine is idle. These idle calls - are made after the idle loop has run for some length of time (e.g., - 333 mS). On some machines, this will cause a hang at boot time or - whenever the CPU becomes idle. (On machines with more than one CPU, - this option does nothing.) - -config APM_DISPLAY_BLANK - bool "Enable console blanking using APM" - depends on APM - help - Enable console blanking using the APM. Some laptops can use this to - turn off the LCD backlight when the screen blanker of the Linux - virtual console blanks the screen. Note that this is only used by - the virtual console screen blanker, and won't turn off the backlight - when using the X Window system. This also doesn't have anything to - do with your VESA-compliant power-saving monitor. Further, this - option doesn't work for all laptops -- it might not turn off your - backlight at all, or it might print a lot of errors to the console, - especially if you are using gpm. - -config APM_RTC_IS_GMT - bool "RTC stores time in GMT" - depends on APM - help - Say Y here if your RTC (Real Time Clock a.k.a. hardware clock) - stores the time in GMT (Greenwich Mean Time). Say N if your RTC - stores localtime. - - It is in fact recommended to store GMT in your RTC, because then you - don't have to worry about daylight savings time changes. The only - reason not to use GMT in your RTC is if you also run a broken OS - that doesn't understand GMT. - -config APM_ALLOW_INTS - bool "Allow interrupts during APM BIOS calls" - depends on APM - help - Normally we disable external interrupts while we are making calls to - the APM BIOS as a measure to lessen the effects of a badly behaving - BIOS implementation. The BIOS should reenable interrupts if it - needs to. Unfortunately, some BIOSes do not -- especially those in - many of the newer IBM Thinkpads. If you experience hangs when you - suspend, try setting this to Y. Otherwise, say N. - -config APM_REAL_MODE_POWER_OFF - bool "Use real mode APM BIOS call to power off" - depends on APM - help - Use real mode APM BIOS calls to switch off the computer. This is - a work-around for a number of buggy BIOSes. Switch this option on if - your computer crashes instead of powering off properly. +if XEN_PHYSDEV_ACCESS -endmenu +menu "Bus options (PCI, PCMCIA, EISA, MCA, ISA)" -source "arch/i386/kernel/cpu/cpufreq/Kconfig" +config X86_UP_APIC + bool "Local APIC support on uniprocessors" + depends on !SMP && !(X86_VISWS || X86_VOYAGER) + help + A local APIC (Advanced Programmable Interrupt Controller) is an + integrated interrupt controller in the CPU. If you have a single-CPU + system which has a processor with a local APIC, you can say Y here to + enable and use it. If you say Y here even though your machine doesn't + have a local APIC, then the kernel will still run with no slowdown at + all. The local APIC supports CPU-generated self-interrupts (timer, + performance counters), and the NMI watchdog which detects hard + lockups. -endmenu +config X86_UP_IOAPIC + bool "IO-APIC support on uniprocessors" + depends on X86_UP_APIC + help + An IO-APIC (I/O Advanced Programmable Interrupt Controller) is an + SMP-capable replacement for PC-style interrupt controllers. Most + SMP systems and many recent uniprocessor systems have one. -menu "Bus options (PCI, PCMCIA, EISA, MCA, ISA)" + If you have a single-CPU system with an IO-APIC, you can say Y here + to use it. If you say Y here even though your machine doesn't have + an IO-APIC, then the kernel will still run with no slowdown at all. config PCI bool "PCI support" if !X86_VISWS @@ -1232,25 +897,7 @@ source "drivers/pci/hotplug/Kconfig" endmenu -menu "Executable file formats" - -source "fs/Kconfig.binfmt" - -endmenu - -source "drivers/Kconfig" - -source "fs/Kconfig" - -source "arch/i386/oprofile/Kconfig" - -source "arch/i386/Kconfig.debug" - -source "security/Kconfig" - -source "crypto/Kconfig" - -source "lib/Kconfig" +endif # # Use the generic interrupt handling code in kernel/irq/: @@ -1268,10 +915,10 @@ config X86_SMP depends on SMP && !X86_VOYAGER default y -config X86_HT - bool - depends on SMP && !(X86_VISWS || X86_VOYAGER) - default y +#config X86_HT +# bool +# depends on SMP && !(X86_VISWS || X86_VOYAGER) +# default y config X86_BIOS_REBOOT bool @@ -1287,3 +934,22 @@ config PC bool depends on X86 && !EMBEDDED default y + +config SECCOMP + bool "Enable seccomp to safely compute untrusted bytecode" + depends on PROC_FS + default y + help + This kernel feature is useful for number crunching applications + that may need to compute untrusted bytecode during their + execution. By using pipes or other transports made available to + the process as file descriptors supporting the read/write + syscalls, it's possible to isolate those applications in + their own address space using seccomp. Once seccomp is + enabled via /proc/<pid>/seccomp, it cannot be disabled + and the task is only allowed to execute a few safe syscalls + defined by each seccomp mode. + + If unsure, say Y. Only embedded should say N here. + +endmenu diff -x mkbuildtree -x include -x xen -x SCCS -urPp pristine-linux-2.6.12/arch/i386/kernel/acpi/boot.c linux-2.6-xen-sparse/arch/i386/kernel/acpi/boot.c --- pristine-linux-2.6.12/arch/i386/kernel/acpi/boot.c 2005-06-17 12:48:29.000000000 -0700 +++ linux-2.6-xen-sparse/arch/i386/kernel/acpi/boot.c 2005-07-28 13:17:07.000000000 -0700 @@ -36,6 +36,11 @@ #include <asm/io.h> #include <asm/irq.h> #include <asm/mpspec.h> +#ifdef CONFIG_XEN +#include <asm/fixmap.h> +#endif + +void (*pm_power_off)(void) = NULL; #ifdef CONFIG_X86_64 @@ -100,7 +105,7 @@ EXPORT_SYMBOL(x86_acpiid_to_apicid); */ enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC; -#ifdef CONFIG_X86_64 +#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN) /* rely on all ACPI tables being in the direct mapping */ char *__acpi_map_table(unsigned long phys_addr, unsigned long size) @@ -133,8 +138,10 @@ char *__acpi_map_table(unsigned long phy unsigned long base, offset, mapped_size; int idx; +#ifndef CONFIG_XEN if (phys + size < 8*1024*1024) return __va(phys); +#endif offset = phys & (PAGE_SIZE - 1); mapped_size = PAGE_SIZE - offset; @@ -462,18 +469,6 @@ unsigned int acpi_register_gsi(u32 gsi, unsigned int irq; unsigned int plat_gsi = gsi; -#ifdef CONFIG_PCI - /* - * Make sure all (legacy) PCI IRQs are set as level-triggered. - */ - if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) { - extern void eisa_set_level_irq(unsigned int irq); - - if (edge_level == ACPI_LEVEL_SENSITIVE) - eisa_set_level_irq(gsi); - } -#endif - #ifdef CONFIG_X86_IO_APIC if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) { plat_gsi = mp_register_gsi(gsi, edge_level, active_high_low); @@ -513,13 +508,14 @@ acpi_scan_rsdp ( { unsigned long offset = 0; unsigned long sig_len = sizeof("RSD PTR ") - 1; + unsigned long vstart = (unsigned long)isa_bus_to_virt(start); /* * Scan all 16-byte boundaries of the physical memory region for the * RSDP signature. */ for (offset = 0; offset < length; offset += 16) { - if (strncmp((char *) (start + offset), "RSD PTR ", sig_len)) + if (strncmp((char *) (vstart + offset), "RSD PTR ", sig_len)) continue; return (start + offset); } @@ -652,6 +648,8 @@ acpi_find_rsdp (void) if (!rsdp_phys) rsdp_phys = acpi_scan_rsdp (0xE0000, 0x20000); + set_fixmap(FIX_ACPI_RSDP_PAGE, rsdp_phys); + return rsdp_phys; } diff -x mkbuildtree -x include -x xen -x SCCS -urPp pristine-linux-2.6.12/arch/i386/kernel/acpi/Makefile linux-2.6-xen-sparse/arch/i386/kernel/acpi/Makefile --- pristine-linux-2.6.12/arch/i386/kernel/acpi/Makefile 2005-06-17 12:48:29.000000000 -0700 +++ linux-2.6-xen-sparse/arch/i386/kernel/acpi/Makefile 2005-07-28 13:17:07.000000000 -0700 @@ -1,4 +1,13 @@ -obj-$(CONFIG_ACPI_BOOT) := boot.o -obj-$(CONFIG_X86_IO_APIC) += earlyquirk.o -obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup.o +obj-$(CONFIG_ACPI_BOOT) := boot.o +c-obj-$(CONFIG_X86_IO_APIC) += earlyquirk.o +c-obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup.o +c-link := + +$(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-link)): + @ln -fsn $(srctree)/arch/i386/kernel/acpi/$(notdir $@) $@ + +obj-y += $(c-obj-y) $(s-obj-y) + +clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-) $(c-link)) +clean-files += $(patsubst %.o,%.S,$(s-obj-y) $(s-obj-) $(s-link)) diff -x mkbuildtree -x include -x xen -x SCCS -urPp pristine-linux-2.6.12/arch/i386/kernel/apic.c linux-2.6-xen-sparse/arch/i386/kernel/apic.c --- pristine-linux-2.6.12/arch/i386/kernel/apic.c 2005-06-17 12:48:29.000000000 -0700 +++ linux-2.6-xen-sparse/arch/i386/kernel/apic.c 2005-07-28 13:17:07.000000000 -0700 @@ -44,8 +44,10 @@ */ int apic_verbosity; - -static void apic_pm_activate(void); +int get_physical_broadcast(void) +{ + return 0xff; +} /* * 'what should we do if we get a hw irq event on an illegal vector'. @@ -65,1212 +67,17 @@ void ack_bad_irq(unsigned int irq) ack_APIC_irq(); } -void __init apic_intr_init(void) -{ -#ifdef CONFIG_SMP - smp_intr_init(); -#endif - /* self generated IPI for local APIC timer */ - set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); - - /* IPI vectors for APIC spurious and error interrupts */ - set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); - set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); - - /* thermal monitor LVT interrupt */ -#ifdef CONFIG_X86_MCE_P4THERMAL - set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); -#endif -} - -/* Using APIC to generate smp_local_timer_interrupt? */ -int using_apic_timer = 0; - -static DEFINE_PER_CPU(int, prof_multiplier) = 1; -static DEFINE_PER_CPU(int, prof_old_multiplier) = 1; -static DEFINE_PER_CPU(int, prof_counter) = 1; - -static int enabled_via_apicbase; - -void enable_NMI_through_LVT0 (void * dummy) -{ - unsigned int v, ver; - - ver = apic_read(APIC_LVR); - ver = GET_APIC_VERSION(ver); - v = APIC_DM_NMI; /* unmask and set to NMI */ - if (!APIC_INTEGRATED(ver)) /* 82489DX */ - v |= APIC_LVT_LEVEL_TRIGGER; - apic_write_around(APIC_LVT0, v); -} - -int get_physical_broadcast(void) -{ - unsigned int lvr, version; - lvr = apic_read(APIC_LVR); - version = GET_APIC_VERSION(lvr); - if (!APIC_INTEGRATED(version) || version >= 0x14) - return 0xff; - else - return 0xf; -} - -int get_maxlvt(void) -{ - unsigned int v, ver, maxlvt; - - v = apic_read(APIC_LVR); - ver = GET_APIC_VERSION(v); - /* 82489DXs do not report # of LVT entries. */ - maxlvt = APIC_INTEGRATED(ver) ? GET_APIC_MAXLVT(v) : 2; - return maxlvt; -} - -void clear_local_APIC(void) -{ - int maxlvt; - unsigned long v; - - maxlvt = get_maxlvt(); - - /* - * Masking an LVT entry on a P6 can trigger a local APIC error - * if the vector is zero. Mask LVTERR first to prevent this. - */ - if (maxlvt >= 3) { - v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ - apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED); - } - /* - * Careful: we have to set masks only first to deassert - * any level-triggered sources. - */ - v = apic_read(APIC_LVTT); - apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED); - v = apic_read(APIC_LVT0); - apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); - v = apic_read(APIC_LVT1); - apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED); - if (maxlvt >= 4) { - v = apic_read(APIC_LVTPC); - apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED); - } - -/* lets not touch this if we didn't frob it */ -#ifdef CONFIG_X86_MCE_P4THERMAL - if (maxlvt >= 5) { - v = apic_read(APIC_LVTTHMR); - apic_write_around(APIC_LVTTHMR, v | APIC_LVT_MASKED); - } -#endif - /* - * Clean APIC state for other OSs: - */ - apic_write_around(APIC_LVTT, APIC_LVT_MASKED); - apic_write_around(APIC_LVT0, APIC_LVT_MASKED); - apic_write_around(APIC_LVT1, APIC_LVT_MASKED); - if (maxlvt >= 3) - apic_write_around(APIC_LVTERR, APIC_LVT_MASKED); - if (maxlvt >= 4) - apic_write_around(APIC_LVTPC, APIC_LVT_MASKED); - -#ifdef CONFIG_X86_MCE_P4THERMAL - if (maxlvt >= 5) - apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED); -#endif - v = GET_APIC_VERSION(apic_read(APIC_LVR)); - if (APIC_INTEGRATED(v)) { /* !82489DX */ - if (maxlvt > 3) /* Due to Pentium errata 3AP and 11AP. */ - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - } -} - -void __init connect_bsp_APIC(void) -{ - if (pic_mode) { - /* - * Do not trust the local APIC being empty at bootup. - */ - clear_local_APIC(); - /* - * PIC mode, enable APIC mode in the IMCR, i.e. - * connect BSP's local APIC to INT and NMI lines. - */ - apic_printk(APIC_VERBOSE, "leaving PIC mode, " - "enabling APIC mode.\n"); - outb(0x70, 0x22); - outb(0x01, 0x23); - } - enable_apic_mode(); -} - -void disconnect_bsp_APIC(void) -{ - if (pic_mode) { - /* - * Put the board back into PIC mode (has an effect - * only on certain older boards). Note that APIC - * interrupts, including IPIs, won't work beyond - * this point! The only exception are INIT IPIs. - */ - apic_printk(APIC_VERBOSE, "disabling APIC mode, " - "entering PIC mode.\n"); - outb(0x70, 0x22); - outb(0x00, 0x23); - } -} - -void disable_local_APIC(void) -{ - unsigned long value; - - clear_local_APIC(); - - /* - * Disable APIC (implies clearing of registers - * for 82489DX!). - */ - value = apic_read(APIC_SPIV); - value &= ~APIC_SPIV_APIC_ENABLED; - apic_write_around(APIC_SPIV, value); - - if (enabled_via_apicbase) { - unsigned int l, h; - rdmsr(MSR_IA32_APICBASE, l, h); - l &= ~MSR_IA32_APICBASE_ENABLE; - wrmsr(MSR_IA32_APICBASE, l, h); - } -} - -/* - * This is to verify that we're looking at a real local APIC. - * Check these against your board if the CPUs aren't getting - * started for no apparent reason. - */ -int __init verify_local_APIC(void) -{ - unsigned int reg0, reg1; - - /* - * The version register is read-only in a real APIC. - */ - reg0 = apic_read(APIC_LVR); - apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0); - apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK); - reg1 = apic_read(APIC_LVR); - apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1); - - /* - * The two version reads above should print the same - * numbers. If the second one is different, then we - * poke at a non-APIC. - */ - if (reg1 != reg0) - return 0; - - /* - * Check if the version looks reasonably. - */ - reg1 = GET_APIC_VERSION(reg0); - if (reg1 == 0x00 || reg1 == 0xff) - return 0; - reg1 = get_maxlvt(); - if (reg1 < 0x02 || reg1 == 0xff) - return 0; - - /* - * The ID register is read/write in a real APIC. - */ - reg0 = apic_read(APIC_ID); - apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); - - /* - * The next two are just to see if we have sane values. - * They're only really relevant if we're in Virtual Wire - * compatibility mode, but most boxes are anymore. - */ - reg0 = apic_read(APIC_LVT0); - apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0); - reg1 = apic_read(APIC_LVT1); - apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1); - - return 1; -} - -void __init sync_Arb_IDs(void) -{ - /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */ - unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR)); - if (ver >= 0x14) /* P4 or higher */ - return; - /* - * Wait for idle. - */ - apic_wait_icr_idle(); - - apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); - apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG - | APIC_DM_INIT); -} - -extern void __error_in_apic_c (void); - -/* - * An initial setup of the virtual wire mode. - */ -void __init init_bsp_APIC(void) -{ - unsigned long value, ver; - - /* - * Don't do the setup now if we have a SMP BIOS as the - * through-I/O-APIC virtual wire mode might be active. - */ - if (smp_found_config || !cpu_has_apic) - return; - - value = apic_read(APIC_LVR); - ver = GET_APIC_VERSION(value); - - /* - * Do not trust the local APIC being empty at bootup. - */ - clear_local_APIC(); - - /* - * Enable APIC. - */ - value = apic_read(APIC_SPIV); - value &= ~APIC_VECTOR_MASK; - value |= APIC_SPIV_APIC_ENABLED; - - /* This bit is reserved on P4/Xeon and should be cleared */ - if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 15)) - value &= ~APIC_SPIV_FOCUS_DISABLED; - else - value |= APIC_SPIV_FOCUS_DISABLED; - value |= SPURIOUS_APIC_VECTOR; - apic_write_around(APIC_SPIV, value); - - /* - * Set up the virtual wire mode. - */ - apic_write_around(APIC_LVT0, APIC_DM_EXTINT); - value = APIC_DM_NMI; - if (!APIC_INTEGRATED(ver)) /* 82489DX */ - value |= APIC_LVT_LEVEL_TRIGGER; - apic_write_around(APIC_LVT1, value); -} - -void __init setup_local_APIC (void) -{ - unsigned long oldvalue, value, ver, maxlvt; - - /* Pound the ESR really hard over the head with a big hammer - mbligh */ - if (esr_disable) { - apic_write(APIC_ESR, 0); - apic_write(APIC_ESR, 0); - apic_write(APIC_ESR, 0); - apic_write(APIC_ESR, 0); - } - - value = apic_read(APIC_LVR); - ver = GET_APIC_VERSION(value); - - if ((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f) - __error_in_apic_c(); - - /* - * Double-check whether this APIC is really registered. - */ - if (!apic_id_registered()) - BUG(); - - /* - * Intel recommends to set DFR, LDR and TPR before enabling - * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel - * document number 292116). So here it goes... - */ - init_apic_ldr(); - - /* - * Set Task Priority to 'accept all'. We never change this - * later on. - */ - value = apic_read(APIC_TASKPRI); - value &= ~APIC_TPRI_MASK; - apic_write_around(APIC_TASKPRI, value); - - /* - * Now that we are all set up, enable the APIC - */ - value = apic_read(APIC_SPIV); - value &= ~APIC_VECTOR_MASK; - /* - * Enable APIC - */ - value |= APIC_SPIV_APIC_ENABLED; - - /* - * Some unknown Intel IO/APIC (or APIC) errata is biting us with - * certain networking cards. If high frequency interrupts are - * happening on a particular IOAPIC pin, plus the IOAPIC routing - * entry is masked/unmasked at a high rate as well then sooner or - * later IOAPIC line gets 'stuck', no more interrupts are received - * from the device. If focus CPU is disabled then the hang goes - * away, oh well :-( - * - * [ This bug can be reproduced easily with a level-triggered - * PCI Ne2000 networking cards and PII/PIII processors, dual - * BX chipset. ] - */ - /* - * Actually disabling the focus CPU check just makes the hang less - * frequent as it makes the interrupt distributon model be more - * like LRU than MRU (the short-term load is more even across CPUs). - * See also the comment in end_level_ioapic_irq(). --macro - */ -#if 1 - /* Enable focus processor (bit==0) */ - value &= ~APIC_SPIV_FOCUS_DISABLED; -#else - /* Disable focus processor (bit==1) */ - value |= APIC_SPIV_FOCUS_DISABLED; -#endif - /* - * Set spurious IRQ vector - */ - value |= SPURIOUS_APIC_VECTOR; - apic_write_around(APIC_SPIV, value); - - /* - * Set up LVT0, LVT1: - * - * set up through-local-APIC on the BP's LINT0. This is not - * strictly necessery in pure symmetric-IO mode, but sometimes - * we delegate interrupts to the 8259A. - */ - /* - * TODO: set up through-local-APIC from through-I/O-APIC? --macro - */ - value = apic_read(APIC_LVT0) & APIC_LVT_MASKED; - if (!smp_processor_id() && (pic_mode || !value)) { - value = APIC_DM_EXTINT; - apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", - smp_processor_id()); - } else { - value = APIC_DM_EXTINT | APIC_LVT_MASKED; - apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", - smp_processor_id()); - } - apic_write_around(APIC_LVT0, value); - - /* - * only the BP should see the LINT1 NMI signal, obviously. - */ - if (!smp_processor_id()) - value = APIC_DM_NMI; - else - value = APIC_DM_NMI | APIC_LVT_MASKED; - if (!APIC_INTEGRATED(ver)) /* 82489DX */ - value |= APIC_LVT_LEVEL_TRIGGER; - apic_write_around(APIC_LVT1, value); - - if (APIC_INTEGRATED(ver) && !esr_disable) { /* !82489DX */ - maxlvt = get_maxlvt(); - if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ - apic_write(APIC_ESR, 0); - oldvalue = apic_read(APIC_ESR); - - value = ERROR_APIC_VECTOR; // enables sending errors - apic_write_around(APIC_LVTERR, value); - /* - * spec says clear errors after enabling vector. - */ - if (maxlvt > 3) - apic_write(APIC_ESR, 0); - value = apic_read(APIC_ESR); - if (value != oldvalue) - apic_printk(APIC_VERBOSE, "ESR value before enabling " - "vector: 0x%08lx after: 0x%08lx\n", - oldvalue, value); - } else { - if (esr_disable) - /* - * Something untraceble is creating bad interrupts on - * secondary quads ... for the moment, just leave the - * ESR disabled - we can't do anything useful with the - * errors anyway - mbligh - */ - printk("Leaving ESR disabled.\n"); - else - printk("No ESR for 82489DX.\n"); - } - - if (nmi_watchdog == NMI_LOCAL_APIC) - setup_apic_nmi_watchdog(); - apic_pm_activate(); -} - -/* - * If Linux enabled the LAPIC against the BIOS default - * disable it down before re-entering the BIOS on shutdown. - * Otherwise the BIOS may get confused and not power-off. - */ -void lapic_shutdown(void) -{ - if (!cpu_has_apic || !enabled_via_apicbase) - return; - - local_irq_disable(); - disable_local_APIC(); - local_irq_enable(); -} - -#ifdef CONFIG_PM - -static struct { - int active; - /* r/w apic fields */ - unsigned int apic_id; - unsigned int apic_taskpri; - unsigned int apic_ldr; - unsigned int apic_dfr; - unsigned int apic_spiv; - unsigned int apic_lvtt; - unsigned int apic_lvtpc; - unsigned int apic_lvt0; - unsigned int apic_lvt1; - unsigned int apic_lvterr; - unsigned int apic_tmict; - unsigned int apic_tdcr; - unsigned int apic_thmr; -} apic_pm_state; - -static int lapic_suspend(struct sys_device *dev, pm_message_t state) -{ - unsigned long flags; - - if (!apic_pm_state.active) - return 0; - - apic_pm_state.apic_id = apic_read(APIC_ID); - apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); - apic_pm_state.apic_ldr = apic_read(APIC_LDR); - apic_pm_state.apic_dfr = apic_read(APIC_DFR); - apic_pm_state.apic_spiv = apic_read(APIC_SPIV); - apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); - apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); - apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); - apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); - apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); - apic_pm_state.apic_tmict = apic_read(APIC_TMICT); - apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); - apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); - - local_irq_save(flags); - disable_local_APIC(); - local_irq_restore(flags); - return 0; -} - -static int lapic_resume(struct sys_device *dev) -{ - unsigned int l, h; - unsigned long flags; - - if (!apic_pm_state.active) - return 0; - - local_irq_save(flags); - - /* - * Make sure the APICBASE points to the right address - * - * FIXME! This will be wrong if we ever support suspend on - * SMP! We'll need to do this as part of the CPU restore! - */ - rdmsr(MSR_IA32_APICBASE, l, h); - l &= ~MSR_IA32_APICBASE_BASE; - l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; - wrmsr(MSR_IA32_APICBASE, l, h); - - apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); - apic_write(APIC_ID, apic_pm_state.apic_id); - apic_write(APIC_DFR, apic_pm_state.apic_dfr); - apic_write(APIC_LDR, apic_pm_state.apic_ldr); - apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri); - apic_write(APIC_SPIV, apic_pm_state.apic_spiv); - apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); - apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); - apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); - apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); - apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); - apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); - apic_write(APIC_TMICT, apic_pm_state.apic_tmict); - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - local_irq_restore(flags); - return 0; -} - -/* - * This device has no shutdown method - fully functioning local APICs - * are needed on every CPU up until machine_halt/restart/poweroff. - */ - -static struct sysdev_class lapic_sysclass = { - set_kset_name("lapic"), - .resume = lapic_resume, - .suspend = lapic_suspend, -}; - -static struct sys_device device_lapic = { - .id = 0, - .cls = &lapic_sysclass, -}; - -static void __init apic_pm_activate(void) -{ - apic_pm_state.active = 1; -} - -static int __init init_lapic_sysfs(void) -{ - int error; - - if (!cpu_has_apic) - return 0; - /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ - - error = sysdev_class_register(&lapic_sysclass); - if (!error) - error = sysdev_register(&device_lapic); - return error; -} -device_initcall(init_lapic_sysfs); - -#else /* CONFIG_PM */ - -static void apic_pm_activate(void) { } - -#endif /* CONFIG_PM */ - -/* - * Detect and enable local APICs on non-SMP boards. - * Original code written by Keir Fraser. - */ - -/* - * Knob to control our willingness to enable the local APIC. - */ -int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */ - -static int __init lapic_disable(char *str) -{ - enable_local_apic = -1; - clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); - return 0; -} -__setup("nolapic", lapic_disable); - -static int __init lapic_enable(char *str) -{ - enable_local_apic = 1; - return 0; -} -__setup("lapic", lapic_enable); - -static int __init apic_set_verbosity(char *str) -{ - if (strcmp("debug", str) == 0) - apic_verbosity = APIC_DEBUG; - else if (strcmp("verbose", str) == 0) - apic_verbosity = APIC_VERBOSE; - else - printk(KERN_WARNING "APIC Verbosity level %s not recognised" - " use apic=verbose or apic=debug", str); - - return 0; -} - -__setup("apic=", apic_set_verbosity); - -static int __init detect_init_APIC (void) -{ - u32 h, l, features; - extern void get_cpu_vendor(struct cpuinfo_x86*); - - /* Disabled by kernel option? */ - if (enable_local_apic < 0) - return -1; - - /* Workaround for us being called before identify_cpu(). */ - get_cpu_vendor(&boot_cpu_data); - - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model > 1) || - (boot_cpu_data.x86 == 15)) - break; - goto no_apic; - case X86_VENDOR_INTEL: - if (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15 || - (boot_cpu_data.x86 == 5 && cpu_has_apic)) - break; - goto no_apic; - default: - goto no_apic; - } - - if (!cpu_has_apic) { - /* - * Over-ride BIOS and try to enable the local - * APIC only if "lapic" specified. - */ - if (enable_local_apic <= 0) { - printk("Local APIC disabled by BIOS -- " - "you can enable it with \"lapic\"\n"); - return -1; - } - /* - * Some BIOSes disable the local APIC in the - * APIC_BASE MSR. This can only be done in - * software for Intel P6 or later and AMD K7 - * (Model > 1) or later. - */ - rdmsr(MSR_IA32_APICBASE, l, h); - if (!(l & MSR_IA32_APICBASE_ENABLE)) { - printk("Local APIC disabled by BIOS -- reenabling.\n"); - l &= ~MSR_IA32_APICBASE_BASE; - l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; - wrmsr(MSR_IA32_APICBASE, l, h); - enabled_via_apicbase = 1; - } - } - /* - * The APIC feature bit should now be enabled - * in `cpuid' - */ - features = cpuid_edx(1); - if (!(features & (1 << X86_FEATURE_APIC))) { - printk("Could not enable APIC!\n"); - return -1; - } - set_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; - - /* The BIOS may have set up the APIC at some other address */ - rdmsr(MSR_IA32_APICBASE, l, h); - if (l & MSR_IA32_APICBASE_ENABLE) - mp_lapic_addr = l & MSR_IA32_APICBASE_BASE; - - if (nmi_watchdog != NMI_NONE) - nmi_watchdog = NMI_LOCAL_APIC; - - printk("Found and enabled local APIC!\n"); - - apic_pm_activate(); - - return 0; - -no_apic: - printk("No local APIC present or hardware disabled\n"); - return -1; -} - -void __init init_apic_mappings(void) -{ - unsigned long apic_phys; - - /* - * If no local APIC can be found then set up a fake all - * zeroes page to simulate the local APIC and another - * one for the IO-APIC. - */ - if (!smp_found_config && detect_init_APIC()) { - apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); - apic_phys = __pa(apic_phys); - } else - apic_phys = mp_lapic_addr; - - set_fixmap_nocache(FIX_APIC_BASE, apic_phys); - printk(KERN_DEBUG "mapped APIC to %08lx (%08lx)\n", APIC_BASE, - apic_phys); - - /* - * Fetch the APIC ID of the BSP in case we have a - * default configuration (or the MP table is broken). - */ - if (boot_cpu_physical_apicid == -1U) - boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID)); - -#ifdef CONFIG_X86_IO_APIC - { - unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; - int i; - - for (i = 0; i < nr_ioapics; i++) { - if (smp_found_config) { - ioapic_phys = mp_ioapics[i].mpc_apicaddr; - if (!ioapic_phys) { - printk(KERN_ERR - "WARNING: bogus zero IO-APIC " - "address found in MPTABLE, " - "disabling IO/APIC support!\n"); - smp_found_config = 0; - skip_ioapic_setup = 1; - goto fake_ioapic_page; - } - } else { -fake_ioapic_page: - ioapic_phys = (unsigned long) - alloc_bootmem_pages(PAGE_SIZE); - ioapic_phys = __pa(ioapic_phys); - } - set_fixmap_nocache(idx, ioapic_phys); - printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n", - __fix_to_virt(idx), ioapic_phys); - idx++; - } - } -#endif -} - -/* - * This part sets up the APIC 32 bit clock in LVTT1, with HZ interrupts - * per second. We assume that the caller has already set up the local - * APIC. - * - * The APIC timer is not exactly sync with the external timer chip, it - * closely follows bus clocks. - */ - -/* - * The timer chip is already set up at HZ interrupts per second here, - * but we do not accept timer interrupts yet. We only allow the BP - * to calibrate. - */ -static unsigned int __init get_8254_timer_count(void) -{ - extern spinlock_t i8253_lock; - unsigned long flags; - - unsigned int count; - - spin_lock_irqsave(&i8253_lock, flags); - - outb_p(0x00, PIT_MODE); - count = inb_p(PIT_CH0); - count |= inb_p(PIT_CH0) << 8; - - spin_unlock_irqrestore(&i8253_lock, flags); - - return count; -} - -/* next tick in 8254 can be caught by catching timer wraparound */ -static void __init wait_8254_wraparound(void) -{ - unsigned int curr_count, prev_count; - - curr_count = get_8254_timer_count(); - do { - prev_count = curr_count; - curr_count = get_8254_timer_count(); - - /* workaround for broken Mercury/Neptune */ - if (prev_count >= curr_count + 0x100) - curr_count = get_8254_timer_count(); - - } while (prev_count >= curr_count); -} - -/* - * Default initialization for 8254 timers. If we use other timers like HPET, - * we override this later - */ -void (*wait_timer_tick)(void) __initdata = wait_8254_wraparound; - -/* - * This function sets up the local APIC timer, with a timeout of - * 'clocks' APIC bus clock. During calibration we actually call - * this function twice on the boot CPU, once with a bogus timeout - * value, second time for real. The other (noncalibrating) CPUs - * call this function only once, with the real, calibrated value. - * - * We do reads before writes even if unnecessary, to get around the - * P5 APIC double write bug. - */ - -#define APIC_DIVISOR 16 - -static void __setup_APIC_LVTT(unsigned int clocks) -{ - unsigned int lvtt_value, tmp_value, ver; - - ver = GET_APIC_VERSION(apic_read(APIC_LVR)); - lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR; - if (!APIC_INTEGRATED(ver)) - lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); - apic_write_around(APIC_LVTT, lvtt_value); - - /* - * Divide PICLK by 16 - */ - tmp_value = apic_read(APIC_TDCR); - apic_write_around(APIC_TDCR, (tmp_value - & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) - | APIC_TDR_DIV_16); - - apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); -} - -static void __init setup_APIC_timer(unsigned int clocks) -{ - unsigned long flags; - - local_irq_save(flags); - - /* - * Wait for IRQ0's slice: - */ - wait_timer_tick(); - - __setup_APIC_LVTT(clocks); - - local_irq_restore(flags); -} - -/* - * In this function we calibrate APIC bus clocks to the external - * timer. Unfortunately we cannot use jiffies and the timer irq - * to calibrate, since some later bootup code depends on getting - * the first irq? Ugh. - * - * We want to do the calibration only once since we - * want to have local timer irqs syncron. CPUs connected - * by the same APIC bus have the very same bus frequency. - * And we want to have irqs off anyways, no accidental - * APIC irq that way. - */ - -static int __init calibrate_APIC_clock(void) -{ - unsigned long long t1 = 0, t2 = 0; - long tt1, tt2; - long result; - int i; - const int LOOPS = HZ/10; - - apic_printk(APIC_VERBOSE, "calibrating APIC timer ...\n"); - - /* - * Put whatever arbitrary (but long enough) timeout - * value into the APIC clock, we just want to get the - * counter running for calibration. - */ - __setup_APIC_LVTT(1000000000); - - /* - * The timer chip counts down to zero. Let's wait - * for a wraparound to start exact measurement: - * (the current tick might have been already half done) - */ - - wait_timer_tick(); - - /* - * We wrapped around just now. Let's start: - */ - if (cpu_has_tsc) - rdtscll(t1); - tt1 = apic_read(APIC_TMCCT); - - /* - * Let's wait LOOPS wraprounds: - */ - for (i = 0; i < LOOPS; i++) - wait_timer_tick(); - - tt2 = apic_read(APIC_TMCCT); - if (cpu_has_tsc) - rdtscll(t2); - - /* - * The APIC bus clock counter is 32 bits only, it - * might have overflown, but note that we use signed - * longs, thus no extra care needed. - * - * underflown to be exact, as the timer counts down ;) - */ - - result = (tt1-tt2)*APIC_DIVISOR/LOOPS; - - if (cpu_has_tsc) - apic_printk(APIC_VERBOSE, "..... CPU clock speed is " - "%ld.%04ld MHz.\n", - ((long)(t2-t1)/LOOPS)/(1000000/HZ), - ((long)(t2-t1)/LOOPS)%(1000000/HZ)); - - apic_printk(APIC_VERBOSE, "..... host bus clock speed is " - "%ld.%04ld MHz.\n", - result/(1000000/HZ), - result%(1000000/HZ)); - - return result; -} - -static unsigned int calibration_result; - -void __init setup_boot_APIC_clock(void) -{ - apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"); - using_apic_timer = 1; - - local_irq_disable(); - - calibration_result = calibrate_APIC_clock(); - /* - * Now set up the timer for real. - */ - setup_APIC_timer(calibration_result); - - local_irq_enable(); -} - -void __init setup_secondary_APIC_clock(void) -{ - setup_APIC_timer(calibration_result); -} - -void __init disable_APIC_timer(void) -{ - if (using_apic_timer) { - unsigned long v; - - v = apic_read(APIC_LVTT); - apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED); - } -} - -void enable_APIC_timer(void) -{ - if (using_apic_timer) { - unsigned long v; - - v = apic_read(APIC_LVTT); - apic_write_around(APIC_LVTT, v & ~APIC_LVT_MASKED); - } -} - -/* - * the frequency of the profiling timer can be changed - * by writing a multiplier value into /proc/profile. - */ -int setup_profiling_timer(unsigned int multiplier) -{ - int i; - - /* - * Sanity check. [at least 500 APIC cycles should be - * between APIC interrupts as a rule of thumb, to avoid - * irqs flooding us] - */ - if ( (!multiplier) || (calibration_result/multiplier < 500)) - return -EINVAL; - - /* - * Set the new multiplier for each CPU. CPUs don't start using the - * new values until the next timer interrupt in which they do process - * accounting. At that time they also adjust their APIC timers - * accordingly. - */ - for (i = 0; i < NR_CPUS; ++i) - per_cpu(prof_multiplier, i) = multiplier; - - return 0; -} - -#undef APIC_DIVISOR - -/* - * Local timer interrupt handler. It does both profiling and - * process statistics/rescheduling. - * - * We do profiling in every local tick, statistics/rescheduling - * happen only every 'profiling multiplier' ticks. The default - * multiplier is 1 and it can be changed by writing the new multiplier - * value into /proc/profile. - */ - -inline void smp_local_timer_interrupt(struct pt_regs * regs) -{ - int cpu = smp_processor_id(); - - profile_tick(CPU_PROFILING, regs); - if (--per_cpu(prof_counter, cpu) <= 0) { - /* - * The multiplier may have changed since the last time we got - * to this point as a result of the user writing to - * /proc/profile. In this case we need to adjust the APIC - * timer accordingly. - * - * Interrupts are already masked off at this point. - */ - per_cpu(prof_counter, cpu) = per_cpu(prof_multiplier, cpu); - if (per_cpu(prof_counter, cpu) != - per_cpu(prof_old_multiplier, cpu)) { - __setup_APIC_LVTT( - calibration_result/ - per_cpu(prof_counter, cpu)); - per_cpu(prof_old_multiplier, cpu) = - per_cpu(prof_counter, cpu); - } - -#ifdef CONFIG_SMP - update_process_times(user_mode(regs)); -#endif - } - - /* - * We take the 'long' return path, and there every subsystem - * grabs the apropriate locks (kernel lock/ irq lock). - * - * we might want to decouple profiling from the 'long path', - * and do the profiling totally in assembly. - * - * Currently this isn't too much of an issue (performance wise), - * we can take more than 100K local irqs per second on a 100 MHz P5. - */ -} - -/* - * Local APIC timer interrupt. This is the most natural way for doing - * local interrupts, but local timer interrupts can be emulated by - * broadcast interrupts too. [in case the hw doesn't support APIC timers] - * - * [ if a single-CPU system runs an SMP kernel then we call the local - * interrupt as well. Thus we cannot inline the local irq ... ] - */ - -fastcall void smp_apic_timer_interrupt(struct pt_regs *regs) -{ - int cpu = smp_processor_id(); - - /* - * the NMI deadlock-detector uses this. - */ - per_cpu(irq_stat, cpu).apic_timer_irqs++; - - /* - * NOTE! We'd better ACK the irq immediately, - * because timer handling can be slow. - */ - ack_APIC_irq(); - /* - * update_process_times() expects us to have done irq_enter(). - * Besides, if we don't timer interrupts ignore the global - * interrupt lock, which is the WrongThing (tm) to do. - */ - irq_enter(); - smp_local_timer_interrupt(regs); - irq_exit(); -} - -/* - * This interrupt should _never_ happen with our APIC/SMP architecture - */ -fastcall void smp_spurious_interrupt(struct pt_regs *regs) -{ - unsigned long v; - - irq_enter(); - /* - * Check if this really is a spurious interrupt and ACK it - * if it is a vectored one. Just in case... - * Spurious interrupts should not be ACKed. - */ - v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1)); - if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) - ack_APIC_irq(); - - /* see sw-dev-man vol 3, chapter 7.4.13.5 */ - printk(KERN_INFO "spurious APIC interrupt on CPU#%d, should never happen.\n", - smp_processor_id()); - irq_exit(); -} - -/* - * This interrupt should never happen with our APIC/SMP architecture - */ - -fastcall void smp_error_interrupt(struct pt_regs *regs) -{ - unsigned long v, v1; - - irq_enter(); - /* First tickle the hardware, only then report what went on. -- REW */ - v = apic_read(APIC_ESR); - apic_write(APIC_ESR, 0); - v1 = apic_read(APIC_ESR); - ack_APIC_irq(); - atomic_inc(&irq_err_count); - - /* Here is what the APIC error bits mean: - 0: Send CS error - 1: Receive CS error - 2: Send accept error - 3: Receive accept error - 4: Reserved - 5: Send illegal vector - 6: Received illegal vector - 7: Illegal register address - */ - printk (KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n", - smp_processor_id(), v , v1); - irq_exit(); -} - /* * This initializes the IO-APIC and APIC hardware if this is * a UP kernel. */ int __init APIC_init_uniprocessor (void) { - if (enable_local_apic < 0) - clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); - - if (!smp_found_config && !cpu_has_apic) - return -1; - - /* - * Complain if the BIOS pretends there is one. - */ - if (!cpu_has_apic && APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { - printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", - boot_cpu_physical_apicid); - return -1; - } - - verify_local_APIC(); - - connect_bsp_APIC(); - - phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid); - - setup_local_APIC(); - #ifdef CONFIG_X86_IO_APIC if (smp_found_config) if (!skip_ioapic_setup && nr_ioapics) setup_IO_APIC(); #endif - setup_boot_APIC_clock(); return 0; } diff -x mkbuildtree -x include -x xen -x SCCS -urPp pristine-linux-2.6.12/arch/i386/kernel/cpu/common.c linux-2.6-xen-sparse/arch/i386/kernel/cpu/common.c --- pristine-linux-2.6.12/arch/i386/kernel/cpu/common.c 2005-06-17 12:48:29.000000000 -0700 +++ linux-2.6-xen-sparse/arch/i386/kernel/cpu/common.c 2005-07-28 13:17:07.000000000 -0700 @@ -15,6 +15,7 @@ #include <asm/apic.h> #include <mach_apic.h> #endif +#include <asm-xen/hypervisor.h> #include "cpu.h" @@ -32,6 +33,8 @@ struct cpu_dev * cpu_devs[X86_VENDOR_NUM extern void mcheck_init(struct cpuinfo_x86 *c); +extern void machine_specific_modify_cpu_capabilities(struct cpuinfo_x86 *c); + extern int disable_pse; static void default_init(struct cpuinfo_x86 * c) @@ -409,6 +412,8 @@ void __init identify_cpu(struct cpuinfo_ c->x86_vendor, c->x86_model); } + machine_specific_modify_cpu_capabilities(c); + /* Now the feature flags better reflect actual CPU features! */ printk(KERN_DEBUG "CPU: After all inits, caps:"); @@ -554,6 +559,24 @@ void __init early_cpu_init(void) disable_pse = 1; #endif } + +void __init cpu_gdt_init(struct Xgt_desc_struct *gdt_descr) +{ + unsigned long frames[16]; + unsigned long va; + int f; + + for (va = gdt_descr->address, f = 0; + va < gdt_descr->address + gdt_descr->size; + va += PAGE_SIZE, f++) { + frames[f] = virt_to_machine(va) >> PAGE_SHIFT; + make_page_readonly((void *)va); + } + if (HYPERVISOR_set_gdt(frames, gdt_descr->size / 8)) + BUG(); + lgdt_finish(); +} + /* * cpu_init() initializes state that is per-CPU. Some data is already * initialized (naturally) in the bootstrap process, such as the GDT @@ -565,7 +588,6 @@ void __init cpu_init (void) int cpu = smp_processor_id(); struct tss_struct * t = &per_cpu(init_tss, cpu); struct thread_struct *thread = ¤t->thread; - __u32 stk16_off = (__u32)&per_cpu(cpu_16bit_stack, cpu); if (cpu_test_and_set(cpu, cpu_initialized)) { printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); @@ -573,7 +595,7 @@ void __init cpu_init (void) } printk(KERN_INFO "Initializing CPU#%d\n", cpu); - if (cpu_has_vme || cpu_has_tsc || cpu_has_de) + if (cpu_has_vme || cpu_has_de) clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); if (tsc_disable && cpu_has_tsc) { printk(KERN_NOTICE "Disabling TSC...\n"); @@ -583,30 +605,12 @@ void __init cpu_init (void) } /* - * Initialize the per-CPU GDT with the boot GDT, - * and set up the GDT descriptor: - */ - memcpy(&per_cpu(cpu_gdt_table, cpu), cpu_gdt_table, - GDT_SIZE); - - /* Set up GDT entry for 16bit stack */ - *(__u64 *)&(per_cpu(cpu_gdt_table, cpu)[GDT_ENTRY_ESPFIX_SS]) |= - ((((__u64)stk16_off) << 16) & 0x000000ffffff0000ULL) | - ((((__u64)stk16_off) << 32) & 0xff00000000000000ULL) | - (CPU_16BIT_STACK_SIZE - 1); - - cpu_gdt_descr[cpu].size = GDT_SIZE - 1; - cpu_gdt_descr[cpu].address = - (unsigned long)&per_cpu(cpu_gdt_table, cpu); - - /* * Set up the per-thread TLS descriptor cache: */ - memcpy(thread->tls_array, &per_cpu(cpu_gdt_table, cpu), - GDT_ENTRY_TLS_ENTRIES * 8); + memcpy(thread->tls_array, &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN], + GDT_ENTRY_TLS_ENTRIES * 8); - __asm__ __volatile__("lgdt %0" : : "m" (cpu_gdt_descr[cpu])); - __asm__ __volatile__("lidt %0" : : "m" (idt_descr)); + cpu_gdt_init(&cpu_gdt_descr[cpu]); /* * Delete NT @@ -623,19 +627,15 @@ void __init cpu_init (void) enter_lazy_tlb(&init_mm, current); load_esp0(t, thread); - set_tss_desc(cpu,t); - load_TR_desc(); - load_LDT(&init_mm.context); - /* Set up doublefault TSS pointer in the GDT */ - __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); + load_LDT(&init_mm.context); /* Clear %fs and %gs. */ asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs"); /* Clear all 6 debug registers: */ -#define CD(register) __asm__("movl %0,%%db" #register ::"r"(0) ); +#define CD(register) HYPERVISOR_set_debugreg(register, 0) CD(0); CD(1); CD(2); CD(3); /* no db4 and db5 */; CD(6); CD(7); diff -x mkbuildtree -x include -x xen -x SCCS -urPp pristine-linux-2.6.12/arch/i386/kernel/cpu/Makefile linux-2.6-xen-sparse/arch/i386/kernel/cpu/Makefile --- pristine-linux-2.6.12/arch/i386/kernel/cpu/Makefile 2005-06-17 12:48:29.000000000 -0700 +++ linux-2.6-xen-sparse/arch/i386/kernel/cpu/Makefile 2005-07-28 13:17:07.000000000 -0700 @@ -2,18 +2,30 @@ # Makefile for x86-compatible CPU details and quirks # -obj-y := common.o proc.o +CFLAGS += -Iarch/i386/kernel/cpu -obj-y += amd.o -obj-y += cyrix.o -obj-y += centaur.o -obj-y += transmeta.o -obj-y += intel.o intel_cacheinfo.o -obj-y += rise.o -obj-y += nexgen.o -obj-y += umc.o +obj-y := common.o +c-obj-y += proc.o -obj-$(CONFIG_X86_MCE) += mcheck/ +c-obj-y += amd.o +c-obj-y += cyrix.o +c-obj-y += centaur.o +c-obj-y += transmeta.o +c-obj-y += intel.o intel_cacheinfo.o +c-obj-y += rise.o +c-obj-y += nexgen.o +c-obj-y += umc.o + +#obj-$(CONFIG_X86_MCE) += ../../../../i386/kernel/cpu/mcheck/ obj-$(CONFIG_MTRR) += mtrr/ -obj-$(CONFIG_CPU_FREQ) += cpufreq/ +#obj-$(CONFIG_CPU_FREQ) += ../../../../i386/kernel/cpu/cpufreq/ + +c-link := + +$(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-link)): + @ln -fsn $(srctree)/arch/i386/kernel/cpu/$(notdir $@) $@ + +obj-y += $(c-obj-y) + +clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-) $(c-link)) diff -x mkbuildtree -x include -x xen -x SCCS -urPp pristine-linux-2.6.12/arch/i386/kernel/cpu/mtrr/main.c linux-2.6-xen-sparse/arch/i386/kernel/cpu/mtrr/main.c --- pristine-linux-2.6.12/arch/i386/kernel/cpu/mtrr/main.c 2005-06-17 12:48:29.000000000 -0700 +++ linux-2.6-xen-sparse/arch/i386/kernel/cpu/mtrr/main.c 2005-07-28 13:17:07.000000000 -0700 @@ -1,116 +1,46 @@ -/* Generic MTRR (Memory Type Range Register) driver. - - Copyright (C) 1997-2000 Richard Gooch - Copyright (c) 2002 Patrick Mochel - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Library General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Library General Public License for more details. - - You should have received a copy of the GNU Library General Public - License along with this library; if not, write to the Free - Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - - Richard Gooch may be reached by email at rgooch@xxxxxxxxxxxxx - The postal address is: - Richard Gooch, c/o ATNF, P. O. Box 76, Epping, N.S.W., 2121, Australia. - - Source: "Pentium Pro Family Developer's Manual, Volume 3: - Operating System Writer's Guide" (Intel document number 242692), - section 11.11.7 - - This was cleaned and made readable by Patrick Mochel <mochel@xxxxxxxx> - on 6-7 March 2002. - Source: Intel Architecture Software Developers Manual, Volume 3: - System Programming Guide; Section 9.11. (1997 edition - PPro). -*/ - -#include <linux/module.h> #include <linux/init.h> -#include <linux/pci.h> -#include <linux/smp.h> -#include <linux/cpu.h> +#include <linux/proc_fs.h> +#include <linux/ctype.h> +#include <linux/module.h> +#include <linux/seq_file.h> +#include <asm/uaccess.h> #include <asm/mtrr.h> - -#include <asm/uaccess.h> -#include <asm/processor.h> -#include <asm/msr.h> #include "mtrr.h" -#define MTRR_VERSION "2.0 (20020519)" - -u32 num_var_ranges = 0; - -unsigned int *usage_table; -static DECLARE_MUTEX(main_lock); - -u32 size_or_mask, size_and_mask; - -static struct mtrr_ops * mtrr_ops[X86_VENDOR_NUM] = {}; - -struct mtrr_ops * mtrr_if = NULL; - -static void set_mtrr(unsigned int reg, unsigned long base, - unsigned long size, mtrr_type type); +void generic_get_mtrr(unsigned int reg, unsigned long *base, + unsigned int *size, mtrr_type * type) +{ + dom0_op_t op; -extern int arr3_protected; + op.cmd = DOM0_READ_MEMTYPE; + op.u.read_memtype.reg = reg; + (void)HYPERVISOR_dom0_op(&op); -void set_mtrr_ops(struct mtrr_ops * ops) -{ - if (ops->vendor && ops->vendor < X86_VENDOR_NUM) - mtrr_ops[ops->vendor] = ops; + *size = op.u.read_memtype.nr_pfns; + *base = op.u.read_memtype.pfn; + *type = op.u.read_memtype.type; } -/* Returns non-zero if we have the write-combining memory type */ -static int have_wrcomb(void) -{ - struct pci_dev *dev; - u8 rev; - - if ((dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL)) != NULL) { - /* ServerWorks LE chipsets < rev 6 have problems with write-combining - Don't allow it and leave room for other chipsets to be tagged */ - if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS && - dev->device == PCI_DEVICE_ID_SERVERWORKS_LE) { - pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); - if (rev <= 5) { - printk(KERN_INFO "mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n"); - pci_dev_put(dev); - return 0; - } - } - /* Intel 450NX errata # 23. Non ascending cacheline evictions to - write combining memory may resulting in data corruption */ - if (dev->vendor == PCI_VENDOR_ID_INTEL && - dev->device == PCI_DEVICE_ID_INTEL_82451NX) { - printk(KERN_INFO "mtrr: Intel 450NX MMC detected. Write-combining disabled.\n"); - pci_dev_put(dev); - return 0; - } - pci_dev_put(dev); - } - return (mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0); -} +struct mtrr_ops generic_mtrr_ops = { + .use_intel_if = 1, + .get = generic_get_mtrr, +}; + +struct mtrr_ops *mtrr_if = &generic_mtrr_ops; +unsigned int num_var_ranges; +unsigned int *usage_table; -/* This function returns the number of variable MTRRs */ static void __init set_num_var_ranges(void) { - unsigned long config = 0, dummy; + dom0_op_t op; - if (use_intel()) { - rdmsr(MTRRcap_MSR, config, dummy); - } else if (is_cpu(AMD)) - config = 2; - else if (is_cpu(CYRIX) || is_cpu(CENTAUR)) - config = 8; - num_var_ranges = config & 0xff; + for (num_var_ranges = 0; ; num_var_ranges++) { + op.cmd = DOM0_READ_MEMTYPE; + op.u.read_memtype.reg = num_var_ranges; + if (HYPERVISOR_dom0_op(&op) != 0) + break; + } } static void __init init_table(void) @@ -124,293 +54,28 @@ static void __init init_table(void) return; } for (i = 0; i < max; i++) - usage_table[i] = 1; -} - -struct set_mtrr_data { - atomic_t count; - atomic_t gate; - unsigned long smp_base; - unsigned long smp_size; - unsigned int smp_reg; - mtrr_type smp_type; -}; - -#ifdef CONFIG_SMP - -static void ipi_handler(void *info) -/* [SUMMARY] Synchronisation handler. Executed by "other" CPUs. - [RETURNS] Nothing. -*/ -{ - struct set_mtrr_data *data = info; - unsigned long flags; - - local_irq_save(flags); - - atomic_dec(&data->count); - while(!atomic_read(&data->gate)) - cpu_relax(); - - /* The master has cleared me to execute */ - if (data->smp_reg != ~0U) - mtrr_if->set(data->smp_reg, data->smp_base, - data->smp_size, data->smp_type); - else - mtrr_if->set_all(); - - atomic_dec(&data->count); - while(atomic_read(&data->gate)) - cpu_relax(); - - atomic_dec(&data->count); - local_irq_restore(flags); + usage_table[i] = 0; } -#endif - -/** - * set_mtrr - update mtrrs on all processors - * @reg: mtrr in question - * @base: mtrr base - * @size: mtrr size - * @type: mtrr type - * - * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly: - * - * 1. Send IPI to do the following: - * 2. Disable Interrupts - * 3. Wait for all procs to do so - * 4. Enter no-fill cache mode - * 5. Flush caches - * 6. Clear PGE bit - * 7. Flush all TLBs - * 8. Disable all range registers - * 9. Update the MTRRs - * 10. Enable all range registers - * 11. Flush all TLBs and caches again - * 12. Enter normal cache mode and reenable caching - * 13. Set PGE - * 14. Wait for buddies to catch up - * 15. Enable interrupts. - * - * What does that mean for us? Well, first we set data.count to the number - * of CPUs. As each CPU disables interrupts, it'll decrement it once. We wait - * until it hits 0 and proceed. We set the data.gate flag and reset data.count. - * Meanwhile, they are waiting for that flag to be set. Once it's set, each - * CPU goes through the transition of updating MTRRs. The CPU vendors may each do it - * differently, so we call mtrr_if->set() callback and let them take care of it. - * When they're done, they again decrement data->count and wait for data.gate to - * be reset. - * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag. - * Everyone then enables interrupts and we all continue on. - * - * Note that the mechanism is the same for UP systems, too; all the SMP stuff - * becomes nops. - */ -static void set_mtrr(unsigned int reg, unsigned long base, - unsigned long size, mtrr_type type) -{ - struct set_mtrr_data data; - unsigned long flags; - - data.smp_reg = reg; - data.smp_base = base; - data.smp_size = size; - data.smp_type = type; - atomic_set(&data.count, num_booting_cpus() - 1); - atomic_set(&data.gate,0); - - /* Start the ball rolling on other CPUs */ - if (smp_call_function(ipi_handler, &data, 1, 0) != 0) - panic("mtrr: timed out waiting for other CPUs\n"); - - local_irq_save(flags); - - while(atomic_read(&data.count)) - cpu_relax(); - - /* ok, reset count and toggle gate */ - atomic_set(&data.count, num_booting_cpus() - 1); - atomic_set(&data.gate,1); - - /* do our MTRR business */ - - /* HACK! - * We use this same function to initialize the mtrrs on boot. - * The state of the boot cpu's mtrrs has been saved, and we want - * to replicate across all the APs. - * If we're doing that @reg is set to something special... - */ - if (reg != ~0U) - mtrr_if->set(reg,base,size,type); - - /* wait for the others */ - while(atomic_read(&data.count)) - cpu_relax(); - - atomic_set(&data.count, num_booting_cpus() - 1); - atomic_set(&data.gate,0); - - /* - * Wait here for everyone to have seen the gate change - * So we're the last ones to touch 'data' - */ - while(atomic_read(&data.count)) - cpu_relax(); - - local_irq_restore(flags); -} - -/** - * mtrr_add_page - Add a memory type region - * @base: Physical base address of region in pages (4 KB) - * @size: Physical size of region in pages (4 KB) - * @type: Type of MTRR desired - * @increment: If this is true do usage counting on the region - * - * Memory type region registers control the caching on newer Intel and - * non Intel processors. This function allows drivers to request an - * MTRR is added. The details and hardware specifics of each processor's - * implementation are hidden from the caller, but nevertheless the - * caller should expect to need to provide a power of two size on an - * equivalent power of two boundary. - * - * If the region cannot be added either because all regions are in use - * or the CPU cannot support it a negative value is returned. On success - * the register number for this entry is returned, but should be treated - * as a cookie only. - * - * On a multiprocessor machine the changes are made to all processors. - * This is required on x86 by the Intel processors. - * - * The available types are - * - * %MTRR_TYPE_UNCACHABLE - No caching - * - * %MTRR_TYPE_WRBACK - Write data back in bursts whenever - * - * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts - * - * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes - * - * BUGS: Needs a quiet flag for the cases where drivers do not mind - * failures and do not wish system log messages to be sent. - */ - int mtrr_add_page(unsigned long base, unsigned long size, unsigned int type, char increment) { - int i; - mtrr_type ltype; - unsigned long lbase; - unsigned int lsize; int error; + dom0_op_t op; - if (!mtrr_if) - return -ENXIO; - - if ((error = mtrr_if->validate_add_page(base,size,type))) + op.cmd = DOM0_ADD_MEMTYPE; + op.u.add_memtype.pfn = base; + op.u.add_memtype.nr_pfns = size; + op.u.add_memtype.type = type; + if ((error = HYPERVISOR_dom0_op(&op))) return error; - if (type >= MTRR_NUM_TYPES) { - printk(KERN_WARNING "mtrr: type: %u invalid\n", type); - return -EINVAL; - } - - /* If the type is WC, check that this processor supports it */ - if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) { - printk(KERN_WARNING - "mtrr: your processor doesn't support write-combining\n"); - return -ENOSYS; - } - - if (base & size_or_mask || size & size_or_mask) { - printk(KERN_WARNING "mtrr: base or size exceeds the MTRR width\n"); - return -EINVAL; - } + if (increment) + ++usage_table[op.u.add_memtype.reg]; - error = -EINVAL; - - /* Search for existing MTRR */ - down(&main_lock); - for (i = 0; i < num_var_ranges; ++i) { - mtrr_if->get(i, &lbase, &lsize, <ype); - if (base >= lbase + lsize) - continue; - if ((base < lbase) && (base + size <= lbase)) - continue; - /* At this point we know there is some kind of overlap/enclosure */ - if ((base < lbase) || (base + size > lbase + lsize)) { - printk(KERN_WARNING - "mtrr: 0x%lx000,0x%lx000 overlaps existing" - " 0x%lx000,0x%x000\n", base, size, lbase, - lsize); - goto out; - } - /* New region is enclosed by an existing region */ - if (ltype != type) { - if (type == MTRR_TYPE_UNCACHABLE) - continue; - printk (KERN_WARNING "mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n", - base, size, mtrr_attrib_to_str(ltype), - mtrr_attrib_to_str(type)); - goto out; - } - if (increment) - ++usage_table[i]; - error = i; - goto out; - } - /* Search for an empty MTRR */ - i = mtrr_if->get_free_region(base, size); - if (i >= 0) { - set_mtrr(i, base, size, type); - usage_table[i] = 1; - } else - printk(KERN_INFO "mtrr: no more MTRRs available\n"); - error = i; - out: - up(&main_lock); - return error; + return op.u.add_memtype.reg; } -/** - * mtrr_add - Add a memory type region - * @base: Physical base address of region - * @size: Physical size of region - * @type: Type of MTRR desired - * @increment: If this is true do usage counting on the region - * - * Memory type region registers control the caching on newer Intel and - * non Intel processors. This function allows drivers to request an - * MTRR is added. The details and hardware specifics of each processor's - * implementation are hidden from the caller, but nevertheless the - * caller should expect to need to provide a power of two size on an - * equivalent power of two boundary. - * - * If the region cannot be added either because all regions are in use - * or the CPU cannot support it a negative value is returned. On success - * the register number for this entry is returned, but should be treated - * as a cookie only. - * - * On a multiprocessor machine the changes are made to all processors. - * This is required on x86 by the Intel processors. - * - * The available types are - * - * %MTRR_TYPE_UNCACHABLE - No caching - * - * %MTRR_TYPE_WRBACK - Write data back in bursts whenever - * - * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts - * - * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes - * - * BUGS: Needs a quiet flag for the cases where drivers do not mind - * failures and do not wish system log messages to be sent. - */ - int mtrr_add(unsigned long base, unsigned long size, unsigned int type, char increment) @@ -424,21 +89,6 @@ mtrr_add(unsigned long base, unsigned lo increment); } -/** - * mtrr_del_page - delete a memory type region - * @reg: Register returned by mtrr_add - * @base: Physical base address - * @size: Size of region - * - * If register is supplied then base and size are ignored. This is - * how drivers should call it. - * - * Releases an MTRR region. If the usage count drops to zero the - * register is freed and the region returns to default state. - * On success the register is returned, on failure a negative error - * code. - */ - int mtrr_del_page(int reg, unsigned long base, unsigned long size) { int i, max; @@ -446,12 +96,9 @@ int mtrr_del_page(int reg, unsigned long unsigned long lbase; unsigned int lsize; int error = -EINVAL; - - if (!mtrr_if) - return -ENXIO; + dom0_op_t op; max = num_var_ranges; - down(&main_lock); if (reg < 0) { /* Search for existing MTRR */ for (i = 0; i < max; ++i) { @@ -467,46 +114,20 @@ int mtrr_del_page(int reg, unsigned long goto out; } } - if (reg >= max) { - printk(KERN_WARNING "mtrr: register: %d too big\n", reg); - goto out; - } - if (is_cpu(CYRIX) && !use_intel()) { - if ((reg == 3) && arr3_protected) { - printk(KERN_WARNING "mtrr: ARR3 cannot be changed\n"); - goto out; - } - } - mtrr_if->get(reg, &lbase, &lsize, <ype); - if (lsize < 1) { - printk(KERN_WARNING "mtrr: MTRR %d not used\n", reg); - goto out; - } if (usage_table[reg] < 1) { printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg); goto out; } - if (--usage_table[reg] < 1) - set_mtrr(reg, 0, 0, 0); + if (--usage_table[reg] < 1) { + op.cmd = DOM0_DEL_MEMTYPE; + op.u.del_memtype.handle = 0; + op.u.add_memtype.reg = reg; + (void)HYPERVISOR_dom0_op(&op); + } error = reg; out: - up(&main_lock); return error; } -/** - * mtrr_del - delete a memory type region - * @reg: Register returned by mtrr_add - * @base: Physical base address - * @size: Size of region - * - * If register is supplied then base and size are ignored. This is - * how drivers should call it. - * - * Releases an MTRR region. If the usage count drops to zero the - * register is freed and the region returns to default state. - * On success the register is returned, on failure a negative error - * code. - */ int mtrr_del(int reg, unsigned long base, unsigned long size) @@ -522,157 +143,23 @@ mtrr_del(int reg, unsigned long base, un EXPORT_SYMBOL(mtrr_add); EXPORT_SYMBOL(mtrr_del); -/* HACK ALERT! - * These should be called implicitly, but we can't yet until all the initcall - * stuff is done... - */ -extern void amd_init_mtrr(void); -extern void cyrix_init_mtrr(void); -extern void centaur_init_mtrr(void); - -static void __init init_ifs(void) -{ - amd_init_mtrr(); - cyrix_init_mtrr(); - centaur_init_mtrr(); -} - -static void __init init_other_cpus(void) +static int __init mtrr_init(void) { - if (use_intel()) - get_mtrr_state(); - - /* bring up the other processors */ - set_mtrr(~0U,0,0,0); - - if (use_intel()) { - finalize_mtrr_state(); - mtrr_state_warn(); - } -} + struct cpuinfo_x86 *c = &boot_cpu_data; + if (!(xen_start_info.flags & SIF_PRIVILEGED)) + return -ENODEV; -struct mtrr_value { - mtrr_type ltype; - unsigned long lbase; - unsigned int lsize; -}; + if ((!cpu_has(c, X86_FEATURE_MTRR)) && + (!cpu_has(c, X86_FEATURE_K6_MTRR)) && + (!cpu_has(c, X86_FEATURE_CYRIX_ARR)) && + (!cpu_has(c, X86_FEATURE_CENTAUR_MCR))) + return -ENODEV; -static struct mtrr_value * mtrr_state; + set_num_var_ranges(); + init_table(); -static int mtrr_save(struct sys_device * sysdev, u32 state) -{ - int i; - int size = num_var_ranges * sizeof(struct mtrr_value); - - mtrr_state = kmalloc(size,GFP_ATOMIC); - if (mtrr_state) - memset(mtrr_state,0,size); - else - return -ENOMEM; - - for (i = 0; i < num_var_ranges; i++) { - mtrr_if->get(i, - &mtrr_state[i].lbase, - &mtrr_state[i].lsize, - &mtrr_state[i].ltype); - } return 0; } -static int mtrr_restore(struct sys_device * sysdev) -{ - int i; - - for (i = 0; i < num_var_ranges; i++) { - if (mtrr_state[i].lsize) - set_mtrr(i, - mtrr_state[i].lbase, - mtrr_state[i].lsize, - mtrr_state[i].ltype); - } - kfree(mtrr_state); - return 0; -} - - - -static struct sysdev_driver mtrr_sysdev_driver = { - .suspend = mtrr_save, - .resume = mtrr_restore, -}; - - -/** - * mtrr_init - initialize mtrrs on the boot CPU - * - * This needs to be called early; before any of the other CPUs are - * initialized (i.e. before smp_init()). - * - */ -static int __init mtrr_init(void) -{ - init_ifs(); - - if (cpu_has_mtrr) { - mtrr_if = &generic_mtrr_ops; - size_or_mask = 0xff000000; /* 36 bits */ - size_and_mask = 0x00f00000; - - /* This is an AMD specific MSR, but we assume(hope?) that - Intel will implement it to when they extend the address - bus of the Xeon. */ - if (cpuid_eax(0x80000000) >= 0x80000008) { - u32 phys_addr; - phys_addr = cpuid_eax(0x80000008) & 0xff; - size_or_mask = ~((1 << (phys_addr - PAGE_SHIFT)) - 1); - size_and_mask = ~size_or_mask & 0xfff00000; - } else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR && - boot_cpu_data.x86 == 6) { - /* VIA C* family have Intel style MTRRs, but - don't support PAE */ - size_or_mask = 0xfff00000; /* 32 bits */ - size_and_mask = 0; - } - } else { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - if (cpu_has_k6_mtrr) { - /* Pre-Athlon (K6) AMD CPU MTRRs */ - mtrr_if = mtrr_ops[X86_VENDOR_AMD]; - size_or_mask = 0xfff00000; /* 32 bits */ - size_and_mask = 0; - } - break; - case X86_VENDOR_CENTAUR: - if (cpu_has_centaur_mcr) { - mtrr_if = mtrr_ops[X86_VENDOR_CENTAUR]; - size_or_mask = 0xfff00000; /* 32 bits */ - size_and_mask = 0; - } - break; - case X86_VENDOR_CYRIX: - if (cpu_has_cyrix_arr) { - mtrr_if = mtrr_ops[X86_VENDOR_CYRIX]; - size_or_mask = 0xfff00000; /* 32 bits */ - size_and_mask = 0; - } - break; - default: - break; - } - } - printk(KERN_INFO "mtrr: v%s\n",MTRR_VERSION); - - if (mtrr_if) { - set_num_var_ranges(); - init_table(); - init_other_cpus(); - - return sysdev_driver_register(&cpu_sysdev_class, - &mtrr_sysdev_driver); - } - return -ENXIO; -} - subsys_initcall(mtrr_init); diff -x mkbuildtree -x include -x xen -x SCCS -urPp pristine-linux-2.6.12/arch/i386/kernel/cpu/mtrr/Makefile linux-2.6-xen-sparse/arch/i386/kernel/cpu/mtrr/Makefile --- pristine-linux-2.6.12/arch/i386/kernel/cpu/mtrr/Makefile 2005-06-17 12:48:29.000000000 -0700 +++ linux-2.6-xen-sparse/arch/i386/kernel/cpu/mtrr/Makefile 2005-07-28 13:17:07.000000000 -0700 @@ -1,5 +1,16 @@ -obj-y := main.o if.o generic.o state.o -obj-y += amd.o -obj-y += cyrix.o -obj-y += centaur.o +obj-y := main.o +c-obj-y := if.o +c-link := + +$(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-link)): $(obj)/mtrr.h + @ln -fsn $(srctree)/arch/i386/kernel/cpu/mtrr/$(notdir $@) $@ + +$(patsubst %.o,$(obj)/%.c,$(obj-y)): $(obj)/mtrr.h + +$(obj)/mtrr.h: + @ln -fsn $(srctree)/arch/i386/kernel/cpu/mtrr/mtrr.h $@ + +obj-y += $(c-obj-y) + +clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-) $(c-link)) diff -x mkbuildtree -x include -x xen -x SCCS -urPp pristine-linux-2.6.12/arch/i386/kernel/entry.S linux-2.6-xen-sparse/arch/i386/kernel/entry.S --- pristine-linux-2.6.12/arch/i386/kernel/entry.S 2005-06-17 12:48:29.000000000 -0700 +++ linux-2.6-xen-sparse/arch/i386/kernel/entry.S 2005-07-28 13:17:07.000000000 -0700 @@ -47,8 +47,8 @@ #include <asm/segment.h> #include <asm/smp.h> #include <asm/page.h> -#include <asm/desc.h> #include "irq_vectors.h" +#include <asm-xen/xen-public/xen.h> #define nr_syscalls ((syscall_table_size)/4) @@ -64,6 +64,7 @@ ES = 0x20 ORIG_EAX = 0x24 EIP = 0x28 CS = 0x2C +EVENT_MASK = 0x2E EFLAGS = 0x30 OLDESP = 0x34 OLDSS = 0x38 @@ -75,11 +76,43 @@ DF_MASK = 0x00000400 NT_MASK = 0x00004000 VM_MASK = 0x00020000 +/* Offsets into shared_info_t. */ +#define evtchn_upcall_pending /* 0 */ +#define evtchn_upcall_mask 1 + +#define sizeof_vcpu_shift 3 + +#ifdef CONFIG_SMP +#define preempt_disable(reg) incl TI_preempt_count(reg) +#define preempt_enable(reg) decl TI_preempt_count(reg) +#define XEN_GET_VCPU_INFO(reg) preempt_disable(%ebp) ; \ + movl TI_cpu(%ebp),reg ; \ + shl $sizeof_vcpu_shift,reg ; \ + addl HYPERVISOR_shared_info,reg +#define XEN_PUT_VCPU_INFO(reg) preempt_enable(%ebp) +#define XEN_PUT_VCPU_INFO_fixup .byte 0xff,0xff,0xff +#else +#define XEN_GET_VCPU_INFO(reg) movl HYPERVISOR_shared_info,reg +#define XEN_PUT_VCPU_INFO(reg) +#define XEN_PUT_VCPU_INFO_fixup +#endif + +#define XEN_LOCKED_BLOCK_EVENTS(reg) movb $1,evtchn_upcall_mask(reg) +#define XEN_LOCKED_UNBLOCK_EVENTS(reg) movb $0,evtchn_upcall_mask(reg) +#define XEN_BLOCK_EVENTS(reg) XEN_GET_VCPU_INFO(reg) ; \ + XEN_LOCKED_BLOCK_EVENTS(reg) ; \ + XEN_PUT_VCPU_INFO(reg) +#define XEN_UNBLOCK_EVENTS(reg) XEN_GET_VCPU_INFO(reg) ; \ + XEN_LOCKED_UNBLOCK_EVENTS(reg) ; \ + XEN_PUT_VCPU_INFO(reg) +#define XEN_TEST_PENDING(reg) testb $0xFF,evtchn_upcall_pending(reg) + #ifdef CONFIG_PREEMPT -#define preempt_stop cli +#define preempt_stop GET_THREAD_INFO(%ebp) ; \ + XEN_BLOCK_EVENTS(%esi) #else #define preempt_stop -#define resume_kernel restore_nocheck +#define resume_kernel restore_all #endif #define SAVE_ALL \ @@ -123,6 +156,23 @@ VM_MASK = 0x00020000 .previous +#define RESTORE_ALL \ + RESTORE_REGS \ + addl $4, %esp; \ +1: iret; \ +.section .fixup,"ax"; \ +2: movl $(__USER_DS), %edx; \ + movl %edx, %ds; \ + movl %edx, %es; \ + movl $11,%eax; \ + call do_exit; \ +.previous; \ +.section __ex_table,"a";\ + .align 4; \ + .long 1b,2b; \ +.previous + + ENTRY(ret_from_fork) pushl %eax call schedule_tail @@ -145,10 +195,10 @@ ret_from_intr: GET_THREAD_INFO(%ebp) movl EFLAGS(%esp), %eax # mix EFLAGS and CS movb CS(%esp), %al - testl $(VM_MASK | 3), %eax - jz resume_kernel + testl $(VM_MASK | 2), %eax + jz resume_kernel # returning to kernel or vm86-space ENTRY(resume_userspace) - cli # make sure we don't miss an interrupt + XEN_BLOCK_EVENTS(%esi) # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret movl TI_flags(%ebp), %ecx @@ -159,15 +209,15 @@ ENTRY(resume_userspace) #ifdef CONFIG_PREEMPT ENTRY(resume_kernel) - cli + XEN_BLOCK_EVENTS(%esi) cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? - jnz restore_nocheck + jnz restore_all need_resched: movl TI_flags(%ebp), %ecx # need_resched set ? testb $_TIF_NEED_RESCHED, %cl jz restore_all - testl $IF_MASK,EFLAGS(%esp) # interrupts off (exception path) ? - jz restore_all + testb $0xFF,EVENT_MASK(%esp) # interrupts off (exception path) ? + jnz restore_all call preempt_schedule_irq jmp need_resched #endif @@ -202,8 +252,7 @@ sysenter_past_esp: SAVE_ALL GET_THREAD_INFO(%ebp) - /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ - testw $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),TI_flags(%ebp) + testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) jnz syscall_trace_entry cmpl $(nr_syscalls), %eax jae syscall_badsys @@ -227,8 +276,7 @@ ENTRY(system_call) SAVE_ALL GET_THREAD_INFO(%ebp) # system call tracing in operation - /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ - testw $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),TI_flags(%ebp) + testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) jnz syscall_trace_entry cmpl $(nr_syscalls), %eax jae syscall_badsys @@ -236,63 +284,31 @@ syscall_call: call *sys_call_table(,%eax,4) movl %eax,EAX(%esp) # store the return value syscall_exit: - cli # make sure we don't miss an interrupt + XEN_BLOCK_EVENTS(%esi) # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx # current->work jne syscall_exit_work - restore_all: - movl EFLAGS(%esp), %eax # mix EFLAGS, SS and CS - # Warning: OLDSS(%esp) contains the wrong/random values if we - # are returning to the kernel. - # See comments in process.c:copy_thread() for details. - movb OLDSS(%esp), %ah - movb CS(%esp), %al - andl $(VM_MASK | (4 << 8) | 3), %eax - cmpl $((4 << 8) | 3), %eax - je ldt_ss # returning to user-space with LDT SS -restore_nocheck: - RESTORE_REGS - addl $4, %esp -1: iret -.section .fixup,"ax" -iret_exc: - sti - pushl $0 # no error code - pushl $do_iret_error - jmp error_code -.previous -.section __ex_table,"a" - .align 4 - .long 1b,iret_exc -.previous + testl $VM_MASK, EFLAGS(%esp) + jnz resume_vm86 + movb EVENT_MASK(%esp), %al + notb %al # %al == ~saved_mask + XEN_GET_VCPU_INFO(%esi) + andb evtchn_upcall_mask(%esi),%al + andb $1,%al # %al == mask & ~saved_mask + jnz restore_all_enable_events # != 0 => reenable event delivery + XEN_PUT_VCPU_INFO(%esi) + RESTORE_ALL -ldt_ss: - larl OLDSS(%esp), %eax - jnz restore_nocheck - testl $0x00400000, %eax # returning to 32bit stack? - jnz restore_nocheck # allright, normal return - /* If returning to userspace with 16bit stack, - * try to fix the higher word of ESP, as the CPU - * won't restore it. - * This is an "official" bug of all the x86-compatible - * CPUs, which we can try to work around to make - * dosemu and wine happy. */ - subl $8, %esp # reserve space for switch16 pointer - cli - movl %esp, %eax - /* Set up the 16bit stack frame with switch32 pointer on top, - * and a switch16 pointer on top of the current frame. */ - call setup_x86_bogus_stack +resume_vm86: + XEN_UNBLOCK_EVENTS(%esi) RESTORE_REGS - lss 20+4(%esp), %esp # switch to 16bit stack -1: iret -.section __ex_table,"a" - .align 4 - .long 1b,iret_exc -.previous + movl %eax,(%esp) + movl $__HYPERVISOR_switch_vm86,%eax + int $0x82 + ud2 # perform work that needs to be done immediately before resumption ALIGN @@ -301,7 +317,7 @@ work_pending: jz work_notifysig work_resched: call schedule - cli # make sure we don't miss an interrupt + XEN_BLOCK_EVENTS(%esi) # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret movl TI_flags(%ebp), %ecx @@ -348,7 +364,7 @@ syscall_trace_entry: syscall_exit_work: testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl jz work_pending - sti # could let do_syscall_trace() call + XEN_UNBLOCK_EVENTS(%esi) # could let do_syscall_trace() call # schedule() instead movl %esp, %eax movl $1, %edx @@ -368,27 +384,7 @@ syscall_badsys: movl $-ENOSYS,EAX(%esp) jmp resume_userspace -#define FIXUP_ESPFIX_STACK \ - movl %esp, %eax; \ - /* switch to 32bit stack using the pointer on top of 16bit stack */ \ - lss %ss:CPU_16BIT_STACK_SIZE-8, %esp; \ - /* copy data from 16bit stack to 32bit stack */ \ - call fixup_x86_bogus_stack; \ - /* put ESP to the proper location */ \ - movl %eax, %esp; -#define UNWIND_ESPFIX_STACK \ - pushl %eax; \ - movl %ss, %eax; \ - /* see if on 16bit stack */ \ - cmpw $__ESPFIX_SS, %ax; \ - jne 28f; \ - movl $__KERNEL_DS, %edx; \ - movl %edx, %ds; \ - movl %edx, %es; \ - /* switch to 32bit stack */ \ - FIXUP_ESPFIX_STACK \ -28: popl %eax; - +#if 0 /* XEN */ /* * Build the entry stubs and pointer table with * some assembler magic. @@ -426,6 +422,7 @@ ENTRY(name) \ /* The include is where all of the SMP etc. interrupts come from */ #include "entry_arch.h" +#endif /* XEN */ ENTRY(divide_error) pushl $0 # no error code @@ -443,9 +440,7 @@ error_code: pushl %ecx pushl %ebx cld - pushl %es - UNWIND_ESPFIX_STACK - popl %ecx + movl %es, %ecx movl ES(%esp), %edi # get the function address movl ORIG_EAX(%esp), %edx # get the error code movl %eax, ORIG_EAX(%esp) @@ -457,6 +452,118 @@ error_code: call *%edi jmp ret_from_exception +# A note on the "critical region" in our callback handler. +# We want to avoid stacking callback handlers due to events occurring +# during handling of the last event. To do this, we keep events disabled +# until we've done all processing. HOWEVER, we must enable events before +# popping the stack frame (can't be done atomically) and so it would still +# be possible to get enough handler activations to overflow the stack. +# Although unlikely, bugs of that kind are hard to track down, so we'd +# like to avoid the possibility. +# So, on entry to the handler we detect whether we interrupted an +# existing activation in its critical region -- if so, we pop the current +# activation and restart the handler using the previous one. +ENTRY(hypervisor_callback) + pushl %eax + SAVE_ALL + movl EIP(%esp),%eax + cmpl $scrit,%eax + jb 11f + cmpl $ecrit,%eax + jb critical_region_fixup +11: push %esp + call evtchn_do_upcall + add $4,%esp + jmp ret_from_intr + + ALIGN +restore_all_enable_events: + XEN_LOCKED_UNBLOCK_EVENTS(%esi) +scrit: /**** START OF CRITICAL REGION ****/ + XEN_TEST_PENDING(%esi) + jnz 14f # process more events if necessary... + XEN_PUT_VCPU_INFO(%esi) + RESTORE_ALL +14: XEN_LOCKED_BLOCK_EVENTS(%esi) + XEN_PUT_VCPU_INFO(%esi) + jmp 11b +ecrit: /**** END OF CRITICAL REGION ****/ +# [How we do the fixup]. We want to merge the current stack frame with the +# just-interrupted frame. How we do this depends on where in the critical +# region the interrupted handler was executing, and so how many saved +# registers are in each frame. We do this quickly using the lookup table +# 'critical_fixup_table'. For each byte offset in the critical region, it +# provides the number of bytes which have already been popped from the +# interrupted stack frame. +critical_region_fixup: + addl $critical_fixup_table-scrit,%eax + movzbl (%eax),%eax # %eax contains num bytes popped + cmpb $0xff,%al # 0xff => vcpu_info critical region + jne 15f + GET_THREAD_INFO(%ebp) + XEN_PUT_VCPU_INFO(%esi) # abort vcpu_info critical region + xorl %eax,%eax +15: mov %esp,%esi + add %eax,%esi # %esi points at end of src region + mov %esp,%edi + add $0x34,%edi # %edi points at end of dst region + mov %eax,%ecx + shr $2,%ecx # convert words to bytes + je 17f # skip loop if nothing to copy +16: subl $4,%esi # pre-decrementing copy loop + subl $4,%edi + movl (%esi),%eax + movl %eax,(%edi) + loop 16b +17: movl %edi,%esp # final %edi is top of merged stack + jmp 11b + +critical_fixup_table: + .byte 0xff,0xff,0xff # testb $0xff,(%esi) = XEN_TEST_PENDING + .byte 0xff,0xff # jnz 14f + XEN_PUT_VCPU_INFO_fixup + .byte 0x00 # pop %ebx + .byte 0x04 # pop %ecx + .byte 0x08 # pop %edx + .byte 0x0c # pop %esi + .byte 0x10 # pop %edi + .byte 0x14 # pop %ebp + .byte 0x18 # pop %eax + .byte 0x1c # pop %ds + .byte 0x20 # pop %es + .byte 0x24,0x24,0x24 # add $4,%esp + .byte 0x28 # iret + .byte 0xff,0xff,0xff,0xff # movb $1,1(%esi) + XEN_PUT_VCPU_INFO_fixup + .byte 0x00,0x00 # jmp 11b + +# Hypervisor uses this for application faults while it executes. +ENTRY(failsafe_callback) +1: popl %ds +2: popl %es +3: popl %fs +4: popl %gs + subl $4,%esp + SAVE_ALL + jmp ret_from_exception +.section .fixup,"ax"; \ +6: movl $0,(%esp); \ + jmp 1b; \ +7: movl $0,(%esp); \ + jmp 2b; \ +8: movl $0,(%esp); \ + jmp 3b; \ +9: movl $0,(%esp); \ + jmp 4b; \ +.previous; \ +.section __ex_table,"a";\ + .align 4; \ + .long 1b,6b; \ + .long 2b,7b; \ + .long 3b,8b; \ + .long 4b,9b; \ +.previous + ENTRY(coprocessor_error) pushl $0 pushl $do_coprocessor_error @@ -470,17 +577,9 @@ ENTRY(simd_coprocessor_error) ENTRY(device_not_available) pushl $-1 # mark this as an int SAVE_ALL - movl %cr0, %eax - testl $0x4, %eax # EM (math emulation bit) - jne device_not_available_emulate preempt_stop call math_state_restore jmp ret_from_exception -device_not_available_emulate: - pushl $0 # temporary storage for ORIG_EIP - call math_emulate - addl $4, %esp - jmp ret_from_exception /* * Debug traps and NMI can happen at the one SYSENTER instruction @@ -516,6 +615,7 @@ debug_stack_correct: call do_debug jmp ret_from_exception +#if 0 /* XEN */ /* * NMI is doubly nasty. It can happen _while_ we're handling * a debug fault, and the debug fault hasn't yet been able to @@ -525,11 +625,6 @@ debug_stack_correct: * fault happened on the sysenter path. */ ENTRY(nmi) - pushl %eax - movl %ss, %eax - cmpw $__ESPFIX_SS, %ax - popl %eax - je nmi_16bit_stack cmpl $sysenter_entry,(%esp) je nmi_stack_fixup pushl %eax @@ -549,7 +644,7 @@ nmi_stack_correct: xorl %edx,%edx # zero error code movl %esp,%eax # pt_regs pointer call do_nmi - jmp restore_all + RESTORE_ALL nmi_stack_fixup: FIX_STACK(12,nmi_stack_correct, 1) @@ -564,29 +659,7 @@ nmi_debug_stack_check: nmi_debug_stack_fixup: FIX_STACK(24,nmi_stack_correct, 1) jmp nmi_stack_correct - -nmi_16bit_stack: - /* create the pointer to lss back */ - pushl %ss - pushl %esp - movzwl %sp, %esp - addw $4, (%esp) - /* copy the iret frame of 12 bytes */ - .rept 3 - pushl 16(%esp) - .endr - pushl %eax - SAVE_ALL - FIXUP_ESPFIX_STACK # %eax == %esp - xorl %edx,%edx # zero error code - call do_nmi - RESTORE_REGS - lss 12+4(%esp), %esp # back to 16bit stack -1: iret -.section __ex_table,"a" - .align 4 - .long 1b,iret_exc -.previous +#endif /* XEN */ ENTRY(int3) pushl $-1 # mark this as an int @@ -636,9 +709,33 @@ ENTRY(alignment_check) pushl $do_alignment_check jmp error_code +# This handler is special, because it gets an extra value on its stack, +# which is the linear faulting address. +# fastcall register usage: %eax = pt_regs, %edx = error code, +# %ecx = fault address ENTRY(page_fault) - pushl $do_page_fault - jmp error_code + pushl %ds + pushl %eax + xorl %eax, %eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + decl %eax /* eax = -1 */ + pushl %ecx + pushl %ebx + cld + movl %es,%edi + movl ES(%esp), %ecx /* get the faulting address */ + movl ORIG_EAX(%esp), %edx /* get the error code */ + movl %eax, ORIG_EAX(%esp) + movl %edi, ES(%esp) + movl $(__KERNEL_DS),%eax + movl %eax, %ds + movl %eax, %es + movl %esp,%eax /* pt_regs pointer */ + call do_page_fault + jmp ret_from_exception #ifdef CONFIG_X86_MCE ENTRY(machine_check) @@ -647,9 +744,8 @@ ENTRY(machine_check) jmp error_code #endif -ENTRY(spurious_interrupt_bug) - pushl $0 - pushl $do_spurious_interrupt_bug +ENTRY(fixup_4gb_segment) + pushl $do_fixup_4gb_segment jmp error_code #include "syscall_table.S" diff -x mkbuildtree -x include -x xen -x SCCS -urPp pristine-linux-2.6.12/arch/i386/kernel/head.S linux-2.6-xen-sparse/arch/i386/kernel/head.S --- pristine-linux-2.6.12/arch/i386/kernel/head.S 2005-06-17 12:48:29.000000000 -0700 +++ linux-2.6-xen-sparse/arch/i386/kernel/head.S 2005-07-28 13:17:07.000000000 -0700 @@ -1,24 +1,25 @@ -/* - * linux/arch/i386/kernel/head.S -- the 32-bit startup code. - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * Enhanced CPU detection and feature setting code by Mike Jagdis - * and Martin Mares, November 1997. - */ -.text #include <linux/config.h> + +.section __xen_guest + .ascii "GUEST_OS=linux,GUEST_VER=2.6" + .ascii ",XEN_VER=3.0" + .ascii ",VIRT_BASE=0xC0000000" +#ifdef CONFIG_X86_PAE + .ascii ",PAE=yes" +#else + .ascii ",PAE=no" +#endif + .ascii ",LOADER=generic" + .byte 0 + +.text #include <linux/threads.h> #include <linux/linkage.h> #include <asm/segment.h> -#include <asm/page.h> -#include <asm/pgtable.h> -#include <asm/desc.h> -#include <asm/cache.h> #include <asm/thread_info.h> #include <asm/asm_offsets.h> -#include <asm/setup.h> +#include <asm-xen/xen-public/arch-x86_32.h> /* * References to members of the new_cpu_data structure. @@ -33,239 +34,24 @@ #define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability #define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id -/* - * This is how much memory *in addition to the memory covered up to - * and including _end* we need mapped initially. We need one bit for - * each possible page, but only in low memory, which means - * 2^32/4096/8 = 128K worst case (4G/4G split.) - * - * Modulo rounding, each megabyte assigned here requires a kilobyte of - * memory, which is currently unreclaimed. - * - * This should be a multiple of a page. - */ -#define INIT_MAP_BEYOND_END (128*1024) - - -/* - * 32-bit kernel entrypoint; only used by the boot CPU. On entry, - * %esi points to the real-mode code as a 32-bit pointer. - * CS and DS must be 4 GB flat segments, but we don't depend on - * any particular GDT layout, because we load our own as soon as we - * can. - */ ENTRY(startup_32) - -/* - * Set segments to known values. - */ cld - lgdt boot_gdt_descr - __PAGE_OFFSET - movl $(__BOOT_DS),%eax - movl %eax,%ds - movl %eax,%es - movl %eax,%fs - movl %eax,%gs -/* - * Clear BSS first so that there are no surprises... - * No need to cld as DF is already clear from cld above... - */ - xorl %eax,%eax - movl $__bss_start - __PAGE_OFFSET,%edi - movl $__bss_stop - __PAGE_OFFSET,%ecx - subl %edi,%ecx - shrl $2,%ecx - rep ; stosl - -/* - * Initialize page tables. This creates a PDE and a set of page - * tables, which are located immediately beyond _end. The variable - * init_pg_tables_end is set up to point to the first "safe" location. - * Mappings are created both at virtual address 0 (identity mapping) - * and PAGE_OFFSET for up to _end+sizeof(page tables)+INIT_MAP_BEYOND_END. - * - * Warning: don't use %esi or the stack in this code. However, %esp - * can be used as a GPR if you really need it... - */ -page_pde_offset = (__PAGE_OFFSET >> 20); - - movl $(pg0 - __PAGE_OFFSET), %edi - movl $(swapper_pg_dir - __PAGE_OFFSET), %edx - movl $0x007, %eax /* 0x007 = PRESENT+RW+USER */ -10: - leal 0x007(%edi),%ecx /* Create PDE entry */ - movl %ecx,(%edx) /* Store identity PDE entry */ - movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */ - addl $4,%edx - movl $1024, %ecx -11: - stosl - addl $0x1000,%eax - loop 11b - /* End condition: we must map up to and including INIT_MAP_BEYOND_END */ - /* bytes beyond the end of our own page tables; the +0x007 is the attribute bits */ - leal (INIT_MAP_BEYOND_END+0x007)(%edi),%ebp - cmpl %ebp,%eax - jb 10b - movl %edi,(init_pg_tables_end - __PAGE_OFFSET) + /* Copy the necessary stuff from xen_start_info structure. */ + mov $xen_start_info_union,%edi + mov $512,%ecx + rep movsl #ifdef CONFIG_SMP - xorl %ebx,%ebx /* This is the boot CPU (BSP) */ - jmp 3f - -/* - * Non-boot CPU entry point; entered from trampoline.S - * We can't lgdt here, because lgdt itself uses a data segment, but - * we know the trampoline has already loaded the boot_gdt_table GDT - * for us. - */ ENTRY(startup_32_smp) cld - movl $(__BOOT_DS),%eax - movl %eax,%ds - movl %eax,%es - movl %eax,%fs - movl %eax,%gs - -/* - * New page tables may be in 4Mbyte page mode and may - * be using the global pages. - * - * NOTE! If we are on a 486 we may have no cr4 at all! - * So we do not try to touch it unless we really have - * some bits in it to set. This won't work if the BSP - * implements cr4 but this AP does not -- very unlikely - * but be warned! The same applies to the pse feature - * if not equally supported. --macro - * - * NOTE! We have to correct for the fact that we're - * not yet offset PAGE_OFFSET.. - */ -#define cr4_bits mmu_cr4_features-__PAGE_OFFSET - movl cr4_bits,%edx - andl %edx,%edx - jz 6f - movl %cr4,%eax # Turn on paging options (PSE,PAE,..) - orl %edx,%eax - movl %eax,%cr4 - - btl $5, %eax # check if PAE is enabled - jnc 6f - - /* Check if extended functions are implemented */ - movl $0x80000000, %eax - cpuid - cmpl $0x80000000, %eax - jbe 6f - mov $0x80000001, %eax - cpuid - /* Execute Disable bit supported? */ - btl $20, %edx - jnc 6f - - /* Setup EFER (Extended Feature Enable Register) */ - movl $0xc0000080, %ecx - rdmsr - - btsl $11, %eax - /* Make changes effective */ - wrmsr - -6: - /* This is a secondary processor (AP) */ - xorl %ebx,%ebx - incl %ebx - -3: #endif /* CONFIG_SMP */ -/* - * Enable paging - */ - movl $swapper_pg_dir-__PAGE_OFFSET,%eax - movl %eax,%cr3 /* set the page table pointer.. */ - movl %cr0,%eax - orl $0x80000000,%eax - movl %eax,%cr0 /* ..and set paging (PG) bit */ - ljmp $__BOOT_CS,$1f /* Clear prefetch and normalize %eip */ -1: /* Set up the stack pointer */ lss stack_start,%esp -/* - * Initialize eflags. Some BIOS's leave bits like NT set. This would - * confuse the debugger if this code is traced. - * XXX - best to initialize before switching to protected mode. - */ - pushl $0 - popfl - -#ifdef CONFIG_SMP - andl %ebx,%ebx - jz 1f /* Initial CPU cleans BSS */ - jmp checkCPUtype -1: -#endif /* CONFIG_SMP */ - -/* - * start system 32-bit setup. We need to re-do some of the things done - * in 16-bit mode for the "real" operations. - */ - call setup_idt - -/* - * Copy bootup parameters out of the way. - * Note: %esi still has the pointer to the real-mode data. - */ - movl $boot_params,%edi - movl $(PARAM_SIZE/4),%ecx - cld - rep - movsl - movl boot_params+NEW_CL_POINTER,%esi - andl %esi,%esi - jnz 2f # New command line protocol - cmpw $(OLD_CL_MAGIC),OLD_CL_MAGIC_ADDR - jne 1f - movzwl OLD_CL_OFFSET,%esi - addl $(OLD_CL_BASE_ADDR),%esi -2: - movl $saved_command_line,%edi - movl $(COMMAND_LINE_SIZE/4),%ecx - rep - movsl -1: checkCPUtype: - movl $-1,X86_CPUID # -1 for no CPUID initially - -/* check if it is 486 or 386. */ -/* - * XXX - this does a lot of unnecessary setup. Alignment checks don't - * apply at our cpl of 0 and the stack ought to be aligned already, and - * we don't need to preserve eflags. - */ - - movb $3,X86 # at least 386 - pushfl # push EFLAGS - popl %eax # get EFLAGS - movl %eax,%ecx # save original EFLAGS - xorl $0x240000,%eax # flip AC and ID bits in EFLAGS - pushl %eax # copy to EFLAGS - popfl # set EFLAGS - pushfl # get new EFLAGS - popl %eax # put it in eax - xorl %ecx,%eax # change in flags - pushl %ecx # restore original EFLAGS - popfl - testl $0x40000,%eax # check if AC bit changed - je is386 - - movb $4,X86 # at least 486 - testl $0x200000,%eax # check if ID bit changed - je is486 - /* get vendor info */ xorl %eax,%eax # call CPUID with 0 -> return vendor ID cpuid @@ -274,9 +60,6 @@ checkCPUtype: movl %edx,X86_VENDOR_ID+4 # next 4 chars movl %ecx,X86_VENDOR_ID+8 # last 4 chars - orl %eax,%eax # do we have processor info as well? - je is486 - movl $1,%eax # Use the CPUID instruction to get CPU type cpuid movb %al,%cl # save reg for future use @@ -289,32 +72,13 @@ checkCPUtype: movb %cl,X86_MASK movl %edx,X86_CAPABILITY -is486: movl $0x50022,%ecx # set AM, WP, NE and MP - jmp 2f - -is386: movl $2,%ecx # set MP -2: movl %cr0,%eax - andl $0x80000011,%eax # Save PG,PE,ET - orl %ecx,%eax - movl %eax,%cr0 - - call check_x87 incb ready - lgdt cpu_gdt_descr - lidt idt_descr - ljmp $(__KERNEL_CS),$1f -1: movl $(__KERNEL_DS),%eax # reload all the segment registers - movl %eax,%ss # after changing gdt. - - movl $(__USER_DS),%eax # DS/ES contains default USER segment - movl %eax,%ds - movl %eax,%es xorl %eax,%eax # Clear FS/GS and LDT movl %eax,%fs movl %eax,%gs - lldt %ax cld # gcc2 wants the direction flag cleared at all times + #ifdef CONFIG_SMP movb ready, %cl cmpb $1,%cl @@ -329,100 +93,18 @@ L6: jmp L6 # main should never return here, but # just in case, we know what happens. -/* - * We depend on ET to be correct. This checks for 287/387. - */ -check_x87: - movb $0,X86_HARD_MATH - clts - fninit - fstsw %ax - cmpb $0,%al - je 1f - movl %cr0,%eax /* no coprocessor: have to set bits */ - xorl $4,%eax /* set EM */ - movl %eax,%cr0 - ret - ALIGN -1: movb $1,X86_HARD_MATH - .byte 0xDB,0xE4 /* fsetpm for 287, ignored by 387 */ - ret +ENTRY(lgdt_finish) + movl $(__KERNEL_DS),%eax # reload all the segment registers + movw %ax,%ss # after changing gdt. -/* - * setup_idt - * - * sets up a idt with 256 entries pointing to - * ignore_int, interrupt gates. It doesn't actually load - * idt - that can be done only after paging has been enabled - * and the kernel moved to PAGE_OFFSET. Interrupts - * are enabled elsewhere, when we can be relatively - * sure everything is ok. - * - * Warning: %esi is live across this function. - */ -setup_idt: - lea ignore_int,%edx - movl $(__KERNEL_CS << 16),%eax - movw %dx,%ax /* selector = 0x0010 = cs */ - movw $0x8E00,%dx /* interrupt gate - dpl=0, present */ - - lea idt_table,%edi - mov $256,%ecx -rp_sidt: - movl %eax,(%edi) - movl %edx,4(%edi) - addl $8,%edi - dec %ecx - jne rp_sidt - ret + movl $(__USER_DS),%eax # DS/ES contains default USER segment + movw %ax,%ds + movw %ax,%es -/* This is the default interrupt "handler" :-) */ - ALIGN -ignore_int: - cld -#ifdef CONFIG_PRINTK + popl %eax # reload CS by intersegment return + pushl $(__KERNEL_CS) pushl %eax - pushl %ecx - pushl %edx - pushl %es - pushl %ds - movl $(__KERNEL_DS),%eax - movl %eax,%ds - movl %eax,%es - pushl 16(%esp) - pushl 24(%esp) - pushl 32(%esp) - pushl 40(%esp) - pushl $int_msg - call printk - addl $(5*4),%esp - popl %ds - popl %es - popl %edx - popl %ecx - popl %eax -#endif - iret - -/* - * Real beginning of normal "text" segment - */ -ENTRY(stext) -ENTRY(_stext) - -/* - * BSS section - */ -.section ".bss.page_aligned","w" -ENTRY(swapper_pg_dir) - .fill 1024,4,0 -ENTRY(empty_zero_page) - .fill 4096,1,0 - -/* - * This starts the data section. - */ -.data + lret ENTRY(stack_start) .long init_thread_union+THREAD_SIZE @@ -430,27 +112,10 @@ ENTRY(stack_start) ready: .byte 0 -int_msg: - .asciz "Unknown interrupt or fault at EIP %p %p %p\n" - -/* - * The IDT and GDT 'descriptors' are a strange 48-bit object - * only used by the lidt and lgdt instructions. They are not - * like usual segment descriptors - they consist of a 16-bit - * segment size, and 32-bit linear address value: - */ - -.globl boot_gdt_descr .globl idt_descr .globl cpu_gdt_descr ALIGN -# early boot GDT descriptor (must use 1:1 address mapping) - .word 0 # 32 bit align gdt_desc.address -boot_gdt_descr: - .word __BOOT_DS+7 - .long boot_gdt_table - __PAGE_OFFSET - .word 0 # 32-bit align idt_desc.address idt_descr: .word IDT_ENTRIES*8-1 # idt contains 256 entries @@ -459,25 +124,18 @@ idt_descr: # boot GDT descriptor (later on used by CPU#0): .word 0 # 32 bit align gdt_desc.address cpu_gdt_descr: - .word GDT_ENTRIES*8-1 + .word GDT_SIZE .long cpu_gdt_table .fill NR_CPUS-1,8,0 # space for the other GDT descriptors -/* - * The boot_gdt_table must mirror the equivalent in setup.S and is - * used only for booting. - */ - .align L1_CACHE_BYTES -ENTRY(boot_gdt_table) - .fill GDT_ENTRY_BOOT_CS,8,0 - .quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */ - .quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */ +.org 0x1000 +ENTRY(empty_zero_page) -/* - * The Global Descriptor Table contains 28 quadwords, per-CPU. - */ - .align PAGE_SIZE_asm +.org 0x2000 +ENTRY(swapper_pg_dir) + +.org 0x3000 ENTRY(cpu_gdt_table) .quad 0x0000000000000000 /* NULL descriptor */ .quad 0x0000000000000000 /* 0x0b reserved */ @@ -492,32 +150,49 @@ ENTRY(cpu_gdt_table) .quad 0x0000000000000000 /* 0x53 reserved */ .quad 0x0000000000000000 /* 0x5b reserved */ - .quad 0x00cf9a000000ffff /* 0x60 kernel 4GB code at 0x00000000 */ - .quad 0x00cf92000000ffff /* 0x68 kernel 4GB data at 0x00000000 */ - .quad 0x00cffa000000ffff /* 0x73 user 4GB code at 0x00000000 */ - .quad 0x00cff2000000ffff /* 0x7b user 4GB data at 0x00000000 */ +#ifdef CONFIG_X86_PAE + .quad 0x00cfbb00000067ff /* 0x60 kernel 4GB code at 0x00000000 */ + .quad 0x00cfb300000067ff /* 0x68 kernel 4GB data at 0x00000000 */ + .quad 0x00cffb00000067ff /* 0x73 user 4GB code at 0x00000000 */ + .quad 0x00cff300000067ff /* 0x7b user 4GB data at 0x00000000 */ +#else + .quad 0x00cfbb000000c3ff /* 0x60 kernel 4GB code at 0x00000000 */ + .quad 0x00cfb3000000c3ff /* 0x68 kernel 4GB data at 0x00000000 */ + .quad 0x00cffb000000c3ff /* 0x73 user 4GB code at 0x00000000 */ + .quad 0x00cff3000000c3ff /* 0x7b user 4GB data at 0x00000000 */ +#endif .quad 0x0000000000000000 /* 0x80 TSS descriptor */ .quad 0x0000000000000000 /* 0x88 LDT descriptor */ /* Segments used for calling PnP BIOS */ - .quad 0x00c09a0000000000 /* 0x90 32-bit code */ - .quad 0x00809a0000000000 /* 0x98 16-bit code */ - .quad 0x0080920000000000 /* 0xa0 16-bit data */ - .quad 0x0080920000000000 /* 0xa8 16-bit data */ - .quad 0x0080920000000000 /* 0xb0 16-bit data */ + .quad 0x0000000000000000 /* 0x90 32-bit code */ + .quad 0x0000000000000000 /* 0x98 16-bit code */ + .quad 0x0000000000000000 /* 0xa0 16-bit data */ + .quad 0x0000000000000000 /* 0xa8 16-bit data */ + .quad 0x0000000000000000 /* 0xb0 16-bit data */ /* * The APM segments have byte granularity and their bases * and limits are set at run time. */ - .quad 0x00409a0000000000 /* 0xb8 APM CS code */ - .quad 0x00009a0000000000 /* 0xc0 APM CS 16 code (16 bit) */ - .quad 0x0040920000000000 /* 0xc8 APM DS data */ + .quad 0x0000000000000000 /* 0xb8 APM CS code */ + .quad 0x0000000000000000 /* 0xc0 APM CS 16 code (16 bit) */ + .quad 0x0000000000000000 /* 0xc8 APM DS data */ - .quad 0x0000920000000000 /* 0xd0 - ESPFIX 16-bit SS */ + .quad 0x0000000000000000 /* 0xd0 - unused */ .quad 0x0000000000000000 /* 0xd8 - unused */ .quad 0x0000000000000000 /* 0xe0 - unused */ .quad 0x0000000000000000 /* 0xe8 - unused */ .quad 0x0000000000000000 /* 0xf0 - unused */ .quad 0x0000000000000000 /* 0xf8 - GDT entry 31: double-fault TSS */ + .fill GDT_ENTRIES-32,8,0 +.org 0x4000 +ENTRY(default_ldt) + +.org 0x5000 +/* + * Real beginning of normal "text" segment + */ +ENTRY(stext) +ENTRY(_stext) diff -x mkbuildtree -x include -x xen -x SCCS -urPp pristine-linux-2.6.12/arch/i386/kernel/i386_ksyms.c linux-2.6-xen-sparse/arch/i386/kernel/i386_ksyms.c --- pristine-linux-2.6.12/arch/i386/kernel/i386_ksyms.c 2005-06-17 12:48:29.000000000 -0700 +++ linux-2.6-xen-sparse/arch/i386/kernel/i386_ksyms.c 2005-07-28 13:17:07.000000000 -0700 @@ -76,7 +76,9 @@ EXPORT_SYMBOL(ioremap_nocache); EXPORT_SYMBOL(iounmap); EXPORT_SYMBOL(kernel_thread); EXPORT_SYMBOL(pm_idle); +#ifdef CONFIG_ACPI_BOOT EXPORT_SYMBOL(pm_power_off); +#endif EXPORT_SYMBOL(get_cmos_time); EXPORT_SYMBOL(cpu_khz); EXPORT_SYMBOL(apm_info); diff -x mkbuildtree -x include -x xen -x SCCS -urPp pristine-linux-2.6.12/arch/i386/kernel/io_apic.c linux-2.6-xen-sparse/arch/i386/kernel/io_apic.c --- pristine-linux-2.6.12/arch/i386/kernel/io_apic.c 2005-06-17 12:48:29.000000000 -0700 +++ linux-2.6-xen-sparse/arch/i386/kernel/io_apic.c 2005-07-28 13:17:07.000000000 -0700 @@ -42,6 +42,48 @@ #include "io_ports.h" +#ifdef CONFIG_XEN + +#include <asm-xen/xen-public/xen.h> +#include <asm-xen/xen-public/physdev.h> + +/* Fake i8259 */ +#define make_8259A_irq(_irq) (io_apic_irqs &= ~(1UL<<(_irq))) +#define disable_8259A_irq(_irq) ((void)0) +#define i8259A_irq_pending(_irq) (0) + +unsigned long io_apic_irqs; + +static inline unsigned int xen_io_apic_read(unsigned int apic, unsigned int reg) +{ + physdev_op_t op; + int ret; + + op.cmd = PHYSDEVOP_APIC_READ; + op.u.apic_op.apic = mp_ioapics[apic].mpc_apicid; + op.u.apic_op.offset = reg; + ret = HYPERVISOR_physdev_op(&op); + if (ret) + return ret; + return op.u.apic_op.value; +} + +static inline void xen_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) +{ + physdev_op_t op; + + op.cmd = PHYSDEVOP_APIC_WRITE; + op.u.apic_op.apic = mp_ioapics[apic].mpc_apicid; + op.u.apic_op.offset = reg; + op.u.apic_op.value = value; + HYPERVISOR_physdev_op(&op); +} + +#define io_apic_read(a,r) xen_io_apic_read(a,r) +#define io_apic_write(a,r,v) xen_io_apic_write(a,r,v) + +#endif /* CONFIG_XEN */ + int (*ioapic_renumber_irq)(int ioapic, int irq); atomic_t irq_mis_count; @@ -107,6 +149,7 @@ static void add_pin_to_irq(unsigned int entry->pin = pin; } +#ifndef CONFIG_XEN /* * Reroute an IRQ to a different pin. */ @@ -243,6 +286,9 @@ static void set_ioapic_affinity_irq(unsi } spin_unlock_irqrestore(&ioapic_lock, flags); } +#else +#define clear_IO_APIC() ((void)0) +#endif #if defined(CONFIG_IRQBALANCE) # include <asm/processor.h> /* kernel_thread() */ @@ -664,6 +710,7 @@ static inline void move_irq(int irq) { } #ifndef CONFIG_SMP void fastcall send_IPI_self(int vector) { +#ifndef CONFIG_XEN unsigned int cfg; /* @@ -675,6 +722,7 @@ void fastcall send_IPI_self(int vector) * Send the IPI. The write to APIC_ICR fires this off. */ apic_write_around(APIC_ICR, cfg); +#endif } #endif /* !CONFIG_SMP */ @@ -744,6 +792,7 @@ static int find_irq_entry(int apic, int return -1; } +#ifndef CONFIG_XEN /* * Find the pin to which IRQ[irq] (ISA) is connected */ @@ -766,6 +815,7 @@ static int find_isa_irq_pin(int irq, int } return -1; } +#endif /* * Find a specific PCI IRQ entry. @@ -813,6 +863,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, return best_guess; } +#ifndef CONFIG_XEN /* * This function currently is only a helper for the i386 smp boot process where * we need to reprogram the ioredtbls to cater for the cpus which have come online @@ -836,6 +887,7 @@ void __init setup_ioapic_dest(void) } } +#endif /* !CONFIG_XEN */ /* * EISA Edge/Level control register, ELCR @@ -1125,26 +1177,22 @@ static inline int IO_APIC_irq_trigger(in } /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */ -u8 irq_vector[NR_IRQ_VECTORS] = { FIRST_DEVICE_VECTOR , 0 }; +u8 irq_vector[NR_IRQ_VECTORS]; /* = { FIRST_DEVICE_VECTOR , 0 }; */ int assign_irq_vector(int irq) { - static int current_vector = FIRST_DEVICE_VECTOR, offset = 0; + static int current_vector = FIRST_DEVICE_VECTOR; + physdev_op_t op; BUG_ON(irq >= NR_IRQ_VECTORS); if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) return IO_APIC_VECTOR(irq); -next: - current_vector += 8; - if (current_vector == SYSCALL_VECTOR) - goto next; - - if (current_vector >= FIRST_SYSTEM_VECTOR) { - offset++; - if (!(offset%8)) - return -ENOSPC; - current_vector = FIRST_DEVICE_VECTOR + offset; - } + + op.cmd = PHYSDEVOP_ASSIGN_VECTOR; + op.u.irq_op.irq = irq; + if (HYPERVISOR_physdev_op(&op)) + return -ENOSPC; + current_vector = op.u.irq_op.vector; vector_irq[current_vector] = irq; if (irq != AUTO_ASSIGN) @@ -1153,6 +1201,7 @@ next: return current_vector; } +#ifndef CONFIG_XEN static struct hw_interrupt_type ioapic_level_type; static struct hw_interrupt_type ioapic_edge_type; @@ -1178,6 +1227,9 @@ static inline void ioapic_register_intr( set_intr_gate(vector, interrupt[irq]); } } +#else +#define ioapic_register_intr(_irq,_vector,_trigger) ((void)0) +#endif static void __init setup_IO_APIC_irqs(void) { @@ -1233,7 +1285,7 @@ static void __init setup_IO_APIC_irqs(vo else add_pin_to_irq(irq, apic, pin); - if (!apic && !IO_APIC_IRQ(irq)) + if (/*!apic &&*/ !IO_APIC_IRQ(irq)) continue; if (IO_APIC_IRQ(irq)) { @@ -1258,6 +1310,7 @@ static void __init setup_IO_APIC_irqs(vo /* * Set up the 8259A-master output pin: */ +#ifndef CONFIG_XEN static void __init setup_ExtINT_IRQ0_pin(unsigned int pin, int vector) { struct IO_APIC_route_entry entry; @@ -1452,8 +1505,6 @@ void __init print_IO_APIC(void) return; } -#if 0 - static void print_APIC_bitfield (int base) { unsigned int v; @@ -1595,8 +1646,9 @@ void /*__init*/ print_PIC(void) v = inb(0x4d1) << 8 | inb(0x4d0); printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); } - -#endif /* 0 */ +#else +void __init print_IO_APIC(void) { } +#endif /* !CONFIG_XEN */ static void __init enable_IO_APIC(void) { @@ -1638,7 +1690,9 @@ void disable_IO_APIC(void) */ clear_IO_APIC(); +#ifndef CONFIG_XEN disconnect_bsp_APIC(); +#endif } /* @@ -1648,7 +1702,7 @@ void disable_IO_APIC(void) * by Matt Domsch <Matt_Domsch@xxxxxxxx> Tue Dec 21 12:25:05 CST 1999 */ -#ifndef CONFIG_X86_NUMAQ +#if !defined(CONFIG_XEN) && !defined(CONFIG_X86_NUMAQ) static void __init setup_ioapic_ids_from_mpc(void) { union IO_APIC_reg_00 reg_00; @@ -1755,6 +1809,7 @@ static void __init setup_ioapic_ids_from static void __init setup_ioapic_ids_from_mpc(void) { } #endif +#ifndef CONFIG_XEN /* * There is a nasty bug in some older SMP boards, their mptable lies * about the timer IRQ. We do the following to work around the situation: @@ -1979,6 +2034,7 @@ static struct hw_interrupt_type ioapic_l .end = end_level_ioapic, .set_affinity = set_ioapic_affinity, }; +#endif /* !CONFIG_XEN */ static inline void init_IO_APIC_traps(void) { @@ -2010,13 +2066,16 @@ static inline void init_IO_APIC_traps(vo */ if (irq < 16) make_8259A_irq(irq); +#ifndef CONFIG_XEN else /* Strange. Oh, well.. */ irq_desc[irq].handler = &no_irq_type; +#endif } } } +#ifndef CONFIG_XEN static void enable_lapic_irq (unsigned int irq) { unsigned long v; @@ -2243,6 +2302,9 @@ static inline void check_timer(void) panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " "report. Then try booting with the 'noapic' option"); } +#else +#define check_timer() ((void)0) +#endif /* * @@ -2269,7 +2331,9 @@ void __init setup_IO_APIC(void) */ if (!acpi_ioapic) setup_ioapic_ids_from_mpc(); +#ifndef CONFIG_XEN sync_Arb_IDs(); +#endif setup_IO_APIC_irqs(); init_IO_APIC_traps(); check_timer(); @@ -2391,6 +2455,7 @@ device_initcall(ioapic_init_sysfs); int __init io_apic_get_unique_id (int ioapic, int apic_id) { +#ifndef CONFIG_XEN union IO_APIC_reg_00 reg_00; static physid_mask_t apic_id_map = PHYSID_MASK_NONE; physid_mask_t tmp; @@ -2457,6 +2522,7 @@ int __init io_apic_get_unique_id (int io apic_printk(APIC_VERBOSE, KERN_INFO "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); +#endif /* !CONFIG_XEN */ return apic_id; } diff -x mkbuildtree -x include -x xen -x SCCS -urPp pristine-linux-2.6.12/arch/i386/kernel/ioport.c linux-2.6-xen-sparse/arch/i386/kernel/ioport.c --- pristine-linux-2.6.12/arch/i386/kernel/ioport.c 2005-06-17 12:48:29.000000000 -0700 +++ linux-2.6-xen-sparse/arch/i386/kernel/ioport.c 2005-07-28 13:17:07.000000000 -0700 @@ -15,6 +15,7 @@ #include <linux/stddef.h> #include <linux/slab.h> #include <linux/thread_info.h> +#include <asm-xen/xen-public/physdev.h> /* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) @@ -56,10 +57,9 @@ static void set_bitmap(unsigned long *bi */ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) { - unsigned long i, max_long, bytes, bytes_updated; struct thread_struct * t = ¤t->thread; - struct tss_struct * tss; unsigned long *bitmap; + physdev_op_t op; if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) return -EINVAL; @@ -78,41 +78,15 @@ asmlinkage long sys_ioperm(unsigned long memset(bitmap, 0xff, IO_BITMAP_BYTES); t->io_bitmap_ptr = bitmap; - } - /* - * do it in the per-thread copy and in the TSS ... - * - * Disable preemption via get_cpu() - we must not switch away - * because the ->io_bitmap_max value must match the bitmap - * contents: - */ - tss = &per_cpu(init_tss, get_cpu()); + op.cmd = PHYSDEVOP_SET_IOBITMAP; + op.u.set_iobitmap.bitmap = (unsigned long)bitmap; + op.u.set_iobitmap.nr_ports = IO_BITMAP_BITS; + HYPERVISOR_physdev_op(&op); + } set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); - /* - * Search for a (possibly new) maximum. This is simple and stupid, - * to keep it obviously correct: - */ - max_long = 0; - for (i = 0; i < IO_BITMAP_LONGS; i++) - if (t->io_bitmap_ptr[i] != ~0UL) - max_long = i; - - bytes = (max_long + 1) * sizeof(long); - bytes_updated = max(bytes, t->io_bitmap_max); - - t->io_bitmap_max = bytes; - - /* - * Sets the lazy trigger so that the next I/O operation will - * reload the correct bitmap. - */ - tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY; - - put_cpu(); - return 0; } @@ -127,21 +101,29 @@ asmlinkage long sys_ioperm(unsigned long * code. */ -asmlinkage long sys_iopl(unsigned long unused) +asmlinkage long sys_iopl(unsigned int new_io_pl) { - volatile struct pt_regs * regs = (struct pt_regs *) &unused; - unsigned int level = regs->ebx; - unsigned int old = (regs->eflags >> 12) & 3; + unsigned int old_io_pl = current->thread.io_pl; + physdev_op_t op; - if (level > 3) + if (new_io_pl > 3) return -EINVAL; - /* Trying to gain more privileges? */ - if (level > old) { - if (!capable(CAP_SYS_RAWIO)) - return -EPERM; - } - regs->eflags = (regs->eflags &~ 0x3000UL) | (level << 12); - /* Make sure we return the long way (not sysenter) */ - set_thread_flag(TIF_IRET); + + /* Need "raw I/O" privileges for direct port access. */ + if ((new_io_pl > old_io_pl) && !capable(CAP_SYS_RAWIO)) + return -EPERM; + + /* Maintain OS privileges even if user attempts to relinquish them. */ + if (new_io_pl == 0) + new_io_pl = 1; + + /* Change our version of the privilege levels. */ + current->thread.io_pl = new_io_pl; + + /* Force the change at ring 0. */ + op.cmd = PHYSDEVOP_SET_IOPL; + op.u.set_iopl.iopl = new_io_pl; + HYPERVISOR_physdev_op(&op); + return 0; } diff -x mkbuildtree -x include -x xen -x SCCS -urPp pristine-linux-2.6.12/arch/i386/kernel/irq.c linux-2.6-xen-sparse/arch/i386/kernel/irq.c --- pristine-linux-2.6.12/arch/i386/kernel/irq.c 2005-06-17 12:48:29.000000000 -0700 +++ linux-2.6-xen-sparse/arch/i386/kernel/irq.c 2005-07-28 13:17:07.000000000 -0700 @@ -15,6 +15,9 @@ #include <linux/seq_file.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> +#include <linux/notifier.h> +#include <linux/cpu.h> +#include <linux/delay.h> DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_maxaligned_in_smp; EXPORT_PER_CPU_SYMBOL(irq_stat); @@ -51,7 +54,7 @@ static union irq_ctx *softirq_ctx[NR_CPU fastcall unsigned int do_IRQ(struct pt_regs *regs) { /* high bits used in ret_from_ code */ - int irq = regs->orig_eax & 0xff; + int irq = regs->orig_eax & __IRQ_MASK(HARDIRQ_BITS); #ifdef CONFIG_4KSTACKS union irq_ctx *curctx, *irqctx; u32 *isp; @@ -210,9 +213,8 @@ int show_interrupts(struct seq_file *p, if (i == 0) { seq_printf(p, " "); - for (j=0; j<NR_CPUS; j++) - if (cpu_online(j)) - seq_printf(p, "CPU%d ",j); + for_each_cpu(j) + seq_printf(p, "CPU%d ",j); seq_putc(p, '\n'); } @@ -225,9 +227,8 @@ int show_interrupts(struct seq_file *p, #ifndef CONFIG_SMP seq_printf(p, "%10u ", kstat_irqs(i)); #else - for (j = 0; j < NR_CPUS; j++) - if (cpu_online(j)) - seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); + for_each_cpu(j) + seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); #endif seq_printf(p, " %14s", irq_desc[i].handler->typename); seq_printf(p, " %s", action->name); @@ -240,16 +241,13 @@ skip: spin_unlock_irqrestore(&irq_desc[i].lock, flags); } else if (i == NR_IRQS) { seq_printf(p, "NMI: "); - for (j = 0; j < NR_CPUS; j++) - if (cpu_online(j)) - seq_printf(p, "%10u ", nmi_count(j)); + for_each_cpu(j) + seq_printf(p, "%10u ", nmi_count(j)); seq_putc(p, '\n'); #ifdef CONFIG_X86_LOCAL_APIC seq_printf(p, "LOC: "); - for (j = 0; j < NR_CPUS; j++) - if (cpu_online(j)) - seq_printf(p, "%10u ", - per_cpu(irq_stat,j).apic_timer_irqs); + for_each_cpu(j) + seq_printf(p, "%10u ", per_cpu(irq_stat, j).apic_timer_irqs); seq_putc(p, '\n'); #endif seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); @@ -259,3 +257,43 @@ skip: } return 0; } + +#ifdef CONFIG_HOTPLUG_CPU + +void fixup_irqs(cpumask_t map) +{ + unsigned int irq; + + for (irq = 0; irq < NR_IRQS; irq++) { + cpumask_t mask; + if (irq == 2) + continue; + + cpus_and(mask, irq_affinity[irq], map); + if (any_online_cpu(mask) == NR_CPUS) { + printk("Breaking affinity for irq %i\n", irq); + mask = map; + } + if (irq_desc[irq].handler->set_affinity) + irq_desc[irq].handler->set_affinity(irq, mask); + else if (irq_desc[irq].action) + printk("Cannot set affinity for irq %i\n", irq); + } + +#if 0 + barrier(); + /* Ingo Molnar says: "after the IO-APIC masks have been redirected + [note the nop - the interrupt-enable boundary on x86 is two + instructions from sti] - to flush out pending hardirqs and + IPIs. After this point nothing is supposed to reach this CPU." */ + __asm__ __volatile__("sti; nop; cli"); + barrier(); +#else + /* That doesn't seem sufficient. Give it 1ms. */ + local_irq_enable(); + mdelay(1); + local_irq_disable(); +#endif +} +#endif + diff -x mkbuildtree -x include -x xen -x SCCS -urPp pristine-linux-2.6.12/arch/i386/kernel/ldt.c linux-2.6-xen-sparse/arch/i386/kernel/ldt.c --- pristine-linux-2.6.12/arch/i386/kernel/ldt.c 2005-06-17 12:48:29.000000000 -0700 +++ linux-2.6-xen-sparse/arch/i386/kernel/ldt.c 2005-07-28 13:17:07.000000000 -0700 @@ -18,6 +18,7 @@ #include <asm/system.h> #include <asm/ldt.h> #include <asm/desc.h> +#include <asm/mmu_context.h> #ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ static void flush_ldt(void *null) @@ -58,16 +59,20 @@ static int alloc_ldt(mm_context_t *pc, i #ifdef CONFIG_SMP cpumask_t mask; preempt_disable(); +#endif + make_pages_readonly(pc->ldt, (pc->size * LDT_ENTRY_SIZE) / + PAGE_SIZE); load_LDT(pc); +#ifdef CONFIG_SMP mask = cpumask_of_cpu(smp_processor_id()); if (!cpus_equal(current->mm->cpu_vm_mask, mask)) smp_call_function(flush_ldt, NULL, 1, 1); preempt_enable(); -#else - load_LDT(pc); #endif } if (oldsize) { + make_pages_writable(oldldt, (oldsize * LDT_ENTRY_SIZE) / + PAGE_SIZE); if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) vfree(oldldt); else @@ -82,6 +87,8 @@ static inline int copy_ldt(mm_context_t if (err < 0) return err; memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); + make_pages_readonly(new->ldt, (new->size * LDT_ENTRY_SIZE) / + PAGE_SIZE); return 0; } @@ -94,14 +101,19 @@ int init_new_context(struct task_struct struct mm_struct * old_mm; int retval = 0; + memset(&mm->context, 0, sizeof(mm->context)); init_MUTEX(&mm->context.sem); - mm->context.size = 0; old_mm = current->mm; if (old_mm && old_mm->context.size > 0) { down(&old_mm->context.sem); retval = copy_ldt(&mm->context, &old_mm->context); up(&old_mm->context.sem); } + if (retval == 0) { + spin_lock(&mm_unpinned_lock); + list_add(&mm->context.unpinned, &mm_unpinned); + spin_unlock(&mm_unpinned_lock); + } return retval; } @@ -113,12 +125,20 @@ void destroy_context(struct mm_struct *m if (mm->context.size) { if (mm == current->active_mm) clear_LDT(); + make_pages_writable(mm->context.ldt, + (mm->context.size * LDT_ENTRY_SIZE) / + PAGE_SIZE); if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE) vfree(mm->context.ldt); else kfree(mm->context.ldt); mm->context.size = 0; } + if (!mm->context.pinned) { + spin_lock(&mm_unpinned_lock); + list_del(&mm->context.unpinned); + spin_unlock(&mm_unpinned_lock); + } } static int read_ldt(void __user * ptr, unsigned long bytecount) @@ -178,6 +198,7 @@ static int write_ldt(void __user * ptr, { struct mm_struct * mm = current->mm; __u32 entry_1, entry_2, *lp; + unsigned long mach_lp; int error; struct user_desc ldt_info; @@ -206,6 +227,7 @@ static int write_ldt(void __user * ptr, } lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt); + mach_lp = arbitrary_virt_to_machine(lp); /* Allow LDTs to be cleared by the user. */ if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { @@ -223,9 +245,7 @@ static int write_ldt(void __user * ptr, /* Install the new entry ... */ install: - *lp = entry_1; - *(lp+1) = entry_2; - error = 0; + error = HYPERVISOR_update_descriptor(mach_lp, entry_1, entry_2); out_unlock: up(&mm->context.sem); diff -x mkbuildtree -x include -x xen -x SCCS -urPp pristine-linux-2.6.12/arch/i386/kernel/Makefile linux-2.6-xen-sparse/arch/i386/kernel/Makefile --- pristine-linux-2.6.12/arch/i386/kernel/Makefile 2005-06-17 12:48:29.000000000 -0700 +++ linux-2.6-xen-sparse/arch/i386/kernel/Makefile 2005-07-28 13:17:07.000000000 -0700 @@ -2,41 +2,52 @@ # Makefile for the linux kernel. # -extra-y := head.o init_task.o vmlinux.lds +XENARCH := $(subst ",,$(CONFIG_XENARCH)) -obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o vm86.o \ - ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \ - pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o \ - doublefault.o quirks.o +CFLAGS += -Iarch/$(XENARCH)/kernel + +extra-y := head.o init_task.o + +obj-y := process.o signal.o entry.o traps.o \ + time.o ioport.o ldt.o setup.o \ + pci-dma.o i386_ksyms.o irq.o quirks.o + +c-obj-y := semaphore.o vm86.o \ + ptrace.o sys_i386.o \ + i387.o dmi_scan.o bootflag.o \ + doublefault.o +s-obj-y := obj-y += cpu/ -obj-y += timers/ +#obj-y += timers/ obj-$(CONFIG_ACPI_BOOT) += acpi/ -obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o -obj-$(CONFIG_MCA) += mca.o -obj-$(CONFIG_X86_MSR) += msr.o -obj-$(CONFIG_X86_CPUID) += cpuid.o +#c-obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o +c-obj-$(CONFIG_MCA) += mca.o +c-obj-$(CONFIG_X86_MSR) += msr.o +c-obj-$(CONFIG_X86_CPUID) += cpuid.o obj-$(CONFIG_MICROCODE) += microcode.o -obj-$(CONFIG_APM) += apm.o +c-obj-$(CONFIG_APM) += apm.o obj-$(CONFIG_X86_SMP) += smp.o smpboot.o -obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o +#obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o obj-$(CONFIG_X86_MPPARSE) += mpparse.o -obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o +obj-$(CONFIG_X86_LOCAL_APIC) += apic.o +c-obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o -obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups.o -obj-$(CONFIG_X86_NUMAQ) += numaq.o -obj-$(CONFIG_X86_SUMMIT_NUMA) += summit.o -obj-$(CONFIG_KPROBES) += kprobes.o -obj-$(CONFIG_MODULES) += module.o -obj-y += sysenter.o vsyscall.o -obj-$(CONFIG_ACPI_SRAT) += srat.o -obj-$(CONFIG_HPET_TIMER) += time_hpet.o -obj-$(CONFIG_EFI) += efi.o efi_stub.o -obj-$(CONFIG_EARLY_PRINTK) += early_printk.o +c-obj-$(CONFIG_X86_REBOOTFIXUPS)+= reboot_fixups.o +c-obj-$(CONFIG_X86_NUMAQ) += numaq.o +c-obj-$(CONFIG_X86_SUMMIT_NUMA) += summit.o +c-obj-$(CONFIG_MODULES) += module.o +c-obj-y += sysenter.o +obj-y += vsyscall.o +c-obj-$(CONFIG_ACPI_SRAT) += srat.o +c-obj-$(CONFIG_HPET_TIMER) += time_hpet.o +c-obj-$(CONFIG_EFI) += efi.o efi_stub.o +c-obj-$(CONFIG_EARLY_PRINTK) += early_printk.o +c-obj-$(CONFIG_SMP_ALTERNATIVES)+= smpalts.o EXTRA_AFLAGS := -traditional -obj-$(CONFIG_SCx200) += scx200.o +c-obj-$(CONFIG_SCx200) += scx200.o # vsyscall.o contains the vsyscall DSO images as __initdata. # We must build both images before we can assemble it. @@ -58,7 +69,7 @@ SYSCFLAGS_vsyscall-int80.so = $(vsyscall $(obj)/vsyscall-int80.so $(obj)/vsyscall-sysenter.so: \ $(obj)/vsyscall-%.so: $(src)/vsyscall.lds \ - $(obj)/vsyscall-%.o $(obj)/vsyscall-note.o FORCE + $(obj)/vsyscall-%.o FORCE $(call if_changed,syscall) # We also create a special relocatable object that should mirror the symbol @@ -70,5 +81,21 @@ $(obj)/built-in.o: ld_flags += -R $(obj) SYSCFLAGS_vsyscall-syms.o = -r $(obj)/vsyscall-syms.o: $(src)/vsyscall.lds \ - $(obj)/vsyscall-sysenter.o $(obj)/vsyscall-note.o FORCE + $(obj)/vsyscall-sysenter.o FORCE $(call if_changed,syscall) + +c-link := init_task.o +s-link := vsyscall-int80.o vsyscall-sysenter.o vsyscall-sigreturn.o vsyscall.lds.o syscall_table.o + +$(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-obj-m) $(c-link)) $(patsubst %.o,$(obj)/%.S,$(s-obj-y) $(s-link)): + @ln -fsn $(srctree)/arch/i386/kernel/$(notdir $@) $@ + +$(obj)/vsyscall-int80.S: $(obj)/vsyscall-sigreturn.S + +$(obj)/entry.o: $(src)/entry.S $(src)/syscall_table.S + +obj-y += $(c-obj-y) $(s-obj-y) +obj-m += $(c-obj-m) + +clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-m) $(c-obj-) $(c-link)) +clean-files += $(patsubst %.o,%.S,$(s-obj-y) $(s-obj-) $(s-link)) diff -x mkbuildtree -x include -x xen -x SCCS -urPp pristine-linux-2.6.12/arch/i386/kernel/microcode.c linux-2.6-xen-sparse/arch/i386/kernel/microcode.c --- pristine-linux-2.6.12/arch/i386/kernel/microcode.c 2005-06-17 12:48:29.000000000 -0700 +++ linux-2.6-xen-sparse/arch/i386/kernel/microcode.c 2005-07-28 13:17:07.000000000 -0700 @@ -18,55 +18,6 @@ * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. - * - * 1.0 16 Feb 2000, Tigran Aivazian <tigran@xxxxxxx> - * Initial release. - * 1.01 18 Feb 2000, Tigran Aivazian <tigran@xxxxxxx> - * Added read() support + cleanups. - * 1.02 21 Feb 2000, Tigran Aivazian <tigran@xxxxxxx> - * Added 'device trimming' support. open(O_WRONLY) zeroes - * and frees the saved copy of applied microcode. - * 1.03 29 Feb 2000, Tigran Aivazian <tigran@xxxxxxx> - * Made to use devfs (/dev/cpu/microcode) + cleanups. - * 1.04 06 Jun 2000, Simon Trimmer <simon@xxxxxxxxxxx> - * Added misc device support (now uses both devfs and misc). - * Added MICROCODE_IOCFREE ioctl to clear memory. - * 1.05 09 Jun 2000, Simon Trimmer <simon@xxxxxxxxxxx> - * Messages for error cases (non Intel & no suitable microcode). - * 1.06 03 Aug 2000, Tigran Aivazian <tigran@xxxxxxxxxxx> - * Removed ->release(). Removed exclusive open and status bitmap. - * Added microcode_rwsem to serialize read()/write()/ioctl(). - * Removed global kernel lock usage. - * 1.07 07 Sep 2000, Tigran Aivazian <tigran@xxxxxxxxxxx> - * Write 0 to 0x8B msr and then cpuid before reading revision, - * so that it works even if there were no update done by the - * BIOS. Otherwise, reading from 0x8B gives junk (which happened - * to be 0 on my machine which is why it worked even when I - * disabled update by the BIOS) - * Thanks to Eric W. Biederman <ebiederman@xxxxxxxx> for the fix. - * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@xxxxxxxxx> and - * Tigran Aivazian <tigran@xxxxxxxxxxx> - * Intel Pentium 4 processor support and bugfixes. - * 1.09 30 Oct 2001, Tigran Aivazian <tigran@xxxxxxxxxxx> - * Bugfix for HT (Hyper-Threading) enabled processors - * whereby processor resources are shared by all logical processors - * in a single CPU package. - * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@xxxxxxxxx> and - * Tigran Aivazian <tigran@xxxxxxxxxxx>, - * Serialize updates as required on HT processors due to speculative - * nature of implementation. - * 1.11 22 Mar 2002 Tigran Aivazian <tigran@xxxxxxxxxxx> - * Fix the panic when writing zero-length microcode chunk. - * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@xxxxxxxxx>, - * Jun Nakajima <jun.nakajima@xxxxxxxxx> - * Support for the microcode updates in the new format. - * 1.13 10 Oct 2003 Tigran Aivazian <tigran@xxxxxxxxxxx> - * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl - * because we no longer hold a copy of applied microcode - * in kernel memory. - * 1.14 25 Jun 2004 Tigran Aivazian <tigran@xxxxxxxxxxx> - * Fix sigmatch() macro to handle old CPUs with pf == 0. - * Thanks to Stuart Swales for pointing out this bug. */ //#define DEBUG /* pr_debug */ @@ -79,6 +30,7 @@ #include <linux/miscdevice.h> #include <linux/spinlock.h> #include <linux/mm.h> +#include <linux/syscalls.h> #include <asm/msr.h> #include <asm/uaccess.h> @@ -88,342 +40,41 @@ MODULE_DESCRIPTION("Intel CPU (IA-32) Mi MODULE_AUTHOR("Tigran Aivazian <tigran@xxxxxxxxxxx>"); MODULE_LICENSE("GPL"); -#define MICROCODE_VERSION "1.14" +#define MICROCODE_VERSION "1.14-xen" #define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */ #define MC_HEADER_SIZE (sizeof (microcode_header_t)) /* 48 bytes */ #define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */ -#define EXT_HEADER_SIZE (sizeof (struct extended_sigtable)) /* 20 bytes */ -#define EXT_SIGNATURE_SIZE (sizeof (struct extended_signature)) /* 12 bytes */ -#define DWSIZE (sizeof (u32)) -#define get_totalsize(mc) \ - (((microcode_t *)mc)->hdr.totalsize ? \ - ((microcode_t *)mc)->hdr.totalsize : DEFAULT_UCODE_TOTALSIZE) -#define get_datasize(mc) \ - (((microcode_t *)mc)->hdr.datasize ? \ - ((microcode_t *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE) - -#define sigmatch(s1, s2, p1, p2) \ - (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0)))) - -#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) - -/* serialize access to the physical write to MSR 0x79 */ -static DEFINE_SPINLOCK(microcode_update_lock); /* no concurrent ->write()s are allowed on /dev/cpu/microcode */ static DECLARE_MUTEX(microcode_sem); static void __user *user_buffer; /* user area microcode data buffer */ static unsigned int user_buffer_size; /* it's size */ - -typedef enum mc_error_code { - MC_SUCCESS = 0, - MC_NOTFOUND = 1, - MC_MARKED = 2, - MC_ALLOCATED = 3, -} mc_error_code_t; - -static struct ucode_cpu_info { - unsigned int sig; - unsigned int pf; - unsigned int rev; - unsigned int cksum; - mc_error_code_t err; - microcode_t *mc; -} ucode_cpu_info[NR_CPUS]; static int microcode_open (struct inode *unused1, struct file *unused2) { return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; } -static void collect_cpu_info (void *unused) -{ - int cpu_num = smp_processor_id(); - struct cpuinfo_x86 *c = cpu_data + cpu_num; - struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; - unsigned int val[2]; - - uci->sig = uci->pf = uci->rev = uci->cksum = 0; - uci->err = MC_NOTFOUND; - uci->mc = NULL; - - if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || - cpu_has(c, X86_FEATURE_IA64)) { - printk(KERN_ERR "microcode: CPU%d not a capable Intel processor\n", cpu_num); - return; - } else { - uci->sig = cpuid_eax(0x00000001); - - if ((c->x86_model >= 5) || (c->x86 > 6)) { - /* get processor flags from MSR 0x17 */ - rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); - uci->pf = 1 << ((val[1] >> 18) & 7); - } - } - - wrmsr(MSR_IA32_UCODE_REV, 0, 0); - __asm__ __volatile__ ("cpuid" : : : "ax", "bx", "cx", "dx"); - /* get the current revision from MSR 0x8B */ - rdmsr(MSR_IA32_UCODE_REV, val[0], uci->rev); - pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n", - uci->sig, uci->pf, uci->rev); -} - -static inline void mark_microcode_update (int cpu_num, microcode_header_t *mc_header, int sig, int pf, int cksum) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; - - pr_debug("Microcode Found.\n"); - pr_debug(" Header Revision 0x%x\n", mc_header->hdrver); - pr_debug(" Loader Revision 0x%x\n", mc_header->ldrver); - pr_debug(" Revision 0x%x \n", mc_header->rev); - pr_debug(" Date %x/%x/%x\n", - ((mc_header->date >> 24 ) & 0xff), - ((mc_header->date >> 16 ) & 0xff), - (mc_header->date & 0xFFFF)); - pr_debug(" Signature 0x%x\n", sig); - pr_debug(" Type 0x%x Family 0x%x Model 0x%x Stepping 0x%x\n", - ((sig >> 12) & 0x3), - ((sig >> 8) & 0xf), - ((sig >> 4) & 0xf), - ((sig & 0xf))); - pr_debug(" Processor Flags 0x%x\n", pf); - pr_debug(" Checksum 0x%x\n", cksum); - - if (mc_header->rev < uci->rev) { - printk(KERN_ERR "microcode: CPU%d not 'upgrading' to earlier revision" - " 0x%x (current=0x%x)\n", cpu_num, mc_header->rev, uci->rev); - goto out; - } else if (mc_header->rev == uci->rev) { - /* notify the caller of success on this cpu */ - uci->err = MC_SUCCESS; - printk(KERN_ERR "microcode: CPU%d already at revision" - " 0x%x (current=0x%x)\n", cpu_num, mc_header->rev, uci->rev); - goto out; - } - - pr_debug("microcode: CPU%d found a matching microcode update with " - " revision 0x%x (current=0x%x)\n", cpu_num, mc_header->rev, uci->rev); - uci->cksum = cksum; - uci->pf = pf; /* keep the original mc pf for cksum calculation */ - uci->err = MC_MARKED; /* found the match */ -out: - return; -} - -static int find_matching_ucodes (void) -{ - int cursor = 0; - int error = 0; - - while (cursor + MC_HEADER_SIZE < user_buffer_size) { - microcode_header_t mc_header; - void *newmc = NULL; - int i, sum, cpu_num, allocated_flag, total_size, data_size, ext_table_size; - - if (copy_from_user(&mc_header, user_buffer + cursor, MC_HEADER_SIZE)) { - printk(KERN_ERR "microcode: error! Can not read user data\n"); - error = -EFAULT; - goto out; - } - - total_size = get_totalsize(&mc_header); - if ((cursor + total_size > user_buffer_size) || (total_size < DEFAULT_UCODE_TOTALSIZE)) { - printk(KERN_ERR "microcode: error! Bad data in microcode data file\n"); - error = -EINVAL; - goto out; - } - - data_size = get_datasize(&mc_header); - if ((data_size + MC_HEADER_SIZE > total_size) || (data_size < DEFAULT_UCODE_DATASIZE)) { - printk(KERN_ERR "microcode: error! Bad data in microcode data file\n"); - error = -EINVAL; - goto out; - } - - if (mc_header.ldrver != 1 || mc_header.hdrver != 1) { - printk(KERN_ERR "microcode: error! Unknown microcode update format\n"); - error = -EINVAL; - goto out; - } - - for (cpu_num = 0; cpu_num < num_online_cpus(); cpu_num++) { - struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; - if (uci->err != MC_NOTFOUND) /* already found a match or not an online cpu*/ - continue; - - if (sigmatch(mc_header.sig, uci->sig, mc_header.pf, uci->pf)) - mark_microcode_update(cpu_num, &mc_header, mc_header.sig, mc_header.pf, mc_header.cksum); - } - - ext_table_size = total_size - (MC_HEADER_SIZE + data_size); - if (ext_table_size) { - struct extended_sigtable ext_header; - struct extended_signature ext_sig; - int ext_sigcount; - - if ((ext_table_size < EXT_HEADER_SIZE) - || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { - printk(KERN_ERR "microcode: error! Bad data in microcode data file\n"); - error = -EINVAL; - goto out; - } - if (copy_from_user(&ext_header, user_buffer + cursor - + MC_HEADER_SIZE + data_size, EXT_HEADER_SIZE)) { - printk(KERN_ERR "microcode: error! Can not read user data\n"); - error = -EFAULT; - goto out; - } - if (ext_table_size != exttable_size(&ext_header)) { - printk(KERN_ERR "microcode: error! Bad data in microcode data file\n"); - error = -EFAULT; - goto out; - } - - ext_sigcount = ext_header.count; - - for (i = 0; i < ext_sigcount; i++) { - if (copy_from_user(&ext_sig, user_buffer + cursor + MC_HEADER_SIZE + data_size + EXT_HEADER_SIZE - + EXT_SIGNATURE_SIZE * i, EXT_SIGNATURE_SIZE)) { - printk(KERN_ERR "microcode: error! Can not read user data\n"); - error = -EFAULT; - goto out; - } - for (cpu_num = 0; cpu_num < num_online_cpus(); cpu_num++) { - struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; - if (uci->err != MC_NOTFOUND) /* already found a match or not an online cpu*/ - continue; - if (sigmatch(ext_sig.sig, uci->sig, ext_sig.pf, uci->pf)) { - mark_microcode_update(cpu_num, &mc_header, ext_sig.sig, ext_sig.pf, ext_sig.cksum); - } - } - } - } - /* now check if any cpu has matched */ - for (cpu_num = 0, allocated_flag = 0, sum = 0; cpu_num < num_online_cpus(); cpu_num++) { - if (ucode_cpu_info[cpu_num].err == MC_MARKED) { - struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; - if (!allocated_flag) { - allocated_flag = 1; - newmc = vmalloc(total_size); - if (!newmc) { - printk(KERN_ERR "microcode: error! Can not allocate memory\n"); - error = -ENOMEM; - goto out; - } - if (copy_from_user(newmc + MC_HEADER_SIZE, - user_buffer + cursor + MC_HEADER_SIZE, - total_size - MC_HEADER_SIZE)) { - printk(KERN_ERR "microcode: error! Can not read user data\n"); - vfree(newmc); - error = -EFAULT; - goto out; - } - memcpy(newmc, &mc_header, MC_HEADER_SIZE); - /* check extended table checksum */ - if (ext_table_size) { - int ext_table_sum = 0; - int * ext_tablep = (((void *) newmc) + MC_HEADER_SIZE + data_size); - i = ext_table_size / DWSIZE; - while (i--) ext_table_sum += ext_tablep[i]; - if (ext_table_sum) { - printk(KERN_WARNING "microcode: aborting, bad extended signature table checksum\n"); - vfree(newmc); - error = -EINVAL; - goto out; - } - } - - /* calculate the checksum */ - i = (MC_HEADER_SIZE + data_size) / DWSIZE; - while (i--) sum += ((int *)newmc)[i]; - sum -= (mc_header.sig + mc_header.pf + mc_header.cksum); - } - ucode_cpu_info[cpu_num].mc = newmc; - ucode_cpu_info[cpu_num].err = MC_ALLOCATED; /* mc updated */ - if (sum + uci->sig + uci->pf + uci->cksum != 0) { - printk(KERN_ERR "microcode: CPU%d aborting, bad checksum\n", cpu_num); - error = -EINVAL; - goto out; - } - } - } - cursor += total_size; /* goto the next update patch */ - } /* end of while */ -out: - return error; -} - -static void do_update_one (void * unused) -{ - unsigned long flags; - unsigned int val[2]; - int cpu_num = smp_processor_id(); - struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; - - if (uci->mc == NULL) { - printk(KERN_INFO "microcode: No new microcode data for CPU%d\n", cpu_num); - return; - } - - /* serialize access to the physical write to MSR 0x79 */ - spin_lock_irqsave(µcode_update_lock, flags); - - /* write microcode via MSR 0x79 */ - wrmsr(MSR_IA32_UCODE_WRITE, - (unsigned long) uci->mc->bits, - (unsigned long) uci->mc->bits >> 16 >> 16); - wrmsr(MSR_IA32_UCODE_REV, 0, 0); - - __asm__ __volatile__ ("cpuid" : : : "ax", "bx", "cx", "dx"); - /* get the current revision from MSR 0x8B */ - rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); - - /* notify the caller of success on this cpu */ - uci->err = MC_SUCCESS; - spin_unlock_irqrestore(µcode_update_lock, flags); - printk(KERN_INFO "microcode: CPU%d updated from revision " - "0x%x to 0x%x, date = %08x \n", - cpu_num, uci->rev, val[1], uci->mc->hdr.date); - return; -} static int do_microcode_update (void) { - int i, error; + int err; + dom0_op_t op; - if (on_each_cpu(collect_cpu_info, NULL, 1, 1) != 0) { - printk(KERN_ERR "microcode: Error! Could not run on all processors\n"); - error = -EIO; - goto out; - } + err = sys_mlock((unsigned long)user_buffer, user_buffer_size); + if (err != 0) + return err; - if ((error = find_matching_ucodes())) { - printk(KERN_ERR "microcode: Error in the microcode data\n"); - goto out_free; - } + op.cmd = DOM0_MICROCODE; + op.u.microcode.data = user_buffer; + op.u.microcode.length = user_buffer_size; + err = HYPERVISOR_dom0_op(&op); - if (on_each_cpu(do_update_one, NULL, 1, 1) != 0) { - printk(KERN_ERR "microcode: Error! Could not run on all processors\n"); - error = -EIO; - } + (void)sys_munlock((unsigned long)user_buffer, user_buffer_size); -out_free: - for (i = 0; i < num_online_cpus(); i++) { - if (ucode_cpu_info[i].mc) { - int j; - void *tmp = ucode_cpu_info[i].mc; - vfree(tmp); - for (j = i; j < num_online_cpus(); j++) { - if (ucode_cpu_info[j].mc == tmp) - ucode_cpu_info[j].mc = NULL; - } - } - } -out: - return error; + return err; } static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos) diff -x mkbuildtree -x include -x xen -x SCCS -urPp pristine-linux-2.6.12/arch/i386/kernel/mpparse.c linux-2.6-xen-sparse/arch/i386/kernel/mpparse.c --- pristine-linux-2.6.12/arch/i386/kernel/mpparse.c 2005-06-17 12:48:29.000000000 -0700 +++ linux-2.6-xen-sparse/arch/i386/kernel/mpparse.c 2005-07-28 13:17:07.000000000 -0700 @@ -109,7 +109,7 @@ static int MP_valid_apicid(int apicid, i { return hweight_long(apicid & 0xf) == 1 && (apicid >> 4) != 0xf; } -#else +#elif !defined(CONFIG_XEN) static int MP_valid_apicid(int apicid, int version) { if (version >= 0x14) @@ -119,6 +119,7 @@ static int MP_valid_apicid(int apicid, i } #endif +#ifndef CONFIG_XEN static void __init MP_processor_info (struct mpc_config_processor *m) { int ver, apicid; @@ -217,6 +218,12 @@ static void __init MP_processor_info (st apic_version[m->mpc_apicid] = ver; bios_cpu_apicid[num_processors - 1] = m->mpc_apicid; } +#else +void __init MP_processor_info (struct mpc_config_processor *m) +{ + num_processors++; +} +#endif /* CONFIG_XEN */ static void __init MP_bus_info (struct mpc_config_bus *m) { @@ -690,7 +697,7 @@ void __init get_smp_config (void) * Read the physical hardware table. Anything here will * override the defaults. */ - if (!smp_read_mpc((void *)mpf->mpf_physptr)) { + if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) { smp_found_config = 0; printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"); printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n"); @@ -725,7 +732,7 @@ void __init get_smp_config (void) static int __init smp_scan_config (unsigned long base, unsigned long length) { - unsigned long *bp = phys_to_virt(base); + unsigned long *bp = isa_bus_to_virt(base); struct intel_mp_floating *mpf; Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length); @@ -741,6 +748,7 @@ static int __init smp_scan_config (unsig || (mpf->mpf_specification == 4)) ) { smp_found_config = 1; +#ifndef CONFIG_XEN printk(KERN_INFO "found SMP MP-table at %08lx\n", virt_to_phys(mpf)); reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE); @@ -760,6 +768,10 @@ static int __init smp_scan_config (unsig size = end - mpf->mpf_physptr; reserve_bootmem(mpf->mpf_physptr, size); } +#else + printk(KERN_INFO "found SMP MP-table at %08lx\n", + ((unsigned long)bp - (unsigned long)isa_bus_to_virt(base)) + base); +#endif mpf_found = mpf; return 1; @@ -803,9 +815,11 @@ void __init find_smp_config (void) * MP1.4 SPEC states to only scan first 1K of 4K EBDA. */ +#ifndef CONFIG_XEN address = get_bios_ebda(); if (address) smp_scan_config(address, 0x400); +#endif } /* -------------------------------------------------------------------------- @@ -817,14 +831,14 @@ void __init find_smp_config (void) void __init mp_register_lapic_address ( u64 address) { +#ifndef CONFIG_XEN mp_lapic_addr = (unsigned long) address; - set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); - if (boot_cpu_physical_apicid == -1U) boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID)); Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid); +#endif } @@ -844,6 +858,7 @@ void __init mp_register_lapic ( if (id == boot_cpu_physical_apicid) boot_cpu = 1; +#ifndef CONFIG_XEN processor.mpc_type = MP_PROCESSOR; processor.mpc_apicid = id; processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR)); @@ -854,6 +869,7 @@ void __init mp_register_lapic ( processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; processor.mpc_reserved[0] = 0; processor.mpc_reserved[1] = 0; +#endif MP_processor_info(&processor); } @@ -913,7 +929,6 @@ void __init mp_register_ioapic ( mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE; mp_ioapics[idx].mpc_apicaddr = address; - set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); mp_ioapics[idx].mpc_apicid = io_apic_get_unique_id(idx, id); mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); diff -x mkbuildtree -x include -x xen -x SCCS -urPp pristine-linux-2.6.12/arch/i386/kernel/pci-dma.c linux-2.6-xen-sparse/arch/i386/kernel/pci-dma.c --- pristine-linux-2.6.12/arch/i386/kernel/pci-dma.c 2005-06-17 12:48:29.000000000 -0700 +++ linux-2.6-xen-sparse/arch/i386/kernel/pci-dma.c 2005-07-28 13:17:07.000000000 -0700 @@ -11,7 +11,10 @@ #include <linux/mm.h> #include <linux/string.h> #include <linux/pci.h> +#include <linux/version.h> #include <asm/io.h> +#include <asm-xen/balloon.h> +#include <asm/tlbflush.h> struct dma_coherent_mem { void *virt_base; @@ -26,7 +29,8 @@ void *dma_alloc_coherent(struct device * { void *ret; struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; - int order = get_order(size); + unsigned int order = get_order(size); + unsigned long vstart; /* ignore region specifiers */ gfp &= ~(__GFP_DMA | __GFP_HIGHMEM); @@ -46,11 +50,14 @@ void *dma_alloc_coherent(struct device * if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff)) gfp |= GFP_DMA; - ret = (void *)__get_free_pages(gfp, order); + vstart = __get_free_pages(gfp, order); + ret = (void *)vstart; if (ret != NULL) { + xen_contig_memory(vstart, order); + memset(ret, 0, size); - *dma_handle = virt_to_phys(ret); + *dma_handle = virt_to_bus(ret); } return ret; } @@ -145,3 +152,131 @@ void *dma_mark_declared_memory_occupied( return mem->virt_base + (pos << PAGE_SHIFT); } EXPORT_SYMBOL(dma_mark_declared_memory_occupied); + +static LIST_HEAD(dma_map_head); +static DEFINE_SPINLOCK(dma_map_lock); +struct dma_map_entry { + struct list_head list; + dma_addr_t dma; + char *bounce, *host; + size_t size; +}; +#define DMA_MAP_MATCHES(e,d) (((e)->dma<=(d)) && (((e)->dma+(e)->size)>(d))) + +dma_addr_t +dma_map_single(struct device *dev, void *ptr, size_t size, + enum dma_data_direction direction) +{ + struct dma_map_entry *ent; + void *bnc; + dma_addr_t dma; + unsigned long flags; + + BUG_ON(direction == DMA_NONE); + + /* + * Even if size is sub-page, the buffer may still straddle a page + * boundary. Take into account buffer start offset. All other calls are + * conservative and always search the dma_map list if it's non-empty. + */ + if ((((unsigned int)ptr & ~PAGE_MASK) + size) <= PAGE_SIZE) { + dma = virt_to_bus(ptr); + } else { + BUG_ON((bnc = dma_alloc_coherent(dev, size, &dma, 0)) == NULL); + BUG_ON((ent = kmalloc(sizeof(*ent), GFP_KERNEL)) == NULL); + if (direction != DMA_FROM_DEVICE) + memcpy(bnc, ptr, size); + ent->dma = dma; + ent->bounce = bnc; + ent->host = ptr; + ent->size = size; + spin_lock_irqsave(&dma_map_lock, flags); + list_add(&ent->list, &dma_map_head); + spin_unlock_irqrestore(&dma_map_lock, flags); + } + + flush_write_buffers(); + return dma; +} +EXPORT_SYMBOL(dma_map_single); + +void +dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, + enum dma_data_direction direction) +{ + struct dma_map_entry *ent; + unsigned long flags; + + BUG_ON(direction == DMA_NONE); + + /* Fast-path check: are there any multi-page DMA mappings? */ + if (!list_empty(&dma_map_head)) { + spin_lock_irqsave(&dma_map_lock, flags); + list_for_each_entry ( ent, &dma_map_head, list ) { + if (DMA_MAP_MATCHES(ent, dma_addr)) { + list_del(&ent->list); + break; + } + } + spin_unlock_irqrestore(&dma_map_lock, flags); + if (&ent->list != &dma_map_head) { + BUG_ON(dma_addr != ent->dma); + BUG_ON(size != ent->size); + if (direction != DMA_TO_DEVICE) + memcpy(ent->host, ent->bounce, size); + dma_free_coherent(dev, size, ent->bounce, ent->dma); + kfree(ent); + } + } +} +EXPORT_SYMBOL(dma_unmap_single); + +void +dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, + enum dma_data_direction direction) +{ + struct dma_map_entry *ent; + unsigned long flags, off; + + /* Fast-path check: are there any multi-page DMA mappings? */ + if (!list_empty(&dma_map_head)) { + spin_lock_irqsave(&dma_map_lock, flags); + list_for_each_entry ( ent, &dma_map_head, list ) + if (DMA_MAP_MATCHES(ent, dma_handle)) + break; + spin_unlock_irqrestore(&dma_map_lock, flags); + if (&ent->list != &dma_map_head) { + off = dma_handle - ent->dma; + BUG_ON((off + size) > ent->size); + /*if (direction != DMA_TO_DEVICE)*/ + memcpy(ent->host+off, ent->bounce+off, size); + } + } +} +EXPORT_SYMBOL(dma_sync_single_for_cpu); + +void +dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size, + enum dma_data_direction direction) +{ + struct dma_map_entry *ent; + unsigned long flags, off; + + /* Fast-path check: are there any multi-page DMA mappings? */ + if (!list_empty(&dma_map_head)) { + spin_lock_irqsave(&dma_map_lock, flags); + list_for_each_entry ( ent, &dma_map_head, list ) + if (DMA_MAP_MATCHES(ent, dma_handle)) + break; + spin_unlock_irqrestore(&dma_map_lock, flags); + if (&ent->list != &dma_map_head) { + off = dma_handle - ent->dma; + BUG_ON((off + size) > ent->size); + /*if (direction != DMA_FROM_DEVICE)*/ + memcpy(ent->bounce+off, ent->host+off, size); + } + } + + flush_write_buffers(); +} +EXPORT_SYMBOL(dma_sync_single_for_device); diff -x mkbuildtree -x include -x xen -x SCCS -urPp pristine-linux-2.6.12/arch/i386/kernel/process.c linux-2.6-xen-sparse/arch/i386/kernel/process.c --- pristine-linux-2.6.12/arch/i386/kernel/process.c 2005-06-17 12:48:29.000000000 -0700 +++ linux-2.6-xen-sparse/arch/i386/kernel/process.c 2005-07-28 13:17:07.000000000 -0700 @@ -13,6 +13,7 @@ #include <stdarg.h> +#include <linux/cpu.h> #include <linux/errno.h> #include <linux/sched.h> #include <linux/fs.h> @@ -47,6 +48,7 @@ #include <asm/i387.h> #include <asm/irq.h> #include <asm/desc.h> +#include <asm-xen/xen-public/physdev.h> #ifdef CONFIG_MATH_EMULATION #include <asm/math_emu.h> #endif @@ -54,6 +56,9 @@ #include <linux/irq.h> #include <linux/err.h> +#include <asm/tlbflush.h> +#include <asm/cpu.h> + asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); static int hlt_counter; @@ -89,54 +94,48 @@ void enable_hlt(void) EXPORT_SYMBOL(enable_hlt); -/* - * We use this if we don't have any better - * idle routine.. - */ -void default_idle(void) +/* XXX XEN doesn't use default_idle(), poll_idle(). Use xen_idle() instead. */ +extern void stop_hz_timer(void); +extern void start_hz_timer(void); +void xen_idle(void) { - if (!hlt_counter && boot_cpu_data.hlt_works_ok) { - local_irq_disable(); - if (!need_resched()) - safe_halt(); - else - local_irq_enable(); + local_irq_disable(); + + if (need_resched()) { + local_irq_enable(); } else { - cpu_relax(); + stop_hz_timer(); + HYPERVISOR_block(); /* implicit local_irq_enable() */ + start_hz_timer(); } } -/* - * On SMP it's slightly faster (but much more power-consuming!) - * to poll the ->work.need_resched flag instead of waiting for the - * cross-CPU IPI to arrive. Use this option with caution. - */ -static void poll_idle (void) -{ - int oldval; - +#ifdef CONFIG_HOTPLUG_CPU +#include <asm/nmi.h> +/* We don't actually take CPU down, just spin without interrupts. */ +static inline void play_dead(void) +{ + /* Ack it */ + __get_cpu_var(cpu_state) = CPU_DEAD; + + /* We shouldn't have to disable interrupts while dead, but + * some interrupts just don't seem to go away, and this makes + * it "work" for testing purposes. */ + /* Death loop */ + while (__get_cpu_var(cpu_state) != CPU_UP_PREPARE) + HYPERVISOR_yield(); + + local_irq_disable(); + __flush_tlb_all(); + cpu_set(smp_processor_id(), cpu_online_map); local_irq_enable(); - - /* - * Deal with another CPU just having chosen a thread to - * run here: - */ - oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED); - - if (!oldval) { - set_thread_flag(TIF_POLLING_NRFLAG); - asm volatile( - "2:" - "testl %0, %1;" - "rep; nop;" - "je 2b;" - : : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags)); - - clear_thread_flag(TIF_POLLING_NRFLAG); - } else { - set_need_resched(); - } } +#else +static inline void play_dead(void) +{ + BUG(); +} +#endif /* CONFIG_HOTPLUG_CPU */ /* * The idle thread. There's no useful work to be @@ -146,22 +145,26 @@ static void poll_idle (void) */ void cpu_idle (void) { + int cpu = _smp_processor_id(); + /* endless idle loop with no priority at all */ while (1) { while (!need_resched()) { - void (*idle)(void); if (__get_cpu_var(cpu_idle_state)) __get_cpu_var(cpu_idle_state) = 0; - rmb(); - idle = pm_idle; - if (!idle) - idle = default_idle; + if (cpu_is_offline(cpu)) { +#if defined(CONFIG_XEN) && defined(CONFIG_HOTPLUG_CPU) + /* Tell hypervisor to take vcpu down. */ + HYPERVISOR_vcpu_down(cpu); +#endif + play_dead(); + } __get_cpu_var(irq_stat).idle_timestamp = jiffies; - idle(); + xen_idle(); } schedule(); } @@ -195,74 +198,18 @@ void cpu_idle_wait(void) } EXPORT_SYMBOL_GPL(cpu_idle_wait); -/* - * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, - * which can obviate IPI to trigger checking of need_resched. - * We execute MONITOR against need_resched and enter optimized wait state - * through MWAIT. Whenever someone changes need_resched, we would be woken - * up from MWAIT (without an IPI). - */ -static void mwait_idle(void) -{ - local_irq_enable(); - - if (!need_resched()) { - set_thread_flag(TIF_POLLING_NRFLAG); - do { - __monitor((void *)¤t_thread_info()->flags, 0, 0); - if (need_resched()) - break; - __mwait(0, 0); - } while (!need_resched()); - clear_thread_flag(TIF_POLLING_NRFLAG); - } -} - -void __init select_idle_routine(const struct cpuinfo_x86 *c) -{ - if (cpu_has(c, X86_FEATURE_MWAIT)) { - printk("monitor/mwait feature present.\n"); - /* - * Skip, if setup has overridden idle. - * One CPU supports mwait => All CPUs supports mwait - */ - if (!pm_idle) { - printk("using mwait in idle threads.\n"); - pm_idle = mwait_idle; - } - } -} - -static int __init idle_setup (char *str) -{ - if (!strncmp(str, "poll", 4)) { - printk("using polling idle threads.\n"); - pm_idle = poll_idle; -#ifdef CONFIG_X86_SMP - if (smp_num_siblings > 1) - printk("WARNING: polling idle and HT enabled, performance may degrade.\n"); -#endif - } else if (!strncmp(str, "halt", 4)) { - printk("using halt in idle threads.\n"); - pm_idle = default_idle; - } - - boot_option_idle_override = 1; - return 1; -} - -__setup("idle=", idle_setup); +/* XXX XEN doesn't use mwait_idle(), select_idle_routine(), idle_setup(). */ +/* Always use xen_idle() instead. */ +void __init select_idle_routine(const struct cpuinfo_x86 *c) {} void show_regs(struct pt_regs * regs) { - unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; - printk("\n"); printk("Pid: %d, comm: %20s\n", current->pid, current->comm); printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id()); print_symbol("EIP is at %s\n", regs->eip); - if (regs->xcs & 3) + if (regs->xcs & 2) printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp); printk(" EFLAGS: %08lx %s (%s)\n", regs->eflags, print_tainted(), system_utsname.release); @@ -273,17 +220,6 @@ void show_regs(struct pt_regs * regs) printk(" DS: %04x ES: %04x\n", 0xffff & regs->xds,0xffff & regs->xes); - __asm__("movl %%cr0, %0": "=r" (cr0)); - __asm__("movl %%cr2, %0": "=r" (cr2)); - __asm__("movl %%cr3, %0": "=r" (cr3)); - /* This could fault if %cr4 does not exist */ - __asm__("1: movl %%cr4, %0 \n" - "2: \n" - ".section __ex_table,\"a\" \n" - ".long 1b,2b \n" - ".previous \n" - : "=r" (cr4): "0" (0)); - printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4); show_trace(NULL, ®s->esp); } @@ -336,20 +272,11 @@ void exit_thread(void) /* The process may have allocated an io port bitmap... nuke it. */ if (unlikely(NULL != t->io_bitmap_ptr)) { - int cpu = get_cpu(); - struct tss_struct *tss = &per_cpu(init_tss, cpu); - + physdev_op_t op = { 0 }; + op.cmd = PHYSDEVOP_SET_IOBITMAP; + HYPERVISOR_physdev_op(&op); kfree(t->io_bitmap_ptr); t->io_bitmap_ptr = NULL; - /* - * Careful, clear this in the TSS too: - */ - memset(tss->io_bitmap, 0xff, tss->io_bitmap_max); - t->io_bitmap_max = 0; - tss->io_bitmap_owner = NULL; - tss->io_bitmap_max = 0; - tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET; - put_cpu(); } } @@ -458,6 +385,8 @@ int copy_thread(int nr, unsigned long cl desc->b = LDT_entry_b(&info); } + p->thread.io_pl = current->thread.io_pl; + err = 0; out: if (err && p->thread.io_bitmap_ptr) { @@ -525,40 +454,10 @@ int dump_task_regs(struct task_struct *t elf_core_copy_regs(regs, &ptregs); + boot_option_idle_override = 1; return 1; } -static inline void -handle_io_bitmap(struct thread_struct *next, struct tss_struct *tss) -{ - if (!next->io_bitmap_ptr) { - /* - * Disable the bitmap via an invalid offset. We still cache - * the previous bitmap owner and the IO bitmap contents: - */ - tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET; - return; - } - if (likely(next == tss->io_bitmap_owner)) { - /* - * Previous owner of the bitmap (hence the bitmap content) - * matches the next task, we dont have to do anything but - * to set a valid offset in the TSS: - */ - tss->io_bitmap_base = IO_BITMAP_OFFSET; - return; - } - /* - * Lazy TSS's I/O bitmap copy. We set an invalid offset here - * and we let the task to get a GPF in case an I/O instruction - * is performed. The handler of the GPF will verify that the - * faulting task has a valid I/O bitmap and, it true, does the - * real copy and restart the instruction. This will save us - * redundant copies when the currently switched task does not - * perform any I/O during its timeslice. - */ - tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY; -} /* * switch_to(x,yn) should switch tasks from x to y. @@ -593,32 +492,77 @@ struct task_struct fastcall * __switch_t *next = &next_p->thread; int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu); + physdev_op_t iopl_op, iobmp_op; + multicall_entry_t _mcl[8], *mcl = _mcl; - /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ + /* XEN NOTE: FS/GS saved in switch_mm(), not here. */ - __unlazy_fpu(prev_p); + /* + * This is basically '__unlazy_fpu', except that we queue a + * multicall to indicate FPU task switch, rather than + * synchronously trapping to Xen. + */ + if (prev_p->thread_info->status & TS_USEDFPU) { + __save_init_fpu(prev_p); /* _not_ save_init_fpu() */ + mcl->op = __HYPERVISOR_fpu_taskswitch; + mcl->args[0] = 1; + mcl++; + } /* * Reload esp0, LDT and the page table pointer: + * This is load_esp0(tss, next) with a multicall. */ - load_esp0(tss, next); + tss->esp0 = next->esp0; + mcl->op = __HYPERVISOR_stack_switch; + mcl->args[0] = tss->ss0; + mcl->args[1] = tss->esp0; + mcl++; /* * Load the per-thread Thread-Local Storage descriptor. + * This is load_TLS(next, cpu) with multicalls. */ - load_TLS(next, cpu); +#define C(i) do { \ + if (unlikely(next->tls_array[i].a != prev->tls_array[i].a || \ + next->tls_array[i].b != prev->tls_array[i].b)) { \ + mcl->op = __HYPERVISOR_update_descriptor; \ + mcl->args[0] = virt_to_machine(&get_cpu_gdt_table(cpu) \ + [GDT_ENTRY_TLS_MIN + i]); \ + mcl->args[1] = ((u32 *)&next->tls_array[i])[0]; \ + mcl->args[2] = ((u32 *)&next->tls_array[i])[1]; \ + mcl++; \ + } \ +} while (0) + C(0); C(1); C(2); +#undef C + + if (unlikely(prev->io_pl != next->io_pl)) { + iopl_op.cmd = PHYSDEVOP_SET_IOPL; + iopl_op.u.set_iopl.iopl = next->io_pl; + mcl->op = __HYPERVISOR_physdev_op; + mcl->args[0] = (unsigned long)&iopl_op; + mcl++; + } + + if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) { + iobmp_op.cmd = + PHYSDEVOP_SET_IOBITMAP; + iobmp_op.u.set_iobitmap.bitmap = + (unsigned long)next->io_bitmap_ptr; + iobmp_op.u.set_iobitmap.nr_ports = + next->io_bitmap_ptr ? IO_BITMAP_BITS : 0; + mcl->op = __HYPERVISOR_physdev_op; + mcl->args[0] = (unsigned long)&iobmp_op; + mcl++; + } - /* - * Save away %fs and %gs. No need to save %es and %ds, as - * those are always kernel segments while inside the kernel. - */ - asm volatile("mov %%fs,%0":"=m" (prev->fs)); - asm volatile("mov %%gs,%0":"=m" (prev->gs)); + (void)HYPERVISOR_multicall(_mcl, mcl - _mcl); /* * Restore %fs and %gs if needed. */ - if (unlikely(prev->fs | prev->gs | next->fs | next->gs)) { + if (unlikely(next->fs | next->gs)) { loadsegment(fs, next->fs); loadsegment(gs, next->gs); } @@ -636,9 +580,6 @@ struct task_struct fastcall * __switch_t loaddebug(next, 7); } - if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) - handle_io_bitmap(next, tss); - return prev_p; } diff -x mkbuildtree -x include -x xen -x SCCS -urPp pristine-linux-2.6.12/arch/i386/kernel/quirks.c linux-2.6-xen-sparse/arch/i386/kernel/quirks.c --- pristine-linux-2.6.12/arch/i386/kernel/quirks.c 2005-06-17 12:48:29.000000000 -0700 +++ linux-2.6-xen-sparse/arch/i386/kernel/quirks.c 2005-07-28 13:17:07.000000000 -0700 @@ -32,14 +32,11 @@ static void __devinit quirk_intel_irqbal raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word); if (!(word & (1 << 13))) { + dom0_op_t op; printk(KERN_INFO "Disabling irq balancing and affinity\n"); -#ifdef CONFIG_IRQBALANCE - irqbalance_disable(""); -#endif - noirqdebug_setup(""); -#ifdef CONFIG_PROC_FS - no_irq_affinity = 1; -#endif + op.cmd = DOM0_PLATFORM_QUIRK; + op.u.platform_quirk.quirk_id = QUIRK_NOIRQBALANCING; + (void)HYPERVISOR_dom0_op(&op); } config &= ~0x2; diff -x mkbuildtree -x include -x xen -x SCCS -urPp pristine-linux-2.6.12/arch/i386/kernel/setup.c linux-2.6-xen-sparse/arch/i386/kernel/setup.c --- pristine-linux-2.6.12/arch/i386/kernel/setup.c 2005-06-17 12:48:29.000000000 -0700 +++ linux-2.6-xen-sparse/arch/i386/kernel/setup.c 2005-07-28 13:17:07.000000000 -0700 @@ -41,6 +41,9 @@ #include <linux/init.h> #include <linux/edd.h> #include <linux/nodemask.h> +#include <linux/kernel.h> +#include <linux/percpu.h> +#include <linux/notifier.h> #include <video/edid.h> #include <asm/e820.h> #include <asm/mpspec.h> @@ -50,13 +53,18 @@ #include <asm/io_apic.h> #include <asm/ist.h> #include <asm/io.h> +#include <asm-xen/hypervisor.h> +#include <asm-xen/xen-public/physdev.h> #include "setup_arch_pre.h" #include <bios_ebda.h> -/* This value is set up by the early boot code to point to the value - immediately after the boot time page tables. It contains a *physical* - address, and must not be in the .bss segment! */ -unsigned long init_pg_tables_end __initdata = ~0UL; +/* Allows setting of maximum possible memory size */ +static unsigned long xen_override_max_pfn; + +static int xen_panic_event(struct notifier_block *, unsigned long, void *); +static struct notifier_block xen_panic_block = { + xen_panic_event, NULL, 0 /* try to go last */ +}; int disable_pse __initdata = 0; @@ -70,9 +78,9 @@ EXPORT_SYMBOL(efi_enabled); #endif /* cpu data as detected by the assembly code in head.S */ -struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; +struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 0, 1, 0, -1 }; /* common cpu data for all cpus */ -struct cpuinfo_x86 boot_cpu_data = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; +struct cpuinfo_x86 boot_cpu_data = { 0, 0, 0, 0, -1, 0, 1, 0, -1 }; unsigned long mmu_cr4_features; @@ -146,6 +154,7 @@ static struct resource code_resource = { .flags = IORESOURCE_BUSY | IORESOURCE_MEM }; +#ifdef CONFIG_XEN_PRIVILEGED_GUEST static struct resource system_rom_resource = { .name = "System ROM", .start = 0xf0000, @@ -201,6 +210,7 @@ static struct resource video_rom_resourc .end = 0xc7fff, .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM }; +#endif static struct resource video_ram_resource = { .name = "Video RAM area", @@ -259,6 +269,7 @@ static struct resource standard_io_resou #define STANDARD_IO_RESOURCES \ (sizeof standard_io_resources / sizeof standard_io_resources[0]) +#ifdef CONFIG_XEN_PRIVILEGED_GUEST #define romsignature(x) (*(unsigned short *)(x) == 0xaa55) static int __init romchecksum(unsigned char *rom, unsigned long length) @@ -276,6 +287,10 @@ static void __init probe_roms(void) unsigned char *rom; int i; + /* Nothing to do if not running in dom0. */ + if (!(xen_start_info.flags & SIF_INITDOMAIN)) + return; + /* video rom */ upper = adapter_rom_resources[0].start; for (start = video_rom_resource.start; start < upper; start += 2048) { @@ -334,6 +349,20 @@ static void __init probe_roms(void) start = adapter_rom_resources[i++].end & ~2047UL; } } +#endif + +/* + * Point at the empty zero page to start with. We map the real shared_info + * page as soon as fixmap is up and running. + */ +shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page; +EXPORT_SYMBOL(HYPERVISOR_shared_info); + +unsigned int *phys_to_machine_mapping, *pfn_to_mfn_frame_list; +EXPORT_SYMBOL(phys_to_machine_mapping); + +/* Raw start-of-day parameters from the hypervisor. */ +union xen_start_info_union xen_start_info_union; static void __init limit_regions(unsigned long long size) { @@ -414,6 +443,7 @@ static void __init print_memory_map(char } } +#if 0 /* * Sanitize the BIOS e820 map. * @@ -633,6 +663,7 @@ static int __init copy_e820_map(struct e } while (biosmap++,--nr_map); return 0; } +#endif #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) struct edd edd; @@ -666,11 +697,14 @@ static inline void copy_edd(void) static void __init parse_cmdline_early (char ** cmdline_p) { char c = ' ', *to = command_line, *from = saved_command_line; - int len = 0; + int len = 0, max_cmdline; int userdef = 0; + if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE) + max_cmdline = COMMAND_LINE_SIZE; + memcpy(saved_command_line, xen_start_info.cmd_line, max_cmdline); /* Save unparsed command line copy for /proc/cmdline */ - saved_command_line[COMMAND_LINE_SIZE-1] = '\0'; + saved_command_line[max_cmdline-1] = '\0'; for (;;) { if (c != ' ') @@ -702,8 +736,13 @@ static void __init parse_cmdline_early ( unsigned long long mem_size; mem_size = memparse(from+4, &from); +#if 0 limit_regions(mem_size); userdef=1; +#else + xen_override_max_pfn = + (unsigned long)(mem_size>>PAGE_SHIFT); +#endif } } @@ -744,7 +783,7 @@ static void __init parse_cmdline_early ( noexec_setup(from + 7); -#ifdef CONFIG_X86_SMP +#ifdef CONFIG_X86_MPPARSE /* * If the BIOS enumerates physical processors before logical, * maxcpus=N at enumeration-time can be used to disable HT. @@ -846,6 +885,7 @@ static void __init parse_cmdline_early ( } } +#if 0 /* !XEN */ /* * Callback for efi_memory_walk. */ @@ -889,6 +929,15 @@ void __init find_max_pfn(void) max_pfn = end; } } +#else +/* We don't use the fake e820 because we need to respond to user override. */ +void __init find_max_pfn(void) +{ + if ( xen_override_max_pfn < xen_start_info.nr_pages ) + xen_override_max_pfn = xen_start_info.nr_pages; + max_pfn = xen_override_max_pfn; +} +#endif /* XEN */ /* * Determine low and high memory ranges: @@ -1011,6 +1060,7 @@ static void __init register_bootmem_low_ } } +#ifndef CONFIG_XEN /* * workaround for Dell systems that neglect to reserve EBDA */ @@ -1021,16 +1071,18 @@ static void __init reserve_ebda_region(v if (addr) reserve_bootmem(addr, PAGE_SIZE); } +#endif #ifndef CONFIG_DISCONTIGMEM void __init setup_bootmem_allocator(void); static unsigned long __init setup_memory(void) { + /* * partially used pages are not usable - thus * we are rounding upwards: */ - min_low_pfn = PFN_UP(init_pg_tables_end); + min_low_pfn = PFN_UP(__pa(xen_start_info.pt_base)) + xen_start_info.nr_pt_frames; find_max_pfn(); @@ -1057,7 +1109,14 @@ void __init zone_sizes_init(void) unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0}; unsigned int max_dma, low; - max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; + /* + * XEN: Our notion of "DMA memory" is fake when running over Xen. + * We simply put all RAM in the DMA zone so that those drivers which + * needlessly specify GFP_DMA do not get starved of RAM unnecessarily. + * Those drivers that *do* require lowmem are screwed anyway when + * running over Xen! + */ + max_dma = max_low_pfn; low = max_low_pfn; if (low < max_dma) @@ -1095,6 +1154,7 @@ void __init setup_bootmem_allocator(void reserve_bootmem(HIGH_MEMORY, (PFN_PHYS(min_low_pfn) + bootmap_size + PAGE_SIZE-1) - (HIGH_MEMORY)); +#ifndef CONFIG_XEN /* * reserve physical page 0 - it's a special BIOS page on many boxes, * enabling clean reboots, SMP operation, laptop functions. @@ -1125,20 +1185,15 @@ void __init setup_bootmem_allocator(void */ acpi_reserve_bootmem(); #endif -#ifdef CONFIG_X86_FIND_SMP_CONFIG - /* - * Find and reserve possible boot-time SMP configuration: - */ - find_smp_config(); -#endif +#endif /* !CONFIG_XEN */ #ifdef CONFIG_BLK_DEV_INITRD - if (LOADER_TYPE && INITRD_START) { + if (xen_start_info.mod_start) { if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) { - reserve_bootmem(INITRD_START, INITRD_SIZE); - initrd_start = - INITRD_START ? INITRD_START + PAGE_OFFSET : 0; + /*reserve_bootmem(INITRD_START, INITRD_SIZE);*/ + initrd_start = INITRD_START + PAGE_OFFSET; initrd_end = initrd_start+INITRD_SIZE; + initrd_below_start_ok = 1; } else { printk(KERN_ERR "initrd extends beyond end of memory " @@ -1149,6 +1204,8 @@ void __init setup_bootmem_allocator(void } } #endif + + phys_to_machine_mapping = (unsigned int *)xen_start_info.mfn_list; } /* @@ -1178,7 +1235,9 @@ legacy_init_iomem_resources(struct resou { int i; +#ifdef CONFIG_XEN_PRIVILEGED_GUEST probe_roms(); +#endif for (i = 0; i < e820.nr_map; i++) { struct resource *res; if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL) @@ -1220,8 +1279,9 @@ static void __init register_memory(void) else legacy_init_iomem_resources(&code_resource, &data_resource); - /* EFI systems may still have VGA */ - request_resource(&iomem_resource, &video_ram_resource); + if (xen_start_info.flags & SIF_INITDOMAIN) + /* EFI systems may still have VGA */ + request_resource(&iomem_resource, &video_ram_resource); /* request I/O space for devices used on all i[345]86 PCs */ for (i = 0; i < STANDARD_IO_RESOURCES; i++) @@ -1396,10 +1456,23 @@ static void set_mca_bus(int x) { } */ void __init setup_arch(char **cmdline_p) { + int i, j; + physdev_op_t op; unsigned long max_low_pfn; + /* Force a quick death if the kernel panics. */ + extern int panic_timeout; + if (panic_timeout == 0) + panic_timeout = 1; + + /* Register a call for panic conditions. */ + notifier_chain_register(&panic_notifier_list, &xen_panic_block); + + HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); + HYPERVISOR_vm_assist(VMASST_CMD_enable, + VMASST_TYPE_writable_pagetables); + memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); - pre_setup_arch_hook(); early_cpu_init(); /* @@ -1414,7 +1487,10 @@ void __init setup_arch(char **cmdline_p) efi_enabled = 1; #endif - ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV); + /* This must be initialized to UNNAMED_MAJOR for ipconfig to work + properly. Setting ROOT_DEV to default to /dev/ram0 breaks initrd. + */ + ROOT_DEV = MKDEV(UNNAMED_MAJOR,0); drive_info = DRIVE_INFO; screen_info = SCREEN_INFO; edid_info = EDID_INFO; @@ -1429,6 +1505,16 @@ void __init setup_arch(char **cmdline_p) } bootloader_type = LOADER_TYPE; +#ifdef CONFIG_XEN_PHYSDEV_ACCESS + /* This is drawn from a dump from vgacon:startup in standard Linux. */ + screen_info.orig_video_mode = 3; + screen_info.orig_video_isVGA = 1; + screen_info.orig_video_lines = 25; + screen_info.orig_video_cols = 80; + screen_info.orig_video_ega_bx = 3; + screen_info.orig_video_points = 16; +#endif + #ifdef CONFIG_BLK_DEV_RAM rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK; rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0); @@ -1449,12 +1535,14 @@ void __init setup_arch(char **cmdline_p) init_mm.start_code = (unsigned long) _text; init_mm.end_code = (unsigned long) _etext; init_mm.end_data = (unsigned long) _edata; - init_mm.brk = init_pg_tables_end + PAGE_OFFSET; + init_mm.brk = (PFN_UP(__pa(xen_start_info.pt_base)) + + xen_start_info.nr_pt_frames) << PAGE_SHIFT; - code_resource.start = virt_to_phys(_text); - code_resource.end = virt_to_phys(_etext)-1; - data_resource.start = virt_to_phys(_etext); - data_resource.end = virt_to_phys(_edata)-1; + /* XEN: This is nonsense: kernel may not even be contiguous in RAM. */ + /*code_resource.start = virt_to_phys(_text);*/ + /*code_resource.end = virt_to_phys(_etext)-1;*/ + /*data_resource.start = virt_to_phys(_etext);*/ + /*data_resource.end = virt_to_phys(_edata)-1;*/ parse_cmdline_early(cmdline_p); @@ -1477,6 +1565,51 @@ void __init setup_arch(char **cmdline_p) remapped_pgdat_init(); zone_sizes_init(); +#ifdef CONFIG_X86_FIND_SMP_CONFIG + /* + * Find and reserve possible boot-time SMP configuration: + */ + find_smp_config(); +#endif + + /* Make sure we have a correctly sized P->M table. */ + if (max_pfn != xen_start_info.nr_pages) { + phys_to_machine_mapping = alloc_bootmem_low_pages( + max_pfn * sizeof(unsigned long)); + + if (max_pfn > xen_start_info.nr_pages) { + /* set to INVALID_P2M_ENTRY */ + memset(phys_to_machine_mapping, ~0, + max_pfn * sizeof(unsigned long)); + memcpy(phys_to_machine_mapping, + (unsigned long *)xen_start_info.mfn_list, + xen_start_info.nr_pages * sizeof(unsigned long)); + } else { + memcpy(phys_to_machine_mapping, + (unsigned long *)xen_start_info.mfn_list, + max_pfn * sizeof(unsigned long)); + if (HYPERVISOR_dom_mem_op( + MEMOP_decrease_reservation, + (unsigned long *)xen_start_info.mfn_list + max_pfn, + xen_start_info.nr_pages - max_pfn, 0) != + (xen_start_info.nr_pages - max_pfn)) BUG(); + } + free_bootmem( + __pa(xen_start_info.mfn_list), + PFN_PHYS(PFN_UP(xen_start_info.nr_pages * + sizeof(unsigned long)))); + } + + pfn_to_mfn_frame_list = alloc_bootmem_low_pages(PAGE_SIZE); + for ( i=0, j=0; i < max_pfn; i+=(PAGE_SIZE/sizeof(unsigned long)), j++ ) + { + pfn_to_mfn_frame_list[j] = + virt_to_machine(&phys_to_machine_mapping[i]) >> PAGE_SHIFT; + } + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list = + virt_to_machine(pfn_to_mfn_frame_list) >> PAGE_SHIFT; + + /* * NOTE: at this point the bootmem allocator is fully available. */ @@ -1502,6 +1635,18 @@ void __init setup_arch(char **cmdline_p) if (efi_enabled) efi_map_memmap(); + op.cmd = PHYSDEVOP_SET_IOPL; + op.u.set_iopl.iopl = current->thread.io_pl = 1; + HYPERVISOR_physdev_op(&op); + +#ifdef CONFIG_ACPI_BOOT + if (!(xen_start_info.flags & SIF_INITDOMAIN)) { + printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); + acpi_disabled = 1; + acpi_ht = 0; + } +#endif + #ifdef CONFIG_ACPI_BOOT /* * Parse the ACPI tables for possible boot-time SMP configuration. @@ -1515,16 +1660,46 @@ void __init setup_arch(char **cmdline_p) get_smp_config(); #endif + /* XXX Disable irqdebug until we have a way to avoid interrupt + * conflicts. */ + noirqdebug_setup(""); + register_memory(); + if (xen_start_info.flags & SIF_INITDOMAIN) { + if (!(xen_start_info.flags & SIF_PRIVILEGED)) + panic("Xen granted us console access " + "but not privileged status"); + #ifdef CONFIG_VT #if defined(CONFIG_VGA_CONSOLE) - if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY)) - conswitchp = &vga_con; + if (!efi_enabled || + (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY)) + conswitchp = &vga_con; #elif defined(CONFIG_DUMMY_CONSOLE) - conswitchp = &dummy_con; + conswitchp = &dummy_con; +#endif #endif + } else { +#ifdef CONFIG_XEN_PRIVILEGED_GUEST + extern const struct consw xennull_con; + extern int console_use_vt; +#if defined(CONFIG_VGA_CONSOLE) + /* disable VGA driver */ + ORIG_VIDEO_ISVGA = VIDEO_TYPE_VLFB; #endif + conswitchp = &xennull_con; + console_use_vt = 0; +#endif + } +} + +static int +xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + HYPERVISOR_crash(); + /* we're never actually going to get here... */ + return NOTIFY_DONE; } #include "setup_arch_post.h" diff -x mkbuildtree -x include -x xen -x SCCS -urPp pristine-linux-2.6.12/arch/i386/kernel/signal.c linux-2.6-xen-sparse/arch/i386/kernel/signal.c --- pristine-linux-2.6.12/arch/i386/kernel/signal.c 2005-06-17 12:48:29.000000000 -0700 +++ linux-2.6-xen-sparse/arch/i386/kernel/signal.c 2005-08-02 00:59:44.000000000 -0700 @@ -599,7 +599,7 @@ int fastcall do_signal(struct pt_regs *r * kernel mode. Just return without doing anything * if so. */ - if ((regs->xcs & 3) != 3) + if ((regs->xcs & 2) != 2) return 1; if (current->flags & PF_FREEZE) { diff -x mkbuildtree -x include -x xen -x SCCS -urPp pristine-linux-2.6.12/arch/i386/kernel/smpboot.c linux-2.6-xen-sparse/arch/i386/kernel/smpboot.c --- pristine-linux-2.6.12/arch/i386/kernel/smpboot.c 2005-06-17 12:48:29.000000000 -0700 +++ linux-2.6-xen-sparse/arch/i386/kernel/smpboot.c 2005-07-28 13:17:07.000000000 -0700 @@ -44,6 +44,9 @@ #include <linux/smp_lock.h> #include <linux/irq.h> #include <linux/bootmem.h> +#include <linux/notifier.h> +#include <linux/cpu.h> +#include <linux/percpu.h> #include <linux/delay.h> #include <linux/mc146818rtc.h> @@ -51,7 +54,11 @@ #include <asm/desc.h> #include <asm/arch_hooks.h> -#include <mach_apic.h> +#include <asm/smp_alt.h> + +#ifndef CONFIG_X86_IO_APIC +#define Dprintk(args...) +#endif #include <mach_wakecpu.h> #include <smpboot_hooks.h> @@ -79,6 +86,7 @@ u8 x86_cpu_to_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = 0xff }; EXPORT_SYMBOL(x86_cpu_to_apicid); +#if 0 /* * Trampoline 80x86 program as an array. */ @@ -87,9 +95,19 @@ extern unsigned char trampoline_data []; extern unsigned char trampoline_end []; static unsigned char *trampoline_base; static int trampoline_exec; +#endif -static void map_cpu_to_logical_apicid(void); +#ifdef CONFIG_HOTPLUG_CPU +/* State of each CPU. */ +DEFINE_PER_CPU(int, cpu_state) = { 0 }; +#endif + +static DEFINE_PER_CPU(int, resched_irq); +static DEFINE_PER_CPU(int, callfunc_irq); +static char resched_name[NR_CPUS][15]; +static char callfunc_name[NR_CPUS][15]; +#if 0 /* * Currently trivial. Write the real->protected mode * bootstrap into the page concerned. The caller @@ -101,6 +119,9 @@ static unsigned long __init setup_trampo memcpy(trampoline_base, trampoline_data, trampoline_end - trampoline_data); return virt_to_phys(trampoline_base); } +#endif + +static void map_cpu_to_logical_apicid(void); /* * We are called very early to get the low memory for the @@ -108,6 +129,15 @@ static unsigned long __init setup_trampo */ void __init smp_alloc_memory(void) { +#if 1 + int cpu; + + for (cpu = 1; cpu < NR_CPUS; cpu++) { + cpu_gdt_descr[cpu].address = (unsigned long) + alloc_bootmem_low_pages(PAGE_SIZE); + /* XXX free unused pages later */ + } +#else trampoline_base = (void *) alloc_bootmem_low_pages(PAGE_SIZE); /* * Has to be in very low memory so we can execute @@ -119,6 +149,7 @@ void __init smp_alloc_memory(void) * Make the SMP trampoline executable: */ trampoline_exec = set_kernel_exec((unsigned long)trampoline_base, 1); +#endif } /* @@ -179,6 +210,7 @@ valid_k7: ; } +#if 0 /* * TSC synchronization. * @@ -315,6 +347,7 @@ static void __init synchronize_tsc_ap (v } } #undef NR_LOOPS +#endif extern void calibrate_delay(void); @@ -325,6 +358,7 @@ static void __init smp_callin(void) int cpuid, phys_id; unsigned long timeout; +#if 0 /* * If waken up by an INIT in an 82489DX configuration * we may get here before an INIT-deassert IPI reaches @@ -332,11 +366,12 @@ static void __init smp_callin(void) * lock up on an APIC access. */ wait_for_init_deassert(&init_deasserted); +#endif /* * (This works even if the APIC is not enabled.) */ - phys_id = GET_APIC_ID(apic_read(APIC_ID)); + phys_id = smp_processor_id(); cpuid = smp_processor_id(); if (cpu_isset(cpuid, cpu_callin_map)) { printk("huh, phys CPU#%d, CPU#%d already present??\n", @@ -372,6 +407,7 @@ static void __init smp_callin(void) BUG(); } +#if 0 /* * the boot CPU has finished the init stage and is spinning * on callin_map until we finish. We are free to set up this @@ -382,6 +418,7 @@ static void __init smp_callin(void) Dprintk("CALLIN, before setup_local_APIC().\n"); smp_callin_clear_local_apic(); setup_local_APIC(); +#endif map_cpu_to_logical_apicid(); /* @@ -395,22 +432,49 @@ static void __init smp_callin(void) */ smp_store_cpu_info(cpuid); +#if 0 disable_APIC_timer(); +#endif /* * Allow the master to continue. */ cpu_set(cpuid, cpu_callin_map); +#if 0 /* * Synchronize the TSC with the BP */ if (cpu_has_tsc && cpu_khz) synchronize_tsc_ap(); +#endif } static int cpucount; + +static irqreturn_t ldebug_interrupt( + int irq, void *dev_id, struct pt_regs *regs) +{ + return IRQ_HANDLED; +} + +static DEFINE_PER_CPU(int, ldebug_irq); +static char ldebug_name[NR_CPUS][15]; + +void ldebug_setup(void) +{ + int cpu = smp_processor_id(); + + per_cpu(ldebug_irq, cpu) = bind_virq_to_irq(VIRQ_DEBUG); + sprintf(ldebug_name[cpu], "ldebug%d", cpu); + BUG_ON(request_irq(per_cpu(ldebug_irq, cpu), ldebug_interrupt, + SA_INTERRUPT, ldebug_name[cpu], NULL)); +} + + +extern void local_setup_timer(void); + /* * Activate a secondary processor. */ @@ -425,13 +489,10 @@ static void __init start_secondary(void smp_callin(); while (!cpu_isset(smp_processor_id(), smp_commenced_mask)) rep_nop(); - setup_secondary_APIC_clock(); - if (nmi_watchdog == NMI_IO_APIC) { - disable_8259A_irq(0); - enable_NMI_through_LVT0(NULL); - enable_8259A_irq(0); - } - enable_APIC_timer(); + local_setup_timer(); + ldebug_setup(); + smp_intr_init(); + local_irq_enable(); /* * low-memory mappings have been cleared, flush them from * the local TLBs too. @@ -510,7 +571,7 @@ u8 cpu_2_logical_apicid[NR_CPUS] = { [0 static void map_cpu_to_logical_apicid(void) { int cpu = smp_processor_id(); - int apicid = logical_smp_processor_id(); + int apicid = smp_processor_id(); cpu_2_logical_apicid[cpu] = apicid; map_cpu_to_node(cpu, apicid_to_node(apicid)); @@ -560,6 +621,7 @@ static inline void __inquire_remote_apic } #endif +#if 0 #ifdef WAKE_SECONDARY_VIA_NMI /* * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal @@ -745,6 +807,7 @@ wakeup_secondary_cpu(int phys_apicid, un return (send_status | accept_status); } #endif /* WAKE_SECONDARY_VIA_INIT */ +#endif extern cpumask_t cpu_initialized; @@ -759,7 +822,15 @@ static int __init do_boot_cpu(int apicid unsigned long boot_error; int timeout, cpu; unsigned long start_eip; +#if 0 unsigned short nmi_high = 0, nmi_low = 0; +#endif + vcpu_guest_context_t ctxt; + extern void startup_32_smp(void); + extern void hypervisor_callback(void); + extern void failsafe_callback(void); + extern void smp_trap_init(trap_info_t *); + int i; cpu = ++cpucount; /* @@ -771,7 +842,7 @@ static int __init do_boot_cpu(int apicid panic("failed fork for CPU %d", cpu); idle->thread.eip = (unsigned long) start_secondary; /* start_eip had better be page-aligned! */ - start_eip = setup_trampoline(); + start_eip = (unsigned long)startup_32_smp; /* So we see what's up */ printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip); @@ -787,6 +858,107 @@ static int __init do_boot_cpu(int apicid atomic_set(&init_deasserted, 0); +#if 1 + if (cpu_gdt_descr[0].size > PAGE_SIZE) + BUG(); + cpu_gdt_descr[cpu].size = cpu_gdt_descr[0].size; + printk("GDT: copying %d bytes from %lx to %lx\n", + cpu_gdt_descr[0].size, cpu_gdt_descr[0].address, + cpu_gdt_descr[cpu].address); + memcpy((void *)cpu_gdt_descr[cpu].address, + (void *)cpu_gdt_descr[0].address, cpu_gdt_descr[0].size); + + memset(&ctxt, 0, sizeof(ctxt)); + + ctxt.user_regs.ds = __USER_DS; + ctxt.user_regs.es = __USER_DS; + ctxt.user_regs.fs = 0; + ctxt.user_regs.gs = 0; + ctxt.user_regs.ss = __KERNEL_DS; + ctxt.user_regs.cs = __KERNEL_CS; + ctxt.user_regs.eip = start_eip; + ctxt.user_regs.esp = idle->thread.esp; + ctxt.user_regs.eflags = (1<<9) | (1<<2) | (idle->thread.io_pl<<12); + + /* FPU is set up to default initial state. */ + memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt)); + + /* Virtual IDT is empty at start-of-day. */ + for ( i = 0; i < 256; i++ ) + { + ctxt.trap_ctxt[i].vector = i; + ctxt.trap_ctxt[i].cs = FLAT_KERNEL_CS; + } + smp_trap_init(ctxt.trap_ctxt); + + /* No LDT. */ + ctxt.ldt_ents = 0; + + { + unsigned long va; + int f; + + for (va = cpu_gdt_descr[cpu].address, f = 0; + va < cpu_gdt_descr[cpu].address + cpu_gdt_descr[cpu].size; + va += PAGE_SIZE, f++) { + ctxt.gdt_frames[f] = virt_to_machine(va) >> PAGE_SHIFT; + make_page_readonly((void *)va); + } + ctxt.gdt_ents = cpu_gdt_descr[cpu].size / 8; + } + + /* Ring 1 stack is the initial stack. */ + ctxt.kernel_ss = __KERNEL_DS; + ctxt.kernel_sp = idle->thread.esp; + + /* Callback handlers. */ + ctxt.event_callback_cs = __KERNEL_CS; + ctxt.event_callback_eip = (unsigned long)hypervisor_callback; + ctxt.failsafe_callback_cs = __KERNEL_CS; + ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback; + + ctxt.ctrlreg[3] = (unsigned long)virt_to_machine(swapper_pg_dir); + + boot_error = HYPERVISOR_boot_vcpu(cpu, &ctxt); + printk("boot error: %ld\n", boot_error); + + if (!boot_error) { + /* + * allow APs to start initializing. + */ + Dprintk("Before Callout %d.\n", cpu); + cpu_set(cpu, cpu_callout_map); + Dprintk("After Callout %d.\n", cpu); + + /* + * Wait 5s total for a response + */ + for (timeout = 0; timeout < 50000; timeout++) { + if (cpu_isset(cpu, cpu_callin_map)) + break; /* It has booted */ + udelay(100); + } + + if (cpu_isset(cpu, cpu_callin_map)) { + /* number CPUs logically, starting from 1 (BSP is 0) */ + Dprintk("OK.\n"); + printk("CPU%d: ", cpu); + print_cpu_info(&cpu_data[cpu]); + Dprintk("CPU has booted.\n"); + } else { + boot_error= 1; + } + } + x86_cpu_to_apicid[cpu] = apicid; + if (boot_error) { + /* Try to put things back the way they were before ... */ + unmap_cpu_to_logical_apicid(cpu); + cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */ + cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */ + cpucount--; + } + +#else Dprintk("Setting warm reset code and vector.\n"); store_NMI_vector(&nmi_high, &nmi_low); @@ -844,6 +1016,7 @@ static int __init do_boot_cpu(int apicid /* mark "stuck" area as not stuck */ *((volatile unsigned long *)trampoline_base) = 0; +#endif return boot_error; } @@ -882,7 +1055,9 @@ static void smp_tune_scheduling (void) * Cycle through the processors sending APIC IPIs to boot each. */ +#if 0 static int boot_cpu_logical_apicid; +#endif /* Where the IO area was mapped on multiquad, always 0 otherwise */ void *xquad_portio; @@ -892,8 +1067,11 @@ EXPORT_SYMBOL(cpu_core_map); static void __init smp_boot_cpus(unsigned int max_cpus) { - int apicid, cpu, bit, kicked; + int cpu, kicked; unsigned long bogosum = 0; +#if 0 + int apicid, bit; +#endif /* * Setup boot CPU information @@ -902,9 +1080,15 @@ static void __init smp_boot_cpus(unsigne printk("CPU%d: ", 0); print_cpu_info(&cpu_data[0]); +#if 0 boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID)); boot_cpu_logical_apicid = logical_smp_processor_id(); x86_cpu_to_apicid[0] = boot_cpu_physical_apicid; +#else + // boot_cpu_physical_apicid = 0; + // boot_cpu_logical_apicid = 0; + x86_cpu_to_apicid[0] = 0; +#endif current_thread_info()->cpu = 0; smp_tune_scheduling(); @@ -914,6 +1098,7 @@ static void __init smp_boot_cpus(unsigne cpus_clear(cpu_core_map[0]); cpu_set(0, cpu_core_map[0]); +#ifdef CONFIG_X86_IO_APIC /* * If we couldn't find an SMP configuration at boot time, * get out of here now! @@ -921,16 +1106,22 @@ static void __init smp_boot_cpus(unsigne if (!smp_found_config && !acpi_lapic) { printk(KERN_NOTICE "SMP motherboard not detected.\n"); smpboot_clear_io_apic_irqs(); +#if 0 phys_cpu_present_map = physid_mask_of_physid(0); +#endif +#ifdef CONFIG_X86_LOCAL_APIC if (APIC_init_uniprocessor()) printk(KERN_NOTICE "Local APIC not detected." " Using dummy APIC emulation.\n"); +#endif map_cpu_to_logical_apicid(); cpu_set(0, cpu_sibling_map[0]); cpu_set(0, cpu_core_map[0]); return; } +#endif +#if 0 /* * Should not be necessary because the MP table should list the boot * CPU too, but we do it for the sake of robustness anyway. @@ -953,27 +1144,35 @@ static void __init smp_boot_cpus(unsigne phys_cpu_present_map = physid_mask_of_physid(0); cpu_set(0, cpu_sibling_map[0]); cpu_set(0, cpu_core_map[0]); + cpu_set(0, cpu_sibling_map[0]); + cpu_set(0, cpu_core_map[0]); return; } verify_local_APIC(); +#endif /* * If SMP should be disabled, then really disable it! */ if (!max_cpus) { - smp_found_config = 0; + HYPERVISOR_shared_info->n_vcpu = 1; printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n"); smpboot_clear_io_apic_irqs(); +#if 0 phys_cpu_present_map = physid_mask_of_physid(0); - cpu_set(0, cpu_sibling_map[0]); - cpu_set(0, cpu_core_map[0]); +#endif return; } + smp_intr_init(); + +#if 0 connect_bsp_APIC(); setup_local_APIC(); +#endif map_cpu_to_logical_apicid(); +#if 0 setup_portio_remap(); @@ -986,32 +1185,33 @@ static void __init smp_boot_cpus(unsigne * clustered apic ID. */ Dprintk("CPU present map: %lx\n", physids_coerce(phys_cpu_present_map)); +#endif + Dprintk("CPU present map: %lx\n", + (1UL << HYPERVISOR_shared_info->n_vcpu) - 1); kicked = 1; - for (bit = 0; kicked < NR_CPUS && bit < MAX_APICS; bit++) { - apicid = cpu_present_to_apicid(bit); - /* - * Don't even attempt to start the boot CPU! - */ - if ((apicid == boot_cpu_apicid) || (apicid == BAD_APICID)) - continue; - - if (!check_apicid_present(bit)) - continue; + for (cpu = 1; kicked < NR_CPUS && + cpu < HYPERVISOR_shared_info->n_vcpu; cpu++) { if (max_cpus <= cpucount+1) continue; - if (do_boot_cpu(apicid)) +#ifdef CONFIG_SMP_ALTERNATIVES + if (kicked == 1) + prepare_for_smp(); +#endif + if (do_boot_cpu(cpu)) printk("CPU #%d not responding - cannot use it.\n", - apicid); + cpu); else ++kicked; } +#if 0 /* * Cleanup possible dangling ends... */ smpboot_restore_warm_reset_vector(); +#endif /* * Allow the user to impress friends. @@ -1078,7 +1278,6 @@ static void __init smp_boot_cpus(unsigne printk(KERN_WARNING "WARNING: %d siblings found for CPU%d, should be %d\n", siblings, cpu, smp_num_siblings); smp_num_siblings = siblings; } - if (c->x86_num_cores > 1) { for (i = 0; i < NR_CPUS; i++) { if (!cpu_isset(i, cpu_callout_map)) @@ -1094,6 +1293,7 @@ static void __init smp_boot_cpus(unsigne smpboot_setup_io_apic(); +#if 0 setup_boot_APIC_clock(); /* @@ -1101,12 +1301,16 @@ static void __init smp_boot_cpus(unsigne */ if (cpu_has_tsc && cpucount && cpu_khz) synchronize_tsc_bp(); +#endif } /* These are wrappers to interface to the new boot process. Someone who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */ void __init smp_prepare_cpus(unsigned int max_cpus) { + smp_commenced_mask = cpumask_of_cpu(0); + cpu_callin_map = cpumask_of_cpu(0); + mb(); smp_boot_cpus(max_cpus); } @@ -1116,20 +1320,189 @@ void __devinit smp_prepare_boot_cpu(void cpu_set(smp_processor_id(), cpu_callout_map); } -int __devinit __cpu_up(unsigned int cpu) +#ifdef CONFIG_HOTPLUG_CPU +#include <asm-xen/ctrl_if.h> + +/* hotplug down/up funtion pointer and target vcpu */ +struct vcpu_hotplug_handler_t { + void (*fn)(int vcpu); + u32 vcpu; +}; +static struct vcpu_hotplug_handler_t vcpu_hotplug_handler; + +/* must be called with the cpucontrol mutex held */ +static int __devinit cpu_enable(unsigned int cpu) +{ +#ifdef CONFIG_SMP_ALTERNATIVES + if (num_online_cpus() == 1) + prepare_for_smp(); +#endif + + /* get the target out of its holding state */ + per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; + wmb(); + + /* wait for the processor to ack it. timeout? */ + while (!cpu_online(cpu)) + cpu_relax(); + + fixup_irqs(cpu_online_map); + + /* counter the disable in fixup_irqs() */ + local_irq_enable(); + return 0; +} + +int __cpu_disable(void) { - /* This only works at boot for x86. See "rewrite" above. */ - if (cpu_isset(cpu, smp_commenced_mask)) { - local_irq_enable(); - return -ENOSYS; + cpumask_t map = cpu_online_map; + int cpu = smp_processor_id(); + + /* + * Perhaps use cpufreq to drop frequency, but that could go + * into generic code. + * + * We won't take down the boot processor on i386 due to some + * interrupts only being able to be serviced by the BSP. + * Especially so if we're not using an IOAPIC -zwane + */ + if (cpu == 0) + return -EBUSY; + + cpu_clear(cpu, map); + fixup_irqs(map); + + /* It's now safe to remove this processor from the online map */ + cpu_clear(cpu, cpu_online_map); + +#ifdef CONFIG_SMP_ALTERNATIVES + if (num_online_cpus() == 1) + unprepare_for_smp(); +#endif + + return 0; +} + +void __cpu_die(unsigned int cpu) +{ + /* We don't do anything here: idle task is faking death itself. */ + unsigned int i; + + for (i = 0; i < 10; i++) { + /* They ack this in play_dead by setting CPU_DEAD */ + if (per_cpu(cpu_state, cpu) == CPU_DEAD) + return; + current->state = TASK_UNINTERRUPTIBLE; + schedule_timeout(HZ/10); + } + printk(KERN_ERR "CPU %u didn't die...\n", cpu); +} + +static int vcpu_hotplug_cpu_process(void *unused) +{ + struct vcpu_hotplug_handler_t *handler = &vcpu_hotplug_handler; + + if (handler->fn) { + (*(handler->fn))(handler->vcpu); + handler->fn = NULL; } + return 0; +} + +static void __vcpu_hotplug_handler(void *unused) +{ + int err; + + err = kernel_thread(vcpu_hotplug_cpu_process, + NULL, CLONE_FS | CLONE_FILES); + if (err < 0) + printk(KERN_ALERT "Error creating hotplug_cpu process!\n"); + +} + +static void vcpu_hotplug_event_handler(ctrl_msg_t *msg, unsigned long id) +{ + static DECLARE_WORK(vcpu_hotplug_work, __vcpu_hotplug_handler, NULL); + vcpu_hotplug_t *req = (vcpu_hotplug_t *)&msg->msg[0]; + struct vcpu_hotplug_handler_t *handler = &vcpu_hotplug_handler; + ssize_t ret; + + if (msg->length != sizeof(vcpu_hotplug_t)) + goto parse_error; + + /* grab target vcpu from msg */ + handler->vcpu = req->vcpu; + + /* determine which function to call based on msg subtype */ + switch (msg->subtype) { + case CMSG_VCPU_HOTPLUG_OFF: + handler->fn = (void *)&cpu_down; + ret = schedule_work(&vcpu_hotplug_work); + req->status = (u32) ret; + break; + case CMSG_VCPU_HOTPLUG_ON: + handler->fn = (void *)&cpu_up; + ret = schedule_work(&vcpu_hotplug_work); + req->status = (u32) ret; + break; + default: + goto parse_error; + } + + ctrl_if_send_response(msg); + return; + parse_error: + msg->length = 0; + ctrl_if_send_response(msg); +} + +static int __init setup_vcpu_hotplug_event(void) +{ + struct vcpu_hotplug_handler_t *handler = &vcpu_hotplug_handler; + + handler->fn = NULL; + ctrl_if_register_receiver(CMSG_VCPU_HOTPLUG, + vcpu_hotplug_event_handler, 0); + + return 0; +} + +__initcall(setup_vcpu_hotplug_event); +#else /* ... !CONFIG_HOTPLUG_CPU */ +int __cpu_disable(void) +{ + return -ENOSYS; +} + +void __cpu_die(unsigned int cpu) +{ + /* We said "no" in __cpu_disable */ + BUG(); +} +#endif /* CONFIG_HOTPLUG_CPU */ + +int __devinit __cpu_up(unsigned int cpu) +{ /* In case one didn't come up */ if (!cpu_isset(cpu, cpu_callin_map)) { + printk(KERN_DEBUG "skipping cpu%d, didn't come online\n", cpu); local_irq_enable(); return -EIO; } +#ifdef CONFIG_HOTPLUG_CPU +#ifdef CONFIG_XEN + /* Tell hypervisor to bring vcpu up. */ + HYPERVISOR_vcpu_up(cpu); +#endif + /* Already up, and in cpu_quiescent now? */ + if (cpu_isset(cpu, smp_commenced_mask)) { + cpu_enable(cpu); + return 0; + } +#endif + local_irq_enable(); /* Unleash the CPU! */ cpu_set(cpu, smp_commenced_mask); @@ -1140,6 +1513,8 @@ int __devinit __cpu_up(unsigned int cpu) void __init smp_cpus_done(unsigned int max_cpus) { +#if 1 +#else #ifdef CONFIG_X86_IO_APIC setup_ioapic_dest(); #endif @@ -1148,25 +1523,26 @@ void __init smp_cpus_done(unsigned int m * Disable executability of the SMP trampoline: */ set_kernel_exec((unsigned long)trampoline_base, trampoline_exec); +#endif } +extern irqreturn_t smp_reschedule_interrupt(int, void *, struct pt_regs *); +extern irqreturn_t smp_call_function_interrupt(int, void *, struct pt_regs *); + void __init smp_intr_init(void) { - /* - * IRQ0 must be given a fixed assignment and initialized, - * because it's used before the IO-APIC is set up. - */ - set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]); - - /* - * The reschedule interrupt is a CPU-to-CPU reschedule-helper - * IPI, driven by wakeup. - */ - set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); - - /* IPI for invalidation */ - set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt); + int cpu = smp_processor_id(); - /* IPI for generic function call */ - set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); + per_cpu(resched_irq, cpu) = + bind_ipi_on_cpu_to_irq(RESCHEDULE_VECTOR); + sprintf(resched_name[cpu], "resched%d", cpu); + BUG_ON(request_irq(per_cpu(resched_irq, cpu), smp_reschedule_interrupt, + SA_INTERRUPT, resched_name[cpu], NULL)); + + per_cpu(callfunc_irq, cpu) = + bind_ipi_on_cpu_to_irq(CALL_FUNCTION_VECTOR); + sprintf(callfunc_name[cpu], "callfunc%d", cpu); + BUG_ON(request_irq(per_cpu(callfunc_irq, cpu), + smp_call_function_interrupt, + SA_INTERRUPT, callfunc_name[cpu], NULL)); } diff -x mkbuildtree -x include -x xen -x SCCS -urPp pristine-linux-2.6.12/arch/i386/kernel/smp.c linux-2.6-xen-sparse/arch/i386/kernel/smp.c --- pristine-linux-2.6.12/arch/i386/kernel/smp.c 2005-06-17 12:48:29.000000000 -0700 +++ linux-2.6-xen-sparse/arch/i386/kernel/smp.c 2005-07-28 13:17:07.000000000 -0700 @@ -19,10 +19,16 @@ #include <linux/mc146818rtc.h> #include <linux/cache.h> #include <linux/interrupt.h> +#include <linux/cpu.h> #include <asm/mtrr.h> #include <asm/tlbflush.h> +#if 0 #include <mach_apic.h> +#endif +#include <asm-xen/evtchn.h> + +#define xxprint(msg) HYPERVISOR_console_io(CONSOLEIO_write, strlen(msg), msg) /* * Some notes on x86 processor bugs affecting SMP operation: @@ -121,31 +127,49 @@ static inline int __prepare_ICR2 (unsign return SET_APIC_DEST_FIELD(mask); } -void __send_IPI_shortcut(unsigned int shortcut, int vector) +DECLARE_PER_CPU(int, ipi_to_evtchn[NR_IPIS]); + +static inline void __send_IPI_one(unsigned int cpu, int vector) { - /* - * Subtle. In the case of the 'never do double writes' workaround - * we have to lock out interrupts to be safe. As we don't care - * of the value read we use an atomic rmw access to avoid costly - * cli/sti. Otherwise we use an even cheaper single atomic write - * to the APIC. - */ - unsigned int cfg; + unsigned int evtchn; - /* - * Wait for idle. - */ - apic_wait_icr_idle(); + evtchn = per_cpu(ipi_to_evtchn, cpu)[vector]; + // printk("send_IPI_mask_bitmask cpu %d vector %d evtchn %d\n", cpu, vector, evtchn); + if (evtchn) { +#if 0 + shared_info_t *s = HYPERVISOR_shared_info; + while (synch_test_bit(evtchn, &s->evtchn_pending[0]) || + synch_test_bit(evtchn, &s->evtchn_mask[0])) + ; +#endif + notify_via_evtchn(evtchn); + } else + printk("send_IPI to unbound port %d/%d", + cpu, vector); +} - /* - * No need to touch the target chip field - */ - cfg = __prepare_ICR(shortcut, vector); +void __send_IPI_shortcut(unsigned int shortcut, int vector) +{ + int cpu; - /* - * Send the IPI. The write to APIC_ICR fires this off. - */ - apic_write_around(APIC_ICR, cfg); + switch (shortcut) { + case APIC_DEST_SELF: + __send_IPI_one(smp_processor_id(), vector); + break; + case APIC_DEST_ALLBUT: + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + if (cpu == smp_processor_id()) + continue; + if (cpu_isset(cpu, cpu_online_map)) { + __send_IPI_one(cpu, vector); + } + } + break; + default: + printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut, + vector); + break; + } } void fastcall send_IPI_self(int vector) @@ -156,81 +180,32 @@ void fastcall send_IPI_self(int vector) /* * This is only used on smaller machines. */ -void send_IPI_mask_bitmask(cpumask_t cpumask, int vector) +void send_IPI_mask_bitmask(cpumask_t mask, int vector) { - unsigned long mask = cpus_addr(cpumask)[0]; - unsigned long cfg; unsigned long flags; + unsigned int cpu; local_irq_save(flags); - - /* - * Wait for idle. - */ - apic_wait_icr_idle(); - - /* - * prepare target chip field - */ - cfg = __prepare_ICR2(mask); - apic_write_around(APIC_ICR2, cfg); - - /* - * program the ICR - */ - cfg = __prepare_ICR(0, vector); - - /* - * Send the IPI. The write to APIC_ICR fires this off. - */ - apic_write_around(APIC_ICR, cfg); + WARN_ON(cpus_addr(mask)[0] & ~cpus_addr(cpu_online_map)[0]); + + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + if (cpu_isset(cpu, mask)) { + __send_IPI_one(cpu, vector); + } + } local_irq_restore(flags); } void send_IPI_mask_sequence(cpumask_t mask, int vector) { - unsigned long cfg, flags; - unsigned int query_cpu; - - /* - * Hack. The clustered APIC addressing mode doesn't allow us to send - * to an arbitrary mask, so I do a unicasts to each CPU instead. This - * should be modified to do 1 message per cluster ID - mbligh - */ - local_irq_save(flags); - - for (query_cpu = 0; query_cpu < NR_CPUS; ++query_cpu) { - if (cpu_isset(query_cpu, mask)) { - - /* - * Wait for idle. - */ - apic_wait_icr_idle(); - - /* - * prepare target chip field - */ - cfg = __prepare_ICR2(cpu_to_logical_apicid(query_cpu)); - apic_write_around(APIC_ICR2, cfg); - - /* - * program the ICR - */ - cfg = __prepare_ICR(0, vector); - - /* - * Send the IPI. The write to APIC_ICR fires this off. - */ - apic_write_around(APIC_ICR, cfg); - } - } - local_irq_restore(flags); + send_IPI_mask_bitmask(mask, vector); } #include <mach_ipi.h> /* must come after the send_IPI functions above for inlining */ +#if 0 /* XEN */ /* * Smarter SMP flushing macros. * c/o Linus Torvalds. @@ -308,7 +283,8 @@ static inline void leave_mm (unsigned lo * 2) Leave the mm if we are in the lazy tlb mode. */ -fastcall void smp_invalidate_interrupt(struct pt_regs *regs) +irqreturn_t smp_invalidate_interrupt(int irq, void *dev_id, + struct pt_regs *regs) { unsigned long cpu; @@ -334,32 +310,33 @@ fastcall void smp_invalidate_interrupt(s } else leave_mm(cpu); } - ack_APIC_irq(); smp_mb__before_clear_bit(); cpu_clear(cpu, flush_cpumask); smp_mb__after_clear_bit(); out: put_cpu_no_resched(); + + return IRQ_HANDLED; } static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, unsigned long va) { - cpumask_t tmp; /* * A couple of (to be removed) sanity checks: * - * - we do not send IPIs to not-yet booted CPUs. * - current CPU must not be in mask * - mask must exist :) */ BUG_ON(cpus_empty(cpumask)); - - cpus_and(tmp, cpumask, cpu_online_map); - BUG_ON(!cpus_equal(cpumask, tmp)); BUG_ON(cpu_isset(smp_processor_id(), cpumask)); BUG_ON(!mm); + /* If a CPU which we ran on has gone down, OK. */ + cpus_and(cpumask, cpumask, cpu_online_map); + if (cpus_empty(cpumask)) + return; + /* * i'm not happy about this global shared spinlock in the * MM hot path, but we'll see how contended it is. @@ -443,7 +420,7 @@ void flush_tlb_page(struct vm_area_struc if (current->active_mm == mm) { if(current->mm) __flush_tlb_one(va); - else + else leave_mm(smp_processor_id()); } @@ -467,6 +444,22 @@ void flush_tlb_all(void) on_each_cpu(do_flush_tlb_all, NULL, 1, 1); } +#else + +irqreturn_t smp_invalidate_interrupt(int irq, void *dev_id, + struct pt_regs *regs) +{ return 0; } +void flush_tlb_current_task(void) +{ xen_tlb_flush_mask(¤t->mm->cpu_vm_mask); } +void flush_tlb_mm(struct mm_struct * mm) +{ xen_tlb_flush_mask(&mm->cpu_vm_mask); } +void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) +{ xen_invlpg_mask(&vma->vm_mm->cpu_vm_mask, va); } +void flush_tlb_all(void) +{ xen_tlb_flush_all(); } + +#endif /* XEN */ + /* * this function sends a 'reschedule' IPI to another CPU. * it goes straight through and wastes no time serializing @@ -474,6 +467,7 @@ void flush_tlb_all(void) */ void smp_send_reschedule(int cpu) { + WARN_ON(cpu_is_offline(cpu)); send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); } @@ -514,10 +508,16 @@ int smp_call_function (void (*func) (voi */ { struct call_data_struct data; - int cpus = num_online_cpus()-1; + int cpus; - if (!cpus) + /* Holding any lock stops cpus from going down. */ + spin_lock(&call_lock); + cpus = num_online_cpus()-1; + + if (!cpus) { + spin_unlock(&call_lock); return 0; + } /* Can deadlock when called with interrupts disabled */ WARN_ON(irqs_disabled()); @@ -529,7 +529,6 @@ int smp_call_function (void (*func) (voi if (wait) atomic_set(&data.finished, 0); - spin_lock(&call_lock); call_data = &data; mb(); @@ -538,11 +537,11 @@ int smp_call_function (void (*func) (voi /* Wait for response */ while (atomic_read(&data.started) != cpus) - cpu_relax(); + barrier(); if (wait) while (atomic_read(&data.finished) != cpus) - cpu_relax(); + barrier(); spin_unlock(&call_lock); return 0; @@ -555,7 +554,11 @@ static void stop_this_cpu (void * dummy) */ cpu_clear(smp_processor_id(), cpu_online_map); local_irq_disable(); +#if 1 + xxprint("stop_this_cpu disable_local_APIC\n"); +#else disable_local_APIC(); +#endif if (cpu_data[smp_processor_id()].hlt_works_ok) for(;;) __asm__("hlt"); for (;;); @@ -570,7 +573,11 @@ void smp_send_stop(void) smp_call_function(stop_this_cpu, NULL, 1, 0); local_irq_disable(); +#if 1 + xxprint("smp_send_stop disable_local_APIC\n"); +#else disable_local_APIC(); +#endif local_irq_enable(); } @@ -579,18 +586,21 @@ void smp_send_stop(void) * all the work is done automatically when * we return from the interrupt. */ -fastcall void smp_reschedule_interrupt(struct pt_regs *regs) +irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id, + struct pt_regs *regs) { - ack_APIC_irq(); + + return IRQ_HANDLED; } -fastcall void smp_call_function_interrupt(struct pt_regs *regs) +#include <linux/kallsyms.h> +irqreturn_t smp_call_function_interrupt(int irq, void *dev_id, + struct pt_regs *regs) { void (*func) (void *info) = call_data->func; void *info = call_data->info; int wait = call_data->wait; - ack_APIC_irq(); /* * Notify initiating CPU that I've grabbed the data and am * about to execute the function @@ -608,5 +618,7 @@ fastcall void smp_call_function_interrup mb(); atomic_inc(&call_data->finished); } + + return IRQ_HANDLED; } diff -x mkbuildtree -x include -x xen -x SCCS -urPp pristine-linux-2.6.12/arch/i386/kernel/time.c linux-2.6-xen-sparse/arch/i386/kernel/time.c --- pristine-linux-2.6.12/arch/i386/kernel/time.c 2005-06-17 12:48:29.000000000 -0700 +++ linux-2.6-xen-sparse/arch/i386/kernel/time.c 2005-07-28 13:17:07.000000000 -0700 @@ -46,6 +46,8 @@ #include <linux/bcd.h> #include <linux/efi.h> #include <linux/mca.h> +#include <linux/sysctl.h> +#include <linux/percpu.h> #include <asm/io.h> #include <asm/smp.h> @@ -71,13 +73,24 @@ extern spinlock_t i8259A_lock; int pit_latch_buggy; /* extern */ -#include "do_timer.h" - u64 jiffies_64 = INITIAL_JIFFIES; EXPORT_SYMBOL(jiffies_64); +#if defined(__x86_64__) +unsigned long vxtime_hz = PIT_TICK_RATE; +struct vxtime_data __vxtime __section_vxtime; /* for vsyscalls */ +volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; +unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES; +struct timespec __xtime __section_xtime; +struct timezone __sys_tz __section_sys_tz; +#endif + +#if defined(__x86_64__) +unsigned int cpu_khz; /* Detected as we calibrate the TSC */ +#else unsigned long cpu_khz; /* Detected as we calibrate the TSC */ +#endif extern unsigned long wall_jiffies; @@ -86,7 +99,210 @@ DEFINE_SPINLOCK(rtc_lock); DEFINE_SPINLOCK(i8253_lock); EXPORT_SYMBOL(i8253_lock); -struct timer_opts *cur_timer = &timer_none; +extern struct init_timer_opts timer_tsc_init; +extern struct timer_opts timer_tsc; +struct timer_opts *cur_timer = &timer_tsc; + +/* These are peridically updated in shared_info, and then copied here. */ +struct shadow_time_info { + u64 tsc_timestamp; /* TSC at last update of time vals. */ + u64 system_timestamp; /* Time, in nanosecs, since boot. */ + u32 tsc_to_nsec_mul; + u32 tsc_to_usec_mul; + int tsc_shift; + u32 version; +}; +static DEFINE_PER_CPU(struct shadow_time_info, shadow_time); +static struct timeval shadow_tv; + +/* Keep track of last time we did processing/updating of jiffies and xtime. */ +static u64 processed_system_time; /* System time (ns) at last processing. */ +static DEFINE_PER_CPU(u64, processed_system_time); + +#define NS_PER_TICK (1000000000ULL/HZ) + +#define HANDLE_USEC_UNDERFLOW(_tv) do { \ + while ((_tv).tv_usec < 0) { \ + (_tv).tv_usec += USEC_PER_SEC; \ + (_tv).tv_sec--; \ + } \ +} while (0) +#define HANDLE_USEC_OVERFLOW(_tv) do { \ + while ((_tv).tv_usec >= USEC_PER_SEC) { \ + (_tv).tv_usec -= USEC_PER_SEC; \ + (_tv).tv_sec++; \ + } \ +} while (0) +static inline void __normalize_time(time_t *sec, s64 *nsec) +{ + while (*nsec >= NSEC_PER_SEC) { + (*nsec) -= NSEC_PER_SEC; + (*sec)++; + } + while (*nsec < 0) { + (*nsec) += NSEC_PER_SEC; + (*sec)--; + } +} + +/* Does this guest OS track Xen time, or set its wall clock independently? */ +static int independent_wallclock = 0; +static int __init __independent_wallclock(char *str) +{ + independent_wallclock = 1; + return 1; +} +__setup("independent_wallclock", __independent_wallclock); +#define INDEPENDENT_WALLCLOCK() \ + (independent_wallclock || (xen_start_info.flags & SIF_INITDOMAIN)) + +int tsc_disable __initdata = 0; + +static void delay_tsc(unsigned long loops) +{ + unsigned long bclock, now; + + rdtscl(bclock); + do + { + rep_nop(); + rdtscl(now); + } while ((now-bclock) < loops); +} + +struct timer_opts timer_tsc = { + .name = "tsc", + .delay = delay_tsc, +}; + +static inline u32 down_shift(u64 time, int shift) +{ + if ( shift < 0 ) + return (u32)(time >> -shift); + return (u32)((u32)time << shift); +} + +/* + * 32-bit multiplication of integer multiplicand and fractional multiplier + * yielding 32-bit integer product. + */ +static inline u32 mul_frac(u32 multiplicand, u32 multiplier) +{ + u32 product_int, product_frac; + __asm__ ( + "mul %3" + : "=a" (product_frac), "=d" (product_int) + : "0" (multiplicand), "r" (multiplier) ); + return product_int; +} + +void init_cpu_khz(void) +{ + u64 __cpu_khz = 1000000ULL << 32; + struct vcpu_time_info *info = &HYPERVISOR_shared_info->vcpu_time[0]; + do_div(__cpu_khz, info->tsc_to_system_mul); + cpu_khz = down_shift(__cpu_khz, -info->tsc_shift); + printk(KERN_INFO "Xen reported: %lu.%03lu MHz processor.\n", + cpu_khz / 1000, cpu_khz % 1000); +} + +static u64 get_nsec_offset(struct shadow_time_info *shadow) +{ + u64 now; + u32 delta; + rdtscll(now); + delta = down_shift(now - shadow->tsc_timestamp, shadow->tsc_shift); + return mul_frac(delta, shadow->tsc_to_nsec_mul); +} + +static unsigned long get_usec_offset(struct shadow_time_info *shadow) +{ + u64 now; + u32 delta; + rdtscll(now); + delta = down_shift(now - shadow->tsc_timestamp, shadow->tsc_shift); + return mul_frac(delta, shadow->tsc_to_usec_mul); +} + +static void update_wallclock(void) +{ + shared_info_t *s = HYPERVISOR_shared_info; + long wtm_nsec, xtime_nsec; + time_t wtm_sec, xtime_sec; + u64 tmp, usec; + + shadow_tv.tv_sec = s->wc_sec; + shadow_tv.tv_usec = s->wc_usec; + + if (INDEPENDENT_WALLCLOCK()) + return; + + if ((time_status & STA_UNSYNC) != 0) + return; + + /* Adjust wall-clock time base based on wall_jiffies ticks. */ + usec = processed_system_time; + do_div(usec, 1000); + usec += (u64)shadow_tv.tv_sec * 1000000ULL; + usec += (u64)shadow_tv.tv_usec; + usec -= (jiffies - wall_jiffies) * (USEC_PER_SEC / HZ); + + /* Split wallclock base into seconds and nanoseconds. */ + tmp = usec; + xtime_nsec = do_div(tmp, 1000000) * 1000ULL; + xtime_sec = (time_t)tmp; + + wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - xtime_sec); + wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - xtime_nsec); + + set_normalized_timespec(&xtime, xtime_sec, xtime_nsec); + set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); +} + +/* + * Reads a consistent set of time-base values from Xen, into a shadow data + * area. Must be called with the xtime_lock held for writing. + */ +static void __get_time_values_from_xen(void) +{ + shared_info_t *s = HYPERVISOR_shared_info; + struct vcpu_time_info *src; + struct shadow_time_info *dst; + + src = &s->vcpu_time[smp_processor_id()]; + dst = &per_cpu(shadow_time, smp_processor_id()); + + do { + dst->version = src->time_version2; + rmb(); + dst->tsc_timestamp = src->tsc_timestamp; + dst->system_timestamp = src->system_time; + dst->tsc_to_nsec_mul = src->tsc_to_system_mul; + dst->tsc_shift = src->tsc_shift; + rmb(); + } + while (dst->version != src->time_version1); + + dst->tsc_to_usec_mul = dst->tsc_to_nsec_mul / 1000; + + if ((shadow_tv.tv_sec != s->wc_sec) || + (shadow_tv.tv_usec != s->wc_usec)) + update_wallclock(); +} + +static inline int time_values_up_to_date(int cpu) +{ + struct vcpu_time_info *src; + struct shadow_time_info *dst; + + src = &HYPERVISOR_shared_info->vcpu_time[smp_processor_id()]; + dst = &per_cpu(shadow_time, smp_processor_id()); + + return (dst->version == src->time_version2); +} + +#define TIME_VALUES_UP_TO_DATE \ + ({ rmb(); (shadow_time_version == HYPERVISOR_shared_info->time_version2); }) /* * This is a special lock that is owned by the CPU and holds the index @@ -126,13 +342,20 @@ void do_gettimeofday(struct timeval *tv) unsigned long seq; unsigned long usec, sec; unsigned long max_ntp_tick; + unsigned long flags; + s64 nsec; + unsigned int cpu; + struct shadow_time_info *shadow; + + cpu = get_cpu(); + shadow = &per_cpu(shadow_time, cpu); do { unsigned long lost; seq = read_seqbegin(&xtime_lock); - usec = cur_timer->get_offset(); + usec = get_usec_offset(shadow); lost = jiffies - wall_jiffies; /* @@ -151,11 +374,31 @@ void do_gettimeofday(struct timeval *tv) usec += lost * (USEC_PER_SEC / HZ); sec = xtime.tv_sec; - usec += (xtime.tv_nsec / 1000); + usec += (xtime.tv_nsec / NSEC_PER_USEC); + + nsec = shadow->system_timestamp - processed_system_time; + __normalize_time(&sec, &nsec); + usec += (long)nsec / NSEC_PER_USEC; + + if (unlikely(!time_values_up_to_date(cpu))) { + /* + * We may have blocked for a long time, + * rendering our calculations invalid + * (e.g. the time delta may have + * overflowed). Detect that and recalculate + * with fresh values. + */ + write_seqlock_irqsave(&xtime_lock, flags); + __get_time_values_from_xen(); + write_sequnlock_irqrestore(&xtime_lock, flags); + continue; + } } while (read_seqretry(&xtime_lock, seq)); - while (usec >= 1000000) { - usec -= 1000000; + put_cpu(); + + while (usec >= USEC_PER_SEC) { + usec -= USEC_PER_SEC; sec++; } @@ -168,21 +411,49 @@ EXPORT_SYMBOL(do_gettimeofday); int do_settimeofday(struct timespec *tv) { time_t wtm_sec, sec = tv->tv_sec; - long wtm_nsec, nsec = tv->tv_nsec; + long wtm_nsec; + s64 nsec; + struct timespec xentime; + unsigned int cpu; + struct shadow_time_info *shadow; if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) return -EINVAL; + if (!INDEPENDENT_WALLCLOCK()) + return 0; /* Silent failure? */ + + cpu = get_cpu(); + shadow = &per_cpu(shadow_time, cpu); + write_seqlock_irq(&xtime_lock); + + /* + * Ensure we don't get blocked for a long time so that our time delta + * overflows. If that were to happen then our shadow time values would + * be stale, so we can retry with fresh ones. + */ + again: + nsec = (s64)tv->tv_nsec - (s64)get_nsec_offset(shadow); + if (unlikely(!time_values_up_to_date(cpu))) { + __get_time_values_from_xen(); + goto again; + } + + __normalize_time(&sec, &nsec); + set_normalized_timespec(&xentime, sec, nsec); + /* * This is revolting. We need to set "xtime" correctly. However, the * value in this location is the value at the most recent update of * wall time. Discover what correction gettimeofday() would have * made, and then undo it! */ - nsec -= cur_timer->get_offset() * NSEC_PER_USEC; nsec -= (jiffies - wall_jiffies) * TICK_NSEC; + nsec -= (shadow->system_timestamp - processed_system_time); + + __normalize_time(&sec, &nsec); wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); @@ -193,13 +464,29 @@ int do_settimeofday(struct timespec *tv) time_status |= STA_UNSYNC; time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; - write_sequnlock_irq(&xtime_lock); + +#ifdef CONFIG_XEN_PRIVILEGED_GUEST + if (xen_start_info.flags & SIF_INITDOMAIN) { + dom0_op_t op; + op.cmd = DOM0_SETTIME; + op.u.settime.secs = xentime.tv_sec; + op.u.settime.usecs = xentime.tv_nsec / NSEC_PER_USEC; + op.u.settime.system_time = shadow->system_timestamp; + write_sequnlock_irq(&xtime_lock); + HYPERVISOR_dom0_op(&op); + } else +#endif + write_sequnlock_irq(&xtime_lock); + + put_cpu(); + clock_was_set(); return 0; } EXPORT_SYMBOL(do_settimeofday); +#ifdef CONFIG_XEN_PRIVILEGED_GUEST static int set_rtc_mmss(unsigned long nowtime) { int retval; @@ -216,9 +503,12 @@ static int set_rtc_mmss(unsigned long no return retval; } - - -int timer_ack; +#else +static int set_rtc_mmss(unsigned long nowtime) +{ + return 0; +} +#endif /* monotonic_clock(): returns # of nanoseconds passed since time_init() * Note: This function is required to return accurate @@ -226,10 +516,31 @@ int timer_ack; */ unsigned long long monotonic_clock(void) { - return cur_timer->monotonic_clock(); + int cpu = get_cpu(); + struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu); + s64 off; + unsigned long flags; + + for ( ; ; ) { + off = get_nsec_offset(shadow); + if (time_values_up_to_date(cpu)) + break; + write_seqlock_irqsave(&xtime_lock, flags); + __get_time_values_from_xen(); + write_sequnlock_irqrestore(&xtime_lock, flags); + } + + put_cpu(); + + return shadow->system_timestamp + off; } EXPORT_SYMBOL(monotonic_clock); +unsigned long long sched_clock(void) +{ + return monotonic_clock(); +} + #if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER) unsigned long profile_pc(struct pt_regs *regs) { @@ -250,37 +561,47 @@ EXPORT_SYMBOL(profile_pc); static inline void do_timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) { -#ifdef CONFIG_X86_IO_APIC - if (timer_ack) { - /* - * Subtle, when I/O APICs are used we have to ack timer IRQ - * manually to reset the IRR bit for do_slow_gettimeoffset(). - * This will also deassert NMI lines for the watchdog if run - * on an 82489DX-based system. - */ - spin_lock(&i8259A_lock); - outb(0x0c, PIC_MASTER_OCW3); - /* Ack the IRQ; AEOI will end it automatically. */ - inb(PIC_MASTER_POLL); - spin_unlock(&i8259A_lock); - } -#endif + s64 delta, delta_cpu; + int cpu = smp_processor_id(); + struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu); + + do { + __get_time_values_from_xen(); - do_timer_interrupt_hook(regs); + delta = delta_cpu = + shadow->system_timestamp + get_nsec_offset(shadow); + delta -= processed_system_time; + delta_cpu -= per_cpu(processed_system_time, cpu); + } + while (!time_values_up_to_date(cpu)); + if (unlikely(delta < 0) || unlikely(delta_cpu < 0)) { + printk("Timer ISR/%d: Time went backwards: " + "delta=%lld cpu_delta=%lld shadow=%lld " + "off=%lld processed=%lld cpu_processed=%lld\n", + cpu, delta, delta_cpu, shadow->system_timestamp, + (s64)get_nsec_offset(shadow), + processed_system_time, + per_cpu(processed_system_time, cpu)); + for (cpu = 0; cpu < num_online_cpus(); cpu++) + printk(" %d: %lld\n", cpu, + per_cpu(processed_system_time, cpu)); + return; + } - if (MCA_bus) { - /* The PS/2 uses level-triggered interrupts. You can't - turn them off, nor would you want to (any attempt to - enable edge-triggered interrupts usually gets intercepted by a - special hardware circuit). Hence we have to acknowledge - the timer interrupt. Through some incredibly stupid - design idea, the reset for IRQ 0 is done by setting the - high bit of the PPI port B (0x61). Note that some PS/2s, - notably the 55SX, work fine if this is removed. */ + /* System-wide jiffy work. */ + while (delta >= NS_PER_TICK) { + delta -= NS_PER_TICK; + processed_system_time += NS_PER_TICK; + do_timer(regs); + } - irq = inb_p( 0x61 ); /* read the current state */ - outb_p( irq|0x80, 0x61 ); /* reset the IRQ */ + /* Local CPU jiffy work. */ + while (delta_cpu >= NS_PER_TICK) { + delta_cpu -= NS_PER_TICK; + per_cpu(processed_system_time, cpu) += NS_PER_TICK; + update_process_times(user_mode(regs)); + profile_tick(CPU_PROFILING, regs); } } @@ -299,11 +620,7 @@ irqreturn_t timer_interrupt(int irq, voi * locally disabled. -arca */ write_seqlock(&xtime_lock); - - cur_timer->mark_offset(); - do_timer_interrupt(irq, NULL, regs); - write_sequnlock(&xtime_lock); return IRQ_HANDLED; } @@ -452,6 +769,14 @@ static void __init hpet_time_init(void) } #endif +/* Dynamically-mapped IRQ. */ +static DEFINE_PER_CPU(int, timer_irq); + +static struct irqaction irq_timer = { + timer_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "timer0", + NULL, NULL +}; + void __init time_init(void) { #ifdef CONFIG_HPET_TIMER @@ -464,13 +789,141 @@ void __init time_init(void) return; } #endif - xtime.tv_sec = get_cmos_time(); - xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ); + __get_time_values_from_xen(); + xtime.tv_sec = shadow_tv.tv_sec; + xtime.tv_nsec = shadow_tv.tv_usec * NSEC_PER_USEC; set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); + processed_system_time = per_cpu(shadow_time, 0).system_timestamp; + per_cpu(processed_system_time, 0) = processed_system_time; - cur_timer = select_timer(); - printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name); + init_cpu_khz(); - time_init_hook(); +#if defined(__x86_64__) + vxtime.mode = VXTIME_TSC; + vxtime.quot = (1000000L << 32) / vxtime_hz; + vxtime.tsc_quot = (1000L << 32) / cpu_khz; + vxtime.hz = vxtime_hz; + sync_core(); + rdtscll(vxtime.last_tsc); +#endif + + per_cpu(timer_irq, 0) = bind_virq_to_irq(VIRQ_TIMER); + (void)setup_irq(per_cpu(timer_irq, 0), &irq_timer); +} + +/* Convert jiffies to system time. */ +static inline u64 jiffies_to_st(unsigned long j) +{ + unsigned long seq; + long delta; + u64 st; + + do { + seq = read_seqbegin(&xtime_lock); + delta = j - jiffies; + /* NB. The next check can trigger in some wrap-around cases, + * but that's ok: we'll just end up with a shorter timeout. */ + if (delta < 1) + delta = 1; + st = processed_system_time + (delta * NS_PER_TICK); + } while (read_seqretry(&xtime_lock, seq)); + + return st; } + +/* + * stop_hz_timer / start_hz_timer - enter/exit 'tickless mode' on an idle cpu + * These functions are based on implementations from arch/s390/kernel/time.c + */ +void stop_hz_timer(void) +{ + unsigned int cpu = smp_processor_id(); + unsigned long j; + + /* s390 does this /before/ checking rcu_pending(). We copy them. */ + cpu_set(cpu, nohz_cpu_mask); + + /* Leave ourselves in 'tick mode' if rcu or softirq pending. */ + if (rcu_pending(cpu) || local_softirq_pending()) { + cpu_clear(cpu, nohz_cpu_mask); + j = jiffies + 1; + } else { + j = next_timer_interrupt(); + } + + BUG_ON(HYPERVISOR_set_timer_op(jiffies_to_st(j)) != 0); +} + +void start_hz_timer(void) +{ + cpu_clear(smp_processor_id(), nohz_cpu_mask); +} + +void time_suspend(void) +{ + /* nothing */ +} + +/* No locking required. We are only CPU running, and interrupts are off. */ +void time_resume(void) +{ + init_cpu_khz(); + + /* Get timebases for new environment. */ + __get_time_values_from_xen(); + + /* Reset our own concept of passage of system time. */ + processed_system_time = + per_cpu(shadow_time, smp_processor_id()).system_timestamp; + per_cpu(processed_system_time, 0) = processed_system_time; +} + +#ifdef CONFIG_SMP +static char timer_name[NR_CPUS][15]; +void local_setup_timer(void) +{ + int seq, cpu = smp_processor_id(); + + do { + seq = read_seqbegin(&xtime_lock); + per_cpu(processed_system_time, cpu) = + per_cpu(shadow_time, cpu).system_timestamp; + } while (read_seqretry(&xtime_lock, seq)); + + per_cpu(timer_irq, cpu) = bind_virq_to_irq(VIRQ_TIMER); + sprintf(timer_name[cpu], "timer%d", cpu); + BUG_ON(request_irq(per_cpu(timer_irq, cpu), timer_interrupt, + SA_INTERRUPT, timer_name[cpu], NULL)); +} +#endif + +/* + * /proc/sys/xen: This really belongs in another file. It can stay here for + * now however. + */ +static ctl_table xen_subtable[] = { + {1, "independent_wallclock", &independent_wallclock, + sizeof(independent_wallclock), 0644, NULL, proc_dointvec}, + {0} +}; +static ctl_table xen_table[] = { + {123, "xen", NULL, 0, 0555, xen_subtable}, + {0} +}; +static int __init xen_sysctl_init(void) +{ + (void)register_sysctl_table(xen_table, 0); + return 0; +} +__initcall(xen_sysctl_init); + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff -x mkbuildtree -x include -x xen -x SCCS -urPp pristine-linux-2.6.12/arch/i386/kernel/timers/Makefile linux-2.6-xen-sparse/arch/i386/kernel/timers/Makefile --- pristine-linux-2.6.12/arch/i386/kernel/timers/Makefile 2005-06-17 12:48:29.000000000 -0700 +++ linux-2.6-xen-sparse/arch/i386/kernel/timers/Makefile 2005-07-28 13:17:07.000000000 -0700 @@ -2,8 +2,16 @@ # Makefile for x86 timers # -obj-y := timer.o timer_none.o timer_tsc.o timer_pit.o common.o +XENARCH := $(subst ",,$(CONFIG_XENARCH)) -obj-$(CONFIG_X86_CYCLONE_TIMER) += timer_cyclone.o -obj-$(CONFIG_HPET_TIMER) += timer_hpet.o -obj-$(CONFIG_X86_PM_TIMER) += timer_pm.o +obj-y := timer_tsc.o +c-obj-y := + +c-link := + +$(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-link)): + @ln -fsn $(srctree)/arch/i386/kernel/timers/$(notdir $@) $@ + +obj-y += $(c-obj-y) + +clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-) $(c-link)) diff -x mkbuildtree -x include -x xen -x SCCS -urPp pristine-linux-2.6.12/arch/i386/kernel/timers/timer_tsc.c linux-2.6-xen-sparse/arch/i386/kernel/timers/timer_tsc.c --- pristine-linux-2.6.12/arch/i386/kernel/timers/timer_tsc.c 2005-06-17 12:48:29.000000000 -0700 +++ linux-2.6-xen-sparse/arch/i386/kernel/timers/timer_tsc.c 2005-07-28 13:17:07.000000000 -0700 @@ -1,10 +1,6 @@ /* * This code largely moved from arch/i386/kernel/time.c. * See comments there for proper credits. - * - * 2004-06-25 Jesper Juhl - * moved mark_offset_tsc below cpufreq_delayed_get to avoid gcc 3.4 - * failing to inline. */ #include <linux/spinlock.h> @@ -38,12 +34,9 @@ int tsc_disable __initdata = 0; extern spinlock_t i8253_lock; static int use_tsc; -/* Number of usecs that the last interrupt was delayed */ -static int delay_at_last_interrupt; -static unsigned long last_tsc_low; /* lsb 32 bits of Time Stamp Counter */ -static unsigned long last_tsc_high; /* msb 32 bits of Time Stamp Counter */ static unsigned long long monotonic_base; +static u32 monotonic_offset; static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED; /* convert from cycles(64bits) => nanoseconds (64bits) @@ -74,8 +67,6 @@ static inline unsigned long long cycles_ return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; } -static int count2; /* counter for mark_offset_tsc() */ - /* Cached *multiplier* to convert TSC counts to microseconds. * (see the equation below). * Equal to 2^32 * (1 / (clocks per usec) ). @@ -83,6 +74,9 @@ static int count2; /* counter for mark_o */ static unsigned long fast_gettimeoffset_quotient; +extern u32 shadow_tsc_stamp; +extern u64 shadow_system_time; + static unsigned long get_offset_tsc(void) { register unsigned long eax, edx; @@ -92,7 +86,7 @@ static unsigned long get_offset_tsc(void rdtsc(eax,edx); /* .. relative to previous jiffy (32 bits is enough) */ - eax -= last_tsc_low; /* tsc_low delta */ + eax -= shadow_tsc_stamp; /* * Time offset = (tsc_low delta) * fast_gettimeoffset_quotient @@ -109,7 +103,7 @@ static unsigned long get_offset_tsc(void "0" (eax)); /* our adjusted time offset in microseconds */ - return delay_at_last_interrupt + edx; + return edx; } static unsigned long long monotonic_clock_tsc(void) @@ -120,7 +114,7 @@ static unsigned long long monotonic_cloc /* atomically read monotonic base & last_offset */ do { seq = read_seqbegin(&monotonic_lock); - last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; + last_offset = monotonic_offset; base = monotonic_base; } while (read_seqretry(&monotonic_lock, seq)); @@ -155,6 +149,17 @@ unsigned long long sched_clock(void) return cycles_2_ns(this_offset); } + +static void mark_offset_tsc(void) +{ + + /* update the monotonic base value */ + write_seqlock(&monotonic_lock); + monotonic_base = shadow_system_time; + monotonic_offset = shadow_tsc_stamp; + write_sequnlock(&monotonic_lock); +} + static void delay_tsc(unsigned long loops) { unsigned long bclock, now; @@ -320,245 +325,39 @@ core_initcall(cpufreq_tsc); static inline void cpufreq_delayed_get(void) { return; } #endif -int recalibrate_cpu_khz(void) -{ -#ifndef CONFIG_SMP - unsigned long cpu_khz_old = cpu_khz; - - if (cpu_has_tsc) { - init_cpu_khz(); - cpu_data[0].loops_per_jiffy = - cpufreq_scale(cpu_data[0].loops_per_jiffy, - cpu_khz_old, - cpu_khz); - return 0; - } else - return -ENODEV; -#else - return -ENODEV; -#endif -} -EXPORT_SYMBOL(recalibrate_cpu_khz); -static void mark_offset_tsc(void) +static int init_tsc(char* override) { - unsigned long lost,delay; - unsigned long delta = last_tsc_low; - int count; - int countmp; - static int count1 = 0; - unsigned long long this_offset, last_offset; - static int lost_count = 0; - - write_seqlock(&monotonic_lock); - last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; - /* - * It is important that these two operations happen almost at - * the same time. We do the RDTSC stuff first, since it's - * faster. To avoid any inconsistencies, we need interrupts - * disabled locally. - */ - - /* - * Interrupts are just disabled locally since the timer irq - * has the SA_INTERRUPT flag set. -arca - */ - - /* read Pentium cycle counter */ - - rdtsc(last_tsc_low, last_tsc_high); - - spin_lock(&i8253_lock); - outb_p(0x00, PIT_MODE); /* latch the count ASAP */ - - count = inb_p(PIT_CH0); /* read the latched count */ - count |= inb(PIT_CH0) << 8; - - /* - * VIA686a test code... reset the latch if count > max + 1 - * from timer_pit.c - cjb - */ - if (count > LATCH) { - outb_p(0x34, PIT_MODE); - outb_p(LATCH & 0xff, PIT_CH0); - outb(LATCH >> 8, PIT_CH0); - count = LATCH - 1; - } - - spin_unlock(&i8253_lock); + u64 __cpu_khz; - if (pit_latch_buggy) { - /* get center value of last 3 time lutch */ - if ((count2 >= count && count >= count1) - || (count1 >= count && count >= count2)) { - count2 = count1; count1 = count; - } else if ((count1 >= count2 && count2 >= count) - || (count >= count2 && count2 >= count1)) { - countmp = count;count = count2; - count2 = count1;count1 = countmp; - } else { - count2 = count1; count1 = count; count = count1; - } - } + __cpu_khz = HYPERVISOR_shared_info->cpu_freq; + do_div(__cpu_khz, 1000); + cpu_khz = (u32)__cpu_khz; + printk(KERN_INFO "Xen reported: %lu.%03lu MHz processor.\n", + cpu_khz / 1000, cpu_khz % 1000); - /* lost tick compensation */ - delta = last_tsc_low - delta; + /* (10^6 * 2^32) / cpu_hz = (10^3 * 2^32) / cpu_khz = + (2^32 * 1 / (clocks/us)) */ { - register unsigned long eax, edx; - eax = delta; - __asm__("mull %2" - :"=a" (eax), "=d" (edx) - :"rm" (fast_gettimeoffset_quotient), - "0" (eax)); - delta = edx; - } - delta += delay_at_last_interrupt; - lost = delta/(1000000/HZ); - delay = delta%(1000000/HZ); - if (lost >= 2) { - jiffies_64 += lost-1; - - /* sanity check to ensure we're not always losing ticks */ - if (lost_count++ > 100) { - printk(KERN_WARNING "Losing too many ticks!\n"); - printk(KERN_WARNING "TSC cannot be used as a timesource. \n"); - printk(KERN_WARNING "Possible reasons for this are:\n"); - printk(KERN_WARNING " You're running with Speedstep,\n"); - printk(KERN_WARNING " You don't have DMA enabled for your hard disk (see hdparm),\n"); - printk(KERN_WARNING " Incorrect TSC synchronization on an SMP system (see dmesg).\n"); - printk(KERN_WARNING "Falling back to a sane timesource now.\n"); - - clock_fallback(); - } - /* ... but give the TSC a fair chance */ - if (lost_count > 25) - cpufreq_delayed_get(); - } else - lost_count = 0; - /* update the monotonic base value */ - this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; - monotonic_base += cycles_2_ns(this_offset - last_offset); - write_sequnlock(&monotonic_lock); - - /* calculate delay_at_last_interrupt */ - count = ((LATCH-1) - count) * TICK_SIZE; - delay_at_last_interrupt = (count + LATCH/2) / LATCH; - - /* catch corner case where tick rollover occured - * between tsc and pit reads (as noted when - * usec delta is > 90% # of usecs/tick) - */ - if (lost && abs(delay - delay_at_last_interrupt) > (900000/HZ)) - jiffies_64++; -} - -static int __init init_tsc(char* override) -{ - - /* check clock override */ - if (override[0] && strncmp(override,"tsc",3)) { -#ifdef CONFIG_HPET_TIMER - if (is_hpet_enabled()) { - printk(KERN_ERR "Warning: clock= override failed. Defaulting to tsc\n"); - } else -#endif - { - return -ENODEV; - } + unsigned long eax=0, edx=1000; + __asm__("divl %2" + :"=a" (fast_gettimeoffset_quotient), "=d" (edx) + :"r" (cpu_khz), + "0" (eax), "1" (edx)); } - /* - * If we have APM enabled or the CPU clock speed is variable - * (CPU stops clock on HLT or slows clock to save power) - * then the TSC timestamps may diverge by up to 1 jiffy from - * 'real time' but nothing will break. - * The most frequent case is that the CPU is "woken" from a halt - * state by the timer interrupt itself, so we get 0 error. In the - * rare cases where a driver would "wake" the CPU and request a - * timestamp, the maximum error is < 1 jiffy. But timestamps are - * still perfectly ordered. - * Note that the TSC counter will be reset if APM suspends - * to disk; this won't break the kernel, though, 'cuz we're - * smart. See arch/i386/kernel/apm.c. - */ - /* - * Firstly we have to do a CPU check for chips with - * a potentially buggy TSC. At this point we haven't run - * the ident/bugs checks so we must run this hook as it - * may turn off the TSC flag. - * - * NOTE: this doesn't yet handle SMP 486 machines where only - * some CPU's have a TSC. Thats never worked and nobody has - * moaned if you have the only one in the world - you fix it! - */ - - count2 = LATCH; /* initialize counter for mark_offset_tsc() */ + set_cyc2ns_scale(cpu_khz/1000); - if (cpu_has_tsc) { - unsigned long tsc_quotient; -#ifdef CONFIG_HPET_TIMER - if (is_hpet_enabled() && hpet_use_timer) { - unsigned long result, remain; - printk("Using TSC for gettimeofday\n"); - tsc_quotient = calibrate_tsc_hpet(NULL); - timer_tsc.mark_offset = &mark_offset_tsc_hpet; - /* - * Math to calculate hpet to usec multiplier - * Look for the comments at get_offset_tsc_hpet() - */ - ASM_DIV64_REG(result, remain, hpet_tick, - 0, KERNEL_TICK_USEC); - if (remain > (hpet_tick >> 1)) - result++; /* rounding the result */ + use_tsc = 1; - hpet_usec_quotient = result; - } else -#endif - { - tsc_quotient = calibrate_tsc(); - } - - if (tsc_quotient) { - fast_gettimeoffset_quotient = tsc_quotient; - use_tsc = 1; - /* - * We could be more selective here I suspect - * and just enable this for the next intel chips ? - */ - /* report CPU clock rate in Hz. - * The formula is (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) = - * clock/second. Our precision is about 100 ppm. - */ - { unsigned long eax=0, edx=1000; - __asm__("divl %2" - :"=a" (cpu_khz), "=d" (edx) - :"r" (tsc_quotient), - "0" (eax), "1" (edx)); - printk("Detected %lu.%03lu MHz processor.\n", cpu_khz / 1000, cpu_khz % 1000); - } - set_cyc2ns_scale(cpu_khz/1000); - return 0; - } - } - return -ENODEV; + return 0; } -#ifndef CONFIG_X86_TSC -/* disable flag for tsc. Takes effect by clearing the TSC cpu flag - * in cpu/common.c */ static int __init tsc_setup(char *str) { - tsc_disable = 1; + printk(KERN_WARNING "notsc: cannot disable TSC in Xen/Linux.\n"); return 1; } -#else -static int __init tsc_setup(char *str) -{ - printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, " - "cannot disable TSC.\n"); - return 1; -} -#endif __setup("notsc", tsc_setup); @@ -566,7 +365,7 @@ __setup("notsc", tsc_setup); /************************************************************/ /* tsc timer_opts struct */ -static struct timer_opts timer_tsc = { +struct timer_opts timer_tsc = { .name = "tsc", .mark_offset = mark_offset_tsc, .get_offset = get_offset_tsc, @@ -574,7 +373,7 @@ static struct timer_opts timer_tsc = { .delay = delay_tsc, }; -struct init_timer_opts __initdata timer_tsc_init = { +struct init_timer_opts timer_tsc_init = { .init = init_tsc, .opts = &timer_tsc, }; diff -x mkbuildtree -x include -x xen -x SCCS -urPp pristine-linux-2.6.12/arch/i386/kernel/traps.c linux-2.6-xen-sparse/arch/i386/kernel/traps.c --- pristine-linux-2.6.12/arch/i386/kernel/traps.c 2005-06-17 12:48:29.000000000 -0700 +++ linux-2.6-xen-sparse/arch/i386/kernel/traps.c 2005-07-28 13:17:07.000000000 -0700 @@ -58,9 +58,6 @@ asmlinkage int system_call(void); -struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 }, - { 0, 0 }, { 0, 0 } }; - /* Do we ignore FPU interrupts ? */ char ignore_fpu_irq = 0; @@ -88,7 +85,7 @@ asmlinkage void page_fault(void); asmlinkage void coprocessor_error(void); asmlinkage void simd_coprocessor_error(void); asmlinkage void alignment_check(void); -asmlinkage void spurious_interrupt_bug(void); +asmlinkage void fixup_4gb_segment(void); asmlinkage void machine_check(void); static int kstack_depth_to_print = 24; @@ -209,7 +206,7 @@ void show_registers(struct pt_regs *regs esp = (unsigned long) (®s->esp); ss = __KERNEL_DS; - if (regs->xcs & 3) { + if (regs->xcs & 2) { in_kernel = 0; esp = regs->esp; ss = regs->xss & 0xffff; @@ -265,7 +262,7 @@ static void handle_BUG(struct pt_regs *r char c; unsigned long eip; - if (regs->xcs & 3) + if (regs->xcs & 2) goto no_bug; /* Not in kernel */ eip = regs->eip; @@ -353,7 +350,7 @@ void die(const char * str, struct pt_reg static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err) { - if (!(regs->eflags & VM_MASK) && !(3 & regs->xcs)) + if (!(regs->eflags & VM_MASK) && !(2 & regs->xcs)) die(str, regs, err); } @@ -366,7 +363,7 @@ static void do_trap(int trapnr, int sign goto trap_signal; } - if (!(regs->xcs & 3)) + if (!(regs->xcs & 2)) goto kernel_trap; trap_signal: { @@ -446,49 +443,37 @@ DO_VM86_ERROR( 3, SIGTRAP, "int3", int3) DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow) DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds) DO_ERROR_INFO( 6, SIGILL, "invalid operand", invalid_op, ILL_ILLOPN, regs->eip) +DO_VM86_ERROR( 7, SIGSEGV, "device not available", device_not_available) DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) DO_ERROR(12, SIGBUS, "stack segment", stack_segment) DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0) +#ifdef CONFIG_X86_MCE +DO_ERROR(18, SIGBUS, "machine check", machine_check) +#endif fastcall void do_general_protection(struct pt_regs * regs, long error_code) { - int cpu = get_cpu(); - struct tss_struct *tss = &per_cpu(init_tss, cpu); - struct thread_struct *thread = ¤t->thread; - /* - * Perform the lazy TSS's I/O bitmap copy. If the TSS has an - * invalid offset set (the LAZY one) and the faulting thread has - * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS - * and we set the offset field correctly. Then we let the CPU to - * restart the faulting instruction. - */ - if (tss->io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY && - thread->io_bitmap_ptr) { - memcpy(tss->io_bitmap, thread->io_bitmap_ptr, - thread->io_bitmap_max); - /* - * If the previously set map was extending to higher ports - * than the current one, pad extra space with 0xff (no access). - */ - if (thread->io_bitmap_max < tss->io_bitmap_max) - memset((char *) tss->io_bitmap + - thread->io_bitmap_max, 0xff, - tss->io_bitmap_max - thread->io_bitmap_max); - tss->io_bitmap_max = thread->io_bitmap_max; - tss->io_bitmap_base = IO_BITMAP_OFFSET; - put_cpu(); - return; + * If we trapped on an LDT access then ensure that the default_ldt is + * loaded, if nothing else. We load default_ldt lazily because LDT + * switching costs time and many applications don't need it. + */ + if (unlikely((error_code & 6) == 4)) { + unsigned long ldt; + __asm__ __volatile__ ("sldt %0" : "=r" (ldt)); + if (ldt == 0) { + xen_set_ldt((unsigned long)&default_ldt[0], 5); + return; + } } - put_cpu(); if (regs->eflags & VM_MASK) goto gp_in_vm86; - if (!(regs->xcs & 3)) + if (!(regs->xcs & 2)) goto gp_in_kernel; current->thread.error_code = error_code; @@ -624,6 +609,14 @@ fastcall void do_nmi(struct pt_regs * re nmi_enter(); cpu = smp_processor_id(); + +#ifdef CONFIG_HOTPLUG_CPU + if (!cpu_online(cpu)) { + nmi_exit(); + return; + } +#endif + ++nmi_count(cpu); if (!nmi_callback(regs, cpu)) @@ -682,14 +675,16 @@ fastcall void do_debug(struct pt_regs * unsigned int condition; struct task_struct *tsk = current; - __asm__ __volatile__("movl %%db6,%0" : "=r" (condition)); + condition = HYPERVISOR_get_debugreg(6); if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, SIGTRAP) == NOTIFY_STOP) return; +#if 0 /* It's safe to allow irq's after DR6 has been saved */ if (regs->eflags & X86_EFLAGS_IF) local_irq_enable(); +#endif /* Mask out spurious debug traps due to lazy DR7 setting */ if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { @@ -713,7 +708,7 @@ fastcall void do_debug(struct pt_regs * * check for kernel mode by just checking the CPL * of CS. */ - if ((regs->xcs & 3) == 0) + if ((regs->xcs & 2) == 0) goto clear_TF_reenable; } @@ -724,9 +719,7 @@ fastcall void do_debug(struct pt_regs * * the signal is delivered. */ clear_dr7: - __asm__("movl %0,%%db7" - : /* no output */ - : "r" (0)); + HYPERVISOR_set_debugreg(7, 0); return; debug_vm86: @@ -878,15 +871,6 @@ fastcall void do_simd_coprocessor_error( } } -fastcall void do_spurious_interrupt_bug(struct pt_regs * regs, - long error_code) -{ -#if 0 - /* No need to warn about this any longer. */ - printk("Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); -#endif -} - fastcall void setup_x86_bogus_stack(unsigned char * stk) { unsigned long *switch16_ptr, *switch32_ptr; @@ -947,7 +931,7 @@ asmlinkage void math_state_restore(struc struct thread_info *thread = current_thread_info(); struct task_struct *tsk = thread->task; - clts(); /* Allow maths ops (or we recurse) */ + /* NB. 'clts' is done for us by Xen during virtual trap. */ if (!tsk_used_math(tsk)) init_fpu(tsk); restore_fpu(tsk); @@ -980,100 +964,58 @@ void __init trap_init_f00f_bug(void) } #endif -#define _set_gate(gate_addr,type,dpl,addr,seg) \ -do { \ - int __d0, __d1; \ - __asm__ __volatile__ ("movw %%dx,%%ax\n\t" \ - "movw %4,%%dx\n\t" \ - "movl %%eax,%0\n\t" \ - "movl %%edx,%1" \ - :"=m" (*((long *) (gate_addr))), \ - "=m" (*(1+(long *) (gate_addr))), "=&a" (__d0), "=&d" (__d1) \ - :"i" ((short) (0x8000+(dpl<<13)+(type<<8))), \ - "3" ((char *) (addr)),"2" ((seg) << 16)); \ -} while (0) - - -/* - * This needs to use 'idt_table' rather than 'idt', and - * thus use the _nonmapped_ version of the IDT, as the - * Pentium F0 0F bugfix can have resulted in the mapped - * IDT being write-protected. - */ -void set_intr_gate(unsigned int n, void *addr) -{ - _set_gate(idt_table+n,14,0,addr,__KERNEL_CS); -} - -/* - * This routine sets up an interrupt gate at directory privilege level 3. - */ -static inline void set_system_intr_gate(unsigned int n, void *addr) -{ - _set_gate(idt_table+n, 14, 3, addr, __KERNEL_CS); -} - -static void __init set_trap_gate(unsigned int n, void *addr) -{ - _set_gate(idt_table+n,15,0,addr,__KERNEL_CS); -} - -static void __init set_system_gate(unsigned int n, void *addr) -{ - _set_gate(idt_table+n,15,3,addr,__KERNEL_CS); -} - -static void __init set_task_gate(unsigned int n, unsigned int gdt_entry) -{ - _set_gate(idt_table+n,5,0,0,(gdt_entry<<3)); -} +/* NB. All these are "trap gates" (i.e. events_mask isn't cleared). */ +static trap_info_t trap_table[] = { + { 0, 0, __KERNEL_CS, (unsigned long)divide_error }, + { 1, 0, __KERNEL_CS, (unsigned long)debug }, + { 3, 3, __KERNEL_CS, (unsigned long)int3 }, + { 4, 3, __KERNEL_CS, (unsigned long)overflow }, + { 5, 3, __KERNEL_CS, (unsigned long)bounds }, + { 6, 0, __KERNEL_CS, (unsigned long)invalid_op }, + { 7, 0, __KERNEL_CS, (unsigned long)device_not_available }, + { 9, 0, __KERNEL_CS, (unsigned long)coprocessor_segment_overrun }, + { 10, 0, __KERNEL_CS, (unsigned long)invalid_TSS }, + { 11, 0, __KERNEL_CS, (unsigned long)segment_not_present }, + { 12, 0, __KERNEL_CS, (unsigned long)stack_segment }, + { 13, 0, __KERNEL_CS, (unsigned long)general_protection }, + { 14, 0, __KERNEL_CS, (unsigned long)page_fault }, + { 15, 0, __KERNEL_CS, (unsigned long)fixup_4gb_segment }, + { 16, 0, __KERNEL_CS, (unsigned long)coprocessor_error }, + { 17, 0, __KERNEL_CS, (unsigned long)alignment_check }, +#ifdef CONFIG_X86_MCE + { 18, 0, __KERNEL_CS, (unsigned long)machine_check }, +#endif + { 19, 0, __KERNEL_CS, (unsigned long)simd_coprocessor_error }, + { SYSCALL_VECTOR, 3, __KERNEL_CS, (unsigned long)system_call }, + { 0, 0, 0, 0 } +}; void __init trap_init(void) { -#ifdef CONFIG_EISA - void __iomem *p = ioremap(0x0FFFD9, 4); - if (readl(p) == 'E'+('I'<<8)+('S'<<16)+('A'<<24)) { - EISA_bus = 1; - } - iounmap(p); -#endif - -#ifdef CONFIG_X86_LOCAL_APIC - init_apic_mappings(); -#endif - - set_trap_gate(0,÷_error); - set_intr_gate(1,&debug); - set_intr_gate(2,&nmi); - set_system_intr_gate(3, &int3); /* int3-5 can be called from all */ - set_system_gate(4,&overflow); - set_system_gate(5,&bounds); - set_trap_gate(6,&invalid_op); - set_trap_gate(7,&device_not_available); - set_task_gate(8,GDT_ENTRY_DOUBLEFAULT_TSS); - set_trap_gate(9,&coprocessor_segment_overrun); - set_trap_gate(10,&invalid_TSS); - set_trap_gate(11,&segment_not_present); - set_trap_gate(12,&stack_segment); - set_trap_gate(13,&general_protection); - set_intr_gate(14,&page_fault); - set_trap_gate(15,&spurious_interrupt_bug); - set_trap_gate(16,&coprocessor_error); - set_trap_gate(17,&alignment_check); -#ifdef CONFIG_X86_MCE - set_trap_gate(18,&machine_check); -#endif - set_trap_gate(19,&simd_coprocessor_error); + HYPERVISOR_set_trap_table(trap_table); - set_system_gate(SYSCALL_VECTOR,&system_call); + /* + * default LDT is a single-entry callgate to lcall7 for iBCS + * and a callgate to lcall27 for Solaris/x86 binaries + */ + make_lowmem_page_readonly(&default_ldt[0]); /* * Should be a barrier for any external CPU state. */ cpu_init(); +} - trap_init_hook(); +void smp_trap_init(trap_info_t *trap_ctxt) +{ + trap_info_t *t = trap_table; + + for (t = trap_table; t->address; t++) { + trap_ctxt[t->vector].flags = t->flags; + trap_ctxt[t->vector].cs = t->cs; + trap_ctxt[t->vector].address = t->address; + } } static int __init kstack_setup(char *s) diff -x mkbuildtree -x include -x xen -x SCCS -urPp pristine-linux-2.6.12/arch/i386/kernel/vsyscall.S linux-2.6-xen-sparse/arch/i386/kernel/vsyscall.S --- pristine-linux-2.6.12/arch/i386/kernel/vsyscall.S 2005-06-17 12:48:29.000000000 -0700 +++ linux-2.6-xen-sparse/arch/i386/kernel/vsyscall.S 2005-07-28 13:17:07.000000000 -0700 @@ -4,12 +4,12 @@ __INITDATA .globl vsyscall_int80_start, vsyscall_int80_end vsyscall_int80_start: - .incbin "arch/i386/kernel/vsyscall-int80.so" + .incbin "arch/xen/i386/kernel/vsyscall-int80.so" vsyscall_int80_end: .globl vsyscall_sysenter_start, vsyscall_sysenter_end vsyscall_sysenter_start: - .incbin "arch/i386/kernel/vsyscall-sysenter.so" + .incbin "arch/xen/i386/kernel/vsyscall-sysenter.so" vsyscall_sysenter_end: __FINIT diff -x mkbuildtree -x include -x xen -x SCCS -urPp pristine-linux-2.6.12/arch/i386/mach-default/Makefile linux-2.6-xen-sparse/arch/i386/mach-default/Makefile --- pristine-linux-2.6.12/arch/i386/mach-default/Makefile 2005-06-17 12:48:29.000000000 -0700 +++ linux-2.6-xen-sparse/arch/i386/mach-default/Makefile 2005-07-28 13:17:07.000000000 -0700 @@ -2,4 +2,11 @@ # Makefile for the linux kernel. # -obj-y := setup.o topology.o +c-obj-y := topology.o + +$(patsubst %.o,$(obj)/%.c,$(c-obj-y)): + @ln -fsn $(srctree)/arch/i386/mach-default/$(notdir $@) $@ + +obj-y += $(c-obj-y) + +clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-)) diff -x mkbuildtree -x include -x xen -x SCCS -urPp pristine-linux-2.6.12/arch/i386/Makefile linux-2.6-xen-sparse/arch/i386/Makefile --- pristine-linux-2.6.12/arch/i386/Makefile 2005-06-17 12:48:29.000000000 -0700 +++ linux-2.6-xen-sparse/arch/i386/Makefile 2005-07-28 13:17:07.000000000 -0700 @@ -17,15 +17,19 @@ # 20050320 Kianusch Sayah Karadji <kianusch@xxxxxxxxxxx> # Added support for GEODE CPU +XENARCH := $(subst ",,$(CONFIG_XENARCH)) + LDFLAGS := -m elf_i386 -OBJCOPYFLAGS := -O binary -R .note -R .comment -S LDFLAGS_vmlinux := -CHECKFLAGS += -D__i386__ +CHECK := $(CHECK) -D__i386__=1 + +CFLAGS += -m32 +AFLAGS += -m32 CFLAGS += -pipe -msoft-float # prevent gcc from keeping the stack 16 byte aligned -CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2) +CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2,) align := $(cc-option-align) cflags-$(CONFIG_M386) += -march=i386 @@ -59,116 +63,46 @@ cflags-$(CONFIG_MGEODEGX1) += $(call cc # -mregparm=3 works ok on gcc-3.0 and later # -GCC_VERSION := $(call cc-version) +GCC_VERSION := $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-version.sh $(CC)) cflags-$(CONFIG_REGPARM) += $(shell if [ $(GCC_VERSION) -ge 0300 ] ; then echo "-mregparm=3"; fi ;) # Disable unit-at-a-time mode, it makes gcc use a lot more stack # due to the lack of sharing of stacklots. -CFLAGS += $(call cc-option,-fno-unit-at-a-time) +CFLAGS += $(call cc-option,-fno-unit-at-a-time,) CFLAGS += $(cflags-y) -# Default subarch .c files -mcore-y := mach-default - -# Voyager subarch support -mflags-$(CONFIG_X86_VOYAGER) := -Iinclude/asm-i386/mach-voyager -mcore-$(CONFIG_X86_VOYAGER) := mach-voyager - -# VISWS subarch support -mflags-$(CONFIG_X86_VISWS) := -Iinclude/asm-i386/mach-visws -mcore-$(CONFIG_X86_VISWS) := mach-visws - -# NUMAQ subarch support -mflags-$(CONFIG_X86_NUMAQ) := -Iinclude/asm-i386/mach-numaq -mcore-$(CONFIG_X86_NUMAQ) := mach-default - -# BIGSMP subarch support -mflags-$(CONFIG_X86_BIGSMP) := -Iinclude/asm-i386/mach-bigsmp -mcore-$(CONFIG_X86_BIGSMP) := mach-default - -#Summit subarch support -mflags-$(CONFIG_X86_SUMMIT) := -Iinclude/asm-i386/mach-summit -mcore-$(CONFIG_X86_SUMMIT) := mach-default - -# generic subarchitecture -mflags-$(CONFIG_X86_GENERICARCH) := -Iinclude/asm-i386/mach-generic -mcore-$(CONFIG_X86_GENERICARCH) := mach-default -core-$(CONFIG_X86_GENERICARCH) += arch/i386/mach-generic/ - -# ES7000 subarch support -mflags-$(CONFIG_X86_ES7000) := -Iinclude/asm-i386/mach-es7000 -mcore-$(CONFIG_X86_ES7000) := mach-default -core-$(CONFIG_X86_ES7000) := arch/i386/mach-es7000/ - -# default subarch .h files -mflags-y += -Iinclude/asm-i386/mach-default - -head-y := arch/i386/kernel/head.o arch/i386/kernel/init_task.o +head-y := arch/xen/i386/kernel/head.o arch/xen/i386/kernel/init_task.o libs-y += arch/i386/lib/ -core-y += arch/i386/kernel/ \ - arch/i386/mm/ \ - arch/i386/$(mcore-y)/ \ +core-y += arch/xen/i386/kernel/ \ + arch/xen/i386/mm/ \ + arch/xen/i386/mach-default/ \ arch/i386/crypto/ +# \ +# arch/xen/$(mcore-y)/ drivers-$(CONFIG_MATH_EMULATION) += arch/i386/math-emu/ -drivers-$(CONFIG_PCI) += arch/i386/pci/ +drivers-$(CONFIG_PCI) += arch/xen/i386/pci/ # must be linked after kernel/ drivers-$(CONFIG_OPROFILE) += arch/i386/oprofile/ drivers-$(CONFIG_PM) += arch/i386/power/ -CFLAGS += $(mflags-y) -AFLAGS += $(mflags-y) - -boot := arch/i386/boot - -.PHONY: zImage bzImage compressed zlilo bzlilo \ - zdisk bzdisk fdimage fdimage144 fdimage288 install kernel_install - -all: bzImage - -# KBUILD_IMAGE specify target image being built - KBUILD_IMAGE := $(boot)/bzImage -zImage zlilo zdisk: KBUILD_IMAGE := arch/i386/boot/zImage +# for clean +obj- += kernel/ mm/ pci/ +#obj- += ../../i386/lib/ ../../i386/mm/ +#../../i386/$(mcore-y)/ +#obj- += ../../i386/pci/ ../../i386/oprofile/ ../../i386/power/ + +xenflags-y += -Iinclude/asm-xen/asm-i386/mach-xen \ + -Iinclude/asm-i386/mach-default +CFLAGS += $(xenflags-y) +AFLAGS += $(xenflags-y) -zImage bzImage: vmlinux - $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE) +prepare: include/asm-$(XENARCH)/asm_offsets.h +CLEAN_FILES += include/asm-$(XENARCH)/asm_offsets.h -compressed: zImage +arch/$(XENARCH)/kernel/asm-offsets.s: include/asm include/.asm-ignore \ + include/linux/version.h include/config/MARKER -zlilo bzlilo: vmlinux - $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) zlilo - -zdisk bzdisk: vmlinux - $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) zdisk - -fdimage fdimage144 fdimage288: vmlinux - $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) $@ - -install: vmlinux -install kernel_install: - $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) install - -prepare: include/asm-$(ARCH)/asm_offsets.h -CLEAN_FILES += include/asm-$(ARCH)/asm_offsets.h - -arch/$(ARCH)/kernel/asm-offsets.s: include/asm include/linux/version.h \ - include/config/MARKER - -include/asm-$(ARCH)/asm_offsets.h: arch/$(ARCH)/kernel/asm-offsets.s +include/asm-$(XENARCH)/asm_offsets.h: arch/$(XENARCH)/kernel/asm-offsets.s $(call filechk,gen-asm-offsets) - -archclean: - $(Q)$(MAKE) $(clean)=arch/i386/boot - -define archhelp - echo '* bzImage - Compressed kernel image (arch/$(ARCH)/boot/bzImage)' - echo ' install - Install kernel using' - echo ' (your) ~/bin/installkernel or' - echo ' (distribution) /sbin/installkernel or' - echo ' install to $$(INSTALL_PATH) and run lilo' - echo ' bzdisk - Create a boot floppy in /dev/fd0' - echo ' fdimage - Create a boot floppy image' -endef - -CLEAN_FILES += arch/$(ARCH)/boot/fdimage arch/$(ARCH)/boot/mtools.conf diff -x mkbuildtree -x include -x xen -x SCCS -urPp pristine-linux-2.6.12/arch/i386/mm/fault.c linux-2.6-xen-sparse/arch/i386/mm/fault.c --- pristine-linux-2.6.12/arch/i386/mm/fault.c 2005-06-17 12:48:29.000000000 -0700 +++ linux-2.6-xen-sparse/arch/i386/mm/fault.c 2005-07-28 13:17:07.000000000 -0700 @@ -21,6 +21,7 @@ #include <linux/vt_kern.h> /* For unblank_screen() */ #include <linux/highmem.h> #include <linux/module.h> +#include <linux/percpu.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -29,6 +30,8 @@ extern void die(const char *,struct pt_regs *,long); +DEFINE_PER_CPU(pgd_t *, cur_pgd); + /* * Unlock any spinlocks which will prevent us from getting the * message out @@ -77,7 +80,7 @@ static inline unsigned long get_segment_ u32 seg_ar, seg_limit, base, *desc; /* The standard kernel/user address space limit. */ - *eip_limit = (seg & 3) ? USER_DS.seg : KERNEL_DS.seg; + *eip_limit = (seg & 2) ? USER_DS.seg : KERNEL_DS.seg; /* Unlikely, but must come before segment checks. */ if (unlikely((regs->eflags & VM_MASK) != 0)) @@ -107,7 +110,7 @@ static inline unsigned long get_segment_ desc = (void *)desc + (seg & ~7); } else { /* Must disable preemption while reading the GDT. */ - desc = (u32 *)&per_cpu(cpu_gdt_table, get_cpu()); + desc = (u32 *)get_cpu_gdt_table(get_cpu()); desc = (void *)desc + (seg & ~7); } @@ -211,25 +214,30 @@ fastcall void do_invalid_op(struct pt_re * bit 1 == 0 means read, 1 means write * bit 2 == 0 means kernel, 1 means user-mode */ -fastcall void do_page_fault(struct pt_regs *regs, unsigned long error_code) +fastcall void do_page_fault(struct pt_regs *regs, unsigned long error_code, + unsigned long address) { struct task_struct *tsk; struct mm_struct *mm; struct vm_area_struct * vma; - unsigned long address; unsigned long page; int write; siginfo_t info; - /* get the address */ - __asm__("movl %%cr2,%0":"=r" (address)); + /* Set the "privileged fault" bit to something sane. */ + error_code &= 3; + error_code |= (regs->xcs & 2) << 1; + if (regs->eflags & X86_EFLAGS_VM) + error_code |= 4; if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, SIGSEGV) == NOTIFY_STOP) return; +#if 0 /* It's safe to allow irq's after cr2 has been saved */ if (regs->eflags & (X86_EFLAGS_IF|VM_MASK)) local_irq_enable(); +#endif tsk = current; @@ -446,9 +454,10 @@ no_context: printk(" at virtual address %08lx\n",address); printk(KERN_ALERT " printing eip:\n"); printk("%08lx\n", regs->eip); - asm("movl %%cr3,%0":"=r" (page)); - page = ((unsigned long *) __va(page))[address >> 22]; - printk(KERN_ALERT "*pde = %08lx\n", page); + page = ((unsigned long *) per_cpu(cur_pgd, smp_processor_id())) + [address >> 22]; + printk(KERN_ALERT "*pde = ma %08lx pa %08lx\n", page, + machine_to_phys(page)); /* * We must not directly access the pte in the highpte * case, the page table might be allocated in highmem. @@ -459,8 +468,10 @@ no_context: if (page & 1) { page &= PAGE_MASK; address &= 0x003ff000; + page = machine_to_phys(page); page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT]; - printk(KERN_ALERT "*pte = %08lx\n", page); + printk(KERN_ALERT "*pte = ma %08lx pa %08lx\n", page, + machine_to_phys(page)); } #endif die("Oops", regs, error_code); @@ -514,14 +525,12 @@ vmalloc_fault: * an interrupt in the middle of a task switch.. */ int index = pgd_index(address); - unsigned long pgd_paddr; pgd_t *pgd, *pgd_k; pud_t *pud, *pud_k; pmd_t *pmd, *pmd_k; pte_t *pte_k; - asm("movl %%cr3,%0":"=r" (pgd_paddr)); - pgd = index + (pgd_t *)__va(pgd_paddr); + pgd = index + per_cpu(cur_pgd, smp_processor_id()); pgd_k = init_mm.pgd + index; if (!pgd_present(*pgd_k)) diff -x mkbuildtree -x include -x xen -x SCCS -urPp pristine-linux-2.6.12/arch/i386/mm/highmem.c linux-2.6-xen-sparse/arch/i386/mm/highmem.c --- pristine-linux-2.6.12/arch/i386/mm/highmem.c 2005-06-17 12:48:29.000000000 -0700 +++ linux-2.6-xen-sparse/arch/i386/mm/highmem.c 2005-07-28 13:17:07.000000000 -0700 @@ -25,7 +25,7 @@ void kunmap(struct page *page) * However when holding an atomic kmap is is not legal to sleep, so atomic * kmaps are appropriate for short, tight code paths only. */ -void *kmap_atomic(struct page *page, enum km_type type) +static void *__kmap_atomic(struct page *page, enum km_type type, pgprot_t prot) { enum fixed_addresses idx; unsigned long vaddr; @@ -41,12 +41,23 @@ void *kmap_atomic(struct page *page, enu if (!pte_none(*(kmap_pte-idx))) BUG(); #endif - set_pte(kmap_pte-idx, mk_pte(page, kmap_prot)); + set_pte(kmap_pte-idx, mk_pte(page, prot)); __flush_tlb_one(vaddr); return (void*) vaddr; } +void *kmap_atomic(struct page *page, enum km_type type) +{ + return __kmap_atomic(page, type, kmap_prot); +} + +/* Same as kmap_atomic but with PAGE_KERNEL_RO page protection. */ +void *kmap_atomic_pte(struct page *page, enum km_type type) +{ + return __kmap_atomic(page, type, PAGE_KERNEL_RO); +} + void kunmap_atomic(void *kvaddr, enum km_type type) { #ifdef CONFIG_DEBUG_HIGHMEM diff -x mkbuildtree -x include -x xen -x SCCS -urPp pristine-linux-2.6.12/arch/i386/mm/hypervisor.c linux-2.6-xen-sparse/arch/i386/mm/hypervisor.c --- pristine-linux-2.6.12/arch/i386/mm/hypervisor.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6-xen-sparse/arch/i386/mm/hypervisor.c 2005-07-28 13:17:07.000000000 -0700 @@ -0,0 +1,363 @@ +/****************************************************************************** + * mm/hypervisor.c + * + * Update page tables via the hypervisor. + * + * Copyright (c) 2002-2004, K A Fraser + * + * This file may be distributed separately from the Linux kernel, or + * incorporated into other software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <linux/config.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/vmalloc.h> +#include <asm/page.h> +#include <asm/pgtable.h> +#include <asm-xen/hypervisor.h> +#include <asm-xen/balloon.h> +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) +#include <linux/percpu.h> +#include <asm/tlbflush.h> +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) +#define pte_offset_kernel pte_offset +#define pud_t pgd_t +#define pud_offset(d, va) d +#elif defined(CONFIG_X86_64) +#define pmd_val_ma(v) (v).pmd +#else +#ifdef CONFIG_X86_PAE +# define pmd_val_ma(v) ((v).pmd) +# define pud_val_ma(v) ((v).pgd.pgd) +#else +# define pmd_val_ma(v) ((v).pud.pgd.pgd) +#endif +#endif + +#ifndef CONFIG_XEN_SHADOW_MODE +void xen_l1_entry_update(pte_t *ptr, pte_t val) +{ + mmu_update_t u; + u.ptr = virt_to_machine(ptr); + u.val = pte_val_ma(val); + BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0); +} + +void xen_l2_entry_update(pmd_t *ptr, pmd_t val) +{ + mmu_update_t u; + u.ptr = virt_to_machine(ptr); + u.val = pmd_val_ma(val); + BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0); +} + +#ifdef CONFIG_X86_PAE +void xen_l3_entry_update(pud_t *ptr, pud_t val) +{ + mmu_update_t u; + u.ptr = virt_to_machine(ptr); + u.val = pud_val_ma(val); + BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0); +} +#endif + +#ifdef CONFIG_X86_64 +void xen_l3_entry_update(pud_t *ptr, pud_t val) +{ + mmu_update_t u; + u.ptr = virt_to_machine(ptr); + u.val = val.pud; + BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0); +} + +void xen_l4_entry_update(pgd_t *ptr, pgd_t val) +{ + mmu_update_t u; + u.ptr = virt_to_machine(ptr); + u.val = val.pgd; + BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0); +} +#endif /* CONFIG_X86_64 */ +#endif /* CONFIG_XEN_SHADOW_MODE */ + +void xen_machphys_update(unsigned long mfn, unsigned long pfn) +{ + mmu_update_t u; + u.ptr = (mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE; + u.val = pfn; + BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0); +} + +void xen_pt_switch(unsigned long ptr) +{ + struct mmuext_op op; + op.cmd = MMUEXT_NEW_BASEPTR; + op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void xen_new_user_pt(unsigned long ptr) +{ + struct mmuext_op op; + op.cmd = MMUEXT_NEW_USER_BASEPTR; + op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void xen_tlb_flush(void) +{ + struct mmuext_op op; + op.cmd = MMUEXT_TLB_FLUSH_LOCAL; + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void xen_invlpg(unsigned long ptr) +{ + struct mmuext_op op; + op.cmd = MMUEXT_INVLPG_LOCAL; + op.linear_addr = ptr & PAGE_MASK; + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +#ifdef CONFIG_SMP + +void xen_tlb_flush_all(void) +{ + struct mmuext_op op; + op.cmd = MMUEXT_TLB_FLUSH_ALL; + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void xen_tlb_flush_mask(cpumask_t *mask) +{ + struct mmuext_op op; + if ( cpus_empty(*mask) ) + return; + op.cmd = MMUEXT_TLB_FLUSH_MULTI; + op.vcpumask = mask->bits; + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void xen_invlpg_all(unsigned long ptr) +{ + struct mmuext_op op; + op.cmd = MMUEXT_INVLPG_ALL; + op.linear_addr = ptr & PAGE_MASK; + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void xen_invlpg_mask(cpumask_t *mask, unsigned long ptr) +{ + struct mmuext_op op; + if ( cpus_empty(*mask) ) + return; + op.cmd = MMUEXT_INVLPG_MULTI; + op.vcpumask = mask->bits; + op.linear_addr = ptr & PAGE_MASK; + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +#endif /* CONFIG_SMP */ + +#ifndef CONFIG_XEN_SHADOW_MODE +void xen_pgd_pin(unsigned long ptr) +{ + struct mmuext_op op; +#ifdef CONFIG_X86_64 + op.cmd = MMUEXT_PIN_L4_TABLE; +#elif defined(CONFIG_X86_PAE) + op.cmd = MMUEXT_PIN_L3_TABLE; +#else + op.cmd = MMUEXT_PIN_L2_TABLE; +#endif + op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void xen_pgd_unpin(unsigned long ptr) +{ + struct mmuext_op op; + op.cmd = MMUEXT_UNPIN_TABLE; + op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void xen_pte_pin(unsigned long ptr) +{ + struct mmuext_op op; + op.cmd = MMUEXT_PIN_L1_TABLE; + op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void xen_pte_unpin(unsigned long ptr) +{ + struct mmuext_op op; + op.cmd = MMUEXT_UNPIN_TABLE; + op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +#ifdef CONFIG_X86_64 +void xen_pud_pin(unsigned long ptr) +{ + struct mmuext_op op; + op.cmd = MMUEXT_PIN_L3_TABLE; + op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void xen_pud_unpin(unsigned long ptr) +{ + struct mmuext_op op; + op.cmd = MMUEXT_UNPIN_TABLE; + op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void xen_pmd_pin(unsigned long ptr) +{ + struct mmuext_op op; + op.cmd = MMUEXT_PIN_L2_TABLE; + op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void xen_pmd_unpin(unsigned long ptr) +{ + struct mmuext_op op; + op.cmd = MMUEXT_UNPIN_TABLE; + op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} +#endif /* CONFIG_X86_64 */ +#endif /* CONFIG_XEN_SHADOW_MODE */ + +void xen_set_ldt(unsigned long ptr, unsigned long len) +{ + struct mmuext_op op; + op.cmd = MMUEXT_SET_LDT; + op.linear_addr = ptr; + op.nr_ents = len; + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +void xen_contig_memory(unsigned long vstart, unsigned int order) +{ + /* + * Ensure multi-page extents are contiguous in machine memory. This code + * could be cleaned up some, and the number of hypercalls reduced. + */ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + unsigned long mfn, i, flags; + + scrub_pages(vstart, 1 << order); + + balloon_lock(flags); + + /* 1. Zap current PTEs, giving away the underlying pages. */ + for (i = 0; i < (1<<order); i++) { + pgd = pgd_offset_k(vstart + (i*PAGE_SIZE)); + pud = pud_offset(pgd, (vstart + (i*PAGE_SIZE))); + pmd = pmd_offset(pud, (vstart + (i*PAGE_SIZE))); + pte = pte_offset_kernel(pmd, (vstart + (i*PAGE_SIZE))); + mfn = pte_mfn(*pte); + HYPERVISOR_update_va_mapping( + vstart + (i*PAGE_SIZE), __pte_ma(0), 0); + phys_to_machine_mapping[(__pa(vstart)>>PAGE_SHIFT)+i] = + INVALID_P2M_ENTRY; + BUG_ON(HYPERVISOR_dom_mem_op( + MEMOP_decrease_reservation, &mfn, 1, 0) != 1); + } + + /* 2. Get a new contiguous memory extent. */ + BUG_ON(HYPERVISOR_dom_mem_op( + MEMOP_increase_reservation, &mfn, 1, order) != 1); + + /* 3. Map the new extent in place of old pages. */ + for (i = 0; i < (1<<order); i++) { + HYPERVISOR_update_va_mapping( + vstart + (i*PAGE_SIZE), + __pte_ma(((mfn+i)<<PAGE_SHIFT)|__PAGE_KERNEL), 0); + xen_machphys_update(mfn+i, (__pa(vstart)>>PAGE_SHIFT)+i); + phys_to_machine_mapping[(__pa(vstart)>>PAGE_SHIFT)+i] = mfn+i; + } + + flush_tlb_all(); + + balloon_unlock(flags); +} + +#ifdef CONFIG_XEN_PHYSDEV_ACCESS + +unsigned long allocate_empty_lowmem_region(unsigned long pages) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + unsigned long *pfn_array; + unsigned long vstart; + unsigned long i; + unsigned int order = get_order(pages*PAGE_SIZE); + + vstart = __get_free_pages(GFP_KERNEL, order); + if ( vstart == 0 ) + return 0UL; + + scrub_pages(vstart, 1 << order); + + pfn_array = vmalloc((1<<order) * sizeof(*pfn_array)); + if ( pfn_array == NULL ) + BUG(); + + for ( i = 0; i < (1<<order); i++ ) + { + pgd = pgd_offset_k( (vstart + (i*PAGE_SIZE))); + pud = pud_offset(pgd, (vstart + (i*PAGE_SIZE))); + pmd = pmd_offset(pud, (vstart + (i*PAGE_SIZE))); + pte = pte_offset_kernel(pmd, (vstart + (i*PAGE_SIZE))); + pfn_array[i] = pte_mfn(*pte); +#ifdef CONFIG_X86_64 + xen_l1_entry_update(pte, __pte(0)); +#else + HYPERVISOR_update_va_mapping(vstart + (i*PAGE_SIZE), __pte_ma(0), 0); +#endif + phys_to_machine_mapping[(__pa(vstart)>>PAGE_SHIFT)+i] = + INVALID_P2M_ENTRY; + } + + flush_tlb_all(); + + balloon_put_pages(pfn_array, 1 << order); + + vfree(pfn_array); + + return vstart; +} + +#endif /* CONFIG_XEN_PHYSDEV_ACCESS */ diff -x mkbuildtree -x include -x xen -x SCCS -urPp pristine-linux-2.6.12/arch/i386/mm/init.c linux-2.6-xen-sparse/arch/i386/mm/init.c --- pristine-linux-2.6.12/arch/i386/mm/init.c 2005-06-17 12:48:29.000000000 -0700 +++ linux-2.6-xen-sparse/arch/i386/mm/init.c 2005-07-28 13:17:07.000000000 -0700 @@ -39,6 +39,7 @@ #include <asm/tlb.h> #include <asm/tlbflush.h> #include <asm/sections.h> +#include <asm-xen/hypervisor.h> unsigned int __VMALLOC_RESERVE = 128 << 20; @@ -56,9 +57,10 @@ static pmd_t * __init one_md_table_init( { pud_t *pud; pmd_t *pmd_table; - + #ifdef CONFIG_X86_PAE pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); + make_page_readonly(pmd_table); set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); pud = pud_offset(pgd, 0); if (pmd_table != pmd_offset(pud, 0)) @@ -79,6 +81,7 @@ static pte_t * __init one_page_table_ini { if (pmd_none(*pmd)) { pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); + make_page_readonly(page_table); set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); if (page_table != pte_offset_kernel(pmd, 0)) BUG(); @@ -119,7 +122,7 @@ static void __init page_table_range_init pud = pud_offset(pgd, vaddr); pmd = pmd_offset(pud, vaddr); for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { - if (pmd_none(*pmd)) + if (vaddr < HYPERVISOR_VIRT_START && pmd_none(*pmd)) one_page_table_init(pmd); vaddr += PMD_SIZE; @@ -148,16 +151,36 @@ static void __init kernel_physical_mappi pte_t *pte; int pgd_idx, pmd_idx, pte_ofs; + unsigned long max_ram_pfn = xen_start_info.nr_pages; + if (max_ram_pfn > max_low_pfn) + max_ram_pfn = max_low_pfn; + pgd_idx = pgd_index(PAGE_OFFSET); pgd = pgd_base + pgd_idx; pfn = 0; + pmd_idx = pmd_index(PAGE_OFFSET); + pte_ofs = pte_index(PAGE_OFFSET); for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) { +#ifdef CONFIG_XEN + /* + * Native linux hasn't PAE-paging enabled yet at this + * point. When running as xen domain we are in PAE + * mode already, thus we can't simply hook a empty + * pmd. That would kill the mappings we are currently + * using ... + */ + pmd = pmd_offset(pud_offset(pgd, PAGE_OFFSET), PAGE_OFFSET); +#else pmd = one_md_table_init(pgd); +#endif if (pfn >= max_low_pfn) continue; - for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) { + pmd += pmd_idx; + for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) { unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET; + if (address >= HYPERVISOR_VIRT_START) + continue; /* Map with big pages if possible, otherwise create normal page tables. */ if (cpu_has_pse) { @@ -171,14 +194,20 @@ static void __init kernel_physical_mappi } else { pte = one_page_table_init(pmd); - for (pte_ofs = 0; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) { + pte += pte_ofs; + for (; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) { + /* XEN: Only map initial RAM allocation. */ + if ((pfn >= max_ram_pfn) || pte_present(*pte)) + continue; if (is_kernel_text(address)) set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC)); else set_pte(pte, pfn_pte(pfn, PAGE_KERNEL)); } + pte_ofs = 0; } } + pmd_idx = 0; } } @@ -271,7 +300,8 @@ void __init one_highpage_init(struct pag ClearPageReserved(page); set_bit(PG_highmem, &page->flags); set_page_count(page, 1); - __free_page(page); + if (pfn < xen_start_info.nr_pages) + __free_page(page); totalhigh_pages++; } else SetPageReserved(page); @@ -308,6 +338,7 @@ static void __init pagetable_init (void) { unsigned long vaddr; pgd_t *pgd_base = swapper_pg_dir; + pgd_t *old_pgd = (pgd_t *)xen_start_info.pt_base; #ifdef CONFIG_X86_PAE int i; @@ -328,6 +359,45 @@ static void __init pagetable_init (void) __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL; } + /* + * Switch to proper mm_init page directory. Initialise from the current + * page directory, write-protect the new page directory, then switch to + * it. We clean up by write-enabling and then freeing the old page dir. + */ +#ifndef CONFIG_X86_PAE + memcpy(pgd_base, old_pgd, PTRS_PER_PGD_NO_HV*sizeof(pgd_t)); + make_page_readonly(pgd_base); + xen_pgd_pin(__pa(pgd_base)); + load_cr3(pgd_base); + xen_pgd_unpin(__pa(old_pgd)); + make_page_writable(old_pgd); + __flush_tlb_all(); + free_bootmem(__pa(old_pgd), PAGE_SIZE); +#else + { + pud_t *old_pud = pud_offset(old_pgd+3, PAGE_OFFSET); + pmd_t *old_pmd = pmd_offset(old_pud, PAGE_OFFSET); + pmd_t *new_pmd = alloc_bootmem_low_pages(PAGE_SIZE); + + memcpy(new_pmd, old_pmd, PAGE_SIZE); + memcpy(pgd_base, old_pgd, PTRS_PER_PGD_NO_HV*sizeof(pgd_t)); + set_pgd(&pgd_base[3], __pgd(__pa(new_pmd) | _PAGE_PRESENT)); + + make_page_readonly(new_pmd); + make_page_readonly(pgd_base); + xen_pgd_pin(__pa(pgd_base)); + load_cr3(pgd_base); + xen_pgd_unpin(__pa(old_pgd)); + make_page_writable(old_pgd); + make_page_writable(old_pmd); + __flush_tlb_all(); + + free_bootmem(__pa(old_pgd), PAGE_SIZE); + free_bootmem(__pa(old_pmd), PAGE_SIZE); + } +#endif + + init_mm.context.pinned = 1; kernel_physical_mapping_init(pgd_base); remap_numa_kva(); @@ -340,7 +410,7 @@ static void __init pagetable_init (void) permanent_kmaps_init(pgd_base); -#ifdef CONFIG_X86_PAE +#if 0 /* def CONFIG_X86_PAE */ /* * Add low memory identity-mappings - SMP needs it when * starting up on an AP from real-mode. In the non-PAE @@ -348,7 +418,7 @@ static void __init pagetable_init (void) * All user-space mappings are explicitly cleared after * SMP startup. */ - pgd_base[0] = pgd_base[USER_PTRS_PER_PGD]; + set_pgd(&pgd_base[0], pgd_base[USER_PTRS_PER_PGD]); #endif } @@ -383,7 +453,7 @@ void zap_low_mappings (void) * us, because pgd_clear() is a no-op on i386. */ for (i = 0; i < USER_PTRS_PER_PGD; i++) -#ifdef CONFIG_X86_PAE +#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN) set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page))); #else set_pgd(swapper_pg_dir+i, __pgd(0)); @@ -470,6 +540,10 @@ out: */ void __init paging_init(void) { +#ifdef CONFIG_XEN_PHYSDEV_ACCESS + int i; +#endif + #ifdef CONFIG_X86_PAE set_nx(); if (nx_enabled) @@ -478,12 +552,12 @@ void __init paging_init(void) pagetable_init(); - load_cr3(swapper_pg_dir); - -#ifdef CONFIG_X86_PAE +#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN) /* * We will bail out later - printk doesn't work right now so * the user would just see a hanging kernel. + * when running as xen domain we are already in PAE mode at + * this point. */ if (cpu_has_pae) set_in_cr4(X86_CR4_PAE); @@ -491,6 +565,22 @@ void __init paging_init(void) __flush_tlb_all(); kmap_init(); + + /* Switch to the real shared_info page, and clear the dummy page. */ + set_fixmap(FIX_SHARED_INFO, xen_start_info.shared_info); + HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO); + memset(empty_zero_page, 0, sizeof(empty_zero_page)); + +#ifdef CONFIG_XEN_PHYSDEV_ACCESS + /* Setup mapping of lower 1st MB */ + for (i = 0; i < NR_FIX_ISAMAPS; i++) + if (xen_start_info.flags & SIF_PRIVILEGED) + set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE); + else + __set_fixmap(FIX_ISAMAP_BEGIN - i, + virt_to_machine(empty_zero_page), + PAGE_KERNEL_RO); +#endif } /* @@ -539,6 +629,7 @@ void __init mem_init(void) int codesize, reservedpages, datasize, initsize; int tmp; int bad_ppro; + unsigned long pfn; #ifndef CONFIG_DISCONTIGMEM if (!mem_map) @@ -564,9 +655,18 @@ void __init mem_init(void) #else high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; #endif - + printk("vmalloc area: %lx-%lx, maxmem %lx\n", + VMALLOC_START,VMALLOC_END,MAXMEM); + BUG_ON(VMALLOC_START > VMALLOC_END); + /* this will put all low memory onto the freelists */ totalram_pages += free_all_bootmem(); + /* XEN: init and count low-mem pages outside initial allocation. */ + for (pfn = xen_start_info.nr_pages; pfn < max_low_pfn; pfn++) { + ClearPageReserved(&mem_map[pfn]); + set_page_count(&mem_map[pfn], 1); + totalram_pages++; + } reservedpages = 0; for (tmp = 0; tmp < max_low_pfn; tmp++) @@ -630,11 +730,16 @@ void __init pgtable_cache_init(void) panic("pgtable_cache_init(): cannot create pmd cache"); } pgd_cache = kmem_cache_create("pgd", +#if 0 /* How the heck _this_ works in native linux ??? */ PTRS_PER_PGD*sizeof(pgd_t), PTRS_PER_PGD*sizeof(pgd_t), +#else + PAGE_SIZE, + PAGE_SIZE, +#endif 0, pgd_ctor, - PTRS_PER_PMD == 1 ? pgd_dtor : NULL); + pgd_dtor); if (!pgd_cache) panic("pgtable_cache_init(): Cannot create pgd cache"); } diff -x mkbuildtree -x include -x xen -x SCCS -urPp pristine-linux-2.6.12/arch/i386/mm/ioremap.c linux-2.6-xen-sparse/arch/i386/mm/ioremap.c --- pristine-linux-2.6.12/arch/i386/mm/ioremap.c 2005-06-17 12:48:29.000000000 -0700 +++ linux-2.6-xen-sparse/arch/i386/mm/ioremap.c 2005-07-28 13:17:07.000000000 -0700 @@ -11,91 +11,54 @@ #include <linux/vmalloc.h> #include <linux/init.h> #include <linux/slab.h> +#include <linux/module.h> #include <asm/io.h> #include <asm/fixmap.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> #include <asm/pgtable.h> +#include <asm/pgalloc.h> -#define ISA_START_ADDRESS 0xa0000 -#define ISA_END_ADDRESS 0x100000 +#ifndef CONFIG_XEN_PHYSDEV_ACCESS -static int ioremap_pte_range(pmd_t *pmd, unsigned long addr, - unsigned long end, unsigned long phys_addr, unsigned long flags) +void * __ioremap(unsigned long phys_addr, unsigned long size, + unsigned long flags) { - pte_t *pte; - unsigned long pfn; - - pfn = phys_addr >> PAGE_SHIFT; - pte = pte_alloc_kernel(&init_mm, pmd, addr); - if (!pte) - return -ENOMEM; - do { - BUG_ON(!pte_none(*pte)); - set_pte(pte, pfn_pte(pfn, __pgprot(_PAGE_PRESENT | _PAGE_RW | - _PAGE_DIRTY | _PAGE_ACCESSED | flags))); - pfn++; - } while (pte++, addr += PAGE_SIZE, addr != end); - return 0; + return NULL; } -static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr, - unsigned long end, unsigned long phys_addr, unsigned long flags) +void *ioremap_nocache (unsigned long phys_addr, unsigned long size) { - pmd_t *pmd; - unsigned long next; - - phys_addr -= addr; - pmd = pmd_alloc(&init_mm, pud, addr); - if (!pmd) - return -ENOMEM; - do { - next = pmd_addr_end(addr, end); - if (ioremap_pte_range(pmd, addr, next, phys_addr + addr, flags)) - return -ENOMEM; - } while (pmd++, addr = next, addr != end); - return 0; + return NULL; } -static inline int ioremap_pud_range(pgd_t *pgd, unsigned long addr, - unsigned long end, unsigned long phys_addr, unsigned long flags) +void iounmap(volatile void __iomem *addr) { - pud_t *pud; - unsigned long next; +} - phys_addr -= addr; - pud = pud_alloc(&init_mm, pgd, addr); - if (!pud) - return -ENOMEM; - do { - next = pud_addr_end(addr, end); - if (ioremap_pmd_range(pud, addr, next, phys_addr + addr, flags)) - return -ENOMEM; - } while (pud++, addr = next, addr != end); - return 0; +void __init *bt_ioremap(unsigned long phys_addr, unsigned long size) +{ + return NULL; } -static int ioremap_page_range(unsigned long addr, - unsigned long end, unsigned long phys_addr, unsigned long flags) +void __init bt_iounmap(void *addr, unsigned long size) { - pgd_t *pgd; - unsigned long next; - int err; +} - BUG_ON(addr >= end); - flush_cache_all(); - phys_addr -= addr; - pgd = pgd_offset_k(addr); - spin_lock(&init_mm.page_table_lock); - do { - next = pgd_addr_end(addr, end); - err = ioremap_pud_range(pgd, addr, next, phys_addr+addr, flags); - if (err) - break; - } while (pgd++, addr = next, addr != end); - spin_unlock(&init_mm.page_table_lock); - flush_tlb_all(); - return err; +#else + +/* + * Does @address reside within a non-highmem page that is local to this virtual + * machine (i.e., not an I/O page, nor a memory page belonging to another VM). + * See the comment that accompanies pte_pfn() in pgtable-2level.h to understand + * why this works. + */ +static inline int is_local_lowmem(unsigned long address) +{ + extern unsigned long max_low_pfn; + unsigned long mfn = address >> PAGE_SHIFT; + unsigned long pfn = mfn_to_pfn(mfn); + return ((pfn < max_low_pfn) && (pfn_to_mfn(pfn) == mfn)); } /* @@ -116,31 +79,36 @@ void __iomem * __ioremap(unsigned long p void __iomem * addr; struct vm_struct * area; unsigned long offset, last_addr; + domid_t domid = DOMID_IO; /* Don't allow wraparound or zero size */ last_addr = phys_addr + size - 1; if (!size || last_addr < phys_addr) return NULL; +#ifdef CONFIG_XEN_PRIVILEGED_GUEST /* * Don't remap the low PCI/ISA area, it's always mapped.. */ - if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) - return (void __iomem *) phys_to_virt(phys_addr); + if (phys_addr >= 0x0 && last_addr < 0x100000) + return isa_bus_to_virt(phys_addr); +#endif /* * Don't allow anybody to remap normal RAM that we're using.. */ - if (phys_addr <= virt_to_phys(high_memory - 1)) { + if (is_local_lowmem(phys_addr)) { char *t_addr, *t_end; struct page *page; - t_addr = __va(phys_addr); + t_addr = bus_to_virt(phys_addr); t_end = t_addr + (size - 1); for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++) if(!PageReserved(page)) return NULL; + + domid = DOMID_SELF; } /* @@ -158,8 +126,10 @@ void __iomem * __ioremap(unsigned long p return NULL; area->phys_addr = phys_addr; addr = (void __iomem *) area->addr; - if (ioremap_page_range((unsigned long) addr, - (unsigned long) addr + size, phys_addr, flags)) { + if (direct_remap_area_pages(&init_mm, (unsigned long) addr, phys_addr, + size, __pgprot(_PAGE_PRESENT | _PAGE_RW | + _PAGE_DIRTY | _PAGE_ACCESSED + | flags), domid)) { vunmap((void __force *) addr); return NULL; } @@ -199,8 +169,8 @@ void __iomem *ioremap_nocache (unsigned /* Guaranteed to be > phys_addr, as per __ioremap() */ last_addr = phys_addr + size - 1; - if (last_addr < virt_to_phys(high_memory) - 1) { - struct page *ppage = virt_to_page(__va(phys_addr)); + if (is_local_lowmem(last_addr)) { + struct page *ppage = virt_to_page(bus_to_virt(phys_addr)); unsigned long npages; phys_addr &= PAGE_MASK; @@ -227,32 +197,24 @@ void iounmap(volatile void __iomem *addr { struct vm_struct *p; if ((void __force *) addr <= high_memory) + return; +#ifdef CONFIG_XEN_PRIVILEGED_GUEST + if ((unsigned long) addr >= fix_to_virt(FIX_ISAMAP_BEGIN)) return; - - /* - * __ioremap special-cases the PCI/ISA range by not instantiating a - * vm_area and by simply returning an address into the kernel mapping - * of ISA space. So handle that here. - */ - if (addr >= phys_to_virt(ISA_START_ADDRESS) && - addr < phys_to_virt(ISA_END_ADDRESS)) - return; - - write_lock(&vmlist_lock); - p = __remove_vm_area((void *) (PAGE_MASK & (unsigned long __force) addr)); +#endif + p = remove_vm_area((void *) (PAGE_MASK & (unsigned long __force) addr)); if (!p) { - printk("iounmap: bad address %p\n", addr); - goto out_unlock; + printk("__iounmap: bad address %p\n", addr); + return; } - if ((p->flags >> 20) && p->phys_addr < virt_to_phys(high_memory) - 1) { - change_page_attr(virt_to_page(__va(p->phys_addr)), - p->size >> PAGE_SHIFT, - PAGE_KERNEL); + if ((p->flags >> 20) && is_local_lowmem(p->phys_addr)) { + /* p->size includes the guard page, but cpa doesn't like that */ + change_page_attr(virt_to_page(bus_to_virt(p->phys_addr)), + (p->size - PAGE_SIZE) >> PAGE_SHIFT, + PAGE_KERNEL); global_flush_tlb(); } -out_unlock: - write_unlock(&vmlist_lock); kfree(p); } @@ -267,11 +229,13 @@ void __init *bt_ioremap(unsigned long ph if (!size || last_addr < phys_addr) return NULL; +#ifdef CONFIG_XEN_PRIVILEGED_GUEST /* * Don't remap the low PCI/ISA area, it's always mapped.. */ - if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) - return phys_to_virt(phys_addr); + if (phys_addr >= 0x0 && last_addr < 0x100000) + return isa_bus_to_virt(phys_addr); +#endif /* * Mappings have to be page-aligned @@ -310,6 +274,10 @@ void __init bt_iounmap(void *addr, unsig virt_addr = (unsigned long)addr; if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) return; +#ifdef CONFIG_XEN_PRIVILEGED_GUEST + if (virt_addr >= fix_to_virt(FIX_ISAMAP_BEGIN)) + return; +#endif offset = virt_addr & ~PAGE_MASK; nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT; @@ -320,3 +288,155 @@ void __init bt_iounmap(void *addr, unsig --nrpages; } } + +#endif /* CONFIG_XEN_PHYSDEV_ACCESS */ + +/* These hacky macros avoid phys->machine translations. */ +#define __direct_pte(x) ((pte_t) { (x) } ) +#define __direct_mk_pte(page_nr,pgprot) \ + __direct_pte(((page_nr) << PAGE_SHIFT) | pgprot_val(pgprot)) +#define direct_mk_pte_phys(physpage, pgprot) \ + __direct_mk_pte((physpage) >> PAGE_SHIFT, pgprot) + +static inline void direct_remap_area_pte(pte_t *pte, + unsigned long address, + unsigned long size, + mmu_update_t **v) +{ + unsigned long end; + + address &= ~PMD_MASK; + end = address + size; + if (end > PMD_SIZE) + end = PMD_SIZE; + if (address >= end) + BUG(); + + do { + (*v)->ptr = virt_to_machine(pte); + (*v)++; + address += PAGE_SIZE; + pte++; + } while (address && (address < end)); +} + +static inline int direct_remap_area_pmd(struct mm_struct *mm, + pmd_t *pmd, + unsigned long address, + unsigned long size, + mmu_update_t **v) +{ + unsigned long end; + + address &= ~PGDIR_MASK; + end = address + size; + if (end > PGDIR_SIZE) + end = PGDIR_SIZE; + if (address >= end) + BUG(); + do { + pte_t *pte = (mm == &init_mm) ? + pte_alloc_kernel(mm, pmd, address) : + pte_alloc_map(mm, pmd, address); + if (!pte) + return -ENOMEM; + direct_remap_area_pte(pte, address, end - address, v); + pte_unmap(pte); + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } while (address && (address < end)); + return 0; +} + +int __direct_remap_area_pages(struct mm_struct *mm, + unsigned long address, + unsigned long size, + mmu_update_t *v) +{ + pgd_t * dir; + unsigned long end = address + size; + int error; + + dir = pgd_offset(mm, address); + if (address >= end) + BUG(); + spin_lock(&mm->page_table_lock); + do { + pud_t *pud; + pmd_t *pmd; + + error = -ENOMEM; + pud = pud_alloc(mm, dir, address); + if (!pud) + break; + pmd = pmd_alloc(mm, pud, address); + if (!pmd) + break; + error = 0; + direct_remap_area_pmd(mm, pmd, address, end - address, &v); + address = (address + PGDIR_SIZE) & PGDIR_MASK; + dir++; + + } while (address && (address < end)); + spin_unlock(&mm->page_table_lock); + return error; +} + + +int direct_remap_area_pages(struct mm_struct *mm, + unsigned long address, + unsigned long machine_addr, + unsigned long size, + pgprot_t prot, + domid_t domid) +{ + int i; + unsigned long start_address; +#define MAX_DIRECTMAP_MMU_QUEUE 130 + mmu_update_t u[MAX_DIRECTMAP_MMU_QUEUE], *v = u; + + start_address = address; + + flush_cache_all(); + + for (i = 0; i < size; i += PAGE_SIZE) { + if ((v - u) == MAX_DIRECTMAP_MMU_QUEUE) { + /* Fill in the PTE pointers. */ + __direct_remap_area_pages(mm, + start_address, + address-start_address, + u); + + if (HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0) + return -EFAULT; + v = u; + start_address = address; + } + + /* + * Fill in the machine address: PTE ptr is done later by + * __direct_remap_area_pages(). + */ + v->val = (machine_addr & PAGE_MASK) | pgprot_val(prot); + + machine_addr += PAGE_SIZE; + address += PAGE_SIZE; + v++; + } + + if (v != u) { + /* get the ptep's filled in */ + __direct_remap_area_pages(mm, + start_address, + address-start_address, + u); + if (unlikely(HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0)) + return -EFAULT; + } + + flush_tlb_all(); + + return 0; +} + +EXPORT_SYMBOL(direct_remap_area_pages); diff -x mkbuildtree -x include -x xen -x SCCS -urPp pristine-linux-2.6.12/arch/i386/mm/Makefile linux-2.6-xen-sparse/arch/i386/mm/Makefile --- pristine-linux-2.6.12/arch/i386/mm/Makefile 2005-06-17 12:48:29.000000000 -0700 +++ linux-2.6-xen-sparse/arch/i386/mm/Makefile 2005-07-28 13:17:07.000000000 -0700 @@ -2,9 +2,23 @@ # Makefile for the linux i386-specific parts of the memory manager. # -obj-y := init.o pgtable.o fault.o ioremap.o extable.o pageattr.o mmap.o +XENARCH := $(subst ",,$(CONFIG_XENARCH)) -obj-$(CONFIG_DISCONTIGMEM) += discontig.o -obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o +CFLAGS += -Iarch/$(XENARCH)/mm + +obj-y := init.o pgtable.o fault.o ioremap.o hypervisor.o +c-obj-y := extable.o mmap.o pageattr.o + +c-obj-$(CONFIG_DISCONTIGMEM) += discontig.o +c-obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_HIGHMEM) += highmem.o -obj-$(CONFIG_BOOT_IOREMAP) += boot_ioremap.o +c-obj-$(CONFIG_BOOT_IOREMAP) += boot_ioremap.o + +c-link := + +$(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-link)): + @ln -fsn $(srctree)/arch/i386/mm/$(notdir $@) $@ + +obj-y += $(c-obj-y) + +clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-) $(c-link)) diff -x mkbuildtree -x include -x xen -x SCCS -urPp pristine-linux-2.6.12/arch/i386/mm/pgtable.c linux-2.6-xen-sparse/arch/i386/mm/pgtable.c --- pristine-linux-2.6.12/arch/i386/mm/pgtable.c 2005-06-17 12:48:29.000000000 -0700 +++ linux-2.6-xen-sparse/arch/i386/mm/pgtable.c 2005-07-28 13:17:07.000000000 -0700 @@ -21,6 +21,10 @@ #include <asm/e820.h> #include <asm/tlb.h> #include <asm/tlbflush.h> +#include <asm/io.h> +#include <asm/mmu_context.h> + +#include <asm-xen/foreign_page.h> void show_mem(void) { @@ -93,6 +97,44 @@ static void set_pte_pfn(unsigned long va } /* + * Associate a virtual page frame with a given physical page frame + * and protection flags for that frame. + */ +static void set_pte_pfn_ma(unsigned long vaddr, unsigned long pfn, + pgprot_t flags) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = swapper_pg_dir + pgd_index(vaddr); + if (pgd_none(*pgd)) { + BUG(); + return; + } + pud = pud_offset(pgd, vaddr); + if (pud_none(*pud)) { + BUG(); + return; + } + pmd = pmd_offset(pud, vaddr); + if (pmd_none(*pmd)) { + BUG(); + return; + } + pte = pte_offset_kernel(pmd, vaddr); + /* <pfn,flags> stored as-is, to permit clearing entries */ + set_pte(pte, pfn_pte_ma(pfn, flags)); + + /* + * It's enough to flush this one mapping. + * (PGE mappings get flushed as well) + */ + __flush_tlb_one(vaddr); +} + +/* * Associate a large virtual page frame with a given physical page frame * and protection flags for that frame. pfn is for the base of the page, * vaddr is what the page gets mapped to - both must be properly aligned. @@ -135,12 +177,26 @@ void __set_fixmap (enum fixed_addresses BUG(); return; } - set_pte_pfn(address, phys >> PAGE_SHIFT, flags); + switch (idx) { + case FIX_WP_TEST: + case FIX_VSYSCALL: +#ifdef CONFIG_X86_F00F_BUG + case FIX_F00F_IDT: +#endif + set_pte_pfn(address, phys >> PAGE_SHIFT, flags); + break; + default: + set_pte_pfn_ma(address, phys >> PAGE_SHIFT, flags); + break; + } } pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { - return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); + pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); + if (pte) + make_page_readonly(pte); + return pte; } struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) @@ -151,10 +207,29 @@ struct page *pte_alloc_one(struct mm_str pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); #else pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); + if (pte) { + SetPageForeign(pte, pte_free); + set_page_count(pte, 1); + } #endif + return pte; } +void pte_free(struct page *pte) +{ + unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT); + + if (!pte_write(*virt_to_ptep(va))) + HYPERVISOR_update_va_mapping( + va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0); + + ClearPageForeign(pte); + set_page_count(pte, 1); + + __free_page(pte); +} + void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags) { memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); @@ -199,14 +274,14 @@ void pgd_ctor(void *pgd, kmem_cache_t *c { unsigned long flags; - if (PTRS_PER_PMD == 1) + if (!HAVE_SHARED_KERNEL_PMD) spin_lock_irqsave(&pgd_lock, flags); memcpy((pgd_t *)pgd + USER_PTRS_PER_PGD, swapper_pg_dir + USER_PTRS_PER_PGD, (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); - if (PTRS_PER_PMD > 1) + if (HAVE_SHARED_KERNEL_PMD) return; pgd_list_add(pgd); @@ -214,11 +289,13 @@ void pgd_ctor(void *pgd, kmem_cache_t *c memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); } -/* never called when PTRS_PER_PMD > 1 */ void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused) { unsigned long flags; /* can be called from interrupt context */ + if (HAVE_SHARED_KERNEL_PMD) + return; + spin_lock_irqsave(&pgd_lock, flags); pgd_list_del(pgd); spin_unlock_irqrestore(&pgd_lock, flags); @@ -226,12 +303,30 @@ void pgd_dtor(void *pgd, kmem_cache_t *c pgd_t *pgd_alloc(struct mm_struct *mm) { - int i; + int i = 0; pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL); if (PTRS_PER_PMD == 1 || !pgd) return pgd; + if (!HAVE_SHARED_KERNEL_PMD) { + /* alloc and copy kernel pmd */ + unsigned long flags; + pgd_t *copy_pgd = pgd_offset_k(PAGE_OFFSET); + pud_t *copy_pud = pud_offset(copy_pgd, PAGE_OFFSET); + pmd_t *copy_pmd = pmd_offset(copy_pud, PAGE_OFFSET); + pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); + if (0 == pmd) + goto out_oom; + + spin_lock_irqsave(&pgd_lock, flags); + memcpy(pmd, copy_pmd, PAGE_SIZE); + spin_unlock_irqrestore(&pgd_lock, flags); + make_page_readonly(pmd); + set_pgd(&pgd[USER_PTRS_PER_PGD], __pgd(1 + __pa(pmd))); + } + + /* alloc user pmds */ for (i = 0; i < USER_PTRS_PER_PGD; ++i) { pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); if (!pmd) @@ -250,11 +345,207 @@ out_oom: void pgd_free(pgd_t *pgd) { int i; + pte_t *ptep = virt_to_ptep(pgd); + + if (!pte_write(*ptep)) { + xen_pgd_unpin(__pa(pgd)); + HYPERVISOR_update_va_mapping( + (unsigned long)pgd, + pfn_pte(virt_to_phys(pgd)>>PAGE_SHIFT, PAGE_KERNEL), + 0); + } /* in the PAE case user pgd entries are overwritten before usage */ - if (PTRS_PER_PMD > 1) - for (i = 0; i < USER_PTRS_PER_PGD; ++i) - kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); + if (PTRS_PER_PMD > 1) { + for (i = 0; i < USER_PTRS_PER_PGD; ++i) { + pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1); + kmem_cache_free(pmd_cache, pmd); + } + if (!HAVE_SHARED_KERNEL_PMD) { + pmd_t *pmd = (void *)__va(pgd_val(pgd[USER_PTRS_PER_PGD])-1); + make_page_writable(pmd); + memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); + kmem_cache_free(pmd_cache, pmd); + } + } /* in the non-PAE case, free_pgtables() clears user pgd entries */ kmem_cache_free(pgd_cache, pgd); } + +#ifndef CONFIG_XEN_SHADOW_MODE +void make_lowmem_page_readonly(void *va) +{ + pte_t *pte = virt_to_ptep(va); + set_pte(pte, pte_wrprotect(*pte)); +} + +void make_lowmem_page_writable(void *va) +{ + pte_t *pte = virt_to_ptep(va); + set_pte(pte, pte_mkwrite(*pte)); +} + +void make_page_readonly(void *va) +{ + pte_t *pte = virt_to_ptep(va); + set_pte(pte, pte_wrprotect(*pte)); + if ( (unsigned long)va >= (unsigned long)high_memory ) + { + unsigned long phys; + phys = machine_to_phys(*(unsigned long *)pte & PAGE_MASK); +#ifdef CONFIG_HIGHMEM + if ( (phys >> PAGE_SHIFT) < highstart_pfn ) +#endif + make_lowmem_page_readonly(phys_to_virt(phys)); + } +} + +void make_page_writable(void *va) +{ + pte_t *pte = virt_to_ptep(va); + set_pte(pte, pte_mkwrite(*pte)); + if ( (unsigned long)va >= (unsigned long)high_memory ) + { + unsigned long phys; + phys = machine_to_phys(*(unsigned long *)pte & PAGE_MASK); +#ifdef CONFIG_HIGHMEM + if ( (phys >> PAGE_SHIFT) < highstart_pfn ) +#endif + make_lowmem_page_writable(phys_to_virt(phys)); + } +} + +void make_pages_readonly(void *va, unsigned int nr) +{ + while ( nr-- != 0 ) + { + make_page_readonly(va); + va = (void *)((unsigned long)va + PAGE_SIZE); + } +} + +void make_pages_writable(void *va, unsigned int nr) +{ + while ( nr-- != 0 ) + { + make_page_writable(va); + va = (void *)((unsigned long)va + PAGE_SIZE); + } +} +#endif /* CONFIG_XEN_SHADOW_MODE */ + +LIST_HEAD(mm_unpinned); +DEFINE_SPINLOCK(mm_unpinned_lock); + +static inline void mm_walk_set_prot(void *pt, pgprot_t flags) +{ + struct page *page = virt_to_page(pt); + unsigned long pfn = page_to_pfn(page); + + if (PageHighMem(page)) + return; + HYPERVISOR_update_va_mapping( + (unsigned long)__va(pfn << PAGE_SHIFT), + pfn_pte(pfn, flags), 0); +} + +static void mm_walk(struct mm_struct *mm, pgprot_t flags) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + int g,u,m; + + pgd = mm->pgd; + for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) { + if (pgd_none(*pgd)) + continue; + pud = pud_offset(pgd, 0); + if (PTRS_PER_PUD > 1) /* not folded */ + mm_walk_set_prot(pud,flags); + for (u = 0; u < PTRS_PER_PUD; u++, pud++) { + if (pud_none(*pud)) + continue; + pmd = pmd_offset(pud, 0); + if (PTRS_PER_PMD > 1) /* not folded */ + mm_walk_set_prot(pmd,flags); + for (m = 0; m < PTRS_PER_PMD; m++, pmd++) { + if (pmd_none(*pmd)) + continue; + pte = pte_offset_kernel(pmd,0); + mm_walk_set_prot(pte,flags); + } + } + } +} + +void mm_pin(struct mm_struct *mm) +{ + spin_lock(&mm->page_table_lock); + + mm_walk(mm, PAGE_KERNEL_RO); + HYPERVISOR_update_va_mapping( + (unsigned long)mm->pgd, + pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL_RO), + UVMF_TLB_FLUSH); + xen_pgd_pin(__pa(mm->pgd)); + mm->context.pinned = 1; + spin_lock(&mm_unpinned_lock); + list_del(&mm->context.unpinned); + spin_unlock(&mm_unpinned_lock); + + spin_unlock(&mm->page_table_lock); +} + +void mm_unpin(struct mm_struct *mm) +{ + spin_lock(&mm->page_table_lock); + + xen_pgd_unpin(__pa(mm->pgd)); + HYPERVISOR_update_va_mapping( + (unsigned long)mm->pgd, + pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL), 0); + mm_walk(mm, PAGE_KERNEL); + xen_tlb_flush(); + mm->context.pinned = 0; + spin_lock(&mm_unpinned_lock); + list_add(&mm->context.unpinned, &mm_unpinned); + spin_unlock(&mm_unpinned_lock); + + spin_unlock(&mm->page_table_lock); +} + +void mm_pin_all(void) +{ + while (!list_empty(&mm_unpinned)) + mm_pin(list_entry(mm_unpinned.next, struct mm_struct, + context.unpinned)); +} + +void _arch_exit_mmap(struct mm_struct *mm) +{ + struct task_struct *tsk = current; + + task_lock(tsk); + + /* + * We aggressively remove defunct pgd from cr3. We execute unmap_vmas() + * *much* faster this way, as no tlb flushes means bigger wrpt batches. + */ + if ( tsk->active_mm == mm ) + { + tsk->active_mm = &init_mm; + atomic_inc(&init_mm.mm_count); + + switch_mm(mm, &init_mm, tsk); + + atomic_dec(&mm->mm_count); + BUG_ON(atomic_read(&mm->mm_count) == 0); + } + + task_unlock(tsk); + + if ( mm->context.pinned && (atomic_read(&mm->mm_count) == 1) ) + mm_unpin(mm); +} diff -x mkbuildtree -x include -x xen -x SCCS -urPp pristine-linux-2.6.12/arch/i386/pci/irq.c linux-2.6-xen-sparse/arch/i386/pci/irq.c --- pristine-linux-2.6.12/arch/i386/pci/irq.c 2005-06-17 12:48:29.000000000 -0700 +++ linux-2.6-xen-sparse/arch/i386/pci/irq.c 2005-07-28 13:17:07.000000000 -0700 @@ -68,7 +68,8 @@ static struct irq_routing_table * __init int i; u8 sum; - for(addr = (u8 *) __va(0xf0000); addr < (u8 *) __va(0x100000); addr += 16) { +#ifdef CONFIG_XEN_PRIVILEGED_GUEST + for(addr = (u8 *) isa_bus_to_virt(0xf0000); addr < (u8 *) isa_bus_to_virt(0x100000); addr += 16) { rt = (struct irq_routing_table *) addr; if (rt->signature != PIRQ_SIGNATURE || rt->version != PIRQ_VERSION || @@ -83,6 +84,8 @@ static struct irq_routing_table * __init return rt; } } +#endif + return NULL; } diff -x mkbuildtree -x include -x xen -x SCCS -urPp pristine-linux-2.6.12/arch/i386/pci/Makefile linux-2.6-xen-sparse/arch/i386/pci/Makefile --- pristine-linux-2.6.12/arch/i386/pci/Makefile 2005-06-17 12:48:29.000000000 -0700 +++ linux-2.6-xen-sparse/arch/i386/pci/Makefile 2005-07-28 13:17:07.000000000 -0700 @@ -1,14 +1,32 @@ -obj-y := i386.o +XENARCH := $(subst ",,$(CONFIG_XENARCH)) -obj-$(CONFIG_PCI_BIOS) += pcbios.o -obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o -obj-$(CONFIG_PCI_DIRECT) += direct.o +CFLAGS += -Iarch/$(XENARCH)/pci -pci-y := fixup.o -pci-$(CONFIG_ACPI_PCI) += acpi.o -pci-y += legacy.o irq.o +c-obj-y := i386.o -pci-$(CONFIG_X86_VISWS) := visws.o fixup.o -pci-$(CONFIG_X86_NUMAQ) := numa.o irq.o +c-obj-$(CONFIG_PCI_BIOS) += pcbios.o +c-obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o +c-obj-$(CONFIG_PCI_DIRECT) += direct.o -obj-y += $(pci-y) common.o +c-pci-y := fixup.o +c-pci-$(CONFIG_ACPI_PCI) += acpi.o +c-pci-y += legacy.o +# Make sure irq.o gets linked in after legacy.o +l-pci-y += irq.o + +c-pci-$(CONFIG_X86_VISWS) := visws.o fixup.o +pci-$(CONFIG_X86_VISWS) := +c-pci-$(CONFIG_X86_NUMAQ) := numa.o +pci-$(CONFIG_X86_NUMAQ) := irq.o + +obj-y += $(pci-y) +c-obj-y += $(c-pci-y) common.o + +c-link := + +$(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-link)): + @ln -fsn $(srctree)/arch/i386/pci/$(notdir $@) $@ + +obj-y += $(c-obj-y) $(l-pci-y) + +clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-) $(c-link)) _______________________________________________ Xen-merge mailing list Xen-merge@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-merge
|
![]() |
Lists.xenproject.org is hosted with RackSpace, monitoring our |