[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-changelog] upgrade the sparse repository to 2.6.12
# HG changeset patch # User vh249@xxxxxxxxxxxxxxxxxxxxxxxx # Node ID de310533c48375f3008a0484c3a770b807aee5c3 # Parent fa660d79f69573459949c847743f6341f466c7d0 upgrade the sparse repository to 2.6.12 Signed-off-by: Vincent Hanquez <vincent@xxxxxxxxxxxxx> diff -r fa660d79f695 -r de310533c483 buildconfigs/mk.linux-2.6-xen0 --- a/buildconfigs/mk.linux-2.6-xen0 Tue Aug 9 15:17:45 2005 +++ b/buildconfigs/mk.linux-2.6-xen0 Tue Aug 9 23:57:17 2005 @@ -2,7 +2,7 @@ OS = linux LINUX_SERIES = 2.6 -LINUX_VER = 2.6.11 +LINUX_VER = 2.6.12 EXTRAVERSION = xen0 diff -r fa660d79f695 -r de310533c483 buildconfigs/mk.linux-2.6-xenU --- a/buildconfigs/mk.linux-2.6-xenU Tue Aug 9 15:17:45 2005 +++ b/buildconfigs/mk.linux-2.6-xenU Tue Aug 9 23:57:17 2005 @@ -2,7 +2,7 @@ OS = linux LINUX_SERIES = 2.6 -LINUX_VER = 2.6.11 +LINUX_VER = 2.6.12 EXTRAVERSION = xenU diff -r fa660d79f695 -r de310533c483 linux-2.6-xen-sparse/arch/xen/Kconfig --- a/linux-2.6-xen-sparse/arch/xen/Kconfig Tue Aug 9 15:17:45 2005 +++ b/linux-2.6-xen-sparse/arch/xen/Kconfig Tue Aug 9 23:57:17 2005 @@ -150,3 +150,5 @@ source "crypto/Kconfig" source "lib/Kconfig" + +source "arch/xen/Kconfig.debug" diff -r fa660d79f695 -r de310533c483 linux-2.6-xen-sparse/arch/xen/configs/xen0_defconfig --- a/linux-2.6-xen-sparse/arch/xen/configs/xen0_defconfig Tue Aug 9 15:17:45 2005 +++ b/linux-2.6-xen-sparse/arch/xen/configs/xen0_defconfig Tue Aug 9 23:57:17 2005 @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.11-xen0 -# Tue May 3 13:22:55 2005 +# Linux kernel version: 2.6.12-xen0 +# Sat Jul 9 09:19:47 2005 # CONFIG_XEN=y CONFIG_ARCH_XEN=y @@ -31,6 +31,7 @@ CONFIG_BROKEN=y CONFIG_BROKEN_ON_SMP=y CONFIG_LOCK_KERNEL=y +CONFIG_INIT_ENV_ARG_LIMIT=32 # # General setup @@ -42,7 +43,6 @@ # CONFIG_BSD_PROCESS_ACCT is not set CONFIG_SYSCTL=y # CONFIG_AUDIT is not set -CONFIG_LOG_BUF_SHIFT=14 CONFIG_HOTPLUG=y CONFIG_KOBJECT_UEVENT=y # CONFIG_IKCONFIG is not set @@ -50,15 +50,18 @@ CONFIG_KALLSYMS=y # CONFIG_KALLSYMS_ALL is not set # CONFIG_KALLSYMS_EXTRA_PASS is not set +CONFIG_PRINTK=y +CONFIG_BUG=y +CONFIG_BASE_FULL=y CONFIG_FUTEX=y CONFIG_EPOLL=y -# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set CONFIG_SHMEM=y CONFIG_CC_ALIGN_FUNCTIONS=0 CONFIG_CC_ALIGN_LABELS=0 CONFIG_CC_ALIGN_LOOPS=0 CONFIG_CC_ALIGN_JUMPS=0 # CONFIG_TINY_SHMEM is not set +CONFIG_BASE_SMALL=0 # # Loadable module support @@ -97,6 +100,7 @@ # CONFIG_MWINCHIPC6 is not set # CONFIG_MWINCHIP2 is not set # CONFIG_MWINCHIP3D is not set +# CONFIG_MGEODEGX1 is not set # CONFIG_MCYRIXIII is not set # CONFIG_MVIAC3_2 is not set # CONFIG_X86_GENERIC is not set @@ -117,6 +121,7 @@ # CONFIG_SMP is not set CONFIG_PREEMPT=y CONFIG_PREEMPT_BKL=y +# CONFIG_X86_REBOOTFIXUPS is not set CONFIG_MICROCODE=y CONFIG_X86_CPUID=y @@ -137,6 +142,8 @@ CONFIG_PCI_DIRECT=y CONFIG_PCI_LEGACY_PROC=y # CONFIG_PCI_NAMES is not set +# CONFIG_PCI_DEBUG is not set +CONFIG_ISA_DMA_API=y CONFIG_ISA=y # CONFIG_EISA is not set # CONFIG_MCA is not set @@ -148,11 +155,6 @@ # CONFIG_PCCARD is not set # -# PC-card bridges -# -CONFIG_PCMCIA_PROBE=y - -# # PCI Hotplug Support # # CONFIG_HOTPLUG_PCI is not set @@ -160,12 +162,14 @@ # # Kernel hacking # +# CONFIG_PRINTK_TIME is not set CONFIG_DEBUG_KERNEL=y CONFIG_EARLY_PRINTK=y # CONFIG_DEBUG_STACKOVERFLOW is not set # CONFIG_DEBUG_STACK_USAGE is not set # CONFIG_DEBUG_SLAB is not set CONFIG_MAGIC_SYSRQ=y +CONFIG_LOG_BUF_SHIFT=14 # CONFIG_DEBUG_SPINLOCK is not set # CONFIG_DEBUG_PAGEALLOC is not set # CONFIG_DEBUG_INFO is not set @@ -176,6 +180,7 @@ CONFIG_GENERIC_IRQ_PROBE=y CONFIG_X86_BIOS_REBOOT=y CONFIG_PC=y +CONFIG_SECCOMP=y # # Executable file formats @@ -332,7 +337,7 @@ # # SCSI Transport Attributes # -# CONFIG_SCSI_SPI_ATTRS is not set +CONFIG_SCSI_SPI_ATTRS=y # CONFIG_SCSI_FC_ATTRS is not set # CONFIG_SCSI_ISCSI_ATTRS is not set @@ -409,6 +414,7 @@ # CONFIG_SCSI_QLA2300 is not set # CONFIG_SCSI_QLA2322 is not set # CONFIG_SCSI_QLA6312 is not set +# CONFIG_SCSI_LPFC is not set # CONFIG_SCSI_SEAGATE is not set # CONFIG_SCSI_SYM53C416 is not set # CONFIG_SCSI_DC395x is not set @@ -442,6 +448,7 @@ CONFIG_DM_SNAPSHOT=y CONFIG_DM_MIRROR=y # CONFIG_DM_ZERO is not set +# CONFIG_DM_MULTIPATH is not set # # Fusion MPT device support @@ -470,7 +477,6 @@ # CONFIG_PACKET=y # CONFIG_PACKET_MMAP is not set -# CONFIG_NETLINK_DEV is not set CONFIG_UNIX=y # CONFIG_NET_KEY is not set CONFIG_INET=y @@ -650,7 +656,6 @@ # CONFIG_DGRS is not set # CONFIG_EEPRO100 is not set CONFIG_E100=y -# CONFIG_E100_NAPI is not set # CONFIG_FEALNX is not set # CONFIG_NATSEMI is not set CONFIG_NE2K_PCI=y @@ -683,6 +688,7 @@ # CONFIG_SK98LIN is not set # CONFIG_VIA_VELOCITY is not set CONFIG_TIGON3=y +# CONFIG_BNX2 is not set # # Ethernet (10000 Mbit) @@ -738,19 +744,6 @@ # CONFIG_INPUT_TSDEV is not set # CONFIG_INPUT_EVDEV is not set # CONFIG_INPUT_EVBUG is not set - -# -# Input I/O drivers -# -# CONFIG_GAMEPORT is not set -CONFIG_SOUND_GAMEPORT=y -CONFIG_SERIO=y -CONFIG_SERIO_I8042=y -CONFIG_SERIO_SERPORT=y -# CONFIG_SERIO_CT82C710 is not set -# CONFIG_SERIO_PCIPS2 is not set -CONFIG_SERIO_LIBPS2=y -# CONFIG_SERIO_RAW is not set # # Input Device Drivers @@ -773,6 +766,18 @@ # CONFIG_INPUT_MISC is not set # +# Hardware I/O ports +# +CONFIG_SERIO=y +CONFIG_SERIO_I8042=y +CONFIG_SERIO_SERPORT=y +# CONFIG_SERIO_CT82C710 is not set +# CONFIG_SERIO_PCIPS2 is not set +CONFIG_SERIO_LIBPS2=y +# CONFIG_SERIO_RAW is not set +# CONFIG_GAMEPORT is not set + +# # Character devices # CONFIG_VT=y @@ -788,6 +793,7 @@ # # Non-8250 serial port support # +# CONFIG_SERIAL_JSM is not set CONFIG_UNIX98_PTYS=y CONFIG_LEGACY_PTYS=y CONFIG_LEGACY_PTY_COUNT=256 @@ -820,7 +826,6 @@ CONFIG_AGP_AMD=m CONFIG_AGP_AMD64=m CONFIG_AGP_INTEL=m -CONFIG_AGP_INTEL_MCH=m CONFIG_AGP_NVIDIA=m CONFIG_AGP_SIS=m CONFIG_AGP_SWORKS=m @@ -841,6 +846,11 @@ # CONFIG_HANGCHECK_TIMER is not set # +# TPM devices +# +# CONFIG_TCG_TPM is not set + +# # I2C support # # CONFIG_I2C is not set @@ -886,6 +896,8 @@ # # USB support # +CONFIG_USB_ARCH_HAS_HCD=y +CONFIG_USB_ARCH_HAS_OHCI=y CONFIG_USB=y # CONFIG_USB_DEBUG is not set @@ -896,14 +908,14 @@ # CONFIG_USB_BANDWIDTH is not set # CONFIG_USB_DYNAMIC_MINORS is not set # CONFIG_USB_OTG is not set -CONFIG_USB_ARCH_HAS_HCD=y -CONFIG_USB_ARCH_HAS_OHCI=y # # USB Host Controller Drivers # # CONFIG_USB_EHCI_HCD is not set CONFIG_USB_OHCI_HCD=y +# CONFIG_USB_OHCI_BIG_ENDIAN is not set +CONFIG_USB_OHCI_LITTLE_ENDIAN=y CONFIG_USB_UHCI_HCD=y # CONFIG_USB_SL811_HCD is not set @@ -940,7 +952,6 @@ # # CONFIG_USB_MDC800 is not set # CONFIG_USB_MICROTEK is not set -# CONFIG_USB_HPUSBSCSI is not set # # USB Multimedia devices @@ -959,6 +970,7 @@ # CONFIG_USB_PEGASUS is not set # CONFIG_USB_RTL8150 is not set # CONFIG_USB_USBNET is not set +CONFIG_USB_MON=y # # USB port drivers @@ -1174,6 +1186,7 @@ # CONFIG_CRYPTO_SHA256 is not set # CONFIG_CRYPTO_SHA512 is not set # CONFIG_CRYPTO_WP512 is not set +# CONFIG_CRYPTO_TGR192 is not set CONFIG_CRYPTO_DES=m # CONFIG_CRYPTO_BLOWFISH is not set # CONFIG_CRYPTO_TWOFISH is not set diff -r fa660d79f695 -r de310533c483 linux-2.6-xen-sparse/arch/xen/configs/xenU_defconfig --- a/linux-2.6-xen-sparse/arch/xen/configs/xenU_defconfig Tue Aug 9 15:17:45 2005 +++ b/linux-2.6-xen-sparse/arch/xen/configs/xenU_defconfig Tue Aug 9 23:57:17 2005 @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.11-xenU -# Wed Apr 13 23:18:37 2005 +# Linux kernel version: 2.6.12-xenU +# Sun Jul 10 17:32:04 2005 # CONFIG_XEN=y CONFIG_ARCH_XEN=y @@ -28,6 +28,7 @@ CONFIG_CLEAN_COMPILE=y CONFIG_BROKEN_ON_SMP=y CONFIG_LOCK_KERNEL=y +CONFIG_INIT_ENV_ARG_LIMIT=32 # # General setup @@ -39,23 +40,26 @@ # CONFIG_BSD_PROCESS_ACCT is not set CONFIG_SYSCTL=y # CONFIG_AUDIT is not set -CONFIG_LOG_BUF_SHIFT=14 CONFIG_HOTPLUG=y CONFIG_KOBJECT_UEVENT=y # CONFIG_IKCONFIG is not set +# CONFIG_CPUSETS is not set # CONFIG_EMBEDDED is not set CONFIG_KALLSYMS=y # CONFIG_KALLSYMS_ALL is not set # CONFIG_KALLSYMS_EXTRA_PASS is not set +CONFIG_PRINTK=y +CONFIG_BUG=y +CONFIG_BASE_FULL=y CONFIG_FUTEX=y CONFIG_EPOLL=y -# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set CONFIG_SHMEM=y CONFIG_CC_ALIGN_FUNCTIONS=0 CONFIG_CC_ALIGN_LABELS=0 CONFIG_CC_ALIGN_LOOPS=0 CONFIG_CC_ALIGN_JUMPS=0 # CONFIG_TINY_SHMEM is not set +CONFIG_BASE_SMALL=0 # # Loadable module support @@ -94,6 +98,7 @@ # CONFIG_MWINCHIPC6 is not set # CONFIG_MWINCHIP2 is not set # CONFIG_MWINCHIP3D is not set +# CONFIG_MGEODEGX1 is not set # CONFIG_MCYRIXIII is not set # CONFIG_MVIAC3_2 is not set # CONFIG_X86_GENERIC is not set @@ -114,6 +119,7 @@ # CONFIG_SMP is not set CONFIG_PREEMPT=y CONFIG_PREEMPT_BKL=y +# CONFIG_X86_REBOOTFIXUPS is not set CONFIG_X86_CPUID=y # @@ -144,6 +150,8 @@ CONFIG_GENERIC_IRQ_PROBE=y CONFIG_X86_BIOS_REBOOT=y CONFIG_PC=y +CONFIG_SECCOMP=y +CONFIG_EARLY_PRINTK=y # # Executable file formats @@ -239,7 +247,6 @@ # CONFIG_PACKET=y # CONFIG_PACKET_MMAP is not set -# CONFIG_NETLINK_DEV is not set CONFIG_UNIX=y # CONFIG_NET_KEY is not set CONFIG_INET=y @@ -506,6 +513,7 @@ # CONFIG_CRYPTO_SHA256 is not set # CONFIG_CRYPTO_SHA512 is not set # CONFIG_CRYPTO_WP512 is not set +# CONFIG_CRYPTO_TGR192 is not set # CONFIG_CRYPTO_DES is not set # CONFIG_CRYPTO_BLOWFISH is not set # CONFIG_CRYPTO_TWOFISH is not set @@ -534,3 +542,27 @@ # CONFIG_CRC32 is not set CONFIG_LIBCRC32C=m CONFIG_ZLIB_INFLATE=y + +# +# Kernel hacking +# +# CONFIG_PRINTK_TIME is not set +CONFIG_DEBUG_KERNEL=y +CONFIG_MAGIC_SYSRQ=y +CONFIG_LOG_BUF_SHIFT=14 +# CONFIG_SCHEDSTATS is not set +# CONFIG_DEBUG_SLAB is not set +# CONFIG_DEBUG_PREEMPT is not set +# CONFIG_DEBUG_SPINLOCK is not set +# CONFIG_DEBUG_SPINLOCK_SLEEP is not set +# CONFIG_DEBUG_KOBJECT is not set +# CONFIG_DEBUG_HIGHMEM is not set +CONFIG_DEBUG_BUGVERBOSE=y +# CONFIG_DEBUG_INFO is not set +# CONFIG_DEBUG_FS is not set +# CONFIG_FRAME_POINTER is not set +# CONFIG_DEBUG_STACKOVERFLOW is not set +# CONFIG_KPROBES is not set +# CONFIG_DEBUG_STACK_USAGE is not set +# CONFIG_DEBUG_PAGEALLOC is not set +# CONFIG_4KSTACKS is not set diff -r fa660d79f695 -r de310533c483 linux-2.6-xen-sparse/arch/xen/i386/Kconfig --- a/linux-2.6-xen-sparse/arch/xen/i386/Kconfig Tue Aug 9 15:17:45 2005 +++ b/linux-2.6-xen-sparse/arch/xen/i386/Kconfig Tue Aug 9 23:57:17 2005 @@ -65,6 +65,7 @@ - "Winchip-C6" for original IDT Winchip. - "Winchip-2" for IDT Winchip 2. - "Winchip-2A" for IDT Winchips with 3dNow! capabilities. + - "GeodeGX1" for Geode GX1 (Cyrix MediaGX). - "CyrixIII/VIA C3" for VIA Cyrix III or VIA C3. - "VIA C3-2 for VIA C3-2 "Nehemiah" (model 9 and above). @@ -191,6 +192,11 @@ and alignment reqirements. Also enable out of order memory stores for this CPU, which can increase performance of some operations. + +config MGEODEGX1 + bool "GeodeGX1" + help + Select this for a Geode GX1 (Cyrix MediaGX) chip. config MCYRIXIII bool "CyrixIII/VIA-C3" @@ -240,7 +246,7 @@ int default "7" if MPENTIUM4 || X86_GENERIC default "4" if X86_ELAN || M486 || M386 - default "5" if MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 + default "5" if MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODEGX1 default "6" if MK7 || MK8 || MPENTIUMM config RWSEM_GENERIC_SPINLOCK @@ -259,7 +265,7 @@ config X86_PPRO_FENCE bool - depends on M686 || M586MMX || M586TSC || M586 || M486 || M386 + depends on M686 || M586MMX || M586TSC || M586 || M486 || M386 || MGEODEGX1 default y config X86_F00F_BUG @@ -289,7 +295,7 @@ config X86_ALIGNMENT_16 bool - depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 + depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 default y config X86_GOOD_APIC @@ -415,7 +421,7 @@ #config X86_TSC # bool -# depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2) && !X86_NUMAQ +# depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MGEODEGX1) && !X86_NUMAQ # default y #config X86_MCE @@ -455,6 +461,24 @@ # Enabling this feature will cause a message to be printed when the P4 # enters thermal throttling. +config X86_REBOOTFIXUPS + bool "Enable X86 board specific fixups for reboot" + depends on X86 + default n + ---help--- + This enables chipset and/or board specific fixups to be done + in order to get reboot to work correctly. This is only needed on + some combinations of hardware and BIOS. The symptom, for which + this config is intended, is when reboot ends with a stalled/hung + system. + + Currently, the only fixup is for the Geode GX1/CS5530A/TROM2.1. + combination. + + Say Y if you want to enable the fixup. Currently, it's safe to + enable this option even if you don't need it. + Say N otherwise. + config MICROCODE tristate "/dev/cpu/microcode - Intel IA32 CPU microcode support" depends on XEN_PRIVILEGED_GUEST @@ -578,6 +602,16 @@ config HAVE_ARCH_BOOTMEM_NODE bool depends on NUMA + default y + +config HAVE_MEMORY_PRESENT + bool + depends on DISCONTIGMEM + default y + +config NEED_NODE_MEMMAP_SIZE + bool + depends on DISCONTIGMEM default y #config HIGHPTE @@ -673,13 +707,18 @@ config X86_LOCAL_APIC bool - depends on (X86_VISWS || SMP) && !X86_VOYAGER + depends on XEN_PRIVILEGED_GUEST && (X86_UP_APIC || ((X86_VISWS || SMP) && !X86_VOYAGER)) default y config X86_IO_APIC bool - depends on SMP && !(X86_VISWS || X86_VOYAGER) - default y + depends on XEN_PRIVILEGED_GUEST && (X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER))) + default y + +config X86_VISWS_APIC + bool + depends on X86_VISWS + default y config PCI bool "PCI support" if !X86_VISWS @@ -748,6 +787,10 @@ source "drivers/pci/Kconfig" +config ISA_DMA_API + bool + default y + config ISA bool "ISA support" depends on !(X86_VOYAGER || X86_VISWS) @@ -777,17 +820,13 @@ source "drivers/eisa/Kconfig" config MCA - bool "MCA support" - depends on !(X86_VISWS || X86_VOYAGER) + bool "MCA support" if !(X86_VISWS || X86_VOYAGER) + default y if X86_VOYAGER help MicroChannel Architecture is found in some IBM PS/2 machines and laptops. It is a bus system similar to PCI or ISA. See <file:Documentation/mca.txt> (and especially the web page given there) before attempting to build an MCA bus kernel. - -config MCA - depends on X86_VOYAGER - default y if X86_VOYAGER source "drivers/mca/Kconfig" @@ -971,4 +1010,21 @@ depends on X86 && !EMBEDDED default y +config SECCOMP + bool "Enable seccomp to safely compute untrusted bytecode" + depends on PROC_FS + default y + help + This kernel feature is useful for number crunching applications + that may need to compute untrusted bytecode during their + execution. By using pipes or other transports made available to + the process as file descriptors supporting the read/write + syscalls, it's possible to isolate those applications in + their own address space using seccomp. Once seccomp is + enabled via /proc/<pid>/seccomp, it cannot be disabled + and the task is only allowed to execute a few safe syscalls + defined by each seccomp mode. + + If unsure, say Y. Only embedded should say N here. + endmenu diff -r fa660d79f695 -r de310533c483 linux-2.6-xen-sparse/arch/xen/i386/Makefile --- a/linux-2.6-xen-sparse/arch/xen/i386/Makefile Tue Aug 9 15:17:45 2005 +++ b/linux-2.6-xen-sparse/arch/xen/i386/Makefile Tue Aug 9 23:57:17 2005 @@ -14,6 +14,8 @@ # 19990713 Artur Skawina <skawina@xxxxxxxxxxxxx> # Added '-march' and '-mpreferred-stack-boundary' support # +# 20050320 Kianusch Sayah Karadji <kianusch@xxxxxxxxxxx> +# Added support for GEODE CPU XENARCH := $(subst ",,$(CONFIG_XENARCH)) @@ -55,6 +57,9 @@ # AMD Elan support cflags-$(CONFIG_X86_ELAN) += -march=i486 + +# Geode GX1 support +cflags-$(CONFIG_MGEODEGX1) += $(call cc-option,-march=pentium-mmx,-march=i486) # -mregparm=3 works ok on gcc-3.0 and later # diff -r fa660d79f695 -r de310533c483 linux-2.6-xen-sparse/arch/xen/i386/kernel/Makefile --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/Makefile Tue Aug 9 15:17:45 2005 +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/Makefile Tue Aug 9 23:57:17 2005 @@ -32,6 +32,7 @@ c-obj-$(CONFIG_X86_MPPARSE) += mpparse.o c-obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o c-obj-$(CONFIG_X86_IO_APIC) += io_apic.o +c-obj-$(CONFIG_X86_REBOOTFIXUPS)+= reboot_fixups.o c-obj-$(CONFIG_X86_NUMAQ) += numaq.o c-obj-$(CONFIG_X86_SUMMIT_NUMA) += summit.o c-obj-$(CONFIG_MODULES) += module.o @@ -51,11 +52,11 @@ # Note: kbuild does not track this dependency due to usage of .incbin $(obj)/vsyscall.o: $(obj)/vsyscall-int80.so $(obj)/vsyscall-sysenter.so targets += $(foreach F,int80 sysenter,vsyscall-$F.o vsyscall-$F.so) -targets += vsyscall.lds +targets += vsyscall-note.o vsyscall.lds # The DSO images are built using a special linker script. quiet_cmd_syscall = SYSCALL $@ - cmd_syscall = $(CC) -nostdlib -m32 $(SYSCFLAGS_$(@F)) \ + cmd_syscall = $(CC) -m elf_i386 -nostdlib $(SYSCFLAGS_$(@F)) \ -Wl,-T,$(filter-out FORCE,$^) -o $@ export CPPFLAGS_vsyscall.lds += -P -C -U$(ARCH) @@ -65,7 +66,8 @@ SYSCFLAGS_vsyscall-int80.so = $(vsyscall-flags) $(obj)/vsyscall-int80.so $(obj)/vsyscall-sysenter.so: \ -$(obj)/vsyscall-%.so: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE +$(obj)/vsyscall-%.so: $(src)/vsyscall.lds \ + $(obj)/vsyscall-%.o FORCE $(call if_changed,syscall) # We also create a special relocatable object that should mirror the symbol @@ -76,17 +78,20 @@ $(obj)/built-in.o: ld_flags += -R $(obj)/vsyscall-syms.o SYSCFLAGS_vsyscall-syms.o = -r -$(obj)/vsyscall-syms.o: $(src)/vsyscall.lds $(obj)/vsyscall-sysenter.o FORCE +$(obj)/vsyscall-syms.o: $(src)/vsyscall.lds \ + $(obj)/vsyscall-sysenter.o FORCE $(call if_changed,syscall) c-link := init_task.o -s-link := vsyscall-int80.o vsyscall-sysenter.o vsyscall-sigreturn.o vsyscall.lds.o +s-link := vsyscall-int80.o vsyscall-sysenter.o vsyscall-sigreturn.o vsyscall.lds.o syscall_table.o $(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-link)) $(patsubst %.o,$(obj)/%.S,$(s-obj-y) $(s-link)): @ln -fsn $(srctree)/arch/i386/kernel/$(notdir $@) $@ $(obj)/vsyscall-int80.S: $(obj)/vsyscall-sigreturn.S +$(obj)/entry.o: $(src)/entry.S $(src)/syscall_table.S + obj-y += $(c-obj-y) $(s-obj-y) clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-) $(c-link)) diff -r fa660d79f695 -r de310533c483 linux-2.6-xen-sparse/arch/xen/i386/kernel/cpu/common.c --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/cpu/common.c Tue Aug 9 15:17:45 2005 +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/cpu/common.c Tue Aug 9 23:57:17 2005 @@ -22,6 +22,9 @@ DEFINE_PER_CPU(struct desc_struct, cpu_gdt_table[GDT_ENTRIES]); EXPORT_PER_CPU_SYMBOL(cpu_gdt_table); +DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]); +EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack); + static int cachesize_override __initdata = -1; static int disable_x86_fxsr __initdata = 0; static int disable_x86_serial_nr __initdata = 1; @@ -202,7 +205,7 @@ /* Probe for the CPUID instruction */ -int __init have_cpuid_p(void) +static int __init have_cpuid_p(void) { return flag_is_changeable_p(X86_EFLAGS_ID); } @@ -210,7 +213,7 @@ /* Do minimum CPU detection early. Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment. The others are not touched to avoid unwanted side effects. */ -void __init early_cpu_detect(void) +static void __init early_cpu_detect(void) { struct cpuinfo_x86 *c = &boot_cpu_data; @@ -243,6 +246,10 @@ } early_intel_workaround(c); + +#ifdef CONFIG_X86_HT + phys_proc_id[smp_processor_id()] = (cpuid_ebx(1) >> 24) & 0xff; +#endif } void __init generic_identify(struct cpuinfo_x86 * c) @@ -431,25 +438,15 @@ mcheck_init(c); #endif } -/* - * Perform early boot up checks for a valid TSC. See arch/i386/kernel/time.c - */ - -void __init dodgy_tsc(void) -{ - if (( boot_cpu_data.x86_vendor == X86_VENDOR_CYRIX ) || - ( boot_cpu_data.x86_vendor == X86_VENDOR_NSC )) - cpu_devs[X86_VENDOR_CYRIX]->c_init(&boot_cpu_data); -} #ifdef CONFIG_X86_HT void __init detect_ht(struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; - int index_lsb, index_msb, tmp; + int index_msb, tmp; int cpu = smp_processor_id(); - if (!cpu_has(c, X86_FEATURE_HT)) + if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY)) return; cpuid(1, &eax, &ebx, &ecx, &edx); @@ -458,7 +455,6 @@ if (smp_num_siblings == 1) { printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); } else if (smp_num_siblings > 1 ) { - index_lsb = 0; index_msb = 31; if (smp_num_siblings > NR_CPUS) { @@ -467,21 +463,34 @@ return; } tmp = smp_num_siblings; - while ((tmp & 1) == 0) { - tmp >>=1 ; - index_lsb++; - } - tmp = smp_num_siblings; while ((tmp & 0x80000000 ) == 0) { tmp <<=1 ; index_msb--; } - if (index_lsb != index_msb ) + if (smp_num_siblings & (smp_num_siblings - 1)) index_msb++; phys_proc_id[cpu] = phys_pkg_id((ebx >> 24) & 0xFF, index_msb); printk(KERN_INFO "CPU: Physical Processor ID: %d\n", phys_proc_id[cpu]); + + smp_num_siblings = smp_num_siblings / c->x86_num_cores; + + tmp = smp_num_siblings; + index_msb = 31; + while ((tmp & 0x80000000) == 0) { + tmp <<=1 ; + index_msb--; + } + + if (smp_num_siblings & (smp_num_siblings - 1)) + index_msb++; + + cpu_core_id[cpu] = phys_pkg_id((ebx >> 24) & 0xFF, index_msb); + + if (c->x86_num_cores > 1) + printk(KERN_INFO "CPU: Processor Core ID: %d\n", + cpu_core_id[cpu]); } } #endif @@ -528,7 +537,6 @@ extern int rise_init_cpu(void); extern int nexgen_init_cpu(void); extern int umc_init_cpu(void); -void early_cpu_detect(void); void __init early_cpu_init(void) { diff -r fa660d79f695 -r de310533c483 linux-2.6-xen-sparse/arch/xen/i386/kernel/cpu/mtrr/main.c --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/cpu/mtrr/main.c Tue Aug 9 15:17:45 2005 +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/cpu/mtrr/main.c Tue Aug 9 23:57:17 2005 @@ -31,7 +31,7 @@ unsigned int num_var_ranges; unsigned int *usage_table; -void __init set_num_var_ranges(void) +static void __init set_num_var_ranges(void) { dom0_op_t op; diff -r fa660d79f695 -r de310533c483 linux-2.6-xen-sparse/arch/xen/i386/kernel/entry.S --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/entry.S Tue Aug 9 15:17:45 2005 +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/entry.S Tue Aug 9 23:57:17 2005 @@ -595,8 +595,6 @@ xorl %edx,%edx # error code 0 movl %esp,%eax # pt_regs pointer call do_debug - testl %eax,%eax - jnz restore_all jmp ret_from_exception #if 0 /* XEN */ @@ -651,8 +649,6 @@ xorl %edx,%edx # zero error code movl %esp,%eax # pt_regs pointer call do_int3 - testl %eax,%eax - jnz restore_all jmp ret_from_exception ENTRY(overflow) @@ -736,296 +732,6 @@ pushl $do_fixup_4gb_segment jmp error_code -.data -ENTRY(sys_call_table) - .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */ - .long sys_exit - .long sys_fork - .long sys_read - .long sys_write - .long sys_open /* 5 */ - .long sys_close - .long sys_waitpid - .long sys_creat - .long sys_link - .long sys_unlink /* 10 */ - .long sys_execve - .long sys_chdir - .long sys_time - .long sys_mknod - .long sys_chmod /* 15 */ - .long sys_lchown16 - .long sys_ni_syscall /* old break syscall holder */ - .long sys_stat - .long sys_lseek - .long sys_getpid /* 20 */ - .long sys_mount - .long sys_oldumount - .long sys_setuid16 - .long sys_getuid16 - .long sys_stime /* 25 */ - .long sys_ptrace - .long sys_alarm - .long sys_fstat - .long sys_pause - .long sys_utime /* 30 */ - .long sys_ni_syscall /* old stty syscall holder */ - .long sys_ni_syscall /* old gtty syscall holder */ - .long sys_access - .long sys_nice - .long sys_ni_syscall /* 35 - old ftime syscall holder */ - .long sys_sync - .long sys_kill - .long sys_rename - .long sys_mkdir - .long sys_rmdir /* 40 */ - .long sys_dup - .long sys_pipe - .long sys_times - .long sys_ni_syscall /* old prof syscall holder */ - .long sys_brk /* 45 */ - .long sys_setgid16 - .long sys_getgid16 - .long sys_signal - .long sys_geteuid16 - .long sys_getegid16 /* 50 */ - .long sys_acct - .long sys_umount /* recycled never used phys() */ - .long sys_ni_syscall /* old lock syscall holder */ - .long sys_ioctl - .long sys_fcntl /* 55 */ - .long sys_ni_syscall /* old mpx syscall holder */ - .long sys_setpgid - .long sys_ni_syscall /* old ulimit syscall holder */ - .long sys_olduname - .long sys_umask /* 60 */ - .long sys_chroot - .long sys_ustat - .long sys_dup2 - .long sys_getppid - .long sys_getpgrp /* 65 */ - .long sys_setsid - .long sys_sigaction - .long sys_sgetmask - .long sys_ssetmask - .long sys_setreuid16 /* 70 */ - .long sys_setregid16 - .long sys_sigsuspend - .long sys_sigpending - .long sys_sethostname - .long sys_setrlimit /* 75 */ - .long sys_old_getrlimit - .long sys_getrusage - .long sys_gettimeofday - .long sys_settimeofday - .long sys_getgroups16 /* 80 */ - .long sys_setgroups16 - .long old_select - .long sys_symlink - .long sys_lstat - .long sys_readlink /* 85 */ - .long sys_uselib - .long sys_swapon - .long sys_reboot - .long old_readdir - .long old_mmap /* 90 */ - .long sys_munmap - .long sys_truncate - .long sys_ftruncate - .long sys_fchmod - .long sys_fchown16 /* 95 */ - .long sys_getpriority - .long sys_setpriority - .long sys_ni_syscall /* old profil syscall holder */ - .long sys_statfs - .long sys_fstatfs /* 100 */ - .long sys_ioperm - .long sys_socketcall - .long sys_syslog - .long sys_setitimer - .long sys_getitimer /* 105 */ - .long sys_newstat - .long sys_newlstat - .long sys_newfstat - .long sys_uname - .long sys_iopl /* 110 */ - .long sys_vhangup - .long sys_ni_syscall /* old "idle" system call */ - .long sys_vm86old - .long sys_wait4 - .long sys_swapoff /* 115 */ - .long sys_sysinfo - .long sys_ipc - .long sys_fsync - .long sys_sigreturn - .long sys_clone /* 120 */ - .long sys_setdomainname - .long sys_newuname - .long sys_modify_ldt - .long sys_adjtimex - .long sys_mprotect /* 125 */ - .long sys_sigprocmask - .long sys_ni_syscall /* old "create_module" */ - .long sys_init_module - .long sys_delete_module - .long sys_ni_syscall /* 130: old "get_kernel_syms" */ - .long sys_quotactl - .long sys_getpgid - .long sys_fchdir - .long sys_bdflush - .long sys_sysfs /* 135 */ - .long sys_personality - .long sys_ni_syscall /* reserved for afs_syscall */ - .long sys_setfsuid16 - .long sys_setfsgid16 - .long sys_llseek /* 140 */ - .long sys_getdents - .long sys_select - .long sys_flock - .long sys_msync - .long sys_readv /* 145 */ - .long sys_writev - .long sys_getsid - .long sys_fdatasync - .long sys_sysctl - .long sys_mlock /* 150 */ - .long sys_munlock - .long sys_mlockall - .long sys_munlockall - .long sys_sched_setparam - .long sys_sched_getparam /* 155 */ - .long sys_sched_setscheduler - .long sys_sched_getscheduler - .long sys_sched_yield - .long sys_sched_get_priority_max - .long sys_sched_get_priority_min /* 160 */ - .long sys_sched_rr_get_interval - .long sys_nanosleep - .long sys_mremap - .long sys_setresuid16 - .long sys_getresuid16 /* 165 */ - .long sys_vm86 - .long sys_ni_syscall /* Old sys_query_module */ - .long sys_poll - .long sys_nfsservctl - .long sys_setresgid16 /* 170 */ - .long sys_getresgid16 - .long sys_prctl - .long sys_rt_sigreturn - .long sys_rt_sigaction - .long sys_rt_sigprocmask /* 175 */ - .long sys_rt_sigpending - .long sys_rt_sigtimedwait - .long sys_rt_sigqueueinfo - .long sys_rt_sigsuspend - .long sys_pread64 /* 180 */ - .long sys_pwrite64 - .long sys_chown16 - .long sys_getcwd - .long sys_capget - .long sys_capset /* 185 */ - .long sys_sigaltstack - .long sys_sendfile - .long sys_ni_syscall /* reserved for streams1 */ - .long sys_ni_syscall /* reserved for streams2 */ - .long sys_vfork /* 190 */ - .long sys_getrlimit - .long sys_mmap2 - .long sys_truncate64 - .long sys_ftruncate64 - .long sys_stat64 /* 195 */ - .long sys_lstat64 - .long sys_fstat64 - .long sys_lchown - .long sys_getuid - .long sys_getgid /* 200 */ - .long sys_geteuid - .long sys_getegid - .long sys_setreuid - .long sys_setregid - .long sys_getgroups /* 205 */ - .long sys_setgroups - .long sys_fchown - .long sys_setresuid - .long sys_getresuid - .long sys_setresgid /* 210 */ - .long sys_getresgid - .long sys_chown - .long sys_setuid - .long sys_setgid - .long sys_setfsuid /* 215 */ - .long sys_setfsgid - .long sys_pivot_root - .long sys_mincore - .long sys_madvise - .long sys_getdents64 /* 220 */ - .long sys_fcntl64 - .long sys_ni_syscall /* reserved for TUX */ - .long sys_ni_syscall - .long sys_gettid - .long sys_readahead /* 225 */ - .long sys_setxattr - .long sys_lsetxattr - .long sys_fsetxattr - .long sys_getxattr - .long sys_lgetxattr /* 230 */ - .long sys_fgetxattr - .long sys_listxattr - .long sys_llistxattr - .long sys_flistxattr - .long sys_removexattr /* 235 */ - .long sys_lremovexattr - .long sys_fremovexattr - .long sys_tkill - .long sys_sendfile64 - .long sys_futex /* 240 */ - .long sys_sched_setaffinity - .long sys_sched_getaffinity - .long sys_set_thread_area - .long sys_get_thread_area - .long sys_io_setup /* 245 */ - .long sys_io_destroy - .long sys_io_getevents - .long sys_io_submit - .long sys_io_cancel - .long sys_fadvise64 /* 250 */ - .long sys_ni_syscall - .long sys_exit_group - .long sys_lookup_dcookie - .long sys_epoll_create - .long sys_epoll_ctl /* 255 */ - .long sys_epoll_wait - .long sys_remap_file_pages - .long sys_set_tid_address - .long sys_timer_create - .long sys_timer_settime /* 260 */ - .long sys_timer_gettime - .long sys_timer_getoverrun - .long sys_timer_delete - .long sys_clock_settime - .long sys_clock_gettime /* 265 */ - .long sys_clock_getres - .long sys_clock_nanosleep - .long sys_statfs64 - .long sys_fstatfs64 - .long sys_tgkill /* 270 */ - .long sys_utimes - .long sys_fadvise64_64 - .long sys_ni_syscall /* sys_vserver */ - .long sys_mbind - .long sys_get_mempolicy - .long sys_set_mempolicy - .long sys_mq_open - .long sys_mq_unlink - .long sys_mq_timedsend - .long sys_mq_timedreceive /* 280 */ - .long sys_mq_notify - .long sys_mq_getsetattr - .long sys_ni_syscall /* reserved for kexec */ - .long sys_waitid - .long sys_ni_syscall /* 285 */ /* available */ - .long sys_add_key - .long sys_request_key - .long sys_keyctl +#include "syscall_table.S" syscall_table_size=(.-sys_call_table) diff -r fa660d79f695 -r de310533c483 linux-2.6-xen-sparse/arch/xen/i386/kernel/i386_ksyms.c --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/i386_ksyms.c Tue Aug 9 15:17:45 2005 +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/i386_ksyms.c Tue Aug 9 23:57:17 2005 @@ -99,6 +99,11 @@ EXPORT_SYMBOL(__get_user_2); EXPORT_SYMBOL(__get_user_4); +EXPORT_SYMBOL(__put_user_1); +EXPORT_SYMBOL(__put_user_2); +EXPORT_SYMBOL(__put_user_4); +EXPORT_SYMBOL(__put_user_8); + EXPORT_SYMBOL(strpbrk); EXPORT_SYMBOL(strstr); @@ -114,7 +119,6 @@ EXPORT_SYMBOL(dma_free_coherent); #ifdef CONFIG_PCI -EXPORT_SYMBOL(pcibios_penalize_isa_irq); EXPORT_SYMBOL(pci_mem_start); #endif @@ -146,7 +150,6 @@ /* TLB flushing */ EXPORT_SYMBOL(flush_tlb_page); -EXPORT_SYMBOL_GPL(flush_tlb_all); #endif #ifdef CONFIG_X86_IO_APIC @@ -168,10 +171,6 @@ EXPORT_SYMBOL_GPL(set_nmi_callback); EXPORT_SYMBOL_GPL(unset_nmi_callback); -#undef memcmp -extern int memcmp(const void *,const void *,__kernel_size_t); -EXPORT_SYMBOL(memcmp); - EXPORT_SYMBOL(register_die_notifier); #ifdef CONFIG_HAVE_DEC_LOCK EXPORT_SYMBOL(_atomic_dec_and_lock); diff -r fa660d79f695 -r de310533c483 linux-2.6-xen-sparse/arch/xen/i386/kernel/pci-dma.c --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/pci-dma.c Tue Aug 9 15:17:45 2005 +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/pci-dma.c Tue Aug 9 23:57:17 2005 @@ -86,7 +86,7 @@ dma_addr_t *dma_handle) #else void *dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, int gfp) + dma_addr_t *dma_handle, unsigned int __nocast gfp) #endif { void *ret; diff -r fa660d79f695 -r de310533c483 linux-2.6-xen-sparse/arch/xen/i386/kernel/process.c --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/process.c Tue Aug 9 15:17:45 2005 +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/process.c Tue Aug 9 23:57:17 2005 @@ -36,6 +36,7 @@ #include <linux/module.h> #include <linux/kallsyms.h> #include <linux/ptrace.h> +#include <linux/random.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -57,7 +58,7 @@ asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); -int hlt_counter; +static int hlt_counter; unsigned long boot_option_idle_override = 0; EXPORT_SYMBOL(boot_option_idle_override); @@ -74,7 +75,7 @@ * Powermanagement idle function, if any.. */ void (*pm_idle)(void); -static cpumask_t cpu_idle_map; +static DEFINE_PER_CPU(unsigned int, cpu_idle_state); void disable_hlt(void) { @@ -120,11 +121,11 @@ while (1) { while (!need_resched()) { - if (cpu_isset(cpu, cpu_idle_map)) - cpu_clear(cpu, cpu_idle_map); + if (__get_cpu_var(cpu_idle_state)) + __get_cpu_var(cpu_idle_state) = 0; rmb(); - irq_stat[cpu].idle_timestamp = jiffies; + __get_cpu_var(irq_stat).idle_timestamp = jiffies; xen_idle(); } schedule(); @@ -133,16 +134,28 @@ void cpu_idle_wait(void) { - int cpu; + unsigned int cpu, this_cpu = get_cpu(); cpumask_t map; - for_each_online_cpu(cpu) - cpu_set(cpu, cpu_idle_map); + set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); + put_cpu(); + + cpus_clear(map); + for_each_online_cpu(cpu) { + per_cpu(cpu_idle_state, cpu) = 1; + cpu_set(cpu, map); + } + + __get_cpu_var(cpu_idle_state) = 0; wmb(); do { ssleep(1); - cpus_and(map, cpu_idle_map, cpu_online_map); + for_each_online_cpu(cpu) { + if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu)) + cpu_clear(cpu, map); + } + cpus_and(map, map, cpu_online_map); } while (!cpus_empty(map)); } EXPORT_SYMBOL_GPL(cpu_idle_wait); @@ -286,6 +299,17 @@ unsigned long eflags; childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1; + /* + * The below -8 is to reserve 8 bytes on top of the ring0 stack. + * This is necessary to guarantee that the entire "struct pt_regs" + * is accessable even if the CPU haven't stored the SS/ESP registers + * on the stack (interrupt gate does not save these registers + * when switching to the same priv ring). + * Therefore beware: accessing the xss/esp fields of the + * "struct pt_regs" is possible, but they may contain the + * completely wrong values. + */ + childregs = (struct pt_regs *) ((unsigned long) childregs - 8); *childregs = *regs; childregs->eax = 0; childregs->esp = esp; @@ -439,12 +463,6 @@ */ tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY; } -/* - * This special macro can be used to load a debugging register - */ -#define loaddebug(thread,register) \ - HYPERVISOR_set_debugreg((register), \ - (thread->debugreg[register])) /* * switch_to(x,yn) should switch tasks from x to y. @@ -777,3 +795,9 @@ return 0; } +unsigned long arch_align_stack(unsigned long sp) +{ + if (randomize_va_space) + sp -= get_random_int() % 8192; + return sp & ~0xf; +} diff -r fa660d79f695 -r de310533c483 linux-2.6-xen-sparse/arch/xen/i386/kernel/setup.c --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/setup.c Tue Aug 9 15:17:45 2005 +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/setup.c Tue Aug 9 23:57:17 2005 @@ -40,6 +40,7 @@ #include <linux/efi.h> #include <linux/init.h> #include <linux/edd.h> +#include <linux/nodemask.h> #include <linux/kernel.h> #include <linux/notifier.h> #include <video/edid.h> @@ -80,7 +81,6 @@ struct cpuinfo_x86 boot_cpu_data = { 0, 0, 0, 0, -1, 0, 1, 0, -1 }; unsigned long mmu_cr4_features; -EXPORT_SYMBOL_GPL(mmu_cr4_features); #ifdef CONFIG_ACPI_INTERPRETER int acpi_disabled = 0; @@ -122,8 +122,6 @@ struct edid_info edid_info; struct ist_info ist_info; struct e820map e820; - -unsigned char aux_device_present; extern void early_cpu_init(void); extern void dmi_scan_machine(void); @@ -454,10 +452,10 @@ struct e820entry *pbios; /* pointer to original bios entry */ unsigned long long addr; /* address for this change point */ }; -struct change_member change_point_list[2*E820MAX] __initdata; -struct change_member *change_point[2*E820MAX] __initdata; -struct e820entry *overlap_list[E820MAX] __initdata; -struct e820entry new_bios[E820MAX] __initdata; +static struct change_member change_point_list[2*E820MAX] __initdata; +static struct change_member *change_point[2*E820MAX] __initdata; +static struct e820entry *overlap_list[E820MAX] __initdata; +static struct e820entry new_bios[E820MAX] __initdata; static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) { @@ -995,8 +993,6 @@ return max_low_pfn; } -#ifndef CONFIG_DISCONTIGMEM - /* * Free all available memory for boot time allocation. Used * as a callback function by efi_memory_walk() @@ -1070,15 +1066,16 @@ reserve_bootmem(addr, PAGE_SIZE); } +#ifndef CONFIG_DISCONTIGMEM +void __init setup_bootmem_allocator(void); static unsigned long __init setup_memory(void) { - unsigned long bootmap_size, start_pfn, max_low_pfn; /* * partially used pages are not usable - thus * we are rounding upwards: */ - start_pfn = PFN_UP(__pa(xen_start_info.pt_base)) + xen_start_info.nr_pt_frames; + min_low_pfn = PFN_UP(__pa(xen_start_info.pt_base)) + xen_start_info.nr_pt_frames; find_max_pfn(); @@ -1094,10 +1091,43 @@ #endif printk(KERN_NOTICE "%ldMB LOWMEM available.\n", pages_to_mb(max_low_pfn)); + + setup_bootmem_allocator(); + + return max_low_pfn; +} + +void __init zone_sizes_init(void) +{ + unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0}; + unsigned int max_dma, low; + + max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; + low = max_low_pfn; + + if (low < max_dma) + zones_size[ZONE_DMA] = low; + else { + zones_size[ZONE_DMA] = max_dma; + zones_size[ZONE_NORMAL] = low - max_dma; +#ifdef CONFIG_HIGHMEM + zones_size[ZONE_HIGHMEM] = highend_pfn - low; +#endif + } + free_area_init(zones_size); +} +#else +extern unsigned long setup_memory(void); +extern void zone_sizes_init(void); +#endif /* !CONFIG_DISCONTIGMEM */ + +void __init setup_bootmem_allocator(void) +{ + unsigned long bootmap_size; /* * Initialize the boot-time allocator (with low memory only): */ - bootmap_size = init_bootmem(start_pfn, max_low_pfn); + bootmap_size = init_bootmem(min_low_pfn, max_low_pfn); register_bootmem_low_pages(max_low_pfn); @@ -1107,7 +1137,7 @@ * the (very unlikely) case of us accidentally initializing the * bootmem allocator with an invalid RAM area. */ - reserve_bootmem(HIGH_MEMORY, (PFN_PHYS(start_pfn) + + reserve_bootmem(HIGH_MEMORY, (PFN_PHYS(min_low_pfn) + bootmap_size + PAGE_SIZE-1) - (HIGH_MEMORY)); /* reserve EBDA region, it's a 4K region */ @@ -1160,12 +1190,25 @@ #endif phys_to_machine_mapping = (unsigned int *)xen_start_info.mfn_list; - - return max_low_pfn; -} -#else -extern unsigned long setup_memory(void); -#endif /* !CONFIG_DISCONTIGMEM */ +} + +/* + * The node 0 pgdat is initialized before all of these because + * it's needed for bootmem. node>0 pgdats have their virtual + * space allocated before the pagetables are in place to access + * them, so they can't be cleared then. + * + * This should all compile down to nothing when NUMA is off. + */ +void __init remapped_pgdat_init(void) +{ + int nid; + + for_each_online_node(nid) { + if (nid != 0) + memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); + } +} /* * Request address space for all standard RAM and ROM resources @@ -1440,7 +1483,6 @@ machine_submodel_id = SYS_DESC_TABLE.table[1]; BIOS_revision = SYS_DESC_TABLE.table[2]; } - aux_device_present = AUX_DEVICE_INFO; bootloader_type = LOADER_TYPE; #ifdef CONFIG_XEN_PHYSDEV_ACCESS @@ -1500,6 +1542,8 @@ smp_alloc_memory(); /* AP processor realmode stacks in low memory*/ #endif paging_init(); + remapped_pgdat_init(); + zone_sizes_init(); /* Make sure we have a correctly sized P->M table. */ if (max_pfn != xen_start_info.nr_pages) { @@ -1564,11 +1608,13 @@ if (efi_enabled) efi_map_memmap(); +#ifdef CONFIG_ACPI_BOOT /* * Parse the ACPI tables for possible boot-time SMP configuration. */ acpi_boot_table_init(); acpi_boot_init(); +#endif #ifdef CONFIG_X86_LOCAL_APIC if (smp_found_config) diff -r fa660d79f695 -r de310533c483 linux-2.6-xen-sparse/arch/xen/i386/kernel/signal.c --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/signal.c Tue Aug 9 15:17:45 2005 +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/signal.c Tue Aug 9 23:57:17 2005 @@ -93,7 +93,7 @@ if (act) { old_sigset_t mask; - if (verify_area(VERIFY_READ, act, sizeof(*act)) || + if (!access_ok(VERIFY_READ, act, sizeof(*act)) || __get_user(new_ka.sa.sa_handler, &act->sa_handler) || __get_user(new_ka.sa.sa_restorer, &act->sa_restorer)) return -EFAULT; @@ -105,7 +105,7 @@ ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); if (!ret && oact) { - if (verify_area(VERIFY_WRITE, oact, sizeof(*oact)) || + if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || __put_user(old_ka.sa.sa_handler, &oact->sa_handler) || __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer)) return -EFAULT; @@ -187,7 +187,7 @@ struct _fpstate __user * buf; err |= __get_user(buf, &sc->fpstate); if (buf) { - if (verify_area(VERIFY_READ, buf, sizeof(*buf))) + if (!access_ok(VERIFY_READ, buf, sizeof(*buf))) goto badframe; err |= restore_i387(buf); } else { @@ -213,7 +213,7 @@ sigset_t set; int eax; - if (verify_area(VERIFY_READ, frame, sizeof(*frame))) + if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) goto badframe; if (__get_user(set.sig[0], &frame->sc.oldmask) || (_NSIG_WORDS > 1 @@ -243,7 +243,7 @@ sigset_t set; int eax; - if (verify_area(VERIFY_READ, frame, sizeof(*frame))) + if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) goto badframe; if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) goto badframe; @@ -557,6 +557,16 @@ } } + /* + * If TF is set due to a debugger (PT_DTRACE), clear the TF flag so + * that register information in the sigcontext is correct. + */ + if (unlikely(regs->eflags & TF_MASK) + && likely(current->ptrace & PT_DTRACE)) { + current->ptrace &= ~PT_DTRACE; + regs->eflags &= ~TF_MASK; + } + /* Set up the stack frame */ if (ka->sa.sa_flags & SA_SIGINFO) setup_rt_frame(sig, ka, info, oldset, regs); @@ -608,8 +618,7 @@ * inside the kernel. */ if (unlikely(current->thread.debugreg[7])) { - HYPERVISOR_set_debugreg(7, - current->thread.debugreg[7]); + loaddebug(¤t->thread, 7); } /* Whee! Actually deliver the signal. */ diff -r fa660d79f695 -r de310533c483 linux-2.6-xen-sparse/arch/xen/i386/kernel/time.c --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/time.c Tue Aug 9 15:17:45 2005 +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/time.c Tue Aug 9 23:57:17 2005 @@ -176,6 +176,35 @@ ({ rmb(); (shadow_time_version == HYPERVISOR_shared_info->time_version2); }) /* + * This is a special lock that is owned by the CPU and holds the index + * register we are working with. It is required for NMI access to the + * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details. + */ +volatile unsigned long cmos_lock = 0; +EXPORT_SYMBOL(cmos_lock); + +/* Routines for accessing the CMOS RAM/RTC. */ +unsigned char rtc_cmos_read(unsigned char addr) +{ + unsigned char val; + lock_cmos_prefix(addr); + outb_p(addr, RTC_PORT(0)); + val = inb_p(RTC_PORT(1)); + lock_cmos_suffix(addr); + return val; +} +EXPORT_SYMBOL(rtc_cmos_read); + +void rtc_cmos_write(unsigned char val, unsigned char addr) +{ + lock_cmos_prefix(addr); + outb_p(addr, RTC_PORT(0)); + outb_p(val, RTC_PORT(1)); + lock_cmos_suffix(addr); +} +EXPORT_SYMBOL(rtc_cmos_write); + +/* * This version of gettimeofday has microsecond resolution * and better than microsecond precision on fast x86 machines with TSC. */ @@ -335,15 +364,22 @@ { int retval; + WARN_ON(irqs_disabled()); + /* gets recalled with irq locally disabled */ - spin_lock(&rtc_lock); + spin_lock_irq(&rtc_lock); if (efi_enabled) retval = efi_set_rtc_mmss(nowtime); else retval = mach_set_rtc_mmss(nowtime); - spin_unlock(&rtc_lock); + spin_unlock_irq(&rtc_lock); return retval; +} +#else +static int set_rtc_mmss(unsigned long nowtime) +{ + return 0; } #endif @@ -476,29 +512,6 @@ last_update_to_xen = xtime.tv_sec; } - - /* - * If we have an externally synchronized Linux clock, then update - * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be - * called as close as possible to 500 ms before the new second starts. - */ - if ((time_status & STA_UNSYNC) == 0 && - xtime.tv_sec > last_rtc_update + 660 && - (xtime.tv_nsec / 1000) - >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 && - (xtime.tv_nsec / 1000) - <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2) { - /* horrible...FIXME */ - if (efi_enabled) { - if (efi_set_rtc_mmss(xtime.tv_sec) == 0) - last_rtc_update = xtime.tv_sec; - else - last_rtc_update = xtime.tv_sec - 600; - } else if (set_rtc_mmss(xtime.tv_sec) == 0) - last_rtc_update = xtime.tv_sec; - else - last_rtc_update = xtime.tv_sec - 600; /* do it again in 60 s */ - } #endif } @@ -538,10 +551,59 @@ return retval; } +static void sync_cmos_clock(unsigned long dummy); + +static struct timer_list sync_cmos_timer = + TIMER_INITIALIZER(sync_cmos_clock, 0, 0); + +static void sync_cmos_clock(unsigned long dummy) +{ + struct timeval now, next; + int fail = 1; + + /* + * If we have an externally synchronized Linux clock, then update + * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be + * called as close as possible to 500 ms before the new second starts. + * This code is run on a timer. If the clock is set, that timer + * may not expire at the correct time. Thus, we adjust... + */ + if ((time_status & STA_UNSYNC) != 0) + /* + * Not synced, exit, do not restart a timer (if one is + * running, let it run out). + */ + return; + + do_gettimeofday(&now); + if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 && + now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2) + fail = set_rtc_mmss(now.tv_sec); + + next.tv_usec = USEC_AFTER - now.tv_usec; + if (next.tv_usec <= 0) + next.tv_usec += USEC_PER_SEC; + + if (!fail) + next.tv_sec = 659; + else + next.tv_sec = 0; + + if (next.tv_usec >= USEC_PER_SEC) { + next.tv_sec++; + next.tv_usec -= USEC_PER_SEC; + } + mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next)); +} + +void notify_arch_cmos_timer(void) +{ + mod_timer(&sync_cmos_timer, jiffies + 1); +} static long clock_cmos_diff, sleep_start; -static int timer_suspend(struct sys_device *dev, u32 state) +static int timer_suspend(struct sys_device *dev, pm_message_t state) { /* * Estimate time zone so that set_time can update the clock @@ -599,14 +661,14 @@ #ifdef CONFIG_HPET_TIMER extern void (*late_time_init)(void); /* Duplicate of time_init() below, with hpet_enable part added */ -void __init hpet_time_init(void) +static void __init hpet_time_init(void) { xtime.tv_sec = get_cmos_time(); xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ); set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); - if (hpet_enable() >= 0) { + if ((hpet_enable() >= 0) && hpet_use_timer) { printk("Using HPET for base-timer\n"); } diff -r fa660d79f695 -r de310533c483 linux-2.6-xen-sparse/arch/xen/i386/kernel/traps.c --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/traps.c Tue Aug 9 15:17:45 2005 +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/traps.c Tue Aug 9 23:57:17 2005 @@ -342,8 +342,7 @@ if (panic_on_oops) { printk(KERN_EMERG "Fatal exception: panic in 5 seconds\n"); - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(5 * HZ); + ssleep(5); panic("Fatal exception"); } do_exit(SIGSEGV); @@ -450,6 +449,7 @@ DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) DO_ERROR(12, SIGBUS, "stack segment", stack_segment) DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) +DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0) #ifdef CONFIG_X86_MCE DO_ERROR(18, SIGBUS, "machine check", machine_check) #endif @@ -635,16 +635,15 @@ } #ifdef CONFIG_KPROBES -fastcall int do_int3(struct pt_regs *regs, long error_code) +fastcall void do_int3(struct pt_regs *regs, long error_code) { if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) - return 1; + return; /* This is an interrupt gate, because kprobes wants interrupts disabled. Normal trap handlers don't. */ restore_interrupts(regs); do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL); - return 0; } #endif @@ -701,8 +700,6 @@ /* * Single-stepping through TF: make sure we ignore any events in * kernel space (but re-enable TF when returning to user mode). - * And if the event was due to a debugger (PT_DTRACE), clear the - * TF flag so that register information is correct. */ if (condition & DR_STEP) { /* @@ -712,11 +709,6 @@ */ if ((regs->xcs & 2) == 0) goto clear_TF_reenable; - - if (likely(tsk->ptrace & PT_DTRACE)) { - tsk->ptrace &= ~PT_DTRACE; - regs->eflags &= ~TF_MASK; - } } /* Ok, finally something we can handle */ @@ -806,7 +798,7 @@ math_error((void __user *)regs->eip); } -void simd_math_error(void __user *eip) +static void simd_math_error(void __user *eip) { struct task_struct * task; siginfo_t info; @@ -876,6 +868,51 @@ current->thread.error_code = error_code; force_sig(SIGSEGV, current); } +} + +fastcall void setup_x86_bogus_stack(unsigned char * stk) +{ + unsigned long *switch16_ptr, *switch32_ptr; + struct pt_regs *regs; + unsigned long stack_top, stack_bot; + unsigned short iret_frame16_off; + int cpu = smp_processor_id(); + /* reserve the space on 32bit stack for the magic switch16 pointer */ + memmove(stk, stk + 8, sizeof(struct pt_regs)); + switch16_ptr = (unsigned long *)(stk + sizeof(struct pt_regs)); + regs = (struct pt_regs *)stk; + /* now the switch32 on 16bit stack */ + stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu); + stack_top = stack_bot + CPU_16BIT_STACK_SIZE; + switch32_ptr = (unsigned long *)(stack_top - 8); + iret_frame16_off = CPU_16BIT_STACK_SIZE - 8 - 20; + /* copy iret frame on 16bit stack */ + memcpy((void *)(stack_bot + iret_frame16_off), ®s->eip, 20); + /* fill in the switch pointers */ + switch16_ptr[0] = (regs->esp & 0xffff0000) | iret_frame16_off; + switch16_ptr[1] = __ESPFIX_SS; + switch32_ptr[0] = (unsigned long)stk + sizeof(struct pt_regs) + + 8 - CPU_16BIT_STACK_SIZE; + switch32_ptr[1] = __KERNEL_DS; +} + +fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp) +{ + unsigned long *switch32_ptr; + unsigned char *stack16, *stack32; + unsigned long stack_top, stack_bot; + int len; + int cpu = smp_processor_id(); + stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu); + stack_top = stack_bot + CPU_16BIT_STACK_SIZE; + switch32_ptr = (unsigned long *)(stack_top - 8); + /* copy the data from 16bit stack to 32bit stack */ + len = CPU_16BIT_STACK_SIZE - 8 - sp; + stack16 = (unsigned char *)(stack_bot + sp); + stack32 = (unsigned char *) + (switch32_ptr[0] + CPU_16BIT_STACK_SIZE - 8 - len); + memcpy(stack32, stack16, len); + return stack32; } /* @@ -978,3 +1015,10 @@ */ cpu_init(); } + +static int __init kstack_setup(char *s) +{ + kstack_depth_to_print = simple_strtoul(s, NULL, 0); + return 0; +} +__setup("kstack=", kstack_setup); diff -r fa660d79f695 -r de310533c483 linux-2.6-xen-sparse/arch/xen/i386/mm/highmem.c --- a/linux-2.6-xen-sparse/arch/xen/i386/mm/highmem.c Tue Aug 9 15:17:45 2005 +++ b/linux-2.6-xen-sparse/arch/xen/i386/mm/highmem.c Tue Aug 9 23:57:17 2005 @@ -77,7 +77,7 @@ * force other mappings to Oops if they'll try to access * this pte without first remap it */ - pte_clear(kmap_pte-idx); + pte_clear(&init_mm, vaddr, kmap_pte-idx); __flush_tlb_one(vaddr); #endif diff -r fa660d79f695 -r de310533c483 linux-2.6-xen-sparse/arch/xen/i386/mm/init.c --- a/linux-2.6-xen-sparse/arch/xen/i386/mm/init.c Tue Aug 9 15:17:45 2005 +++ b/linux-2.6-xen-sparse/arch/xen/i386/mm/init.c Tue Aug 9 23:57:17 2005 @@ -249,13 +249,10 @@ pte_t *kmap_pte; pgprot_t kmap_prot; -EXPORT_SYMBOL(kmap_prot); -EXPORT_SYMBOL(kmap_pte); - #define kmap_get_fixmap_pte(vaddr) \ pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr)) -void __init kmap_init(void) +static void __init kmap_init(void) { unsigned long kmap_vstart; @@ -266,7 +263,7 @@ kmap_prot = PAGE_KERNEL; } -void __init permanent_kmaps_init(pgd_t *pgd_base) +static void __init permanent_kmaps_init(pgd_t *pgd_base) { pgd_t *pgd; pud_t *pud; @@ -298,7 +295,7 @@ } #ifndef CONFIG_DISCONTIGMEM -void __init set_highmem_pages_init(int bad_ppro) +static void __init set_highmem_pages_init(int bad_ppro) { int pfn; for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) @@ -426,38 +423,6 @@ flush_tlb_all(); } -#ifndef CONFIG_DISCONTIGMEM -void __init zone_sizes_init(void) -{ - unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0}; - unsigned int /*max_dma,*/ high, low; - - /* - * XEN: Our notion of "DMA memory" is fake when running over Xen. - * We simply put all RAM in the DMA zone so that those drivers which - * needlessly specify GFP_DMA do not get starved of RAM unnecessarily. - * Those drivers that *do* require lowmem are screwed anyway when - * running over Xen! - */ - /*max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;*/ - low = max_low_pfn; - high = highend_pfn; - - /*if (low < max_dma)*/ - zones_size[ZONE_DMA] = low; - /*else*/ { - /*zones_size[ZONE_DMA] = max_dma;*/ - /*zones_size[ZONE_NORMAL] = low - max_dma;*/ -#ifdef CONFIG_HIGHMEM - zones_size[ZONE_HIGHMEM] = high - low; -#endif - } - free_area_init(zones_size); -} -#else -extern void zone_sizes_init(void); -#endif /* !CONFIG_DISCONTIGMEM */ - static int disable_nx __initdata = 0; u64 __supported_pte_mask = ~_PAGE_NX; @@ -560,7 +525,6 @@ __flush_tlb_all(); kmap_init(); - zone_sizes_init(); /* Switch to the real shared_info page, and clear the dummy page. */ flush_page_update_queue(); @@ -586,7 +550,7 @@ * but fortunately the switch to using exceptions got rid of all that. */ -void __init test_wp_bit(void) +static void __init test_wp_bit(void) { printk("Checking if this processor honours the WP bit even in supervisor mode... "); @@ -605,20 +569,17 @@ } } +static void __init set_max_mapnr_init(void) +{ +#ifdef CONFIG_HIGHMEM + num_physpages = highend_pfn; +#else + num_physpages = max_low_pfn; +#endif #ifndef CONFIG_DISCONTIGMEM -static void __init set_max_mapnr_init(void) -{ -#ifdef CONFIG_HIGHMEM - max_mapnr = num_physpages = highend_pfn; -#else - max_mapnr = num_physpages = max_low_pfn; -#endif -} -#define __free_all_bootmem() free_all_bootmem() -#else -#define __free_all_bootmem() free_all_bootmem_node(NODE_DATA(0)) -extern void set_max_mapnr_init(void); -#endif /* !CONFIG_DISCONTIGMEM */ + max_mapnr = num_physpages; +#endif +} static struct kcore_list kcore_mem, kcore_vmalloc; @@ -650,16 +611,16 @@ set_max_mapnr_init(); #ifdef CONFIG_HIGHMEM - high_memory = (void *) __va(highstart_pfn * PAGE_SIZE); + high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; #else - high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); + high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; #endif printk("vmalloc area: %lx-%lx, maxmem %lx\n", VMALLOC_START,VMALLOC_END,MAXMEM); BUG_ON(VMALLOC_START > VMALLOC_END); /* this will put all low memory onto the freelists */ - totalram_pages += __free_all_bootmem(); + totalram_pages += free_all_bootmem(); /* XEN: init and count low-mem pages outside initial allocation. */ for (pfn = xen_start_info.nr_pages; pfn < max_low_pfn; pfn++) { ClearPageReserved(&mem_map[pfn]); diff -r fa660d79f695 -r de310533c483 linux-2.6-xen-sparse/arch/xen/i386/mm/pgtable.c --- a/linux-2.6-xen-sparse/arch/xen/i386/mm/pgtable.c Tue Aug 9 15:17:45 2005 +++ b/linux-2.6-xen-sparse/arch/xen/i386/mm/pgtable.c Tue Aug 9 23:57:17 2005 @@ -368,7 +368,7 @@ if (PTRS_PER_PMD > 1) for (i = 0; i < USER_PTRS_PER_PGD; ++i) kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); - /* in the non-PAE case, clear_page_range() clears user pgd entries */ + /* in the non-PAE case, free_pgtables() clears user pgd entries */ kmem_cache_free(pgd_cache, pgd); } diff -r fa660d79f695 -r de310533c483 linux-2.6-xen-sparse/arch/xen/i386/pci/irq.c --- a/linux-2.6-xen-sparse/arch/xen/i386/pci/irq.c Tue Aug 9 15:17:45 2005 +++ b/linux-2.6-xen-sparse/arch/xen/i386/pci/irq.c Tue Aug 9 23:57:17 2005 @@ -114,11 +114,11 @@ if (pin != 0) { if (dev->irq != 0) printk(KERN_INFO "PCI: Obtained IRQ %d for device %s\n", - dev->irq, dev->slot_name); + dev->irq, pci_name(dev)); else printk(KERN_WARNING "PCI: No IRQ known for interrupt " "pin %c of device %s.\n", 'A' + pin - 1, - dev->slot_name); + pci_name(dev)); } return 0; diff -r fa660d79f695 -r de310533c483 linux-2.6-xen-sparse/drivers/Makefile --- a/linux-2.6-xen-sparse/drivers/Makefile Tue Aug 9 15:17:45 2005 +++ b/linux-2.6-xen-sparse/drivers/Makefile Tue Aug 9 23:57:17 2005 @@ -48,8 +48,8 @@ obj-$(CONFIG_TC) += tc/ obj-$(CONFIG_USB) += usb/ obj-$(CONFIG_USB_GADGET) += usb/gadget/ +obj-$(CONFIG_GAMEPORT) += input/gameport/ obj-$(CONFIG_INPUT) += input/ -obj-$(CONFIG_GAMEPORT) += input/gameport/ obj-$(CONFIG_I2O) += message/ obj-$(CONFIG_I2C) += i2c/ obj-$(CONFIG_W1) += w1/ @@ -62,5 +62,6 @@ obj-$(CONFIG_CPU_FREQ) += cpufreq/ obj-$(CONFIG_MMC) += mmc/ obj-$(CONFIG_INFINIBAND) += infiniband/ +obj-$(CONFIG_BLK_DEV_SGIIOC4) += sn/ obj-y += firmware/ obj-$(CONFIG_CRYPTO) += crypto/ diff -r fa660d79f695 -r de310533c483 linux-2.6-xen-sparse/drivers/char/mem.c --- a/linux-2.6-xen-sparse/drivers/char/mem.c Tue Aug 9 15:17:45 2005 +++ b/linux-2.6-xen-sparse/drivers/char/mem.c Tue Aug 9 23:57:17 2005 @@ -23,6 +23,7 @@ #include <linux/devfs_fs_kernel.h> #include <linux/ptrace.h> #include <linux/device.h> +#include <linux/backing-dev.h> #include <asm/uaccess.h> #include <asm/io.h> @@ -76,14 +77,6 @@ * On ia64, we ignore O_SYNC because we cannot tolerate memory attribute aliases. */ return !(efi_mem_attributes(addr) & EFI_MEMORY_WB); -#elif defined(CONFIG_PPC64) - /* On PPC64, we always do non-cacheable access to the IO hole and - * cacheable elsewhere. Cache paradox can checkstop the CPU and - * the high_memory heuristic below is wrong on machines with memory - * above the IO hole... Ah, and of course, XFree86 doesn't pass - * O_SYNC when mapping us to tap IO space. Surprised ? - */ - return !page_is_ram(addr >> PAGE_SHIFT); #else /* * Accessing memory above the top the kernel knows about or through a file pointer @@ -111,38 +104,6 @@ } #endif -static ssize_t do_write_mem(void *p, unsigned long realp, - const char __user * buf, size_t count, loff_t *ppos) -{ - ssize_t written; - unsigned long copied; - - written = 0; -#if defined(__sparc__) || (defined(__mc68000__) && defined(CONFIG_MMU)) - /* we don't have page 0 mapped on sparc and m68k.. */ - if (realp < PAGE_SIZE) { - unsigned long sz = PAGE_SIZE-realp; - if (sz > count) sz = count; - /* Hmm. Do something? */ - buf+=sz; - p+=sz; - count-=sz; - written+=sz; - } -#endif - copied = copy_from_user(p, buf, count); - if (copied) { - ssize_t ret = written + (count - copied); - - if (ret) - return ret; - return -EFAULT; - } - written += count; - *ppos += written; - return written; -} - #ifndef ARCH_HAS_DEV_MEM /* * This funcion reads the *physical* memory. The f_pos points directly to the @@ -152,15 +113,16 @@ size_t count, loff_t *ppos) { unsigned long p = *ppos; - ssize_t read; + ssize_t read, sz; + char *ptr; if (!valid_phys_addr_range(p, &count)) return -EFAULT; read = 0; -#if defined(__sparc__) || (defined(__mc68000__) && defined(CONFIG_MMU)) +#ifdef __ARCH_HAS_NO_PAGE_ZERO_MAPPED /* we don't have page 0 mapped on sparc and m68k.. */ if (p < PAGE_SIZE) { - unsigned long sz = PAGE_SIZE-p; + sz = PAGE_SIZE - p; if (sz > count) sz = count; if (sz > 0) { @@ -173,9 +135,33 @@ } } #endif - if (copy_to_user(buf, __va(p), count)) - return -EFAULT; - read += count; + + while (count > 0) { + /* + * Handle first page in case it's not aligned + */ + if (-p & (PAGE_SIZE - 1)) + sz = -p & (PAGE_SIZE - 1); + else + sz = PAGE_SIZE; + + sz = min_t(unsigned long, sz, count); + + /* + * On ia64 if a page has been mapped somewhere as + * uncached, then it must also be accessed uncached + * by the kernel or data corruption may occur + */ + ptr = xlate_dev_mem_ptr(p); + + if (copy_to_user(buf, ptr, sz)) + return -EFAULT; + buf += sz; + p += sz; + count -= sz; + read += sz; + } + *ppos += read; return read; } @@ -184,16 +170,76 @@ size_t count, loff_t *ppos) { unsigned long p = *ppos; + ssize_t written, sz; + unsigned long copied; + void *ptr; if (!valid_phys_addr_range(p, &count)) return -EFAULT; - return do_write_mem(__va(p), p, buf, count, ppos); + + written = 0; + +#ifdef __ARCH_HAS_NO_PAGE_ZERO_MAPPED + /* we don't have page 0 mapped on sparc and m68k.. */ + if (p < PAGE_SIZE) { + unsigned long sz = PAGE_SIZE - p; + if (sz > count) + sz = count; + /* Hmm. Do something? */ + buf += sz; + p += sz; + count -= sz; + written += sz; + } +#endif + + while (count > 0) { + /* + * Handle first page in case it's not aligned + */ + if (-p & (PAGE_SIZE - 1)) + sz = -p & (PAGE_SIZE - 1); + else + sz = PAGE_SIZE; + + sz = min_t(unsigned long, sz, count); + + /* + * On ia64 if a page has been mapped somewhere as + * uncached, then it must also be accessed uncached + * by the kernel or data corruption may occur + */ + ptr = xlate_dev_mem_ptr(p); + + copied = copy_from_user(ptr, buf, sz); + if (copied) { + ssize_t ret; + + ret = written + (sz - copied); + if (ret) + return ret; + return -EFAULT; + } + buf += sz; + p += sz; + count -= sz; + written += sz; + } + + *ppos += written; + return written; } #endif static int mmap_kmem(struct file * file, struct vm_area_struct * vma) { -#ifdef pgprot_noncached +#if defined(__HAVE_PHYS_MEM_ACCESS_PROT) + unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; + + vma->vm_page_prot = phys_mem_access_prot(file, offset, + vma->vm_end - vma->vm_start, + vma->vm_page_prot); +#elif defined(pgprot_noncached) unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; int uncached; @@ -212,6 +258,25 @@ return 0; } +#if 0 +static int mmap_kmem(struct file * file, struct vm_area_struct * vma) +{ + unsigned long long val; + /* + * RED-PEN: on some architectures there is more mapped memory + * than available in mem_map which pfn_valid checks + * for. Perhaps should add a new macro here. + * + * RED-PEN: vmalloc is not supported right now. + */ + if (!pfn_valid(vma->vm_pgoff)) + return -EIO; + val = (u64)vma->vm_pgoff << PAGE_SHIFT; + vma->vm_pgoff = __pa(val) >> PAGE_SHIFT; + return mmap_mem(file, vma); +} +#endif + extern long vread(char *buf, char *addr, unsigned long count); extern long vwrite(char *buf, char *addr, unsigned long count); @@ -222,33 +287,55 @@ size_t count, loff_t *ppos) { unsigned long p = *ppos; - ssize_t read = 0; - ssize_t virtr = 0; + ssize_t low_count, read, sz; char * kbuf; /* k-addr because vread() takes vmlist_lock rwlock */ - + + read = 0; if (p < (unsigned long) high_memory) { - read = count; + low_count = count; if (count > (unsigned long) high_memory - p) - read = (unsigned long) high_memory - p; - -#if defined(__sparc__) || (defined(__mc68000__) && defined(CONFIG_MMU)) + low_count = (unsigned long) high_memory - p; + +#ifdef __ARCH_HAS_NO_PAGE_ZERO_MAPPED /* we don't have page 0 mapped on sparc and m68k.. */ - if (p < PAGE_SIZE && read > 0) { + if (p < PAGE_SIZE && low_count > 0) { size_t tmp = PAGE_SIZE - p; - if (tmp > read) tmp = read; + if (tmp > low_count) tmp = low_count; if (clear_user(buf, tmp)) return -EFAULT; buf += tmp; p += tmp; - read -= tmp; + read += tmp; + low_count -= tmp; count -= tmp; } #endif - if (copy_to_user(buf, (char *)p, read)) - return -EFAULT; - p += read; - buf += read; - count -= read; + while (low_count > 0) { + /* + * Handle first page in case it's not aligned + */ + if (-p & (PAGE_SIZE - 1)) + sz = -p & (PAGE_SIZE - 1); + else + sz = PAGE_SIZE; + + sz = min_t(unsigned long, sz, low_count); + + /* + * On ia64 if a page has been mapped somewhere as + * uncached, then it must also be accessed uncached + * by the kernel or data corruption may occur + */ + kbuf = xlate_dev_kmem_ptr((char *)p); + + if (copy_to_user(buf, kbuf, sz)) + return -EFAULT; + buf += sz; + p += sz; + read += sz; + low_count -= sz; + count -= sz; + } } if (count > 0) { @@ -269,14 +356,78 @@ } count -= len; buf += len; - virtr += len; + read += len; p += len; } free_page((unsigned long)kbuf); } *ppos = p; - return virtr + read; -} + return read; +} + + +static inline ssize_t +do_write_kmem(void *p, unsigned long realp, const char __user * buf, + size_t count, loff_t *ppos) +{ + ssize_t written, sz; + unsigned long copied; + + written = 0; +#ifdef __ARCH_HAS_NO_PAGE_ZERO_MAPPED + /* we don't have page 0 mapped on sparc and m68k.. */ + if (realp < PAGE_SIZE) { + unsigned long sz = PAGE_SIZE - realp; + if (sz > count) + sz = count; + /* Hmm. Do something? */ + buf += sz; + p += sz; + realp += sz; + count -= sz; + written += sz; + } +#endif + + while (count > 0) { + char *ptr; + /* + * Handle first page in case it's not aligned + */ + if (-realp & (PAGE_SIZE - 1)) + sz = -realp & (PAGE_SIZE - 1); + else + sz = PAGE_SIZE; + + sz = min_t(unsigned long, sz, count); + + /* + * On ia64 if a page has been mapped somewhere as + * uncached, then it must also be accessed uncached + * by the kernel or data corruption may occur + */ + ptr = xlate_dev_kmem_ptr(p); + + copied = copy_from_user(ptr, buf, sz); + if (copied) { + ssize_t ret; + + ret = written + (sz - copied); + if (ret) + return ret; + return -EFAULT; + } + buf += sz; + p += sz; + realp += sz; + count -= sz; + written += sz; + } + + *ppos += written; + return written; +} + /* * This function writes to the *virtual* memory as seen by the kernel. @@ -296,7 +447,7 @@ if (count > (unsigned long) high_memory - p) wrote = (unsigned long) high_memory - p; - written = do_write_mem((void*)p, p, buf, wrote, ppos); + written = do_write_kmem((void*)p, p, buf, wrote, ppos); if (written != wrote) return written; wrote = written; @@ -344,7 +495,7 @@ unsigned long i = *ppos; char __user *tmp = buf; - if (verify_area(VERIFY_WRITE,buf,count)) + if (!access_ok(VERIFY_WRITE, buf, count)) return -EFAULT; while (count-- > 0 && i < 65536) { if (__put_user(inb(i),tmp) < 0) @@ -362,7 +513,7 @@ unsigned long i = *ppos; const char __user * tmp = buf; - if (verify_area(VERIFY_READ,buf,count)) + if (!access_ok(VERIFY_READ,buf,count)) return -EFAULT; while (count-- > 0 && i < 65536) { char c; @@ -568,7 +719,6 @@ return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; } -#define mmap_mem mmap_kmem #define zero_lseek null_lseek #define full_lseek null_lseek #define write_zero write_null @@ -581,7 +731,7 @@ .llseek = memory_lseek, .read = read_mem, .write = write_mem, - .mmap = mmap_mem, + .mmap = mmap_kmem, .open = open_mem, }; #else @@ -616,6 +766,10 @@ .read = read_zero, .write = write_zero, .mmap = mmap_zero, +}; + +static struct backing_dev_info zero_bdi = { + .capabilities = BDI_CAP_MAP_COPY, }; static struct file_operations full_fops = { @@ -664,6 +818,7 @@ break; #endif case 5: + filp->f_mapping->backing_dev_info = &zero_bdi; filp->f_op = &zero_fops; break; case 7: diff -r fa660d79f695 -r de310533c483 linux-2.6-xen-sparse/drivers/char/tty_io.c --- a/linux-2.6-xen-sparse/drivers/char/tty_io.c Tue Aug 9 15:17:45 2005 +++ b/linux-2.6-xen-sparse/drivers/char/tty_io.c Tue Aug 9 23:57:17 2005 @@ -187,7 +187,7 @@ EXPORT_SYMBOL(tty_name); -inline int tty_paranoia_check(struct tty_struct *tty, struct inode *inode, +int tty_paranoia_check(struct tty_struct *tty, struct inode *inode, const char *routine) { #ifdef TTY_PARANOIA_CHECK @@ -1791,7 +1791,6 @@ } #ifdef CONFIG_VT if (console_use_vt && (device == MKDEV(TTY_MAJOR,0))) { - extern int fg_console; extern struct tty_driver *console_driver; driver = console_driver; index = fg_console; @@ -2018,11 +2017,10 @@ return 0; #ifdef CONFIG_VT if (tty->driver->type == TTY_DRIVER_TYPE_CONSOLE) { - unsigned int currcons = tty->index; int rc; acquire_console_sem(); - rc = vc_resize(currcons, tmp_ws.ws_col, tmp_ws.ws_row); + rc = vc_resize(tty->driver_data, tmp_ws.ws_col, tmp_ws.ws_row); release_console_sem(); if (rc) return -ENXIO; @@ -2634,6 +2632,7 @@ tty->magic = TTY_MAGIC; tty_ldisc_assign(tty, tty_ldisc_get(N_TTY)); tty->pgrp = -1; + tty->overrun_time = jiffies; tty->flip.char_buf_ptr = tty->flip.char_buf; tty->flip.flag_buf_ptr = tty->flip.flag_buf; INIT_WORK(&tty->flip.work, flush_to_ldisc, tty); diff -r fa660d79f695 -r de310533c483 linux-2.6-xen-sparse/include/asm-generic/pgtable.h --- a/linux-2.6-xen-sparse/include/asm-generic/pgtable.h Tue Aug 9 15:17:45 2005 +++ b/linux-2.6-xen-sparse/include/asm-generic/pgtable.h Tue Aug 9 23:57:17 2005 @@ -16,7 +16,7 @@ #ifndef __HAVE_ARCH_SET_PTE_ATOMIC #define ptep_establish(__vma, __address, __ptep, __entry) \ do { \ - set_pte(__ptep, __entry); \ + set_pte_at((__vma)->vm_mm, (__address), __ptep, __entry); \ flush_tlb_page(__vma, __address); \ } while (0) #else /* __HAVE_ARCH_SET_PTE_ATOMIC */ @@ -37,7 +37,7 @@ */ #define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \ do { \ - set_pte(__ptep, __entry); \ + set_pte_at((__vma)>vm_mm, (__address), __ptep, __entry); \ flush_tlb_page(__vma, __address); \ } while (0) #endif @@ -53,20 +53,24 @@ #endif #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG -static inline int ptep_test_and_clear_young(pte_t *ptep) -{ - pte_t pte = *ptep; - if (!pte_young(pte)) - return 0; - set_pte(ptep, pte_mkold(pte)); - return 1; -} +#define ptep_test_and_clear_young(__vma, __address, __ptep) \ +({ \ + pte_t __pte = *(__ptep); \ + int r = 1; \ + if (!pte_young(__pte)) \ + r = 0; \ + else \ + set_pte_at((__vma)->vm_mm, (__address), \ + (__ptep), pte_mkold(__pte)); \ + r; \ +}) #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH #define ptep_clear_flush_young(__vma, __address, __ptep) \ ({ \ - int __young = ptep_test_and_clear_young(__ptep); \ + int __young; \ + __young = ptep_test_and_clear_young(__vma, __address, __ptep); \ if (__young) \ flush_tlb_page(__vma, __address); \ __young; \ @@ -74,20 +78,24 @@ #endif #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY -static inline int ptep_test_and_clear_dirty(pte_t *ptep) -{ - pte_t pte = *ptep; - if (!pte_dirty(pte)) - return 0; - set_pte(ptep, pte_mkclean(pte)); - return 1; -} +#define ptep_test_and_clear_dirty(__vma, __address, __ptep) \ +({ \ + pte_t __pte = *__ptep; \ + int r = 1; \ + if (!pte_dirty(__pte)) \ + r = 0; \ + else \ + set_pte_at((__vma)->vm_mm, (__address), (__ptep), \ + pte_mkclean(__pte)); \ + r; \ +}) #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_DIRTY_FLUSH #define ptep_clear_flush_dirty(__vma, __address, __ptep) \ ({ \ - int __dirty = ptep_test_and_clear_dirty(__ptep); \ + int __dirty; \ + __dirty = ptep_test_and_clear_dirty(__vma, __address, __ptep); \ if (__dirty) \ flush_tlb_page(__vma, __address); \ __dirty; \ @@ -95,36 +103,29 @@ #endif #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR -static inline pte_t ptep_get_and_clear(pte_t *ptep) -{ - pte_t pte = *ptep; - pte_clear(ptep); - return pte; -} +#define ptep_get_and_clear(__mm, __address, __ptep) \ +({ \ + pte_t __pte = *(__ptep); \ + pte_clear((__mm), (__address), (__ptep)); \ + __pte; \ +}) #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH #define ptep_clear_flush(__vma, __address, __ptep) \ ({ \ - pte_t __pte = ptep_get_and_clear(__ptep); \ + pte_t __pte; \ + __pte = ptep_get_and_clear((__vma)->vm_mm, __address, __ptep); \ flush_tlb_page(__vma, __address); \ __pte; \ }) #endif #ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT -static inline void ptep_set_wrprotect(pte_t *ptep) +static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep) { pte_t old_pte = *ptep; - set_pte(ptep, pte_wrprotect(old_pte)); -} -#endif - -#ifndef __HAVE_ARCH_PTEP_MKDIRTY -static inline void ptep_mkdirty(pte_t *ptep) -{ - pte_t old_pte = *ptep; - set_pte(ptep, pte_mkdirty(old_pte)); + set_pte_at(mm, address, ptep, pte_wrprotect(old_pte)); } #endif @@ -144,4 +145,77 @@ #define pgd_offset_gate(mm, addr) pgd_offset(mm, addr) #endif +#ifndef __HAVE_ARCH_LAZY_MMU_PROT_UPDATE +#define lazy_mmu_prot_update(pte) do { } while (0) +#endif + +/* + * When walking page tables, get the address of the next boundary, + * or the end address of the range if that comes earlier. Although no + * vma end wraps to 0, rounded up __boundary may wrap to 0 throughout. + */ + +#define pgd_addr_end(addr, end) \ +({ unsigned long __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK; \ + (__boundary - 1 < (end) - 1)? __boundary: (end); \ +}) + +#ifndef pud_addr_end +#define pud_addr_end(addr, end) \ +({ unsigned long __boundary = ((addr) + PUD_SIZE) & PUD_MASK; \ + (__boundary - 1 < (end) - 1)? __boundary: (end); \ +}) +#endif + +#ifndef pmd_addr_end +#define pmd_addr_end(addr, end) \ +({ unsigned long __boundary = ((addr) + PMD_SIZE) & PMD_MASK; \ + (__boundary - 1 < (end) - 1)? __boundary: (end); \ +}) +#endif + +#ifndef __ASSEMBLY__ +/* + * When walking page tables, we usually want to skip any p?d_none entries; + * and any p?d_bad entries - reporting the error before resetting to none. + * Do the tests inline, but report and clear the bad entry in mm/memory.c. + */ +void pgd_clear_bad(pgd_t *); +void pud_clear_bad(pud_t *); +void pmd_clear_bad(pmd_t *); + +static inline int pgd_none_or_clear_bad(pgd_t *pgd) +{ + if (pgd_none(*pgd)) + return 1; + if (unlikely(pgd_bad(*pgd))) { + pgd_clear_bad(pgd); + return 1; + } + return 0; +} + +static inline int pud_none_or_clear_bad(pud_t *pud) +{ + if (pud_none(*pud)) + return 1; + if (unlikely(pud_bad(*pud))) { + pud_clear_bad(pud); + return 1; + } + return 0; +} + +static inline int pmd_none_or_clear_bad(pmd_t *pmd) +{ + if (pmd_none(*pmd)) + return 1; + if (unlikely(pmd_bad(*pmd))) { + pmd_clear_bad(pmd); + return 1; + } + return 0; +} +#endif /* !__ASSEMBLY__ */ + #endif /* _ASM_GENERIC_PGTABLE_H */ diff -r fa660d79f695 -r de310533c483 linux-2.6-xen-sparse/include/asm-xen/asm-i386/desc.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/desc.h Tue Aug 9 15:17:45 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/desc.h Tue Aug 9 23:57:17 2005 @@ -3,6 +3,8 @@ #include <asm/ldt.h> #include <asm/segment.h> + +#define CPU_16BIT_STACK_SIZE 1024 #ifndef __ASSEMBLY__ @@ -12,6 +14,8 @@ #include <asm/mmu.h> extern struct desc_struct cpu_gdt_table[NR_CPUS][GDT_ENTRIES]; + +DECLARE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]); struct Xgt_desc_struct { unsigned short size; diff -r fa660d79f695 -r de310533c483 linux-2.6-xen-sparse/include/asm-xen/asm-i386/dma-mapping.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/dma-mapping.h Tue Aug 9 15:17:45 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/dma-mapping.h Tue Aug 9 23:57:17 2005 @@ -11,7 +11,7 @@ #define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h) void *dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, int flag); + dma_addr_t *dma_handle, unsigned int __nocast flag); void dma_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle); diff -r fa660d79f695 -r de310533c483 linux-2.6-xen-sparse/include/asm-xen/asm-i386/highmem.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/highmem.h Tue Aug 9 15:17:45 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/highmem.h Tue Aug 9 23:57:17 2005 @@ -32,8 +32,6 @@ extern pte_t *kmap_pte; extern pgprot_t kmap_prot; extern pte_t *pkmap_page_table; - -extern void kmap_init(void); /* * Right now we initialize only a single pte table. It can be extended diff -r fa660d79f695 -r de310533c483 linux-2.6-xen-sparse/include/asm-xen/asm-i386/io.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/io.h Tue Aug 9 15:17:45 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/io.h Tue Aug 9 23:57:17 2005 @@ -49,6 +49,17 @@ #include <linux/vmalloc.h> #include <asm/fixmap.h> + +/* + * Convert a physical pointer to a virtual kernel pointer for /dev/mem + * access + */ +#define xlate_dev_mem_ptr(p) __va(p) + +/* + * Convert a virtual cached pointer to an uncached pointer + */ +#define xlate_dev_kmem_ptr(p) p /** * virt_to_phys - map virtual addresses to physical diff -r fa660d79f695 -r de310533c483 linux-2.6-xen-sparse/include/asm-xen/asm-i386/mmu_context.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/mmu_context.h Tue Aug 9 15:17:45 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/mmu_context.h Tue Aug 9 23:57:17 2005 @@ -64,7 +64,7 @@ } #define deactivate_mm(tsk, mm) \ - asm("movl %0,%%fs ; movl %0,%%gs": :"r" (0)) + asm("mov %0,%%fs ; mov %0,%%gs": :"r" (0)) #define activate_mm(prev, next) do { \ switch_mm((prev),(next),NULL); \ diff -r fa660d79f695 -r de310533c483 linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgalloc.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgalloc.h Tue Aug 9 15:17:45 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgalloc.h Tue Aug 9 23:57:17 2005 @@ -2,7 +2,6 @@ #define _I386_PGALLOC_H #include <linux/config.h> -#include <asm/processor.h> #include <asm/fixmap.h> #include <linux/threads.h> #include <linux/mm.h> /* for struct page */ diff -r fa660d79f695 -r de310533c483 linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgtable-2level.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgtable-2level.h Tue Aug 9 15:17:45 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgtable-2level.h Tue Aug 9 23:57:17 2005 @@ -16,6 +16,7 @@ #define set_pte_batched(pteptr, pteval) \ queue_l1_entry_update(pteptr, (pteval).pte_low) #define set_pte(pteptr, pteval) (*(pteptr) = pteval) +#define set_pte_at(mm,addr,ptep,pteval) set_pte(ptep,pteval) #define set_pte_atomic(pteptr, pteval) set_pte(pteptr,pteval) #define set_pmd(pmdptr, pmdval) xen_l2_entry_update((pmdptr), (pmdval)) @@ -28,14 +29,8 @@ * each domain will have separate page tables, with their own versions of * accessed & dirty state. */ -static inline pte_t ptep_get_and_clear(pte_t *xp) -{ - pte_t pte = *xp; - if (pte.pte_low) - set_pte(xp, __pte_ma(0)); - return pte; -} +#define ptep_get_and_clear(mm,addr,xp) __pte_ma(xchg(&(xp)->pte_low, 0)) #define pte_same(a, b) ((a).pte_low == (b).pte_low) /* * We detect special mappings in one of two ways: diff -r fa660d79f695 -r de310533c483 linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgtable.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgtable.h Tue Aug 9 15:17:45 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgtable.h Tue Aug 9 23:57:17 2005 @@ -64,7 +64,7 @@ #define PGDIR_MASK (~(PGDIR_SIZE-1)) #define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) -#define FIRST_USER_PGD_NR 0 +#define FIRST_USER_ADDRESS 0 #define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT) #define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS) @@ -200,15 +200,15 @@ /* * Define this if things work differently on an i386 and an i486: * it will (on an i486) warn about kernel memory accesses that are - * done without a 'verify_area(VERIFY_WRITE,..)' - */ -#undef TEST_VERIFY_AREA + * done without a 'access_ok(VERIFY_WRITE,..)' + */ +#undef TEST_ACCESS_OK /* The boot page tables (all created as a single array) */ extern unsigned long pg0[]; #define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE)) -#define pte_clear(xp) do { set_pte(xp, __pte(0)); } while (0) +#define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0) #define pmd_none(x) (!pmd_val(x)) /* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t. @@ -252,7 +252,7 @@ # include <asm/pgtable-2level.h> #endif -static inline int ptep_test_and_clear_dirty(pte_t *ptep) +static inline int ptep_test_and_clear_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pte_t pte = *ptep; int ret = pte_dirty(pte); @@ -261,7 +261,7 @@ return ret; } -static inline int ptep_test_and_clear_young(pte_t *ptep) +static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pte_t pte = *ptep; int ret = pte_young(pte); @@ -270,18 +270,11 @@ return ret; } -static inline void ptep_set_wrprotect(pte_t *ptep) +static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { pte_t pte = *ptep; if (pte_write(pte)) set_pte(ptep, pte_wrprotect(pte)); -} - -static inline void ptep_mkdirty(pte_t *ptep) -{ - pte_t pte = *ptep; - if (!pte_dirty(pte)) - xen_l1_entry_update(ptep, pte_mkdirty(pte).pte_low); } /* @@ -495,11 +488,14 @@ #define io_remap_pfn_range(vma,from,pfn,size,prot) \ direct_remap_area_pages(vma->vm_mm,from,pfn<<PAGE_SHIFT,size,prot,DOMID_IO) +#define MK_IOSPACE_PFN(space, pfn) (pfn) +#define GET_IOSPACE(pfn) 0 +#define GET_PFN(pfn) (pfn) + #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY #define __HAVE_ARCH_PTEP_GET_AND_CLEAR #define __HAVE_ARCH_PTEP_SET_WRPROTECT -#define __HAVE_ARCH_PTEP_MKDIRTY #define __HAVE_ARCH_PTE_SAME #include <asm-generic/pgtable.h> diff -r fa660d79f695 -r de310533c483 linux-2.6-xen-sparse/include/asm-xen/asm-i386/processor.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/processor.h Tue Aug 9 15:17:45 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/processor.h Tue Aug 9 23:57:17 2005 @@ -99,12 +99,12 @@ #endif extern int phys_proc_id[NR_CPUS]; +extern int cpu_core_id[NR_CPUS]; extern char ignore_fpu_irq; extern void identify_cpu(struct cpuinfo_x86 *); extern void print_cpu_info(struct cpuinfo_x86 *); extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); -extern void dodgy_tsc(void); #ifdef CONFIG_X86_HT extern void detect_ht(struct cpuinfo_x86 *c); @@ -138,7 +138,7 @@ * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx * resulting in stale register contents being returned. */ -static inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx) +static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { __asm__("cpuid" : "=a" (*eax), @@ -146,6 +146,18 @@ "=c" (*ecx), "=d" (*edx) : "0" (op), "c"(0)); +} + +/* Some CPUID calls want 'count' to be placed in ecx */ +static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx, + int *edx) +{ + __asm__("cpuid" + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (op), "c" (count)); } /* @@ -501,6 +513,13 @@ regs->esp = new_esp; \ } while (0) +/* + * This special macro can be used to load a debugging register + */ +#define loaddebug(thread,register) \ + HYPERVISOR_set_debugreg((register), \ + ((thread)->debugreg[register])) + /* Forward declaration, a strange C thing */ struct task_struct; struct mm_struct; diff -r fa660d79f695 -r de310533c483 linux-2.6-xen-sparse/include/asm-xen/asm-i386/segment.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/segment.h Tue Aug 9 15:17:45 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/segment.h Tue Aug 9 23:57:17 2005 @@ -38,7 +38,7 @@ * 24 - APM BIOS support * 25 - APM BIOS support * - * 26 - unused + * 26 - ESPFIX small SS * 27 - unused * 28 - unused * 29 - unused @@ -71,6 +71,9 @@ #define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 6) #define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 11) +#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14) +#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8) + #define GDT_ENTRY_DOUBLEFAULT_TSS 31 /* diff -r fa660d79f695 -r de310533c483 linux-2.6-xen-sparse/include/asm-xen/asm-i386/setup.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/setup.h Tue Aug 9 15:17:45 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/setup.h Tue Aug 9 23:57:17 2005 @@ -16,7 +16,7 @@ #define MAXMEM_PFN PFN_DOWN(MAXMEM) #define MAX_NONPAE_PFN (1 << 20) -#define PARAM_SIZE 2048 +#define PARAM_SIZE 4096 #define COMMAND_LINE_SIZE 256 #define OLD_CL_MAGIC_ADDR 0x90020 diff -r fa660d79f695 -r de310533c483 linux-2.6-xen-sparse/include/asm-xen/asm-i386/system.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/system.h Tue Aug 9 15:17:45 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/system.h Tue Aug 9 23:57:17 2005 @@ -84,7 +84,7 @@ #define loadsegment(seg,value) \ asm volatile("\n" \ "1:\t" \ - "movl %0,%%" #seg "\n" \ + "mov %0,%%" #seg "\n" \ "2:\n" \ ".section .fixup,\"ax\"\n" \ "3:\t" \ @@ -96,13 +96,13 @@ ".align 4\n\t" \ ".long 1b,3b\n" \ ".previous" \ - : :"m" (*(unsigned int *)&(value))) + : :"m" (value)) /* * Save a segment register away */ #define savesegment(seg, value) \ - asm volatile("movl %%" #seg ",%0":"=m" (*(int *)&(value))) + asm volatile("mov %%" #seg ",%0":"=m" (value)) /* * Clear and set 'TS' bit respectively @@ -519,4 +519,6 @@ extern int es7000_plat; void cpu_idle_wait(void); +extern unsigned long arch_align_stack(unsigned long sp); + #endif diff -r fa660d79f695 -r de310533c483 linux-2.6-xen-sparse/include/linux/gfp.h --- a/linux-2.6-xen-sparse/include/linux/gfp.h Tue Aug 9 15:17:45 2005 +++ b/linux-2.6-xen-sparse/include/linux/gfp.h Tue Aug 9 23:57:17 2005 @@ -26,26 +26,28 @@ * * __GFP_NORETRY: The VM implementation must not retry indefinitely. */ -#define __GFP_WAIT 0x10 /* Can wait and reschedule? */ -#define __GFP_HIGH 0x20 /* Should access emergency pools? */ -#define __GFP_IO 0x40 /* Can start physical IO? */ -#define __GFP_FS 0x80 /* Can call down to low-level FS? */ -#define __GFP_COLD 0x100 /* Cache-cold page required */ -#define __GFP_NOWARN 0x200 /* Suppress page allocation failure warning */ -#define __GFP_REPEAT 0x400 /* Retry the allocation. Might fail */ -#define __GFP_NOFAIL 0x800 /* Retry for ever. Cannot fail */ -#define __GFP_NORETRY 0x1000 /* Do not retry. Might fail */ -#define __GFP_NO_GROW 0x2000 /* Slab internal usage */ -#define __GFP_COMP 0x4000 /* Add compound page metadata */ -#define __GFP_ZERO 0x8000 /* Return zeroed page on success */ +#define __GFP_WAIT 0x10u /* Can wait and reschedule? */ +#define __GFP_HIGH 0x20u /* Should access emergency pools? */ +#define __GFP_IO 0x40u /* Can start physical IO? */ +#define __GFP_FS 0x80u /* Can call down to low-level FS? */ +#define __GFP_COLD 0x100u /* Cache-cold page required */ +#define __GFP_NOWARN 0x200u /* Suppress page allocation failure warning */ +#define __GFP_REPEAT 0x400u /* Retry the allocation. Might fail */ +#define __GFP_NOFAIL 0x800u /* Retry for ever. Cannot fail */ +#define __GFP_NORETRY 0x1000u /* Do not retry. Might fail */ +#define __GFP_NO_GROW 0x2000u /* Slab internal usage */ +#define __GFP_COMP 0x4000u /* Add compound page metadata */ +#define __GFP_ZERO 0x8000u /* Return zeroed page on success */ +#define __GFP_NOMEMALLOC 0x10000u /* Don't use emergency reserves */ -#define __GFP_BITS_SHIFT 16 /* Room for 16 __GFP_FOO bits */ +#define __GFP_BITS_SHIFT 20 /* Room for 20 __GFP_FOO bits */ #define __GFP_BITS_MASK ((1 << __GFP_BITS_SHIFT) - 1) /* if you forget to add the bitmask here kernel will crash, period */ #define GFP_LEVEL_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS| \ __GFP_COLD|__GFP_NOWARN|__GFP_REPEAT| \ - __GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP) + __GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP| \ + __GFP_NOMEMALLOC) #define GFP_ATOMIC (__GFP_HIGH) #define GFP_NOIO (__GFP_WAIT) @@ -86,7 +88,7 @@ extern struct page * FASTCALL(__alloc_pages(unsigned int, unsigned int, struct zonelist *)); -static inline struct page *alloc_pages_node(int nid, unsigned int gfp_mask, +static inline struct page *alloc_pages_node(int nid, unsigned int __nocast gfp_mask, unsigned int order) { if (unlikely(order >= MAX_ORDER)) @@ -97,17 +99,17 @@ } #ifdef CONFIG_NUMA -extern struct page *alloc_pages_current(unsigned gfp_mask, unsigned order); +extern struct page *alloc_pages_current(unsigned int __nocast gfp_mask, unsigned order); static inline struct page * -alloc_pages(unsigned int gfp_mask, unsigned int order) +alloc_pages(unsigned int __nocast gfp_mask, unsigned int order) { if (unlikely(order >= MAX_ORDER)) return NULL; return alloc_pages_current(gfp_mask, order); } -extern struct page *alloc_page_vma(unsigned gfp_mask, +extern struct page *alloc_page_vma(unsigned __nocast gfp_mask, struct vm_area_struct *vma, unsigned long addr); #else #define alloc_pages(gfp_mask, order) \ @@ -116,8 +118,8 @@ #endif #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) -extern unsigned long FASTCALL(__get_free_pages(unsigned int gfp_mask, unsigned int order)); -extern unsigned long FASTCALL(get_zeroed_page(unsigned int gfp_mask)); +extern unsigned long FASTCALL(__get_free_pages(unsigned int __nocast gfp_mask, unsigned int order)); +extern unsigned long FASTCALL(get_zeroed_page(unsigned int __nocast gfp_mask)); #define __get_free_page(gfp_mask) \ __get_free_pages((gfp_mask),0) diff -r fa660d79f695 -r de310533c483 linux-2.6-xen-sparse/mm/highmem.c --- a/linux-2.6-xen-sparse/mm/highmem.c Tue Aug 9 15:17:45 2005 +++ b/linux-2.6-xen-sparse/mm/highmem.c Tue Aug 9 23:57:17 2005 @@ -30,9 +30,9 @@ static mempool_t *page_pool, *isa_page_pool; -static void *page_pool_alloc(int gfp_mask, void *data) -{ - int gfp = gfp_mask | (int) (long) data; +static void *page_pool_alloc(unsigned int __nocast gfp_mask, void *data) +{ + unsigned int gfp = gfp_mask | (unsigned int) (long) data; return alloc_page(gfp); } @@ -90,7 +90,8 @@ * So no dangers, even with speculative execution. */ page = pte_page(pkmap_page_table[i]); - pte_clear(&pkmap_page_table[i]); + pte_clear(&init_mm, (unsigned long)page_address(page), + &pkmap_page_table[i]); set_page_address(page, NULL); } @@ -138,7 +139,8 @@ } } vaddr = PKMAP_ADDR(last_pkmap_nr); - set_pte(&(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot)); + set_pte_at(&init_mm, vaddr, + &(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot)); pkmap_count[last_pkmap_nr] = 1; set_page_address(page, (void *)vaddr); @@ -332,6 +334,7 @@ continue; mempool_free(bvec->bv_page, pool); + dec_page_state(nr_bounce); } bio_endio(bio_orig, bio_orig->bi_size, err); @@ -412,6 +415,7 @@ to->bv_page = mempool_alloc(pool, q->bounce_gfp); to->bv_len = from->bv_len; to->bv_offset = from->bv_offset; + inc_page_state(nr_bounce); if (rw == WRITE) { char *vto, *vfrom; diff -r fa660d79f695 -r de310533c483 linux-2.6-xen-sparse/mm/memory.c --- a/linux-2.6-xen-sparse/mm/memory.c Tue Aug 9 15:17:45 2005 +++ b/linux-2.6-xen-sparse/mm/memory.c Tue Aug 9 23:57:17 2005 @@ -46,7 +46,6 @@ #include <linux/highmem.h> #include <linux/pagemap.h> #include <linux/rmap.h> -#include <linux/acct.h> #include <linux/module.h> #include <linux/init.h> @@ -84,116 +83,205 @@ EXPORT_SYMBOL(vmalloc_earlyreserve); /* + * If a p?d_bad entry is found while walking page tables, report + * the error, before resetting entry to p?d_none. Usually (but + * very seldom) called out from the p?d_none_or_clear_bad macros. + */ + +void pgd_clear_bad(pgd_t *pgd) +{ + pgd_ERROR(*pgd); + pgd_clear(pgd); +} + +void pud_clear_bad(pud_t *pud) +{ + pud_ERROR(*pud); + pud_clear(pud); +} + +void pmd_clear_bad(pmd_t *pmd) +{ + pmd_ERROR(*pmd); + pmd_clear(pmd); +} + +/* * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. */ -static inline void clear_pmd_range(struct mmu_gather *tlb, pmd_t *pmd, unsigned long start, unsigned long end) -{ - struct page *page; - - if (pmd_none(*pmd)) +static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd) +{ + struct page *page = pmd_page(*pmd); + pmd_clear(pmd); + pte_free_tlb(tlb, page); + dec_page_state(nr_page_table_pages); + tlb->mm->nr_ptes--; +} + +static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling) +{ + pmd_t *pmd; + unsigned long next; + unsigned long start; + + start = addr; + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(pmd)) + continue; + free_pte_range(tlb, pmd); + } while (pmd++, addr = next, addr != end); + + start &= PUD_MASK; + if (start < floor) return; - if (unlikely(pmd_bad(*pmd))) { - pmd_ERROR(*pmd); - pmd_clear(pmd); + if (ceiling) { + ceiling &= PUD_MASK; + if (!ceiling) + return; + } + if (end - 1 > ceiling - 1) return; - } - if (!((start | end) & ~PMD_MASK)) { - /* Only clear full, aligned ranges */ - page = pmd_page(*pmd); - pmd_clear(pmd); - dec_page_state(nr_page_table_pages); - tlb->mm->nr_ptes--; - pte_free_tlb(tlb, page); - } -} - -static inline void clear_pud_range(struct mmu_gather *tlb, pud_t *pud, unsigned long start, unsigned long end) -{ - unsigned long addr = start, next; - pmd_t *pmd, *__pmd; - - if (pud_none(*pud)) + + pmd = pmd_offset(pud, start); + pud_clear(pud); + pmd_free_tlb(tlb, pmd); +} + +static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling) +{ + pud_t *pud; + unsigned long next; + unsigned long start; + + start = addr; + pud = pud_offset(pgd, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) + continue; + free_pmd_range(tlb, pud, addr, next, floor, ceiling); + } while (pud++, addr = next, addr != end); + + start &= PGDIR_MASK; + if (start < floor) return; - if (unlikely(pud_bad(*pud))) { - pud_ERROR(*pud); - pud_clear(pud); + if (ceiling) { + ceiling &= PGDIR_MASK; + if (!ceiling) + return; + } + if (end - 1 > ceiling - 1) return; - } - - pmd = __pmd = pmd_offset(pud, start); + + pud = pud_offset(pgd, start); + pgd_clear(pgd); + pud_free_tlb(tlb, pud); +} + +/* + * This function frees user-level page tables of a process. + * + * Must be called with pagetable lock held. + */ +void free_pgd_range(struct mmu_gather **tlb, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling) +{ + pgd_t *pgd; + unsigned long next; + unsigned long start; + + /* + * The next few lines have given us lots of grief... + * + * Why are we testing PMD* at this top level? Because often + * there will be no work to do at all, and we'd prefer not to + * go all the way down to the bottom just to discover that. + * + * Why all these "- 1"s? Because 0 represents both the bottom + * of the address space and the top of it (using -1 for the + * top wouldn't help much: the masks would do the wrong thing). + * The rule is that addr 0 and floor 0 refer to the bottom of + * the address space, but end 0 and ceiling 0 refer to the top + * Comparisons need to use "end - 1" and "ceiling - 1" (though + * that end 0 case should be mythical). + * + * Wherever addr is brought up or ceiling brought down, we must + * be careful to reject "the opposite 0" before it confuses the + * subsequent tests. But what about where end is brought down + * by PMD_SIZE below? no, end can't go down to 0 there. + * + * Whereas we round start (addr) and ceiling down, by different + * masks at different levels, in order to test whether a table + * now has no other vmas using it, so can be freed, we don't + * bother to round floor or end up - the tests don't need that. + */ + + addr &= PMD_MASK; + if (addr < floor) { + addr += PMD_SIZE; + if (!addr) + return; + } + if (ceiling) { + ceiling &= PMD_MASK; + if (!ceiling) + return; + } + if (end - 1 > ceiling - 1) + end -= PMD_SIZE; + if (addr > end - 1) + return; + + start = addr; + pgd = pgd_offset((*tlb)->mm, addr); do { - next = (addr + PMD_SIZE) & PMD_MASK; - if (next > end || next <= addr) - next = end; - - clear_pmd_range(tlb, pmd, addr, next); - pmd++; - addr = next; - } while (addr && (addr < end)); - - if (!((start | end) & ~PUD_MASK)) { - /* Only clear full, aligned ranges */ - pud_clear(pud); - pmd_free_tlb(tlb, __pmd); - } -} - - -static inline void clear_pgd_range(struct mmu_gather *tlb, pgd_t *pgd, unsigned long start, unsigned long end) -{ - unsigned long addr = start, next; - pud_t *pud, *__pud; - - if (pgd_none(*pgd)) - return; - if (unlikely(pgd_bad(*pgd))) { - pgd_ERROR(*pgd); - pgd_clear(pgd); - return; - } - - pud = __pud = pud_offset(pgd, start); - do { - next = (addr + PUD_SIZE) & PUD_MASK; - if (next > end || next <= addr) - next = end; - - clear_pud_range(tlb, pud, addr, next); - pud++; - addr = next; - } while (addr && (addr < end)); - - if (!((start | end) & ~PGDIR_MASK)) { - /* Only clear full, aligned ranges */ - pgd_clear(pgd); - pud_free_tlb(tlb, __pud); - } -} - -/* - * This function clears user-level page tables of a process. - * - * Must be called with pagetable lock held. - */ -void clear_page_range(struct mmu_gather *tlb, unsigned long start, unsigned long end) -{ - unsigned long addr = start, next; - pgd_t * pgd = pgd_offset(tlb->mm, start); - unsigned long i; - - for (i = pgd_index(start); i <= pgd_index(end-1); i++) { - next = (addr + PGDIR_SIZE) & PGDIR_MASK; - if (next > end || next <= addr) - next = end; - - clear_pgd_range(tlb, pgd, addr, next); - pgd++; - addr = next; - } -} - -pte_t fastcall * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address) + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) + continue; + free_pud_range(*tlb, pgd, addr, next, floor, ceiling); + } while (pgd++, addr = next, addr != end); + + if (!tlb_is_full_mm(*tlb)) + flush_tlb_pgtables((*tlb)->mm, start, end); +} + +void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, + unsigned long floor, unsigned long ceiling) +{ + while (vma) { + struct vm_area_struct *next = vma->vm_next; + unsigned long addr = vma->vm_start; + + if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) { + hugetlb_free_pgd_range(tlb, addr, vma->vm_end, + floor, next? next->vm_start: ceiling); + } else { + /* + * Optimization: gather nearby vmas into one call down + */ + while (next && next->vm_start <= vma->vm_end + PMD_SIZE + && !is_hugepage_only_range(vma->vm_mm, next->vm_start, + HPAGE_SIZE)) { + vma = next; + next = vma->vm_next; + } + free_pgd_range(tlb, addr, vma->vm_end, + floor, next? next->vm_start: ceiling); + } + vma = next; + } +} + +pte_t fastcall *pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, + unsigned long address) { if (!pmd_present(*pmd)) { struct page *new; @@ -254,20 +342,7 @@ */ static inline void -copy_swap_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t pte) -{ - if (pte_file(pte)) - return; - swap_duplicate(pte_to_swp_entry(pte)); - if (list_empty(&dst_mm->mmlist)) { - spin_lock(&mmlist_lock); - list_add(&dst_mm->mmlist, &src_mm->mmlist); - spin_unlock(&mmlist_lock); - } -} - -static inline void -copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, +copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t *dst_pte, pte_t *src_pte, unsigned long vm_flags, unsigned long addr) { @@ -275,12 +350,21 @@ struct page *page; unsigned long pfn; - /* pte contains position in swap, so copy. */ - if (!pte_present(pte)) { - copy_swap_pte(dst_mm, src_mm, pte); - set_pte(dst_pte, pte); + /* pte contains position in swap or file, so copy. */ + if (unlikely(!pte_present(pte))) { + if (!pte_file(pte)) { + swap_duplicate(pte_to_swp_entry(pte)); + /* make sure dst_mm is on swapoff's mmlist. */ + if (unlikely(list_empty(&dst_mm->mmlist))) { + spin_lock(&mmlist_lock); + list_add(&dst_mm->mmlist, &src_mm->mmlist); + spin_unlock(&mmlist_lock); + } + } + set_pte_at(dst_mm, addr, dst_pte, pte); return; } + pfn = pte_pfn(pte); /* the pte points outside of valid memory, the * mapping is assumed to be good, meaningful @@ -292,7 +376,7 @@ page = pfn_to_page(pfn); if (!page || PageReserved(page)) { - set_pte(dst_pte, pte); + set_pte_at(dst_mm, addr, dst_pte, pte); return; } @@ -301,7 +385,7 @@ * in the parent and the child */ if ((vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE) { - ptep_set_wrprotect(src_pte); + ptep_set_wrprotect(src_mm, addr, src_pte); pte = *src_pte; } @@ -313,172 +397,137 @@ pte = pte_mkclean(pte); pte = pte_mkold(pte); get_page(page); - dst_mm->rss++; + inc_mm_counter(dst_mm, rss); if (PageAnon(page)) - dst_mm->anon_rss++; - set_pte(dst_pte, pte); + inc_mm_counter(dst_mm, anon_rss); + set_pte_at(dst_mm, addr, dst_pte, pte); page_dup_rmap(page); } -static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, +static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, unsigned long addr, unsigned long end) { pte_t *src_pte, *dst_pte; - pte_t *s, *d; unsigned long vm_flags = vma->vm_flags; - - d = dst_pte = pte_alloc_map(dst_mm, dst_pmd, addr); + int progress; + +again: + dst_pte = pte_alloc_map(dst_mm, dst_pmd, addr); if (!dst_pte) return -ENOMEM; - + src_pte = pte_offset_map_nested(src_pmd, addr); + + progress = 0; spin_lock(&src_mm->page_table_lock); - s = src_pte = pte_offset_map_nested(src_pmd, addr); - for (; addr < end; addr += PAGE_SIZE, s++, d++) { - if (pte_none(*s)) + do { + /* + * We are holding two locks at this point - either of them + * could generate latencies in another task on another CPU. + */ + if (progress >= 32 && (need_resched() || + need_lockbreak(&src_mm->page_table_lock) || + need_lockbreak(&dst_mm->page_table_lock))) + break; + if (pte_none(*src_pte)) { + progress++; continue; - copy_one_pte(dst_mm, src_mm, d, s, vm_flags, addr); - } - pte_unmap_nested(src_pte); - pte_unmap(dst_pte); + } + copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vm_flags, addr); + progress += 8; + } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); spin_unlock(&src_mm->page_table_lock); + + pte_unmap_nested(src_pte - 1); + pte_unmap(dst_pte - 1); cond_resched_lock(&dst_mm->page_table_lock); + if (addr != end) + goto again; return 0; } -static int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, +static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma, unsigned long addr, unsigned long end) { pmd_t *src_pmd, *dst_pmd; - int err = 0; unsigned long next; - src_pmd = pmd_offset(src_pud, addr); dst_pmd = pmd_alloc(dst_mm, dst_pud, addr); if (!dst_pmd) return -ENOMEM; - - for (; addr < end; addr = next, src_pmd++, dst_pmd++) { - next = (addr + PMD_SIZE) & PMD_MASK; - if (next > end || next <= addr) - next = end; - if (pmd_none(*src_pmd)) + src_pmd = pmd_offset(src_pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(src_pmd)) continue; - if (pmd_bad(*src_pmd)) { - pmd_ERROR(*src_pmd); - pmd_clear(src_pmd); - continue; - } - err = copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, - vma, addr, next); - if (err) - break; - } - return err; -} - -static int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, + if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, + vma, addr, next)) + return -ENOMEM; + } while (dst_pmd++, src_pmd++, addr = next, addr != end); + return 0; +} + +static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma, unsigned long addr, unsigned long end) { pud_t *src_pud, *dst_pud; - int err = 0; unsigned long next; - src_pud = pud_offset(src_pgd, addr); dst_pud = pud_alloc(dst_mm, dst_pgd, addr); if (!dst_pud) return -ENOMEM; - - for (; addr < end; addr = next, src_pud++, dst_pud++) { - next = (addr + PUD_SIZE) & PUD_MASK; - if (next > end || next <= addr) - next = end; - if (pud_none(*src_pud)) + src_pud = pud_offset(src_pgd, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(src_pud)) continue; - if (pud_bad(*src_pud)) { - pud_ERROR(*src_pud); - pud_clear(src_pud); + if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, + vma, addr, next)) + return -ENOMEM; + } while (dst_pud++, src_pud++, addr = next, addr != end); + return 0; +} + +int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, + struct vm_area_struct *vma) +{ + pgd_t *src_pgd, *dst_pgd; + unsigned long next; + unsigned long addr = vma->vm_start; + unsigned long end = vma->vm_end; + + if (is_vm_hugetlb_page(vma)) + return copy_hugetlb_page_range(dst_mm, src_mm, vma); + + dst_pgd = pgd_offset(dst_mm, addr); + src_pgd = pgd_offset(src_mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(src_pgd)) continue; - } - err = copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, - vma, addr, next); - if (err) - break; - } - return err; -} - -int copy_page_range(struct mm_struct *dst, struct mm_struct *src, - struct vm_area_struct *vma) -{ - pgd_t *src_pgd, *dst_pgd; - unsigned long addr, start, end, next; - int err = 0; - - if (is_vm_hugetlb_page(vma)) - return copy_hugetlb_page_range(dst, src, vma); - - start = vma->vm_start; - src_pgd = pgd_offset(src, start); - dst_pgd = pgd_offset(dst, start); - - end = vma->vm_end; - addr = start; - while (addr && (addr < end-1)) { - next = (addr + PGDIR_SIZE) & PGDIR_MASK; - if (next > end || next <= addr) - next = end; - if (pgd_none(*src_pgd)) - goto next_pgd; - if (pgd_bad(*src_pgd)) { - pgd_ERROR(*src_pgd); - pgd_clear(src_pgd); - goto next_pgd; - } - err = copy_pud_range(dst, src, dst_pgd, src_pgd, - vma, addr, next); - if (err) - break; - -next_pgd: - src_pgd++; - dst_pgd++; - addr = next; - } - - return err; -} - -static void zap_pte_range(struct mmu_gather *tlb, - pmd_t *pmd, unsigned long address, - unsigned long size, struct zap_details *details) -{ - unsigned long offset; - pte_t *ptep; - - if (pmd_none(*pmd)) - return; - if (unlikely(pmd_bad(*pmd))) { - pmd_ERROR(*pmd); - pmd_clear(pmd); - return; - } - ptep = pte_offset_map(pmd, address); - offset = address & ~PMD_MASK; - if (offset + size > PMD_SIZE) - size = PMD_SIZE - offset; - size &= PAGE_MASK; - if (details && !details->check_mapping && !details->nonlinear_vma) - details = NULL; - for (offset=0; offset < size; ptep++, offset += PAGE_SIZE) { - pte_t pte = *ptep; - if (pte_none(pte)) + if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, + vma, addr, next)) + return -ENOMEM; + } while (dst_pgd++, src_pgd++, addr = next, addr != end); + return 0; +} + +static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd, + unsigned long addr, unsigned long end, + struct zap_details *details) +{ + pte_t *pte; + + pte = pte_offset_map(pmd, addr); + do { + pte_t ptent = *pte; + if (pte_none(ptent)) continue; - if (pte_present(pte)) { + if (pte_present(ptent)) { struct page *page = NULL; - unsigned long pfn = pte_pfn(pte); + unsigned long pfn = pte_pfn(ptent); if (pfn_valid(pfn)) { page = pfn_to_page(pfn); if (PageReserved(page)) @@ -502,19 +551,20 @@ page->index > details->last_index)) continue; } - pte = ptep_get_and_clear(ptep); - tlb_remove_tlb_entry(tlb, ptep, address+offset); + ptent = ptep_get_and_clear(tlb->mm, addr, pte); + tlb_remove_tlb_entry(tlb, pte, addr); if (unlikely(!page)) continue; if (unlikely(details) && details->nonlinear_vma && linear_page_index(details->nonlinear_vma, - address+offset) != page->index) - set_pte(ptep, pgoff_to_pte(page->index)); - if (pte_dirty(pte)) + addr) != page->index) + set_pte_at(tlb->mm, addr, pte, + pgoff_to_pte(page->index)); + if (pte_dirty(ptent)) set_page_dirty(page); if (PageAnon(page)) - tlb->mm->anon_rss--; - else if (pte_young(pte)) + dec_mm_counter(tlb->mm, anon_rss); + else if (pte_young(ptent)) mark_page_accessed(page); tlb->freed++; page_remove_rmap(page); @@ -527,78 +577,64 @@ */ if (unlikely(details)) continue; - if (!pte_file(pte)) - free_swap_and_cache(pte_to_swp_entry(pte)); - pte_clear(ptep); - } - pte_unmap(ptep-1); -} - -static void zap_pmd_range(struct mmu_gather *tlb, - pud_t *pud, unsigned long address, - unsigned long size, struct zap_details *details) -{ - pmd_t * pmd; - unsigned long end; - - if (pud_none(*pud)) - return; - if (unlikely(pud_bad(*pud))) { - pud_ERROR(*pud); - pud_clear(pud); - return; - } - pmd = pmd_offset(pud, address); - end = address + size; - if (end > ((address + PUD_SIZE) & PUD_MASK)) - end = ((address + PUD_SIZE) & PUD_MASK); + if (!pte_file(ptent)) + free_swap_and_cache(pte_to_swp_entry(ptent)); + pte_clear(tlb->mm, addr, pte); + } while (pte++, addr += PAGE_SIZE, addr != end); + pte_unmap(pte - 1); +} + +static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud, + unsigned long addr, unsigned long end, + struct zap_details *details) +{ + pmd_t *pmd; + unsigned long next; + + pmd = pmd_offset(pud, addr); do { - zap_pte_range(tlb, pmd, address, end - address, details); - address = (address + PMD_SIZE) & PMD_MASK; - pmd++; - } while (address && (address < end)); -} - -static void zap_pud_range(struct mmu_gather *tlb, - pgd_t * pgd, unsigned long address, - unsigned long end, struct zap_details *details) -{ - pud_t * pud; - - if (pgd_none(*pgd)) - return; - if (unlikely(pgd_bad(*pgd))) { - pgd_ERROR(*pgd); - pgd_clear(pgd); - return; - } - pud = pud_offset(pgd, address); + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(pmd)) + continue; + zap_pte_range(tlb, pmd, addr, next, details); + } while (pmd++, addr = next, addr != end); +} + +static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd, + unsigned long addr, unsigned long end, + struct zap_details *details) +{ + pud_t *pud; + unsigned long next; + + pud = pud_offset(pgd, addr); do { - zap_pmd_range(tlb, pud, address, end - address, details); - address = (address + PUD_SIZE) & PUD_MASK; - pud++; - } while (address && (address < end)); -} - -static void unmap_page_range(struct mmu_gather *tlb, - struct vm_area_struct *vma, unsigned long address, - unsigned long end, struct zap_details *details) -{ + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) + continue; + zap_pmd_range(tlb, pud, addr, next, details); + } while (pud++, addr = next, addr != end); +} + +static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long addr, unsigned long end, + struct zap_details *details) +{ + pgd_t *pgd; unsigned long next; - pgd_t *pgd; - int i; - - BUG_ON(address >= end); - pgd = pgd_offset(vma->vm_mm, address); + + if (details && !details->check_mapping && !details->nonlinear_vma) + details = NULL; + + BUG_ON(addr >= end); tlb_start_vma(tlb, vma); - for (i = pgd_index(address); i <= pgd_index(end-1); i++) { - next = (address + PGDIR_SIZE) & PGDIR_MASK; - if (next <= address || next > end) - next = end; - zap_pud_range(tlb, pgd, address, next, details); - address = next; - pgd++; - } + pgd = pgd_offset(vma->vm_mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) + continue; + zap_pud_range(tlb, pgd, addr, next, details); + } while (pgd++, addr = next, addr != end); tlb_end_vma(tlb, vma); } @@ -619,7 +655,7 @@ * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here * @details: details of nonlinear truncation or shared cache invalidation * - * Returns the number of vma's which were covered by the unmapping. + * Returns the end address of the unmapping (restart addr if interrupted). * * Unmap all pages in the vma list. Called under page_table_lock. * @@ -636,7 +672,7 @@ * ensure that any thus-far unmapped pages are flushed before unmap_vmas() * drops the lock and schedules. */ -int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, +unsigned long unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, unsigned long *nr_accounted, struct zap_details *details) @@ -644,12 +680,11 @@ unsigned long zap_bytes = ZAP_BLOCK_SIZE; unsigned long tlb_start = 0; /* For tlb_finish_mmu */ int tlb_start_valid = 0; - int ret = 0; + unsigned long start = start_addr; spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; int fullmm = tlb_is_full_mm(*tlbp); for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { - unsigned long start; unsigned long end; start = max(vma->vm_start, start_addr); @@ -662,7 +697,6 @@ if (vma->vm_flags & VM_ACCOUNT) *nr_accounted += (end - start) >> PAGE_SHIFT; - ret++; while (start != end) { unsigned long block; @@ -693,7 +727,6 @@ if (i_mmap_lock) { /* must reset count of rss freed */ *tlbp = tlb_gather_mmu(mm, fullmm); - details->break_addr = start; goto out; } spin_unlock(&mm->page_table_lock); @@ -707,7 +740,7 @@ } } out: - return ret; + return start; /* which is now the end (or restart) address */ } /** @@ -717,7 +750,7 @@ * @size: number of bytes to zap * @details: details of nonlinear truncation or shared cache invalidation */ -void zap_page_range(struct vm_area_struct *vma, unsigned long address, +unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *details) { struct mm_struct *mm = vma->vm_mm; @@ -727,16 +760,16 @@ if (is_vm_hugetlb_page(vma)) { zap_hugepage_range(vma, address, size); - return; + return end; } lru_add_drain(); spin_lock(&mm->page_table_lock); tlb = tlb_gather_mmu(mm, 0); - unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details); + end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details); tlb_finish_mmu(tlb, address, end); - acct_update_integrals(); spin_unlock(&mm->page_table_lock); + return end; } /* @@ -987,111 +1020,78 @@ EXPORT_SYMBOL(get_user_pages); -static void zeromap_pte_range(pte_t * pte, unsigned long address, - unsigned long size, pgprot_t prot) -{ - unsigned long end; - - address &= ~PMD_MASK; - end = address + size; - if (end > PMD_SIZE) - end = PMD_SIZE; +static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, unsigned long end, pgprot_t prot) +{ + pte_t *pte; + + pte = pte_alloc_map(mm, pmd, addr); + if (!pte) + return -ENOMEM; do { - pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(address), prot)); + pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(addr), prot)); BUG_ON(!pte_none(*pte)); - set_pte(pte, zero_pte); - address += PAGE_SIZE; - pte++; - } while (address && (address < end)); -} - -static inline int zeromap_pmd_range(struct mm_struct *mm, pmd_t * pmd, - unsigned long address, unsigned long size, pgprot_t prot) -{ - unsigned long base, end; - - base = address & PUD_MASK; - address &= ~PUD_MASK; - end = address + size; - if (end > PUD_SIZE) - end = PUD_SIZE; + set_pte_at(mm, addr, pte, zero_pte); + } while (pte++, addr += PAGE_SIZE, addr != end); + pte_unmap(pte - 1); + return 0; +} + +static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud, + unsigned long addr, unsigned long end, pgprot_t prot) +{ + pmd_t *pmd; + unsigned long next; + + pmd = pmd_alloc(mm, pud, addr); + if (!pmd) + return -ENOMEM; do { - pte_t * pte = pte_alloc_map(mm, pmd, base + address); - if (!pte) + next = pmd_addr_end(addr, end); + if (zeromap_pte_range(mm, pmd, addr, next, prot)) return -ENOMEM; - zeromap_pte_range(pte, base + address, end - address, prot); - pte_unmap(pte); - address = (address + PMD_SIZE) & PMD_MASK; - pmd++; - } while (address && (address < end)); + } while (pmd++, addr = next, addr != end); return 0; } -static inline int zeromap_pud_range(struct mm_struct *mm, pud_t * pud, - unsigned long address, - unsigned long size, pgprot_t prot) -{ - unsigned long base, end; - int error = 0; - - base = address & PGDIR_MASK; - address &= ~PGDIR_MASK; - end = address + size; - if (end > PGDIR_SIZE) - end = PGDIR_SIZE; +static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd, + unsigned long addr, unsigned long end, pgprot_t prot) +{ + pud_t *pud; + unsigned long next; + + pud = pud_alloc(mm, pgd, addr); + if (!pud) + return -ENOMEM; do { - pmd_t * pmd = pmd_alloc(mm, pud, base + address); - error = -ENOMEM; - if (!pmd) + next = pud_addr_end(addr, end); + if (zeromap_pmd_range(mm, pud, addr, next, prot)) + return -ENOMEM; + } while (pud++, addr = next, addr != end); + return 0; +} + +int zeromap_page_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long size, pgprot_t prot) +{ + pgd_t *pgd; + unsigned long next; + unsigned long end = addr + size; + struct mm_struct *mm = vma->vm_mm; + int err; + + BUG_ON(addr >= end); + pgd = pgd_offset(mm, addr); + flush_cache_range(vma, addr, end); + spin_lock(&mm->page_table_lock); + do { + next = pgd_addr_end(addr, end); + err = zeromap_pud_range(mm, pgd, addr, next, prot); + if (err) break; - error = zeromap_pmd_range(mm, pmd, base + address, - end - address, prot); - if (error) - break; - address = (address + PUD_SIZE) & PUD_MASK; - pud++; - } while (address && (address < end)); - return 0; -} - -int zeromap_page_range(struct vm_area_struct *vma, unsigned long address, - unsigned long size, pgprot_t prot) -{ - int i; - int error = 0; - pgd_t * pgd; - unsigned long beg = address; - unsigned long end = address + size; - unsigned long next; - struct mm_struct *mm = vma->vm_mm; - - pgd = pgd_offset(mm, address); - flush_cache_range(vma, beg, end); - BUG_ON(address >= end); - BUG_ON(end > vma->vm_end); - - spin_lock(&mm->page_table_lock); - for (i = pgd_index(address); i <= pgd_index(end-1); i++) { - pud_t *pud = pud_alloc(mm, pgd, address); - error = -ENOMEM; - if (!pud) - break; - next = (address + PGDIR_SIZE) & PGDIR_MASK; - if (next <= beg || next > end) - next = end; - error = zeromap_pud_range(mm, pud, address, - next - address, prot); - if (error) - break; - address = next; - pgd++; - } - /* - * Why flush? zeromap_pte_range has a BUG_ON for !pte_none() - */ - flush_tlb_range(vma, beg, end); + } while (pgd++, addr = next, addr != end); spin_unlock(&mm->page_table_lock); - return error; + return err; } /* @@ -1099,95 +1099,74 @@ * mappings are removed. any references to nonexistent pages results * in null mappings (currently treated as "copy-on-access") */ -static inline void -remap_pte_range(pte_t * pte, unsigned long address, unsigned long size, - unsigned long pfn, pgprot_t prot) -{ - unsigned long end; - - address &= ~PMD_MASK; - end = address + size; - if (end > PMD_SIZE) - end = PMD_SIZE; +static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, unsigned long end, + unsigned long pfn, pgprot_t prot) +{ + pte_t *pte; + + pte = pte_alloc_map(mm, pmd, addr); + if (!pte) + return -ENOMEM; do { BUG_ON(!pte_none(*pte)); if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn))) - set_pte(pte, pfn_pte(pfn, prot)); - address += PAGE_SIZE; + set_pte_at(mm, addr, pte, pfn_pte(pfn, prot)); pfn++; - pte++; - } while (address && (address < end)); -} - -static inline int -remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, - unsigned long size, unsigned long pfn, pgprot_t prot) -{ - unsigned long base, end; - - base = address & PUD_MASK; - address &= ~PUD_MASK; - end = address + size; - if (end > PUD_SIZE) - end = PUD_SIZE; - pfn -= (address >> PAGE_SHIFT); + } while (pte++, addr += PAGE_SIZE, addr != end); + pte_unmap(pte - 1); + return 0; +} + +static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, + unsigned long addr, unsigned long end, + unsigned long pfn, pgprot_t prot) +{ + pmd_t *pmd; + unsigned long next; + + pfn -= addr >> PAGE_SHIFT; + pmd = pmd_alloc(mm, pud, addr); + if (!pmd) + return -ENOMEM; do { - pte_t * pte = pte_alloc_map(mm, pmd, base + address); - if (!pte) + next = pmd_addr_end(addr, end); + if (remap_pte_range(mm, pmd, addr, next, + pfn + (addr >> PAGE_SHIFT), prot)) return -ENOMEM; - remap_pte_range(pte, base + address, end - address, - (address >> PAGE_SHIFT) + pfn, prot); - pte_unmap(pte); - address = (address + PMD_SIZE) & PMD_MASK; - pmd++; - } while (address && (address < end)); + } while (pmd++, addr = next, addr != end); return 0; } -static inline int remap_pud_range(struct mm_struct *mm, pud_t * pud, - unsigned long address, unsigned long size, - unsigned long pfn, pgprot_t prot) -{ - unsigned long base, end; - int error; - - base = address & PGDIR_MASK; - address &= ~PGDIR_MASK; - end = address + size; - if (end > PGDIR_SIZE) - end = PGDIR_SIZE; - pfn -= address >> PAGE_SHIFT; +static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd, + unsigned long addr, unsigned long end, + unsigned long pfn, pgprot_t prot) +{ + pud_t *pud; + unsigned long next; + + pfn -= addr >> PAGE_SHIFT; + pud = pud_alloc(mm, pgd, addr); + if (!pud) + return -ENOMEM; do { - pmd_t *pmd = pmd_alloc(mm, pud, base+address); - error = -ENOMEM; - if (!pmd) - break; - error = remap_pmd_range(mm, pmd, base + address, end - address, - (address >> PAGE_SHIFT) + pfn, prot); - if (error) - break; - address = (address + PUD_SIZE) & PUD_MASK; - pud++; - } while (address && (address < end)); - return error; + next = pud_addr_end(addr, end); + if (remap_pmd_range(mm, pud, addr, next, + pfn + (addr >> PAGE_SHIFT), prot)) + return -ENOMEM; + } while (pud++, addr = next, addr != end); + return 0; } /* Note: this is only safe if the mm semaphore is held when called. */ -int remap_pfn_range(struct vm_area_struct *vma, unsigned long from, +int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { - int error = 0; pgd_t *pgd; - unsigned long beg = from; - unsigned long end = from + size; unsigned long next; + unsigned long end = addr + size; struct mm_struct *mm = vma->vm_mm; - int i; - - pfn -= from >> PAGE_SHIFT; - pgd = pgd_offset(mm, from); - flush_cache_range(vma, beg, end); - BUG_ON(from >= end); + int err; /* * Physically remapped pages are special. Tell the @@ -1199,31 +1178,21 @@ */ vma->vm_flags |= VM_IO | VM_RESERVED; + BUG_ON(addr >= end); + pfn -= addr >> PAGE_SHIFT; + pgd = pgd_offset(mm, addr); + flush_cache_range(vma, addr, end); spin_lock(&mm->page_table_lock); - for (i = pgd_index(beg); i <= pgd_index(end-1); i++) { - pud_t *pud = pud_alloc(mm, pgd, from); - error = -ENOMEM; - if (!pud) + do { + next = pgd_addr_end(addr, end); + err = remap_pud_range(mm, pgd, addr, next, + pfn + (addr >> PAGE_SHIFT), prot); + if (err) break; - next = (from + PGDIR_SIZE) & PGDIR_MASK; - if (next > end || next <= from) - next = end; - error = remap_pud_range(mm, pud, from, end - from, - pfn + (from >> PAGE_SHIFT), prot); - if (error) - break; - from = next; - pgd++; - } - /* - * Why flush? remap_pte_range has a BUG_ON for !pte_none() - */ - flush_tlb_range(vma, beg, end); + } while (pgd++, addr = next, addr != end); spin_unlock(&mm->page_table_lock); - - return error; -} - + return err; +} EXPORT_SYMBOL(remap_pfn_range); /* @@ -1247,11 +1216,11 @@ { pte_t entry; - flush_cache_page(vma, address); entry = maybe_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)), vma); ptep_establish(vma, address, page_table, entry); update_mmu_cache(vma, address, entry); + lazy_mmu_prot_update(entry); } /* @@ -1299,11 +1268,12 @@ int reuse = can_share_swap_page(old_page); unlock_page(old_page); if (reuse) { - flush_cache_page(vma, address); + flush_cache_page(vma, address, pfn); entry = maybe_mkwrite(pte_mkyoung(pte_mkdirty(pte)), vma); ptep_set_access_flags(vma, address, page_table, entry, 1); update_mmu_cache(vma, address, entry); + lazy_mmu_prot_update(entry); pte_unmap(page_table); spin_unlock(&mm->page_table_lock); return VM_FAULT_MINOR; @@ -1337,13 +1307,12 @@ page_table = pte_offset_map(pmd, address); if (likely(pte_same(*page_table, pte))) { if (PageAnon(old_page)) - mm->anon_rss--; - if (PageReserved(old_page)) { - ++mm->rss; - acct_update_integrals(); - update_mem_hiwater(); - } else + dec_mm_counter(mm, anon_rss); + if (PageReserved(old_page)) + inc_mm_counter(mm, rss); + else page_remove_rmap(old_page); + flush_cache_page(vma, address, pfn); break_cow(vma, new_page, address, page_table); lru_cache_add_active(new_page); page_add_anon_rmap(new_page, vma, address); @@ -1387,7 +1356,7 @@ * i_mmap_lock. * * In order to make forward progress despite repeatedly restarting some - * large vma, note the break_addr set by unmap_vmas when it breaks out: + * large vma, note the restart_addr from unmap_vmas when it breaks out: * and restart from that address when we reach that vma again. It might * have been split or merged, shrunk or extended, but never shifted: so * restart_addr remains valid so long as it remains in the vma's range. @@ -1425,8 +1394,8 @@ } } - details->break_addr = end_addr; - zap_page_range(vma, start_addr, end_addr - start_addr, details); + restart_addr = zap_page_range(vma, start_addr, + end_addr - start_addr, details); /* * We cannot rely on the break test in unmap_vmas: @@ -1437,14 +1406,14 @@ need_break = need_resched() || need_lockbreak(details->i_mmap_lock); - if (details->break_addr >= end_addr) { + if (restart_addr >= end_addr) { /* We have now completed this vma: mark it so */ vma->vm_truncate_count = details->truncate_count; if (!need_break) return 0; } else { /* Note restart_addr in vma's truncate_count field */ - vma->vm_truncate_count = details->break_addr; + vma->vm_truncate_count = restart_addr; if (!need_break) goto again; } @@ -1732,12 +1701,13 @@ spin_lock(&mm->page_table_lock); page_table = pte_offset_map(pmd, address); if (unlikely(!pte_same(*page_table, orig_pte))) { - pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); - unlock_page(page); - page_cache_release(page); ret = VM_FAULT_MINOR; - goto out; + goto out_nomap; + } + + if (unlikely(!PageUptodate(page))) { + ret = VM_FAULT_SIGBUS; + goto out_nomap; } /* The page isn't present yet, go ahead with the fault. */ @@ -1746,10 +1716,7 @@ if (vm_swap_full()) remove_exclusive_swap_page(page); - mm->rss++; - acct_update_integrals(); - update_mem_hiwater(); - + inc_mm_counter(mm, rss); pte = mk_pte(page, vma->vm_page_prot); if (write_access && can_share_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); @@ -1758,7 +1725,7 @@ unlock_page(page); flush_icache_page(vma, page); - set_pte(page_table, pte); + set_pte_at(mm, address, page_table, pte); page_add_anon_rmap(page, vma, address); if (write_access) { @@ -1770,10 +1737,17 @@ /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, pte); + lazy_mmu_prot_update(pte); pte_unmap(page_table); spin_unlock(&mm->page_table_lock); out: return ret; +out_nomap: + pte_unmap(page_table); + spin_unlock(&mm->page_table_lock); + unlock_page(page); + page_cache_release(page); + goto out; } /* @@ -1813,9 +1787,7 @@ spin_unlock(&mm->page_table_lock); goto out; } - mm->rss++; - acct_update_integrals(); - update_mem_hiwater(); + inc_mm_counter(mm, rss); entry = maybe_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)), vma); @@ -1824,11 +1796,12 @@ page_add_anon_rmap(page, vma, addr); } - ptep_establish_new(vma, addr, page_table, entry); + set_pte_at(mm, addr, page_table, entry); pte_unmap(page_table); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, addr, entry); + lazy_mmu_prot_update(entry); spin_unlock(&mm->page_table_lock); out: return VM_FAULT_MINOR; @@ -1931,15 +1904,13 @@ /* Only go through if we didn't race with anybody else... */ if (pte_none(*page_table)) { if (!PageReserved(new_page)) - ++mm->rss; - acct_update_integrals(); - update_mem_hiwater(); + inc_mm_counter(mm, rss); flush_icache_page(vma, new_page); entry = mk_pte(new_page, vma->vm_page_prot); if (write_access) entry = maybe_mkwrite(pte_mkdirty(entry), vma); - ptep_establish_new(vma, address, page_table, entry); + set_pte_at(mm, address, page_table, entry); if (anon) { lru_cache_add_active(new_page); page_add_anon_rmap(new_page, vma, address); @@ -1956,6 +1927,7 @@ /* no need to invalidate: a not-present page shouldn't be cached */ update_mmu_cache(vma, address, entry); + lazy_mmu_prot_update(entry); spin_unlock(&mm->page_table_lock); out: return ret; @@ -1983,7 +1955,7 @@ */ if (!vma->vm_ops || !vma->vm_ops->populate || (write_access && !(vma->vm_flags & VM_SHARED))) { - pte_clear(pte); + pte_clear(mm, address, pte); return do_no_page(mm, vma, address, write_access, pte, pmd); } @@ -2050,6 +2022,7 @@ entry = pte_mkyoung(entry); ptep_set_access_flags(vma, address, pte, entry, write_access); update_mmu_cache(vma, address, entry); + lazy_mmu_prot_update(entry); pte_unmap(pte); spin_unlock(&mm->page_table_lock); return VM_FAULT_MINOR; @@ -2099,15 +2072,12 @@ return VM_FAULT_OOM; } -#ifndef __ARCH_HAS_4LEVEL_HACK +#ifndef __PAGETABLE_PUD_FOLDED /* * Allocate page upper directory. * * We've already handled the fast-path in-line, and we own the * page table lock. - * - * On a two-level or three-level page table, this ends up actually being - * entirely optimized away. */ pud_t fastcall *__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) { @@ -2131,15 +2101,14 @@ out: return pud_offset(pgd, address); } - +#endif /* __PAGETABLE_PUD_FOLDED */ + +#ifndef __PAGETABLE_PMD_FOLDED /* * Allocate page middle directory. * * We've already handled the fast-path in-line, and we own the * page table lock. - * - * On a two-level page table, this ends up actually being entirely - * optimized away. */ pmd_t fastcall *__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) { @@ -2155,38 +2124,24 @@ * Because we dropped the lock, we should re-check the * entry, as somebody else could have populated it.. */ +#ifndef __ARCH_HAS_4LEVEL_HACK if (pud_present(*pud)) { pmd_free(new); goto out; } pud_populate(mm, pud, new); - out: - return pmd_offset(pud, address); -} #else -pmd_t fastcall *__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) -{ - pmd_t *new; - - spin_unlock(&mm->page_table_lock); - new = pmd_alloc_one(mm, address); - spin_lock(&mm->page_table_lock); - if (!new) - return NULL; - - /* - * Because we dropped the lock, we should re-check the - * entry, as somebody else could have populated it.. - */ if (pgd_present(*pud)) { pmd_free(new); goto out; } pgd_populate(mm, pud, new); -out: +#endif /* __ARCH_HAS_4LEVEL_HACK */ + + out: return pmd_offset(pud, address); } -#endif +#endif /* __PAGETABLE_PMD_FOLDED */ int make_pages_present(unsigned long addr, unsigned long end) { @@ -2253,13 +2208,13 @@ * update_mem_hiwater * - update per process rss and vm high water data */ -void update_mem_hiwater(void) -{ - struct task_struct *tsk = current; - +void update_mem_hiwater(struct task_struct *tsk) +{ if (tsk->mm) { - if (tsk->mm->hiwater_rss < tsk->mm->rss) - tsk->mm->hiwater_rss = tsk->mm->rss; + unsigned long rss = get_mm_counter(tsk->mm, rss); + + if (tsk->mm->hiwater_rss < rss) + tsk->mm->hiwater_rss = rss; if (tsk->mm->hiwater_vm < tsk->mm->total_vm) tsk->mm->hiwater_vm = tsk->mm->total_vm; } diff -r fa660d79f695 -r de310533c483 linux-2.6-xen-sparse/mm/page_alloc.c --- a/linux-2.6-xen-sparse/mm/page_alloc.c Tue Aug 9 15:17:45 2005 +++ b/linux-2.6-xen-sparse/mm/page_alloc.c Tue Aug 9 23:57:17 2005 @@ -31,19 +31,26 @@ #include <linux/topology.h> #include <linux/sysctl.h> #include <linux/cpu.h> +#include <linux/cpuset.h> #include <linux/nodemask.h> #include <linux/vmalloc.h> #include <asm/tlbflush.h> #include "internal.h" -/* MCD - HACK: Find somewhere to initialize this EARLY, or make this initializer cleaner */ +/* + * MCD - HACK: Find somewhere to initialize this EARLY, or make this + * initializer cleaner + */ nodemask_t node_online_map = { { [0] = 1UL } }; +EXPORT_SYMBOL(node_online_map); nodemask_t node_possible_map = NODE_MASK_ALL; +EXPORT_SYMBOL(node_possible_map); struct pglist_data *pgdat_list; unsigned long totalram_pages; unsigned long totalhigh_pages; long nr_swap_pages; + /* * results with 256, 32 in the lowmem_reserve sysctl: * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) @@ -188,6 +195,37 @@ { __ClearPagePrivate(page); page->private = 0; +} + +/* + * Locate the struct page for both the matching buddy in our + * pair (buddy1) and the combined O(n+1) page they form (page). + * + * 1) Any buddy B1 will have an order O twin B2 which satisfies + * the following equation: + * B2 = B1 ^ (1 << O) + * For example, if the starting buddy (buddy2) is #8 its order + * 1 buddy is #10: + * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 + * + * 2) Any buddy B will have an order O+1 parent P which + * satisfies the following equation: + * P = B & ~(1 << O) + * + * Assumption: *_mem_map is contigious at least up to MAX_ORDER + */ +static inline struct page * +__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order) +{ + unsigned long buddy_idx = page_idx ^ (1 << order); + + return page + (buddy_idx - page_idx); +} + +static inline unsigned long +__find_combined_index(unsigned long page_idx, unsigned int order) +{ + return (page_idx & ~(1 << order)); } /* @@ -233,50 +271,49 @@ * -- wli */ -static inline void __free_pages_bulk (struct page *page, struct page *base, +static inline void __free_pages_bulk (struct page *page, struct zone *zone, unsigned int order) { unsigned long page_idx; - struct page *coalesced; int order_size = 1 << order; if (unlikely(order)) destroy_compound_page(page, order); - page_idx = page - base; + page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); BUG_ON(page_idx & (order_size - 1)); BUG_ON(bad_range(zone, page)); zone->free_pages += order_size; while (order < MAX_ORDER-1) { + unsigned long combined_idx; struct free_area *area; struct page *buddy; - int buddy_idx; - - buddy_idx = (page_idx ^ (1 << order)); - buddy = base + buddy_idx; + + combined_idx = __find_combined_index(page_idx, order); + buddy = __page_find_buddy(page, page_idx, order); + if (bad_range(zone, buddy)) break; if (!page_is_buddy(buddy, order)) - break; - /* Move the buddy up one level. */ + break; /* Move the buddy up one level. */ list_del(&buddy->lru); area = zone->free_area + order; area->nr_free--; rmv_page_order(buddy); - page_idx &= buddy_idx; + page = page + (combined_idx - page_idx); + page_idx = combined_idx; order++; } - coalesced = base + page_idx; - set_page_order(coalesced, order); - list_add(&coalesced->lru, &zone->free_area[order].free_list); + set_page_order(page, order); + list_add(&page->lru, &zone->free_area[order].free_list); zone->free_area[order].nr_free++; } static inline void free_pages_check(const char *function, struct page *page) { - if ( page_mapped(page) || + if ( page_mapcount(page) || page->mapping != NULL || page_count(page) != 0 || (page->flags & ( @@ -309,10 +346,9 @@ struct list_head *list, unsigned int order) { unsigned long flags; - struct page *base, *page = NULL; + struct page *page = NULL; int ret = 0; - base = zone->zone_mem_map; spin_lock_irqsave(&zone->lock, flags); zone->all_unreclaimable = 0; zone->pages_scanned = 0; @@ -320,7 +356,7 @@ page = list_entry(list->prev, struct page, lru); /* have to delete it as __free_pages_bulk list manipulates */ list_del(&page->lru); - __free_pages_bulk(page, base, zone, order); + __free_pages_bulk(page, zone, order); ret++; } spin_unlock_irqrestore(&zone->lock, flags); @@ -405,7 +441,7 @@ */ static void prep_new_page(struct page *page, int order) { - if (page->mapping || page_mapped(page) || + if (page->mapping || page_mapcount(page) || (page->flags & ( 1 << PG_private | 1 << PG_locked | @@ -601,7 +637,7 @@ free_hot_cold_page(page, 1); } -static inline void prep_zero_page(struct page *page, int order, int gfp_flags) +static inline void prep_zero_page(struct page *page, int order, unsigned int __nocast gfp_flags) { int i; @@ -616,7 +652,7 @@ * or two. */ static struct page * -buffered_rmqueue(struct zone *zone, int order, int gfp_flags) +buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags) { unsigned long flags; struct page *page = NULL; @@ -694,7 +730,7 @@ * This is the 'heart' of the zoned buddy allocator. */ struct page * fastcall -__alloc_pages(unsigned int gfp_mask, unsigned int order, +__alloc_pages(unsigned int __nocast gfp_mask, unsigned int order, struct zonelist *zonelist) { const int wait = gfp_mask & __GFP_WAIT; @@ -734,6 +770,9 @@ classzone_idx, 0, 0)) continue; + if (!cpuset_zone_allowed(z)) + continue; + page = buffered_rmqueue(z, order, gfp_mask); if (page) goto got_pg; @@ -745,6 +784,9 @@ /* * Go through the zonelist again. Let __GFP_HIGH and allocations * coming from realtime tasks to go deeper into reserves + * + * This is the last chance, in general, before the goto nopage. + * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. */ for (i = 0; (z = zones[i]) != NULL; i++) { if (!zone_watermark_ok(z, order, z->pages_min, @@ -752,18 +794,27 @@ gfp_mask & __GFP_HIGH)) continue; + if (wait && !cpuset_zone_allowed(z)) + continue; + page = buffered_rmqueue(z, order, gfp_mask); if (page) goto got_pg; } /* This allocation should allow future memory freeing. */ - if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) && !in_interrupt()) { - /* go through the zonelist yet again, ignoring mins */ - for (i = 0; (z = zones[i]) != NULL; i++) { - page = buffered_rmqueue(z, order, gfp_mask); - if (page) - goto got_pg; + + if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) + && !in_interrupt()) { + if (!(gfp_mask & __GFP_NOMEMALLOC)) { + /* go through the zonelist yet again, ignoring mins */ + for (i = 0; (z = zones[i]) != NULL; i++) { + if (!cpuset_zone_allowed(z)) + continue; + page = buffered_rmqueue(z, order, gfp_mask); + if (page) + goto got_pg; + } } goto nopage; } @@ -798,6 +849,9 @@ if (!zone_watermark_ok(z, order, z->pages_min, classzone_idx, can_try_harder, gfp_mask & __GFP_HIGH)) + continue; + + if (!cpuset_zone_allowed(z)) continue; page = buffered_rmqueue(z, order, gfp_mask); @@ -816,6 +870,9 @@ classzone_idx, 0, 0)) continue; + if (!cpuset_zone_allowed(z)) + continue; + page = buffered_rmqueue(z, order, gfp_mask); if (page) goto got_pg; @@ -862,7 +919,7 @@ /* * Common helper functions. */ -fastcall unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int order) +fastcall unsigned long __get_free_pages(unsigned int __nocast gfp_mask, unsigned int order) { struct page * page; page = alloc_pages(gfp_mask, order); @@ -873,7 +930,7 @@ EXPORT_SYMBOL(__get_free_pages); -fastcall unsigned long get_zeroed_page(unsigned int gfp_mask) +fastcall unsigned long get_zeroed_page(unsigned int __nocast gfp_mask) { struct page * page; @@ -1302,8 +1359,7 @@ #define MAX_NODE_LOAD (num_online_nodes()) static int __initdata node_load[MAX_NUMNODES]; /** - * find_next_best_node - find the next node that should appear in a given - * node's fallback list + * find_next_best_node - find the next node that should appear in a given node's fallback list * @node: node whose fallback list we're appending * @used_node_mask: nodemask_t of already used nodes * @@ -1372,7 +1428,6 @@ /* initialize zonelists */ for (i = 0; i < GFP_ZONETYPES; i++) { zonelist = pgdat->node_zonelists + i; - memset(zonelist, 0, sizeof(*zonelist)); zonelist->zones[0] = NULL; } @@ -1419,7 +1474,6 @@ struct zonelist *zonelist; zonelist = pgdat->node_zonelists + i; - memset(zonelist, 0, sizeof(*zonelist)); j = 0; k = ZONE_NORMAL; @@ -1461,6 +1515,7 @@ for_each_online_node(i) build_zonelists(NODE_DATA(i)); printk("Built %i zonelists\n", num_online_nodes()); + cpuset_init_current_mems_allowed(); } /* @@ -1622,6 +1677,18 @@ batch /= 4; /* We effectively *= 4 below */ if (batch < 1) batch = 1; + + /* + * Clamp the batch to a 2^n - 1 value. Having a power + * of 2 value was found to be more likely to have + * suboptimal cache aliasing properties in some cases. + * + * For example if 2 tasks are alternately allocating + * batches of pages, one task can end up with a lot + * of pages of one half of the possible page colors + * and the other with pages of the other colors. + */ + batch = (1 << fls(batch + batch/2)) - 1; for (cpu = 0; cpu < NR_CPUS; cpu++) { struct per_cpu_pages *pcp; @@ -1681,14 +1748,25 @@ } } -void __init node_alloc_mem_map(struct pglist_data *pgdat) +static void __init alloc_node_mem_map(struct pglist_data *pgdat) { unsigned long size; - size = (pgdat->node_spanned_pages + 1) * sizeof(struct page); - pgdat->node_mem_map = alloc_bootmem_node(pgdat, size); + /* Skip empty nodes */ + if (!pgdat->node_spanned_pages) + return; + + /* ia64 gets its own node_mem_map, before this, without bootmem */ + if (!pgdat->node_mem_map) { + size = (pgdat->node_spanned_pages + 1) * sizeof(struct page); + pgdat->node_mem_map = alloc_bootmem_node(pgdat, size); + } #ifndef CONFIG_DISCONTIGMEM - mem_map = contig_page_data.node_mem_map; + /* + * With no DISCONTIG, the global mem_map is just set as node 0's + */ + if (pgdat == NODE_DATA(0)) + mem_map = NODE_DATA(0)->node_mem_map; #endif } @@ -1700,8 +1778,7 @@ pgdat->node_start_pfn = node_start_pfn; calculate_zone_totalpages(pgdat, zones_size, zholes_size); - if (!pfn_to_page(node_start_pfn)) - node_alloc_mem_map(pgdat); + alloc_node_mem_map(pgdat); free_area_init_core(pgdat, zones_size, zholes_size); } @@ -1823,6 +1900,7 @@ "allocstall", "pgrotated", + "nr_bounce", }; static void *vmstat_start(struct seq_file *m, loff_t *pos) @@ -1926,15 +2004,20 @@ for_each_pgdat(pgdat) { for (j = 0; j < MAX_NR_ZONES; j++) { - struct zone * zone = pgdat->node_zones + j; + struct zone *zone = pgdat->node_zones + j; unsigned long present_pages = zone->present_pages; zone->lowmem_reserve[j] = 0; for (idx = j-1; idx >= 0; idx--) { - struct zone * lower_zone = pgdat->node_zones + idx; - - lower_zone->lowmem_reserve[j] = present_pages / sysctl_lowmem_reserve_ratio[idx]; + struct zone *lower_zone; + + if (sysctl_lowmem_reserve_ratio[idx] < 1) + sysctl_lowmem_reserve_ratio[idx] = 1; + + lower_zone = pgdat->node_zones + idx; + lower_zone->lowmem_reserve[j] = present_pages / + sysctl_lowmem_reserve_ratio[idx]; present_pages += lower_zone->present_pages; } } @@ -2041,7 +2124,7 @@ * changes. */ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, loff_t *ppos) + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) { proc_dointvec(table, write, file, buffer, length, ppos); setup_per_zone_pages_min(); @@ -2058,7 +2141,7 @@ * if in function of the boot time zone sizes. */ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, loff_t *ppos) + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) { proc_dointvec_minmax(table, write, file, buffer, length, ppos); setup_per_zone_lowmem_reserve(); _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-changelog
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |