[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-changelog] Update to Linux 2.6.15.
# HG changeset patch # User cl349@xxxxxxxxxxxxxxxxxxxx # Node ID 5a63f675107cd84970e299a291485420d97bc139 # Parent d609de73b9faca3da11509e04f09e092e065ffcd Update to Linux 2.6.15. Signed-off-by: Christian Limpach <Christian.Limpach@xxxxxxxxxxxx> diff -r d609de73b9fa -r 5a63f675107c buildconfigs/linux-defconfig_xen0_x86_32 --- a/buildconfigs/linux-defconfig_xen0_x86_32 Wed Feb 1 17:06:16 2006 +++ b/buildconfigs/linux-defconfig_xen0_x86_32 Wed Feb 1 18:00:19 2006 @@ -1,10 +1,11 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.14-xen0 -# Tue Jan 31 18:56:38 2006 -# +# Linux kernel version: 2.6.15-xen0 +# Wed Feb 1 15:54:13 2006 +# +CONFIG_X86_32=y +CONFIG_SEMAPHORE_SLEEPERS=y CONFIG_X86=y -CONFIG_SEMAPHORE_SLEEPERS=y CONFIG_MMU=y CONFIG_UID16=y CONFIG_GENERIC_ISA_DMA=y @@ -35,6 +36,7 @@ CONFIG_KOBJECT_UEVENT=y # CONFIG_IKCONFIG is not set CONFIG_INITRAMFS_SOURCE="" +# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set # CONFIG_EMBEDDED is not set CONFIG_KALLSYMS=y # CONFIG_KALLSYMS_ALL is not set @@ -62,6 +64,24 @@ # CONFIG_MODVERSIONS is not set # CONFIG_MODULE_SRCVERSION_ALL is not set CONFIG_KMOD=y + +# +# Block layer +# +# CONFIG_LBD is not set + +# +# IO Schedulers +# +CONFIG_IOSCHED_NOOP=y +CONFIG_IOSCHED_AS=y +CONFIG_IOSCHED_DEADLINE=y +CONFIG_IOSCHED_CFQ=y +CONFIG_DEFAULT_AS=y +# CONFIG_DEFAULT_DEADLINE is not set +# CONFIG_DEFAULT_CFQ is not set +# CONFIG_DEFAULT_NOOP is not set +CONFIG_DEFAULT_IOSCHED="anticipatory" # # Processor type and features @@ -108,8 +128,10 @@ CONFIG_X86_INVLPG=y CONFIG_X86_BSWAP=y CONFIG_X86_POPAD_OK=y +CONFIG_X86_CMPXCHG64=y CONFIG_X86_GOOD_APIC=y CONFIG_X86_USE_PPRO_CHECKSUM=y +CONFIG_X86_TSC=y # CONFIG_SMP is not set CONFIG_PREEMPT_NONE=y # CONFIG_PREEMPT_VOLUNTARY is not set @@ -142,6 +164,7 @@ CONFIG_FLATMEM=y CONFIG_FLAT_NODE_MEM_MAP=y # CONFIG_SPARSEMEM_STATIC is not set +CONFIG_SPLIT_PTLOCK_CPUS=4096 CONFIG_MTRR=y # CONFIG_REGPARM is not set CONFIG_SECCOMP=y @@ -257,6 +280,10 @@ CONFIG_NETFILTER=y # CONFIG_NETFILTER_DEBUG is not set CONFIG_BRIDGE_NETFILTER=y + +# +# Core Netfilter Configuration +# # CONFIG_NETFILTER_NETLINK is not set # @@ -346,8 +373,11 @@ # CONFIG_NET_DIVERT is not set # CONFIG_ECONET is not set # CONFIG_WAN_ROUTER is not set + +# +# QoS and/or fair queueing +# # CONFIG_NET_SCHED is not set -# CONFIG_NET_CLS_ROUTE is not set # # Network testing @@ -409,16 +439,7 @@ CONFIG_BLK_DEV_RAM_COUNT=16 CONFIG_BLK_DEV_RAM_SIZE=4096 CONFIG_BLK_DEV_INITRD=y -# CONFIG_LBD is not set # CONFIG_CDROM_PKTCDVD is not set - -# -# IO Schedulers -# -CONFIG_IOSCHED_NOOP=y -CONFIG_IOSCHED_AS=y -CONFIG_IOSCHED_DEADLINE=y -CONFIG_IOSCHED_CFQ=y # CONFIG_ATA_OVER_ETH is not set # @@ -464,6 +485,7 @@ # CONFIG_BLK_DEV_CY82C693 is not set # CONFIG_BLK_DEV_CS5520 is not set # CONFIG_BLK_DEV_CS5530 is not set +# CONFIG_BLK_DEV_CS5535 is not set # CONFIG_BLK_DEV_HPT34X is not set # CONFIG_BLK_DEV_HPT366 is not set # CONFIG_BLK_DEV_SC1200 is not set @@ -519,6 +541,7 @@ # # SCSI low-level drivers # +# CONFIG_ISCSI_TCP is not set CONFIG_BLK_DEV_3W_XXXX_RAID=y # CONFIG_SCSI_3W_9XXX is not set # CONFIG_SCSI_ACARD is not set @@ -548,16 +571,17 @@ CONFIG_SCSI_ATA_PIIX=y # CONFIG_SCSI_SATA_MV is not set # CONFIG_SCSI_SATA_NV is not set +# CONFIG_SCSI_PDC_ADMA is not set +# CONFIG_SCSI_SATA_QSTOR is not set CONFIG_SCSI_SATA_PROMISE=y -# CONFIG_SCSI_SATA_QSTOR is not set CONFIG_SCSI_SATA_SX4=y CONFIG_SCSI_SATA_SIL=y +CONFIG_SCSI_SATA_SIL24=y # CONFIG_SCSI_SATA_SIS is not set # CONFIG_SCSI_SATA_ULI is not set # CONFIG_SCSI_SATA_VIA is not set # CONFIG_SCSI_SATA_VITESSE is not set CONFIG_SCSI_SATA_INTEL_COMBINED=y -# CONFIG_SCSI_CPQFCTS is not set # CONFIG_SCSI_DMX3191D is not set # CONFIG_SCSI_EATA_PIO is not set # CONFIG_SCSI_FUTURE_DOMAIN is not set @@ -566,7 +590,6 @@ # CONFIG_SCSI_INIA100 is not set # CONFIG_SCSI_SYM53C8XX_2 is not set # CONFIG_SCSI_IPR is not set -# CONFIG_SCSI_QLOGIC_ISP is not set # CONFIG_SCSI_QLOGIC_FC is not set # CONFIG_SCSI_QLOGIC_1280 is not set CONFIG_SCSI_QLA2XXX=y @@ -807,7 +830,6 @@ # # Serial drivers # -# CONFIG_SERIAL_8250 is not set # # Non-8250 serial port support @@ -870,6 +892,7 @@ # TPM devices # # CONFIG_TCG_TPM is not set +# CONFIG_TELCLOCK is not set # # I2C support @@ -954,12 +977,15 @@ # # USB Device Class drivers # -# CONFIG_USB_BLUETOOTH_TTY is not set # CONFIG_USB_ACM is not set # CONFIG_USB_PRINTER is not set # -# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support' may also be needed; see USB_STORAGE Help for more information +# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support' +# + +# +# may also be needed; see USB_STORAGE Help for more information # # CONFIG_USB_STORAGE is not set @@ -1212,6 +1238,11 @@ # CONFIG_NLS_UTF8 is not set # +# Instrumentation Support +# +# CONFIG_KPROBES is not set + +# # Kernel hacking # # CONFIG_PRINTK_TIME is not set @@ -1228,10 +1259,11 @@ CONFIG_DEBUG_BUGVERBOSE=y # CONFIG_DEBUG_INFO is not set # CONFIG_DEBUG_FS is not set +# CONFIG_DEBUG_VM is not set CONFIG_FRAME_POINTER=y +# CONFIG_RCU_TORTURE_TEST is not set CONFIG_EARLY_PRINTK=y # CONFIG_DEBUG_STACKOVERFLOW is not set -# CONFIG_KPROBES is not set # CONFIG_DEBUG_STACK_USAGE is not set # CONFIG_DEBUG_PAGEALLOC is not set # CONFIG_4KSTACKS is not set @@ -1312,4 +1344,3 @@ CONFIG_GENERIC_HARDIRQS=y CONFIG_GENERIC_IRQ_PROBE=y CONFIG_X86_BIOS_REBOOT=y -CONFIG_PC=y diff -r d609de73b9fa -r 5a63f675107c buildconfigs/linux-defconfig_xen0_x86_64 --- a/buildconfigs/linux-defconfig_xen0_x86_64 Wed Feb 1 17:06:16 2006 +++ b/buildconfigs/linux-defconfig_xen0_x86_64 Wed Feb 1 18:00:19 2006 @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.14-xen0 -# Tue Jan 31 16:21:00 2006 +# Linux kernel version: 2.6.15-xen0 +# Wed Feb 1 15:50:08 2006 # CONFIG_X86_64=y CONFIG_64BIT=y @@ -40,6 +40,7 @@ CONFIG_KOBJECT_UEVENT=y # CONFIG_IKCONFIG is not set CONFIG_INITRAMFS_SOURCE="" +# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set # CONFIG_EMBEDDED is not set CONFIG_KALLSYMS=y # CONFIG_KALLSYMS_ALL is not set @@ -67,6 +68,24 @@ # CONFIG_MODVERSIONS is not set # CONFIG_MODULE_SRCVERSION_ALL is not set CONFIG_KMOD=y + +# +# Block layer +# +# CONFIG_LBD is not set + +# +# IO Schedulers +# +CONFIG_IOSCHED_NOOP=y +CONFIG_IOSCHED_AS=y +CONFIG_IOSCHED_DEADLINE=y +CONFIG_IOSCHED_CFQ=y +CONFIG_DEFAULT_AS=y +# CONFIG_DEFAULT_DEADLINE is not set +# CONFIG_DEFAULT_CFQ is not set +# CONFIG_DEFAULT_NOOP is not set +CONFIG_DEFAULT_IOSCHED="anticipatory" # # Processor type and features @@ -88,7 +107,6 @@ CONFIG_PREEMPT_NONE=y # CONFIG_PREEMPT_VOLUNTARY is not set # CONFIG_PREEMPT is not set -# CONFIG_NUMA is not set CONFIG_ARCH_FLATMEM_ENABLE=y CONFIG_SELECT_MEMORY_MODEL=y CONFIG_FLATMEM_MANUAL=y @@ -97,6 +115,7 @@ CONFIG_FLATMEM=y CONFIG_FLAT_NODE_MEM_MAP=y # CONFIG_SPARSEMEM_STATIC is not set +CONFIG_SPLIT_PTLOCK_CPUS=4096 CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y CONFIG_SWIOTLB=y CONFIG_DUMMY_IOMMU=y @@ -196,6 +215,10 @@ CONFIG_NETFILTER=y # CONFIG_NETFILTER_DEBUG is not set CONFIG_BRIDGE_NETFILTER=y + +# +# Core Netfilter Configuration +# # CONFIG_NETFILTER_NETLINK is not set # @@ -285,8 +308,11 @@ # CONFIG_NET_DIVERT is not set # CONFIG_ECONET is not set # CONFIG_WAN_ROUTER is not set + +# +# QoS and/or fair queueing +# # CONFIG_NET_SCHED is not set -# CONFIG_NET_CLS_ROUTE is not set # # Network testing @@ -348,16 +374,7 @@ CONFIG_BLK_DEV_RAM_COUNT=16 CONFIG_BLK_DEV_RAM_SIZE=16384 CONFIG_BLK_DEV_INITRD=y -# CONFIG_LBD is not set # CONFIG_CDROM_PKTCDVD is not set - -# -# IO Schedulers -# -CONFIG_IOSCHED_NOOP=y -CONFIG_IOSCHED_AS=y -CONFIG_IOSCHED_DEADLINE=y -CONFIG_IOSCHED_CFQ=y # CONFIG_ATA_OVER_ETH is not set # @@ -458,6 +475,7 @@ # # SCSI low-level drivers # +# CONFIG_ISCSI_TCP is not set CONFIG_BLK_DEV_3W_XXXX_RAID=y # CONFIG_SCSI_3W_9XXX is not set # CONFIG_SCSI_ACARD is not set @@ -488,10 +506,12 @@ CONFIG_SCSI_ATA_PIIX=y # CONFIG_SCSI_SATA_MV is not set # CONFIG_SCSI_SATA_NV is not set +# CONFIG_SCSI_PDC_ADMA is not set +# CONFIG_SCSI_SATA_QSTOR is not set CONFIG_SCSI_SATA_PROMISE=y -# CONFIG_SCSI_SATA_QSTOR is not set CONFIG_SCSI_SATA_SX4=y CONFIG_SCSI_SATA_SIL=y +CONFIG_SCSI_SATA_SIL24=y # CONFIG_SCSI_SATA_SIS is not set # CONFIG_SCSI_SATA_ULI is not set # CONFIG_SCSI_SATA_VIA is not set @@ -499,7 +519,6 @@ CONFIG_SCSI_SATA_INTEL_COMBINED=y CONFIG_SCSI_BUSLOGIC=y # CONFIG_SCSI_OMIT_FLASHPOINT is not set -# CONFIG_SCSI_CPQFCTS is not set # CONFIG_SCSI_DMX3191D is not set # CONFIG_SCSI_EATA is not set # CONFIG_SCSI_EATA_PIO is not set @@ -510,7 +529,6 @@ # CONFIG_SCSI_INIA100 is not set # CONFIG_SCSI_SYM53C8XX_2 is not set # CONFIG_SCSI_IPR is not set -# CONFIG_SCSI_QLOGIC_ISP is not set # CONFIG_SCSI_QLOGIC_FC is not set # CONFIG_SCSI_QLOGIC_1280 is not set CONFIG_SCSI_QLA2XXX=y @@ -750,7 +768,6 @@ # # Serial drivers # -# CONFIG_SERIAL_8250 is not set # # Non-8250 serial port support @@ -800,6 +817,7 @@ # TPM devices # # CONFIG_TCG_TPM is not set +# CONFIG_TELCLOCK is not set # # I2C support @@ -884,12 +902,15 @@ # # USB Device Class drivers # -# CONFIG_USB_BLUETOOTH_TTY is not set # CONFIG_USB_ACM is not set # CONFIG_USB_PRINTER is not set # -# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support' may also be needed; see USB_STORAGE Help for more information +# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support' +# + +# +# may also be needed; see USB_STORAGE Help for more information # # CONFIG_USB_STORAGE is not set @@ -989,6 +1010,7 @@ CONFIG_INFINIBAND_IPOIB=y CONFIG_INFINIBAND_IPOIB_DEBUG=y CONFIG_INFINIBAND_IPOIB_DEBUG_DATA=y +CONFIG_INFINIBAND_SRP=y # # SN Devices @@ -1155,9 +1177,10 @@ # CONFIG_NLS_UTF8 is not set # -# Profiling support +# Instrumentation Support # # CONFIG_PROFILING is not set +# CONFIG_KPROBES is not set # # Kernel hacking @@ -1173,10 +1196,10 @@ # CONFIG_DEBUG_SPINLOCK_SLEEP is not set # CONFIG_DEBUG_KOBJECT is not set # CONFIG_DEBUG_FS is not set +# CONFIG_DEBUG_VM is not set CONFIG_FRAME_POINTER=y -# CONFIG_CHECKING is not set +# CONFIG_RCU_TORTURE_TEST is not set # CONFIG_INIT_DEBUG is not set -# CONFIG_KPROBES is not set # # Security options diff -r d609de73b9fa -r 5a63f675107c buildconfigs/linux-defconfig_xenU_x86_32 --- a/buildconfigs/linux-defconfig_xenU_x86_32 Wed Feb 1 17:06:16 2006 +++ b/buildconfigs/linux-defconfig_xenU_x86_32 Wed Feb 1 18:00:19 2006 @@ -1,10 +1,11 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.14-xenU -# Tue Jan 31 18:57:16 2006 -# +# Linux kernel version: 2.6.15-xenU +# Wed Feb 1 17:28:35 2006 +# +CONFIG_X86_32=y +CONFIG_SEMAPHORE_SLEEPERS=y CONFIG_X86=y -CONFIG_SEMAPHORE_SLEEPERS=y CONFIG_MMU=y CONFIG_UID16=y CONFIG_GENERIC_ISA_DMA=y @@ -35,6 +36,7 @@ # CONFIG_IKCONFIG is not set # CONFIG_CPUSETS is not set CONFIG_INITRAMFS_SOURCE="" +# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set # CONFIG_EMBEDDED is not set CONFIG_KALLSYMS=y # CONFIG_KALLSYMS_ALL is not set @@ -63,6 +65,24 @@ # CONFIG_MODULE_SRCVERSION_ALL is not set CONFIG_KMOD=y CONFIG_STOP_MACHINE=y + +# +# Block layer +# +# CONFIG_LBD is not set + +# +# IO Schedulers +# +CONFIG_IOSCHED_NOOP=y +CONFIG_IOSCHED_AS=y +CONFIG_IOSCHED_DEADLINE=y +CONFIG_IOSCHED_CFQ=y +CONFIG_DEFAULT_AS=y +# CONFIG_DEFAULT_DEADLINE is not set +# CONFIG_DEFAULT_CFQ is not set +# CONFIG_DEFAULT_NOOP is not set +CONFIG_DEFAULT_IOSCHED="anticipatory" # # Processor type and features @@ -109,8 +129,10 @@ CONFIG_X86_INVLPG=y CONFIG_X86_BSWAP=y CONFIG_X86_POPAD_OK=y +CONFIG_X86_CMPXCHG64=y CONFIG_X86_GOOD_APIC=y CONFIG_X86_USE_PPRO_CHECKSUM=y +CONFIG_X86_TSC=y CONFIG_SMP=y CONFIG_SMP_ALTERNATIVES=y CONFIG_NR_CPUS=8 @@ -141,6 +163,7 @@ CONFIG_FLATMEM=y CONFIG_FLAT_NODE_MEM_MAP=y # CONFIG_SPARSEMEM_STATIC is not set +CONFIG_SPLIT_PTLOCK_CPUS=4096 # CONFIG_REGPARM is not set CONFIG_SECCOMP=y # CONFIG_HZ_100 is not set @@ -212,8 +235,11 @@ # CONFIG_NET_DIVERT is not set # CONFIG_ECONET is not set # CONFIG_WAN_ROUTER is not set + +# +# QoS and/or fair queueing +# # CONFIG_NET_SCHED is not set -# CONFIG_NET_CLS_ROUTE is not set # # Network testing @@ -248,16 +274,7 @@ CONFIG_BLK_DEV_RAM_COUNT=16 CONFIG_BLK_DEV_RAM_SIZE=4096 CONFIG_BLK_DEV_INITRD=y -# CONFIG_LBD is not set # CONFIG_CDROM_PKTCDVD is not set - -# -# IO Schedulers -# -CONFIG_IOSCHED_NOOP=y -CONFIG_IOSCHED_AS=y -CONFIG_IOSCHED_DEADLINE=y -CONFIG_IOSCHED_CFQ=y # CONFIG_ATA_OVER_ETH is not set # @@ -295,6 +312,7 @@ # # SCSI low-level drivers # +# CONFIG_ISCSI_TCP is not set # CONFIG_SCSI_SATA is not set # CONFIG_SCSI_DEBUG is not set @@ -503,6 +521,11 @@ # CONFIG_NLS_UTF8 is not set # +# Instrumentation Support +# +# CONFIG_KPROBES is not set + +# # Kernel hacking # # CONFIG_PRINTK_TIME is not set @@ -519,10 +542,11 @@ CONFIG_DEBUG_BUGVERBOSE=y # CONFIG_DEBUG_INFO is not set # CONFIG_DEBUG_FS is not set +# CONFIG_DEBUG_VM is not set CONFIG_FRAME_POINTER=y +# CONFIG_RCU_TORTURE_TEST is not set CONFIG_EARLY_PRINTK=y # CONFIG_DEBUG_STACKOVERFLOW is not set -# CONFIG_KPROBES is not set # CONFIG_DEBUG_STACK_USAGE is not set # CONFIG_DEBUG_PAGEALLOC is not set # CONFIG_4KSTACKS is not set @@ -598,4 +622,3 @@ CONFIG_X86_SMP=y CONFIG_X86_BIOS_REBOOT=y CONFIG_X86_TRAMPOLINE=y -CONFIG_PC=y diff -r d609de73b9fa -r 5a63f675107c buildconfigs/linux-defconfig_xenU_x86_64 --- a/buildconfigs/linux-defconfig_xenU_x86_64 Wed Feb 1 17:06:16 2006 +++ b/buildconfigs/linux-defconfig_xenU_x86_64 Wed Feb 1 18:00:19 2006 @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.14-xenU -# Tue Jan 31 19:51:18 2006 +# Linux kernel version: 2.6.15-xenU +# Wed Feb 1 15:49:27 2006 # CONFIG_X86_64=y CONFIG_64BIT=y @@ -42,6 +42,7 @@ # CONFIG_IKCONFIG is not set # CONFIG_CPUSETS is not set CONFIG_INITRAMFS_SOURCE="" +# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set # CONFIG_EMBEDDED is not set CONFIG_KALLSYMS=y # CONFIG_KALLSYMS_ALL is not set @@ -72,6 +73,24 @@ CONFIG_STOP_MACHINE=y # +# Block layer +# +CONFIG_LBD=y + +# +# IO Schedulers +# +CONFIG_IOSCHED_NOOP=y +CONFIG_IOSCHED_AS=y +CONFIG_IOSCHED_DEADLINE=y +CONFIG_IOSCHED_CFQ=y +CONFIG_DEFAULT_AS=y +# CONFIG_DEFAULT_DEADLINE is not set +# CONFIG_DEFAULT_CFQ is not set +# CONFIG_DEFAULT_NOOP is not set +CONFIG_DEFAULT_IOSCHED="anticipatory" + +# # Processor type and features # # CONFIG_MK8 is not set @@ -90,7 +109,6 @@ # CONFIG_PREEMPT_VOLUNTARY is not set # CONFIG_PREEMPT is not set CONFIG_PREEMPT_BKL=y -# CONFIG_NUMA is not set CONFIG_ARCH_FLATMEM_ENABLE=y CONFIG_SELECT_MEMORY_MODEL=y CONFIG_FLATMEM_MANUAL=y @@ -99,6 +117,7 @@ CONFIG_FLATMEM=y CONFIG_FLAT_NODE_MEM_MAP=y # CONFIG_SPARSEMEM_STATIC is not set +CONFIG_SPLIT_PTLOCK_CPUS=4096 CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y CONFIG_NR_CPUS=8 # CONFIG_HOTPLUG_CPU is not set @@ -219,6 +238,10 @@ CONFIG_NETFILTER=y # CONFIG_NETFILTER_DEBUG is not set CONFIG_BRIDGE_NETFILTER=y + +# +# Core Netfilter Configuration +# # CONFIG_NETFILTER_NETLINK is not set # @@ -384,10 +407,18 @@ CONFIG_NET_DIVERT=y # CONFIG_ECONET is not set CONFIG_WAN_ROUTER=m + +# +# QoS and/or fair queueing +# CONFIG_NET_SCHED=y CONFIG_NET_SCH_CLK_JIFFIES=y # CONFIG_NET_SCH_CLK_GETTIMEOFDAY is not set # CONFIG_NET_SCH_CLK_CPU is not set + +# +# Queueing/Scheduling +# CONFIG_NET_SCH_CBQ=m CONFIG_NET_SCH_HTB=m CONFIG_NET_SCH_HFSC=m @@ -401,8 +432,10 @@ CONFIG_NET_SCH_DSMARK=m CONFIG_NET_SCH_NETEM=m CONFIG_NET_SCH_INGRESS=m -CONFIG_NET_QOS=y -CONFIG_NET_ESTIMATOR=y + +# +# Classification +# CONFIG_NET_CLS=y # CONFIG_NET_CLS_BASIC is not set CONFIG_NET_CLS_TCINDEX=m @@ -411,13 +444,14 @@ CONFIG_NET_CLS_FW=m CONFIG_NET_CLS_U32=m CONFIG_CLS_U32_PERF=y -CONFIG_NET_CLS_IND=y # CONFIG_CLS_U32_MARK is not set CONFIG_NET_CLS_RSVP=m CONFIG_NET_CLS_RSVP6=m # CONFIG_NET_EMATCH is not set # CONFIG_NET_CLS_ACT is not set CONFIG_NET_CLS_POLICE=y +CONFIG_NET_CLS_IND=y +CONFIG_NET_ESTIMATOR=y # # Network testing @@ -496,7 +530,6 @@ CONFIG_BT_HCIUART=m CONFIG_BT_HCIUART_H4=y CONFIG_BT_HCIUART_BCSP=y -CONFIG_BT_HCIUART_BCSP_TXCRC=y CONFIG_BT_HCIVHCI=m # CONFIG_IEEE80211 is not set @@ -524,16 +557,7 @@ CONFIG_BLK_DEV_RAM_COUNT=16 CONFIG_BLK_DEV_RAM_SIZE=16384 CONFIG_BLK_DEV_INITRD=y -CONFIG_LBD=y # CONFIG_CDROM_PKTCDVD is not set - -# -# IO Schedulers -# -CONFIG_IOSCHED_NOOP=y -CONFIG_IOSCHED_AS=y -CONFIG_IOSCHED_DEADLINE=y -CONFIG_IOSCHED_CFQ=y # CONFIG_ATA_OVER_ETH is not set # @@ -572,6 +596,7 @@ # # SCSI low-level drivers # +# CONFIG_ISCSI_TCP is not set CONFIG_SCSI_SATA=m # CONFIG_SCSI_DEBUG is not set @@ -647,6 +672,7 @@ # # ATM drivers # +# CONFIG_ATM_DUMMY is not set CONFIG_ATM_TCP=m CONFIG_PPP=m CONFIG_PPP_MULTILINK=y @@ -655,6 +681,7 @@ CONFIG_PPP_SYNC_TTY=m CONFIG_PPP_DEFLATE=m # CONFIG_PPP_BSDCOMP is not set +# CONFIG_PPP_MPPE is not set CONFIG_PPPOE=m CONFIG_PPPOATM=m # CONFIG_SLIP is not set @@ -705,7 +732,7 @@ CONFIG_FS_POSIX_ACL=y CONFIG_XFS_FS=m CONFIG_XFS_EXPORT=y -CONFIG_XFS_QUOTA=m +# CONFIG_XFS_QUOTA is not set CONFIG_XFS_SECURITY=y CONFIG_XFS_POSIX_ACL=y # CONFIG_XFS_RT is not set @@ -877,9 +904,10 @@ CONFIG_NLS_UTF8=m # -# Profiling support +# Instrumentation Support # # CONFIG_PROFILING is not set +# CONFIG_KPROBES is not set # # Kernel hacking @@ -895,9 +923,10 @@ # CONFIG_DEBUG_SPINLOCK_SLEEP is not set # CONFIG_DEBUG_KOBJECT is not set # CONFIG_DEBUG_FS is not set +# CONFIG_DEBUG_VM is not set CONFIG_FRAME_POINTER=y +# CONFIG_RCU_TORTURE_TEST is not set # CONFIG_INIT_DEBUG is not set -# CONFIG_KPROBES is not set # # Security options diff -r d609de73b9fa -r 5a63f675107c buildconfigs/linux-defconfig_xen_x86_32 --- a/buildconfigs/linux-defconfig_xen_x86_32 Wed Feb 1 17:06:16 2006 +++ b/buildconfigs/linux-defconfig_xen_x86_32 Wed Feb 1 18:00:19 2006 @@ -1,10 +1,11 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.14-xen -# Tue Jan 31 19:01:58 2006 -# +# Linux kernel version: 2.6.15-xen +# Wed Feb 1 17:28:24 2006 +# +CONFIG_X86_32=y +CONFIG_SEMAPHORE_SLEEPERS=y CONFIG_X86=y -CONFIG_SEMAPHORE_SLEEPERS=y CONFIG_MMU=y CONFIG_UID16=y CONFIG_GENERIC_ISA_DMA=y @@ -38,6 +39,7 @@ # CONFIG_IKCONFIG is not set # CONFIG_CPUSETS is not set CONFIG_INITRAMFS_SOURCE="" +# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set CONFIG_EMBEDDED=y CONFIG_KALLSYMS=y # CONFIG_KALLSYMS_ALL is not set @@ -47,7 +49,6 @@ CONFIG_BASE_FULL=y CONFIG_FUTEX=y CONFIG_EPOLL=y -# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set CONFIG_SHMEM=y CONFIG_CC_ALIGN_FUNCTIONS=0 CONFIG_CC_ALIGN_LABELS=0 @@ -67,6 +68,24 @@ # CONFIG_MODULE_SRCVERSION_ALL is not set CONFIG_KMOD=y CONFIG_STOP_MACHINE=y + +# +# Block layer +# +CONFIG_LBD=y + +# +# IO Schedulers +# +CONFIG_IOSCHED_NOOP=y +CONFIG_IOSCHED_AS=y +CONFIG_IOSCHED_DEADLINE=y +CONFIG_IOSCHED_CFQ=y +CONFIG_DEFAULT_AS=y +# CONFIG_DEFAULT_DEADLINE is not set +# CONFIG_DEFAULT_CFQ is not set +# CONFIG_DEFAULT_NOOP is not set +CONFIG_DEFAULT_IOSCHED="anticipatory" # # Processor type and features @@ -113,8 +132,10 @@ CONFIG_X86_INVLPG=y CONFIG_X86_BSWAP=y CONFIG_X86_POPAD_OK=y +CONFIG_X86_CMPXCHG64=y CONFIG_X86_GOOD_APIC=y CONFIG_X86_USE_PPRO_CHECKSUM=y +CONFIG_X86_TSC=y CONFIG_SMP=y CONFIG_SMP_ALTERNATIVES=y CONFIG_NR_CPUS=8 @@ -148,6 +169,7 @@ CONFIG_FLATMEM=y CONFIG_FLAT_NODE_MEM_MAP=y # CONFIG_SPARSEMEM_STATIC is not set +CONFIG_SPLIT_PTLOCK_CPUS=4096 CONFIG_MTRR=y # CONFIG_REGPARM is not set CONFIG_SECCOMP=y @@ -156,8 +178,8 @@ # CONFIG_HZ_1000 is not set CONFIG_HZ=250 CONFIG_PHYSICAL_START=0x100000 +# CONFIG_CRASH_DUMP is not set CONFIG_HOTPLUG_CPU=y -# CONFIG_CRASH_DUMP is not set # # Power management options (ACPI, APM) @@ -175,6 +197,7 @@ CONFIG_ACPI_HOTKEY=m CONFIG_ACPI_FAN=m CONFIG_ACPI_PROCESSOR=m +CONFIG_ACPI_HOTPLUG_CPU=y CONFIG_ACPI_THERMAL=m CONFIG_ACPI_ASUS=m CONFIG_ACPI_IBM=m @@ -185,7 +208,7 @@ CONFIG_ACPI_POWER=y CONFIG_ACPI_SYSTEM=y # CONFIG_X86_PM_TIMER is not set -# CONFIG_ACPI_CONTAINER is not set +CONFIG_ACPI_CONTAINER=m # # CPU Frequency scaling @@ -206,7 +229,6 @@ # CONFIG_PCI_LEGACY_PROC is not set # CONFIG_PCI_DEBUG is not set CONFIG_SCx200=m -# CONFIG_HOTPLUG_CPU is not set # # PCCARD (PCMCIA/CardBus) support @@ -341,6 +363,10 @@ CONFIG_NETFILTER=y # CONFIG_NETFILTER_DEBUG is not set CONFIG_BRIDGE_NETFILTER=y + +# +# Core Netfilter Configuration +# CONFIG_NETFILTER_NETLINK=m CONFIG_NETFILTER_NETLINK_QUEUE=m CONFIG_NETFILTER_NETLINK_LOG=m @@ -534,10 +560,18 @@ CONFIG_ECONET_AUNUDP=y CONFIG_ECONET_NATIVE=y CONFIG_WAN_ROUTER=m + +# +# QoS and/or fair queueing +# CONFIG_NET_SCHED=y CONFIG_NET_SCH_CLK_JIFFIES=y # CONFIG_NET_SCH_CLK_GETTIMEOFDAY is not set # CONFIG_NET_SCH_CLK_CPU is not set + +# +# Queueing/Scheduling +# CONFIG_NET_SCH_CBQ=m CONFIG_NET_SCH_HTB=m CONFIG_NET_SCH_HFSC=m @@ -551,8 +585,10 @@ CONFIG_NET_SCH_DSMARK=m CONFIG_NET_SCH_NETEM=m CONFIG_NET_SCH_INGRESS=m -CONFIG_NET_QOS=y -CONFIG_NET_ESTIMATOR=y + +# +# Classification +# CONFIG_NET_CLS=y CONFIG_NET_CLS_BASIC=m CONFIG_NET_CLS_TCINDEX=m @@ -561,7 +597,6 @@ CONFIG_NET_CLS_FW=m CONFIG_NET_CLS_U32=m # CONFIG_CLS_U32_PERF is not set -# CONFIG_NET_CLS_IND is not set # CONFIG_CLS_U32_MARK is not set CONFIG_NET_CLS_RSVP=m CONFIG_NET_CLS_RSVP6=m @@ -574,6 +609,8 @@ CONFIG_NET_EMATCH_TEXT=m # CONFIG_NET_CLS_ACT is not set CONFIG_NET_CLS_POLICE=y +# CONFIG_NET_CLS_IND is not set +CONFIG_NET_ESTIMATOR=y # # Network testing @@ -676,7 +713,6 @@ CONFIG_BT_HCIUART=m CONFIG_BT_HCIUART_H4=y CONFIG_BT_HCIUART_BCSP=y -# CONFIG_BT_HCIUART_BCSP_TXCRC is not set CONFIG_BT_HCIBCM203X=m # CONFIG_BT_HCIBPA10X is not set CONFIG_BT_HCIBFUSB=m @@ -731,6 +767,7 @@ CONFIG_NFTL=m CONFIG_NFTL_RW=y CONFIG_INFTL=m +CONFIG_RFD_FTL=m # # RAM/ROM/Flash chip drivers @@ -821,6 +858,12 @@ CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADDRESS=0 # CONFIG_MTD_NAND_DISKONCHIP_BBTWRITE is not set # CONFIG_MTD_NAND_NANDSIM is not set + +# +# OneNAND Flash Device Drivers +# +CONFIG_MTD_ONENAND=m +# CONFIG_MTD_ONENAND_VERIFY_WRITE is not set # # Parallel port support @@ -896,18 +939,9 @@ CONFIG_BLK_DEV_RAM_COUNT=16 CONFIG_BLK_DEV_RAM_SIZE=16384 CONFIG_BLK_DEV_INITRD=y -CONFIG_LBD=y CONFIG_CDROM_PKTCDVD=m CONFIG_CDROM_PKTCDVD_BUFFERS=8 # CONFIG_CDROM_PKTCDVD_WCACHE is not set - -# -# IO Schedulers -# -CONFIG_IOSCHED_NOOP=y -CONFIG_IOSCHED_AS=y -CONFIG_IOSCHED_DEADLINE=y -CONFIG_IOSCHED_CFQ=y CONFIG_ATA_OVER_ETH=m # @@ -957,6 +991,7 @@ CONFIG_BLK_DEV_CY82C693=y CONFIG_BLK_DEV_CS5520=y CONFIG_BLK_DEV_CS5530=y +CONFIG_BLK_DEV_CS5535=m CONFIG_BLK_DEV_HPT34X=y # CONFIG_HPT34X_AUTODMA is not set CONFIG_BLK_DEV_HPT366=y @@ -1010,12 +1045,13 @@ # CONFIG_SCSI_SPI_ATTRS=m CONFIG_SCSI_FC_ATTRS=m -# CONFIG_SCSI_ISCSI_ATTRS is not set +CONFIG_SCSI_ISCSI_ATTRS=m CONFIG_SCSI_SAS_ATTRS=m # # SCSI low-level drivers # +CONFIG_ISCSI_TCP=m CONFIG_BLK_DEV_3W_XXXX_RAID=m CONFIG_SCSI_3W_9XXX=m CONFIG_SCSI_ACARD=m @@ -1046,16 +1082,17 @@ CONFIG_SCSI_ATA_PIIX=m CONFIG_SCSI_SATA_MV=m CONFIG_SCSI_SATA_NV=m +CONFIG_SCSI_PDC_ADMA=m +# CONFIG_SCSI_SATA_QSTOR is not set CONFIG_SCSI_SATA_PROMISE=m -# CONFIG_SCSI_SATA_QSTOR is not set CONFIG_SCSI_SATA_SX4=m CONFIG_SCSI_SATA_SIL=m +CONFIG_SCSI_SATA_SIL24=m CONFIG_SCSI_SATA_SIS=m CONFIG_SCSI_SATA_ULI=m CONFIG_SCSI_SATA_VIA=m CONFIG_SCSI_SATA_VITESSE=m CONFIG_SCSI_SATA_INTEL_COMBINED=y -# CONFIG_SCSI_CPQFCTS is not set CONFIG_SCSI_DMX3191D=m CONFIG_SCSI_EATA_PIO=m CONFIG_SCSI_FUTURE_DOMAIN=m @@ -1074,11 +1111,9 @@ CONFIG_SCSI_IPR=m # CONFIG_SCSI_IPR_TRACE is not set # CONFIG_SCSI_IPR_DUMP is not set -CONFIG_SCSI_QLOGIC_ISP=m CONFIG_SCSI_QLOGIC_FC=m CONFIG_SCSI_QLOGIC_FC_FIRMWARE=y CONFIG_SCSI_QLOGIC_1280=m -CONFIG_SCSI_QLOGIC_1280_1040=y CONFIG_SCSI_QLA2XXX=m CONFIG_SCSI_QLA21XX=m CONFIG_SCSI_QLA22XX=m @@ -1206,7 +1241,6 @@ # PHY device support # CONFIG_PHYLIB=m -CONFIG_PHYCONTROL=y # # MII PHY device drivers @@ -1302,7 +1336,6 @@ # CONFIG_IXGB_NAPI is not set CONFIG_S2IO=m # CONFIG_S2IO_NAPI is not set -# CONFIG_2BUFF_MODE is not set # # Token Ring devices @@ -1418,6 +1451,7 @@ # # ATM drivers # +CONFIG_ATM_DUMMY=m CONFIG_ATM_TCP=m CONFIG_ATM_LANAI=m CONFIG_ATM_ENI=m @@ -1462,6 +1496,7 @@ CONFIG_PPP_SYNC_TTY=m CONFIG_PPP_DEFLATE=m CONFIG_PPP_BSDCOMP=m +CONFIG_PPP_MPPE=m CONFIG_PPPOE=m CONFIG_PPPOATM=m CONFIG_SLIP=m @@ -1674,6 +1709,7 @@ CONFIG_TOUCHSCREEN_MK712=m CONFIG_INPUT_MISC=y CONFIG_INPUT_PCSPKR=m +CONFIG_INPUT_WISTRON_BTNS=m CONFIG_INPUT_UINPUT=m # @@ -1816,17 +1852,20 @@ # PCMCIA character devices # CONFIG_SYNCLINK_CS=m +CONFIG_CARDMAN_4000=m +CONFIG_CARDMAN_4040=m CONFIG_MWAVE=m CONFIG_SCx200_GPIO=m CONFIG_RAW_DRIVER=m +CONFIG_MAX_RAW_DEVS=256 # CONFIG_HPET is not set -CONFIG_MAX_RAW_DEVS=256 CONFIG_HANGCHECK_TIMER=m # # TPM devices # # CONFIG_TCG_TPM is not set +CONFIG_TELCLOCK=m # # I2C support @@ -1883,6 +1922,7 @@ CONFIG_SENSORS_PCF8591=m CONFIG_SENSORS_RTC8564=m CONFIG_SENSORS_MAX6875=m +CONFIG_RTC_X1205_I2C=m # CONFIG_I2C_DEBUG_CORE is not set # CONFIG_I2C_DEBUG_ALGO is not set # CONFIG_I2C_DEBUG_BUS is not set @@ -1964,6 +2004,7 @@ # Video Adapters # CONFIG_VIDEO_BT848=m +# CONFIG_VIDEO_BT848_DVB is not set CONFIG_VIDEO_SAA6588=m CONFIG_VIDEO_BWQCAM=m CONFIG_VIDEO_CQCAM=m @@ -1990,7 +2031,10 @@ CONFIG_VIDEO_HEXIUM_GEMINI=m CONFIG_VIDEO_CX88=m # CONFIG_VIDEO_CX88_DVB is not set +CONFIG_VIDEO_EM28XX=m CONFIG_VIDEO_OVCAMCHIP=m +CONFIG_VIDEO_AUDIO_DECODER=m +CONFIG_VIDEO_DECODER=m # # Radio Adapters @@ -2098,6 +2142,7 @@ # ATSC (North American/Korean Terresterial DTV) frontends # CONFIG_DVB_NXT2002=m +CONFIG_DVB_NXT200X=m CONFIG_DVB_OR51211=m CONFIG_DVB_OR51132=m CONFIG_DVB_BCM3510=m @@ -2118,7 +2163,6 @@ CONFIG_FB_CFB_FILLRECT=m CONFIG_FB_CFB_COPYAREA=m CONFIG_FB_CFB_IMAGEBLIT=m -CONFIG_FB_SOFT_CURSOR=m # CONFIG_FB_MACMODES is not set CONFIG_FB_MODE_HELPERS=y CONFIG_FB_TILEBLITTING=y @@ -2134,6 +2178,7 @@ CONFIG_VIDEO_SELECT=y CONFIG_FB_HGA=m # CONFIG_FB_HGA_ACCEL is not set +CONFIG_FB_S1D13XXX=m CONFIG_FB_NVIDIA=m CONFIG_FB_NVIDIA_I2C=y CONFIG_FB_RIVA=m @@ -2176,7 +2221,6 @@ # CONFIG_FB_PM3 is not set CONFIG_FB_GEODE=y CONFIG_FB_GEODE_GX1=m -CONFIG_FB_S1D13XXX=m CONFIG_FB_VIRTUAL=m # @@ -2185,6 +2229,7 @@ CONFIG_VGA_CONSOLE=y CONFIG_DUMMY_CONSOLE=y CONFIG_FRAMEBUFFER_CONSOLE=m +# CONFIG_FRAMEBUFFER_CONSOLE_ROTATION is not set # CONFIG_FONTS is not set CONFIG_FONT_8x8=y CONFIG_FONT_8x16=y @@ -2204,6 +2249,8 @@ # Advanced Linux Sound Architecture # CONFIG_SND=m +CONFIG_SND_AC97_CODEC=m +CONFIG_SND_AC97_BUS=m CONFIG_SND_TIMER=m CONFIG_SND_PCM=m CONFIG_SND_HWDEP=m @@ -2231,8 +2278,6 @@ CONFIG_SND_MTPAV=m CONFIG_SND_SERIAL_U16550=m CONFIG_SND_MPU401=m -CONFIG_SND_AC97_CODEC=m -CONFIG_SND_AC97_BUS=m # # PCI devices @@ -2295,30 +2340,13 @@ # Open Sound System # CONFIG_SOUND_PRIME=m -CONFIG_SOUND_BT878=m -CONFIG_SOUND_CMPCI=m -# CONFIG_SOUND_CMPCI_FM is not set -# CONFIG_SOUND_CMPCI_MIDI is not set -CONFIG_SOUND_CMPCI_JOYSTICK=y -CONFIG_SOUND_EMU10K1=m +# CONFIG_OBSOLETE_OSS_DRIVER is not set CONFIG_SOUND_FUSION=m -CONFIG_SOUND_CS4281=m -CONFIG_SOUND_ES1370=m -CONFIG_SOUND_ES1371=m -CONFIG_SOUND_ESSSOLO1=m -CONFIG_SOUND_MAESTRO=m -CONFIG_SOUND_MAESTRO3=m CONFIG_SOUND_ICH=m -CONFIG_SOUND_SONICVIBES=m CONFIG_SOUND_TRIDENT=m # CONFIG_SOUND_MSNDCLAS is not set # CONFIG_SOUND_MSNDPIN is not set -CONFIG_SOUND_VIA82CXXX=m CONFIG_SOUND_TVMIXER=m -CONFIG_SOUND_ALI5455=m -CONFIG_SOUND_FORTE=m -CONFIG_SOUND_RME96XX=m -CONFIG_SOUND_AD1980=m # # USB support @@ -2355,15 +2383,15 @@ # USB Device Class drivers # # CONFIG_OBSOLETE_OSS_USB_DRIVER is not set - -# -# USB Bluetooth TTY can only be used with disabled Bluetooth subsystem -# CONFIG_USB_ACM=m CONFIG_USB_PRINTER=m # -# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support' may also be needed; see USB_STORAGE Help for more information +# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support' +# + +# +# may also be needed; see USB_STORAGE Help for more information # CONFIG_USB_STORAGE=m # CONFIG_USB_STORAGE_DEBUG is not set @@ -2375,7 +2403,6 @@ CONFIG_USB_STORAGE_SDDR09=y CONFIG_USB_STORAGE_SDDR55=y CONFIG_USB_STORAGE_JUMPSHOT=y -CONFIG_USB_STORAGE_ONETOUCH=y # # USB Input Devices @@ -2460,6 +2487,7 @@ CONFIG_USB_SERIAL=m CONFIG_USB_SERIAL_GENERIC=y CONFIG_USB_SERIAL_AIRPRIME=m +CONFIG_USB_SERIAL_ANYDATA=m CONFIG_USB_SERIAL_BELKIN=m CONFIG_USB_SERIAL_WHITEHEAT=m CONFIG_USB_SERIAL_DIGI_ACCELEPORT=m @@ -2593,7 +2621,7 @@ CONFIG_FS_POSIX_ACL=y CONFIG_XFS_FS=m CONFIG_XFS_EXPORT=y -CONFIG_XFS_QUOTA=m +# CONFIG_XFS_QUOTA is not set CONFIG_XFS_SECURITY=y CONFIG_XFS_POSIX_ACL=y CONFIG_XFS_RT=y @@ -2660,6 +2688,7 @@ CONFIG_JFFS2_FS=m CONFIG_JFFS2_FS_DEBUG=0 CONFIG_JFFS2_FS_WRITEBUFFER=y +# CONFIG_JFFS2_SUMMARY is not set # CONFIG_JFFS2_COMPRESSION_OPTIONS is not set CONFIG_JFFS2_ZLIB=y CONFIG_JFFS2_RTIME=y @@ -2787,6 +2816,11 @@ CONFIG_NLS_UTF8=m # +# Instrumentation Support +# +# CONFIG_KPROBES is not set + +# # Kernel hacking # # CONFIG_PRINTK_TIME is not set @@ -2803,10 +2837,11 @@ # CONFIG_DEBUG_BUGVERBOSE is not set # CONFIG_DEBUG_INFO is not set # CONFIG_DEBUG_FS is not set +# CONFIG_DEBUG_VM is not set # CONFIG_FRAME_POINTER is not set +# CONFIG_RCU_TORTURE_TEST is not set # CONFIG_EARLY_PRINTK is not set # CONFIG_DEBUG_STACKOVERFLOW is not set -# CONFIG_KPROBES is not set # CONFIG_DEBUG_STACK_USAGE is not set # CONFIG_DEBUG_PAGEALLOC is not set # CONFIG_4KSTACKS is not set diff -r d609de73b9fa -r 5a63f675107c buildconfigs/linux-defconfig_xen_x86_64 --- a/buildconfigs/linux-defconfig_xen_x86_64 Wed Feb 1 17:06:16 2006 +++ b/buildconfigs/linux-defconfig_xen_x86_64 Wed Feb 1 18:00:19 2006 @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.14-xen -# Tue Jan 31 18:19:07 2006 +# Linux kernel version: 2.6.15-xen +# Wed Feb 1 15:51:35 2006 # CONFIG_X86_64=y CONFIG_64BIT=y @@ -42,6 +42,7 @@ # CONFIG_IKCONFIG is not set # CONFIG_CPUSETS is not set CONFIG_INITRAMFS_SOURCE="" +# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set # CONFIG_EMBEDDED is not set CONFIG_KALLSYMS=y # CONFIG_KALLSYMS_ALL is not set @@ -70,6 +71,24 @@ CONFIG_MODULE_SRCVERSION_ALL=y CONFIG_KMOD=y CONFIG_STOP_MACHINE=y + +# +# Block layer +# +CONFIG_LBD=y + +# +# IO Schedulers +# +CONFIG_IOSCHED_NOOP=y +CONFIG_IOSCHED_AS=y +CONFIG_IOSCHED_DEADLINE=y +CONFIG_IOSCHED_CFQ=y +CONFIG_DEFAULT_AS=y +# CONFIG_DEFAULT_DEADLINE is not set +# CONFIG_DEFAULT_CFQ is not set +# CONFIG_DEFAULT_NOOP is not set +CONFIG_DEFAULT_IOSCHED="anticipatory" # # Processor type and features @@ -92,7 +111,6 @@ # CONFIG_PREEMPT_VOLUNTARY is not set # CONFIG_PREEMPT is not set CONFIG_PREEMPT_BKL=y -# CONFIG_NUMA is not set CONFIG_ARCH_FLATMEM_ENABLE=y CONFIG_SELECT_MEMORY_MODEL=y CONFIG_FLATMEM_MANUAL=y @@ -101,6 +119,7 @@ CONFIG_FLATMEM=y CONFIG_FLAT_NODE_MEM_MAP=y # CONFIG_SPARSEMEM_STATIC is not set +CONFIG_SPLIT_PTLOCK_CPUS=4096 CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y CONFIG_NR_CPUS=8 # CONFIG_HOTPLUG_CPU is not set @@ -259,6 +278,10 @@ CONFIG_NETFILTER=y # CONFIG_NETFILTER_DEBUG is not set CONFIG_BRIDGE_NETFILTER=y + +# +# Core Netfilter Configuration +# CONFIG_NETFILTER_NETLINK=m CONFIG_NETFILTER_NETLINK_QUEUE=m CONFIG_NETFILTER_NETLINK_LOG=m @@ -444,10 +467,18 @@ CONFIG_NET_DIVERT=y # CONFIG_ECONET is not set CONFIG_WAN_ROUTER=m + +# +# QoS and/or fair queueing +# CONFIG_NET_SCHED=y CONFIG_NET_SCH_CLK_JIFFIES=y # CONFIG_NET_SCH_CLK_GETTIMEOFDAY is not set # CONFIG_NET_SCH_CLK_CPU is not set + +# +# Queueing/Scheduling +# CONFIG_NET_SCH_CBQ=m CONFIG_NET_SCH_HTB=m CONFIG_NET_SCH_HFSC=m @@ -461,8 +492,10 @@ CONFIG_NET_SCH_DSMARK=m CONFIG_NET_SCH_NETEM=m CONFIG_NET_SCH_INGRESS=m -CONFIG_NET_QOS=y -CONFIG_NET_ESTIMATOR=y + +# +# Classification +# CONFIG_NET_CLS=y CONFIG_NET_CLS_BASIC=m CONFIG_NET_CLS_TCINDEX=m @@ -471,7 +504,6 @@ CONFIG_NET_CLS_FW=m CONFIG_NET_CLS_U32=m CONFIG_CLS_U32_PERF=y -CONFIG_NET_CLS_IND=y CONFIG_CLS_U32_MARK=y CONFIG_NET_CLS_RSVP=m CONFIG_NET_CLS_RSVP6=m @@ -484,6 +516,8 @@ CONFIG_NET_EMATCH_TEXT=m # CONFIG_NET_CLS_ACT is not set CONFIG_NET_CLS_POLICE=y +CONFIG_NET_CLS_IND=y +CONFIG_NET_ESTIMATOR=y # # Network testing @@ -570,7 +604,6 @@ CONFIG_BT_HCIUART=m CONFIG_BT_HCIUART_H4=y CONFIG_BT_HCIUART_BCSP=y -CONFIG_BT_HCIUART_BCSP_TXCRC=y CONFIG_BT_HCIBCM203X=m CONFIG_BT_HCIBPA10X=m CONFIG_BT_HCIBFUSB=m @@ -621,6 +654,7 @@ CONFIG_NFTL=m CONFIG_NFTL_RW=y CONFIG_INFTL=m +CONFIG_RFD_FTL=m # # RAM/ROM/Flash chip drivers @@ -701,6 +735,12 @@ CONFIG_MTD_NAND_IDS=m # CONFIG_MTD_NAND_DISKONCHIP is not set # CONFIG_MTD_NAND_NANDSIM is not set + +# +# OneNAND Flash Device Drivers +# +CONFIG_MTD_ONENAND=m +# CONFIG_MTD_ONENAND_VERIFY_WRITE is not set # # Parallel port support @@ -767,18 +807,9 @@ CONFIG_BLK_DEV_RAM_COUNT=16 CONFIG_BLK_DEV_RAM_SIZE=16384 CONFIG_BLK_DEV_INITRD=y -CONFIG_LBD=y CONFIG_CDROM_PKTCDVD=m CONFIG_CDROM_PKTCDVD_BUFFERS=8 # CONFIG_CDROM_PKTCDVD_WCACHE is not set - -# -# IO Schedulers -# -CONFIG_IOSCHED_NOOP=y -CONFIG_IOSCHED_AS=y -CONFIG_IOSCHED_DEADLINE=y -CONFIG_IOSCHED_CFQ=y CONFIG_ATA_OVER_ETH=m # @@ -885,6 +916,7 @@ # # SCSI low-level drivers # +CONFIG_ISCSI_TCP=m CONFIG_BLK_DEV_3W_XXXX_RAID=m CONFIG_SCSI_3W_9XXX=m CONFIG_SCSI_ACARD=m @@ -914,10 +946,12 @@ CONFIG_SCSI_ATA_PIIX=y CONFIG_SCSI_SATA_MV=m CONFIG_SCSI_SATA_NV=m +CONFIG_SCSI_PDC_ADMA=m +CONFIG_SCSI_SATA_QSTOR=m CONFIG_SCSI_SATA_PROMISE=m -CONFIG_SCSI_SATA_QSTOR=m CONFIG_SCSI_SATA_SX4=m CONFIG_SCSI_SATA_SIL=m +CONFIG_SCSI_SATA_SIL24=m CONFIG_SCSI_SATA_SIS=m CONFIG_SCSI_SATA_ULI=m CONFIG_SCSI_SATA_VIA=m @@ -925,7 +959,6 @@ CONFIG_SCSI_SATA_INTEL_COMBINED=y CONFIG_SCSI_BUSLOGIC=m # CONFIG_SCSI_OMIT_FLASHPOINT is not set -# CONFIG_SCSI_CPQFCTS is not set # CONFIG_SCSI_DMX3191D is not set # CONFIG_SCSI_EATA is not set # CONFIG_SCSI_EATA_PIO is not set @@ -944,10 +977,8 @@ CONFIG_SCSI_SYM53C8XX_MAX_TAGS=64 # CONFIG_SCSI_SYM53C8XX_IOMAPPED is not set # CONFIG_SCSI_IPR is not set -# CONFIG_SCSI_QLOGIC_ISP is not set # CONFIG_SCSI_QLOGIC_FC is not set CONFIG_SCSI_QLOGIC_1280=m -CONFIG_SCSI_QLOGIC_1280_1040=y CONFIG_SCSI_QLA2XXX=y CONFIG_SCSI_QLA21XX=m CONFIG_SCSI_QLA22XX=m @@ -1055,7 +1086,6 @@ # PHY device support # CONFIG_PHYLIB=m -CONFIG_PHYCONTROL=y # # MII PHY device drivers @@ -1152,7 +1182,6 @@ CONFIG_IXGB_NAPI=y CONFIG_S2IO=m CONFIG_S2IO_NAPI=y -# CONFIG_2BUFF_MODE is not set # # Token Ring devices @@ -1208,6 +1237,7 @@ # # ATM drivers # +CONFIG_ATM_DUMMY=m CONFIG_ATM_TCP=m CONFIG_ATM_LANAI=m CONFIG_ATM_ENI=m @@ -1239,6 +1269,7 @@ CONFIG_PPP_SYNC_TTY=m CONFIG_PPP_DEFLATE=m # CONFIG_PPP_BSDCOMP is not set +CONFIG_PPP_MPPE=m CONFIG_PPPOE=m CONFIG_PPPOATM=m CONFIG_SLIP=m @@ -1466,7 +1497,6 @@ # # Serial drivers # -# CONFIG_SERIAL_8250 is not set # # Non-8250 serial port support @@ -1558,6 +1588,7 @@ # TPM devices # # CONFIG_TCG_TPM is not set +CONFIG_TELCLOCK=m # # I2C support @@ -1611,6 +1642,7 @@ CONFIG_SENSORS_PCF8591=m CONFIG_SENSORS_RTC8564=m CONFIG_SENSORS_MAX6875=m +CONFIG_RTC_X1205_I2C=m # CONFIG_I2C_DEBUG_CORE is not set # CONFIG_I2C_DEBUG_ALGO is not set # CONFIG_I2C_DEBUG_BUS is not set @@ -1692,6 +1724,7 @@ # Video Adapters # CONFIG_VIDEO_BT848=m +# CONFIG_VIDEO_BT848_DVB is not set CONFIG_VIDEO_SAA6588=m CONFIG_VIDEO_BWQCAM=m CONFIG_VIDEO_CQCAM=m @@ -1711,14 +1744,20 @@ CONFIG_VIDEO_ZORAN_LML33R10=m # CONFIG_VIDEO_ZR36120 is not set CONFIG_VIDEO_SAA7134=m +CONFIG_VIDEO_SAA7134_ALSA=m CONFIG_VIDEO_SAA7134_DVB=m +CONFIG_VIDEO_SAA7134_DVB_ALL_FRONTENDS=y CONFIG_VIDEO_MXB=m CONFIG_VIDEO_DPC=m CONFIG_VIDEO_HEXIUM_ORION=m CONFIG_VIDEO_HEXIUM_GEMINI=m CONFIG_VIDEO_CX88=m CONFIG_VIDEO_CX88_DVB=m +CONFIG_VIDEO_CX88_DVB_ALL_FRONTENDS=y +CONFIG_VIDEO_EM28XX=m CONFIG_VIDEO_OVCAMCHIP=m +CONFIG_VIDEO_AUDIO_DECODER=m +CONFIG_VIDEO_DECODER=m # # Radio Adapters @@ -1831,6 +1870,7 @@ # ATSC (North American/Korean Terresterial DTV) frontends # CONFIG_DVB_NXT2002=m +CONFIG_DVB_NXT200X=m CONFIG_DVB_OR51211=m CONFIG_DVB_OR51132=m CONFIG_DVB_BCM3510=m @@ -1852,7 +1892,6 @@ CONFIG_FB_CFB_FILLRECT=y CONFIG_FB_CFB_COPYAREA=y CONFIG_FB_CFB_IMAGEBLIT=y -CONFIG_FB_SOFT_CURSOR=y # CONFIG_FB_MACMODES is not set CONFIG_FB_MODE_HELPERS=y CONFIG_FB_TILEBLITTING=y @@ -1866,6 +1905,7 @@ CONFIG_FB_VESA=y CONFIG_VIDEO_SELECT=y # CONFIG_FB_HGA is not set +# CONFIG_FB_S1D13XXX is not set # CONFIG_FB_NVIDIA is not set CONFIG_FB_RIVA=m # CONFIG_FB_RIVA_I2C is not set @@ -1901,7 +1941,6 @@ CONFIG_FB_TRIDENT_ACCEL=y # CONFIG_FB_PM3 is not set # CONFIG_FB_GEODE is not set -# CONFIG_FB_S1D13XXX is not set # CONFIG_FB_VIRTUAL is not set # @@ -1910,6 +1949,7 @@ CONFIG_VGA_CONSOLE=y CONFIG_DUMMY_CONSOLE=y CONFIG_FRAMEBUFFER_CONSOLE=y +# CONFIG_FRAMEBUFFER_CONSOLE_ROTATION is not set # CONFIG_FONTS is not set CONFIG_FONT_8x8=y CONFIG_FONT_8x16=y @@ -1936,6 +1976,8 @@ # Advanced Linux Sound Architecture # CONFIG_SND=m +CONFIG_SND_AC97_CODEC=m +CONFIG_SND_AC97_BUS=m CONFIG_SND_TIMER=m CONFIG_SND_PCM=m CONFIG_SND_HWDEP=m @@ -1963,8 +2005,6 @@ CONFIG_SND_MTPAV=m # CONFIG_SND_SERIAL_U16550 is not set CONFIG_SND_MPU401=m -CONFIG_SND_AC97_CODEC=m -CONFIG_SND_AC97_BUS=m # # PCI devices @@ -2059,15 +2099,15 @@ # USB Device Class drivers # # CONFIG_OBSOLETE_OSS_USB_DRIVER is not set - -# -# USB Bluetooth TTY can only be used with disabled Bluetooth subsystem -# CONFIG_USB_ACM=m CONFIG_USB_PRINTER=m # -# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support' may also be needed; see USB_STORAGE Help for more information +# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support' +# + +# +# may also be needed; see USB_STORAGE Help for more information # CONFIG_USB_STORAGE=m # CONFIG_USB_STORAGE_DEBUG is not set @@ -2079,7 +2119,6 @@ CONFIG_USB_STORAGE_SDDR09=y CONFIG_USB_STORAGE_SDDR55=y CONFIG_USB_STORAGE_JUMPSHOT=y -CONFIG_USB_STORAGE_ONETOUCH=y # # USB Input Devices @@ -2161,6 +2200,7 @@ CONFIG_USB_SERIAL=m CONFIG_USB_SERIAL_GENERIC=y CONFIG_USB_SERIAL_AIRPRIME=m +CONFIG_USB_SERIAL_ANYDATA=m CONFIG_USB_SERIAL_BELKIN=m CONFIG_USB_SERIAL_WHITEHEAT=m CONFIG_USB_SERIAL_DIGI_ACCELEPORT=m @@ -2252,6 +2292,7 @@ # CONFIG_INFINIBAND_MTHCA_DEBUG is not set CONFIG_INFINIBAND_IPOIB=m # CONFIG_INFINIBAND_IPOIB_DEBUG is not set +CONFIG_INFINIBAND_SRP=m # # SN Devices @@ -2293,7 +2334,7 @@ CONFIG_FS_POSIX_ACL=y CONFIG_XFS_FS=m CONFIG_XFS_EXPORT=y -CONFIG_XFS_QUOTA=m +# CONFIG_XFS_QUOTA is not set CONFIG_XFS_SECURITY=y CONFIG_XFS_POSIX_ACL=y # CONFIG_XFS_RT is not set @@ -2355,6 +2396,7 @@ CONFIG_JFFS2_FS=m CONFIG_JFFS2_FS_DEBUG=0 CONFIG_JFFS2_FS_WRITEBUFFER=y +# CONFIG_JFFS2_SUMMARY is not set # CONFIG_JFFS2_COMPRESSION_OPTIONS is not set CONFIG_JFFS2_ZLIB=y CONFIG_JFFS2_RTIME=y @@ -2474,9 +2516,10 @@ CONFIG_NLS_UTF8=m # -# Profiling support +# Instrumentation Support # # CONFIG_PROFILING is not set +# CONFIG_KPROBES is not set # # Kernel hacking @@ -2492,9 +2535,10 @@ # CONFIG_DEBUG_SPINLOCK_SLEEP is not set # CONFIG_DEBUG_KOBJECT is not set # CONFIG_DEBUG_FS is not set +# CONFIG_DEBUG_VM is not set # CONFIG_FRAME_POINTER is not set +# CONFIG_RCU_TORTURE_TEST is not set # CONFIG_INIT_DEBUG is not set -# CONFIG_KPROBES is not set # # Security options diff -r d609de73b9fa -r 5a63f675107c buildconfigs/mk.linux-2.6-xen --- a/buildconfigs/mk.linux-2.6-xen Wed Feb 1 17:06:16 2006 +++ b/buildconfigs/mk.linux-2.6-xen Wed Feb 1 18:00:19 2006 @@ -2,7 +2,7 @@ OS = linux LINUX_SERIES = 2.6 -LINUX_VER = 2.6.14 +LINUX_VER = 2.6.15 EXTRAVERSION ?= xen diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/arch/i386/Kconfig --- a/linux-2.6-xen-sparse/arch/i386/Kconfig Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/arch/i386/Kconfig Wed Feb 1 18:00:19 2006 @@ -5,7 +5,7 @@ mainmenu "Linux Kernel Configuration" -config X86 +config X86_32 bool default y help @@ -15,6 +15,10 @@ AMD, Cyrix, and others. config SEMAPHORE_SLEEPERS + bool + default y + +config X86 bool default y @@ -160,304 +164,7 @@ default y depends on SMP && X86_ES7000 && MPENTIUMIII -if !X86_ELAN - -choice - prompt "Processor family" - default M686 - -config M386 - bool "386" - ---help--- - This is the processor type of your CPU. This information is used for - optimizing purposes. In order to compile a kernel that can run on - all x86 CPU types (albeit not optimally fast), you can specify - "386" here. - - The kernel will not necessarily run on earlier architectures than - the one you have chosen, e.g. a Pentium optimized kernel will run on - a PPro, but not necessarily on a i486. - - Here are the settings recommended for greatest speed: - - "386" for the AMD/Cyrix/Intel 386DX/DXL/SL/SLC/SX, Cyrix/TI - 486DLC/DLC2, UMC 486SX-S and NexGen Nx586. Only "386" kernels - will run on a 386 class machine. - - "486" for the AMD/Cyrix/IBM/Intel 486DX/DX2/DX4 or - SL/SLC/SLC2/SLC3/SX/SX2 and UMC U5D or U5S. - - "586" for generic Pentium CPUs lacking the TSC - (time stamp counter) register. - - "Pentium-Classic" for the Intel Pentium. - - "Pentium-MMX" for the Intel Pentium MMX. - - "Pentium-Pro" for the Intel Pentium Pro. - - "Pentium-II" for the Intel Pentium II or pre-Coppermine Celeron. - - "Pentium-III" for the Intel Pentium III or Coppermine Celeron. - - "Pentium-4" for the Intel Pentium 4 or P4-based Celeron. - - "K6" for the AMD K6, K6-II and K6-III (aka K6-3D). - - "Athlon" for the AMD K7 family (Athlon/Duron/Thunderbird). - - "Crusoe" for the Transmeta Crusoe series. - - "Efficeon" for the Transmeta Efficeon series. - - "Winchip-C6" for original IDT Winchip. - - "Winchip-2" for IDT Winchip 2. - - "Winchip-2A" for IDT Winchips with 3dNow! capabilities. - - "GeodeGX1" for Geode GX1 (Cyrix MediaGX). - - "CyrixIII/VIA C3" for VIA Cyrix III or VIA C3. - - "VIA C3-2 for VIA C3-2 "Nehemiah" (model 9 and above). - - If you don't know what to do, choose "386". - -config M486 - bool "486" - help - Select this for a 486 series processor, either Intel or one of the - compatible processors from AMD, Cyrix, IBM, or Intel. Includes DX, - DX2, and DX4 variants; also SL/SLC/SLC2/SLC3/SX/SX2 and UMC U5D or - U5S. - -config M586 - bool "586/K5/5x86/6x86/6x86MX" - help - Select this for an 586 or 686 series processor such as the AMD K5, - the Cyrix 5x86, 6x86 and 6x86MX. This choice does not - assume the RDTSC (Read Time Stamp Counter) instruction. - -config M586TSC - bool "Pentium-Classic" - help - Select this for a Pentium Classic processor with the RDTSC (Read - Time Stamp Counter) instruction for benchmarking. - -config M586MMX - bool "Pentium-MMX" - help - Select this for a Pentium with the MMX graphics/multimedia - extended instructions. - -config M686 - bool "Pentium-Pro" - help - Select this for Intel Pentium Pro chips. This enables the use of - Pentium Pro extended instructions, and disables the init-time guard - against the f00f bug found in earlier Pentiums. - -config MPENTIUMII - bool "Pentium-II/Celeron(pre-Coppermine)" - help - Select this for Intel chips based on the Pentium-II and - pre-Coppermine Celeron core. This option enables an unaligned - copy optimization, compiles the kernel with optimization flags - tailored for the chip, and applies any applicable Pentium Pro - optimizations. - -config MPENTIUMIII - bool "Pentium-III/Celeron(Coppermine)/Pentium-III Xeon" - help - Select this for Intel chips based on the Pentium-III and - Celeron-Coppermine core. This option enables use of some - extended prefetch instructions in addition to the Pentium II - extensions. - -config MPENTIUMM - bool "Pentium M" - help - Select this for Intel Pentium M (not Pentium-4 M) - notebook chips. - -config MPENTIUM4 - bool "Pentium-4/Celeron(P4-based)/Pentium-4 M/Xeon" - help - Select this for Intel Pentium 4 chips. This includes the - Pentium 4, P4-based Celeron and Xeon, and Pentium-4 M - (not Pentium M) chips. This option enables compile flags - optimized for the chip, uses the correct cache shift, and - applies any applicable Pentium III optimizations. - -config MK6 - bool "K6/K6-II/K6-III" - help - Select this for an AMD K6-family processor. Enables use of - some extended instructions, and passes appropriate optimization - flags to GCC. - -config MK7 - bool "Athlon/Duron/K7" - help - Select this for an AMD Athlon K7-family processor. Enables use of - some extended instructions, and passes appropriate optimization - flags to GCC. - -config MK8 - bool "Opteron/Athlon64/Hammer/K8" - help - Select this for an AMD Opteron or Athlon64 Hammer-family processor. Enables - use of some extended instructions, and passes appropriate optimization - flags to GCC. - -config MCRUSOE - bool "Crusoe" - help - Select this for a Transmeta Crusoe processor. Treats the processor - like a 586 with TSC, and sets some GCC optimization flags (like a - Pentium Pro with no alignment requirements). - -config MEFFICEON - bool "Efficeon" - help - Select this for a Transmeta Efficeon processor. - -config MWINCHIPC6 - bool "Winchip-C6" - help - Select this for an IDT Winchip C6 chip. Linux and GCC - treat this chip as a 586TSC with some extended instructions - and alignment requirements. - -config MWINCHIP2 - bool "Winchip-2" - help - Select this for an IDT Winchip-2. Linux and GCC - treat this chip as a 586TSC with some extended instructions - and alignment requirements. - -config MWINCHIP3D - bool "Winchip-2A/Winchip-3" - help - Select this for an IDT Winchip-2A or 3. Linux and GCC - treat this chip as a 586TSC with some extended instructions - and alignment reqirements. Also enable out of order memory - stores for this CPU, which can increase performance of some - operations. - -config MGEODEGX1 - bool "GeodeGX1" - help - Select this for a Geode GX1 (Cyrix MediaGX) chip. - -config MCYRIXIII - bool "CyrixIII/VIA-C3" - help - Select this for a Cyrix III or C3 chip. Presently Linux and GCC - treat this chip as a generic 586. Whilst the CPU is 686 class, - it lacks the cmov extension which gcc assumes is present when - generating 686 code. - Note that Nehemiah (Model 9) and above will not boot with this - kernel due to them lacking the 3DNow! instructions used in earlier - incarnations of the CPU. - -config MVIAC3_2 - bool "VIA C3-2 (Nehemiah)" - help - Select this for a VIA C3 "Nehemiah". Selecting this enables usage - of SSE and tells gcc to treat the CPU as a 686. - Note, this kernel will not boot on older (pre model 9) C3s. - -endchoice - -config X86_GENERIC - bool "Generic x86 support" - help - Instead of just including optimizations for the selected - x86 variant (e.g. PII, Crusoe or Athlon), include some more - generic optimizations as well. This will make the kernel - perform better on x86 CPUs other than that selected. - - This is really intended for distributors who need more - generic optimizations. - -endif - -# -# Define implied options from the CPU selection here -# -config X86_CMPXCHG - bool - depends on !M386 - default y - -config X86_XADD - bool - depends on !M386 - default y - -config X86_L1_CACHE_SHIFT - int - default "7" if MPENTIUM4 || X86_GENERIC - default "4" if X86_ELAN || M486 || M386 - default "5" if MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODEGX1 - default "6" if MK7 || MK8 || MPENTIUMM - -config RWSEM_GENERIC_SPINLOCK - bool - depends on M386 - default y - -config RWSEM_XCHGADD_ALGORITHM - bool - depends on !M386 - default y - -config GENERIC_CALIBRATE_DELAY - bool - default y - -config X86_PPRO_FENCE - bool - depends on M686 || M586MMX || M586TSC || M586 || M486 || M386 || MGEODEGX1 - default y - -config X86_F00F_BUG - bool - depends on M586MMX || M586TSC || M586 || M486 || M386 - default y - -config X86_WP_WORKS_OK - bool - depends on !M386 - default y - -config X86_INVLPG - bool - depends on !M386 - default y - -config X86_BSWAP - bool - depends on !M386 - default y - -config X86_POPAD_OK - bool - depends on !M386 - default y - -config X86_ALIGNMENT_16 - bool - depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 - default y - -config X86_GOOD_APIC - bool - depends on MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || MK8 || MEFFICEON - default y - -config X86_INTEL_USERCOPY - bool - depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON - default y - -config X86_USE_PPRO_CHECKSUM - bool - depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON - default y - -config X86_USE_3DNOW - bool - depends on MCYRIXIII || MK7 - default y - -config X86_OOSTORE - bool - depends on (MWINCHIP3D || MWINCHIP2 || MWINCHIPC6) && MTRR - default y +source "arch/i386/Kconfig.cpu" config HPET_TIMER bool "HPET Timer Support" @@ -1041,7 +748,7 @@ config APM tristate "APM (Advanced Power Management) BIOS support" - depends on PM + depends on PM && PM_LEGACY ---help--- APM is a BIOS specification for saving power using several different techniques. This is mostly useful for battery powered laptops with @@ -1333,9 +1040,22 @@ source "fs/Kconfig" +menu "Instrumentation Support" + depends on EXPERIMENTAL + if !X86_XEN source "arch/i386/oprofile/Kconfig" endif + +config KPROBES + bool "Kprobes (EXPERIMENTAL)" + help + Kprobes allows you to trap at almost any kernel address and + execute a callback function. register_kprobe() establishes + a probepoint and specifies the callback. Kprobes is useful + for kernel debugging, non-intrusive instrumentation and testing. + If in doubt, say "N". +endmenu source "arch/i386/Kconfig.debug" @@ -1382,8 +1102,3 @@ bool depends on X86_SMP || (X86_VOYAGER && SMP) default y - -config PC - bool - depends on X86 && !EMBEDDED - default y diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/arch/i386/Makefile --- a/linux-2.6-xen-sparse/arch/i386/Makefile Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/arch/i386/Makefile Wed Feb 1 18:00:19 2006 @@ -34,35 +34,8 @@ # prevent gcc from keeping the stack 16 byte aligned CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2) -align := $(cc-option-align) -cflags-$(CONFIG_M386) += -march=i386 -cflags-$(CONFIG_M486) += -march=i486 -cflags-$(CONFIG_M586) += -march=i586 -cflags-$(CONFIG_M586TSC) += -march=i586 -cflags-$(CONFIG_M586MMX) += $(call cc-option,-march=pentium-mmx,-march=i586) -cflags-$(CONFIG_M686) += -march=i686 -cflags-$(CONFIG_MPENTIUMII) += -march=i686 $(call cc-option,-mtune=pentium2) -cflags-$(CONFIG_MPENTIUMIII) += -march=i686 $(call cc-option,-mtune=pentium3) -cflags-$(CONFIG_MPENTIUMM) += -march=i686 $(call cc-option,-mtune=pentium3) -cflags-$(CONFIG_MPENTIUM4) += -march=i686 $(call cc-option,-mtune=pentium4) -cflags-$(CONFIG_MK6) += -march=k6 -# Please note, that patches that add -march=athlon-xp and friends are pointless. -# They make zero difference whatsosever to performance at this time. -cflags-$(CONFIG_MK7) += $(call cc-option,-march=athlon,-march=i686 $(align)-functions=4) -cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8,$(call cc-option,-march=athlon,-march=i686 $(align)-functions=4)) -cflags-$(CONFIG_MCRUSOE) += -march=i686 $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 -cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call cc-option,-mtune=pentium3) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 -cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586) -cflags-$(CONFIG_MWINCHIP2) += $(call cc-option,-march=winchip2,-march=i586) -cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586) -cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 -cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) - -# AMD Elan support -cflags-$(CONFIG_X86_ELAN) += -march=i486 - -# Geode GX1 support -cflags-$(CONFIG_MGEODEGX1) += $(call cc-option,-march=pentium-mmx,-march=i486) +# CPU-specific tuning. Anything which can be shared with UML should go here. +include $(srctree)/arch/i386/Makefile.cpu # -mregparm=3 works ok on gcc-3.0 and later # diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/arch/i386/kernel/Makefile --- a/linux-2.6-xen-sparse/arch/i386/kernel/Makefile Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/arch/i386/kernel/Makefile Wed Feb 1 18:00:19 2006 @@ -86,6 +86,7 @@ include $(srctree)/scripts/Makefile.xen obj-y += fixup.o +microcode-$(subst m,y,$(CONFIG_MICROCODE)) := microcode-xen.o n-obj-xen := i8259.o doublefault.o timers/ reboot.o smpboot.o trampoline.o obj-y := $(call filterxen, $(obj-y), $(n-obj-xen)) diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/arch/i386/kernel/acpi/boot-xen.c --- a/linux-2.6-xen-sparse/arch/i386/kernel/acpi/boot-xen.c Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/arch/i386/kernel/acpi/boot-xen.c Wed Feb 1 18:00:19 2006 @@ -36,22 +36,16 @@ #include <asm/apic.h> #include <asm/io.h> #include <asm/mpspec.h> -#ifdef CONFIG_XEN -#include <asm/fixmap.h> -#endif #ifdef CONFIG_X86_64 -static inline void acpi_madt_oem_check(char *oem_id, char *oem_table_id) -{ -} extern void __init clustered_apic_check(void); -static inline int ioapic_setup_disabled(void) -{ - return 0; -} - + +extern int gsi_irq_sharing(int gsi); #include <asm/proto.h> + +static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id) { return 0; } + #else /* X86 */ @@ -59,6 +53,8 @@ #include <mach_apic.h> #include <mach_mpparse.h> #endif /* CONFIG_X86_LOCAL_APIC */ + +static inline int gsi_irq_sharing(int gsi) { return gsi; } #endif /* X86 */ @@ -138,7 +134,7 @@ int idx; #ifndef CONFIG_XEN - if (phys + size < 8 * 1024 * 1024) + if (phys + size < 8 * 1024 * 1024) return __va(phys); #endif @@ -254,9 +250,7 @@ acpi_table_print_madt_entry(header); - /* no utility in registering a disabled processor */ - if (processor->flags.enabled == 0) - return 0; + /* Register even disabled CPUs for cpu hotplug */ x86_acpiid_to_apicid[processor->acpi_id] = processor->id; @@ -464,7 +458,7 @@ *irq = IO_APIC_VECTOR(gsi); else #endif - *irq = gsi; + *irq = gsi_irq_sharing(gsi); return 0; } @@ -538,7 +532,7 @@ EXPORT_SYMBOL(acpi_unregister_ioapic); static unsigned long __init -acpi_scan_rsdp (unsigned long start, unsigned long length) +acpi_scan_rsdp(unsigned long start, unsigned long length) { unsigned long offset = 0; unsigned long sig_len = sizeof("RSD PTR ") - 1; @@ -647,6 +641,13 @@ return 0; pmtmr_ioport = fadt->xpm_tmr_blk.address; + /* + * "X" fields are optional extensions to the original V1.0 + * fields, so we must selectively expand V1.0 fields if the + * corresponding X field is zero. + */ + if (!pmtmr_ioport) + pmtmr_ioport = fadt->V1_pm_tmr_blk; } else { /* FADT rev. 1 */ pmtmr_ioport = fadt->V1_pm_tmr_blk; diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/arch/i386/kernel/cpu/common-xen.c --- a/linux-2.6-xen-sparse/arch/i386/kernel/cpu/common-xen.c Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/arch/i386/kernel/cpu/common-xen.c Wed Feb 1 18:00:19 2006 @@ -33,8 +33,6 @@ struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {}; -extern void mcheck_init(struct cpuinfo_x86 *c); - extern void machine_specific_modify_cpu_capabilities(struct cpuinfo_x86 *c); extern int disable_pse; @@ -238,10 +236,10 @@ cpuid(0x00000001, &tfms, &misc, &junk, &cap0); c->x86 = (tfms >> 8) & 15; c->x86_model = (tfms >> 4) & 15; - if (c->x86 == 0xf) { + if (c->x86 == 0xf) c->x86 += (tfms >> 20) & 0xff; + if (c->x86 >= 0x6) c->x86_model += ((tfms >> 16) & 0xF) << 4; - } c->x86_mask = tfms & 15; if (cap0 & (1<<19)) c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8; @@ -340,7 +338,7 @@ c->x86_model = c->x86_mask = 0; /* So far unknown... */ c->x86_vendor_id[0] = '\0'; /* Unset */ c->x86_model_id[0] = '\0'; /* Unset */ - c->x86_num_cores = 1; + c->x86_max_cores = 1; memset(&c->x86_capability, 0, sizeof c->x86_capability); if (!have_cpuid_p()) { @@ -436,9 +434,8 @@ } /* Init Machine Check Exception if available. */ -#ifdef CONFIG_X86_MCE mcheck_init(c); -#endif + if (c == &boot_cpu_data) sysenter_setup(); enable_sep_cpu(); @@ -453,52 +450,44 @@ void __devinit detect_ht(struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; - int index_msb, tmp; + int index_msb, core_bits; int cpu = smp_processor_id(); + + cpuid(1, &eax, &ebx, &ecx, &edx); + + c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0); if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY)) return; - cpuid(1, &eax, &ebx, &ecx, &edx); smp_num_siblings = (ebx & 0xff0000) >> 16; if (smp_num_siblings == 1) { printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); } else if (smp_num_siblings > 1 ) { - index_msb = 31; if (smp_num_siblings > NR_CPUS) { printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings); smp_num_siblings = 1; return; } - tmp = smp_num_siblings; - while ((tmp & 0x80000000 ) == 0) { - tmp <<=1 ; - index_msb--; - } - if (smp_num_siblings & (smp_num_siblings - 1)) - index_msb++; + + index_msb = get_count_order(smp_num_siblings); phys_proc_id[cpu] = phys_pkg_id((ebx >> 24) & 0xFF, index_msb); printk(KERN_INFO "CPU: Physical Processor ID: %d\n", phys_proc_id[cpu]); - smp_num_siblings = smp_num_siblings / c->x86_num_cores; - - tmp = smp_num_siblings; - index_msb = 31; - while ((tmp & 0x80000000) == 0) { - tmp <<=1 ; - index_msb--; - } - - if (smp_num_siblings & (smp_num_siblings - 1)) - index_msb++; - - cpu_core_id[cpu] = phys_pkg_id((ebx >> 24) & 0xFF, index_msb); - - if (c->x86_num_cores > 1) + smp_num_siblings = smp_num_siblings / c->x86_max_cores; + + index_msb = get_count_order(smp_num_siblings) ; + + core_bits = get_count_order(c->x86_max_cores); + + cpu_core_id[cpu] = phys_pkg_id((ebx >> 24) & 0xFF, index_msb) & + ((1 << core_bits) - 1); + + if (c->x86_max_cores > 1) printk(KERN_INFO "CPU: Processor Core ID: %d\n", cpu_core_id[cpu]); } @@ -615,12 +604,6 @@ set_in_cr4(X86_CR4_TSD); } - /* - * Set up the per-thread TLS descriptor cache: - */ - memcpy(thread->tls_array, &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN], - GDT_ENTRY_TLS_ENTRIES * 8); - cpu_gdt_init(&cpu_gdt_descr[cpu]); /* diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/arch/i386/kernel/entry-xen.S --- a/linux-2.6-xen-sparse/arch/i386/kernel/entry-xen.S Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/arch/i386/kernel/entry-xen.S Wed Feb 1 18:00:19 2006 @@ -746,11 +746,10 @@ nmi_debug_stack_check: cmpw $__KERNEL_CS,16(%esp) jne nmi_stack_correct - cmpl $debug - 1,(%esp) - jle nmi_stack_correct + cmpl $debug,(%esp) + jb nmi_stack_correct cmpl $debug_esp_fix_insn,(%esp) - jle nmi_debug_stack_fixup -nmi_debug_stack_fixup: + ja nmi_stack_correct FIX_STACK(24,nmi_stack_correct, 1) jmp nmi_stack_correct diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/arch/i386/kernel/io_apic-xen.c --- a/linux-2.6-xen-sparse/arch/i386/kernel/io_apic-xen.c Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/arch/i386/kernel/io_apic-xen.c Wed Feb 1 18:00:19 2006 @@ -88,6 +88,9 @@ int (*ioapic_renumber_irq)(int ioapic, int irq); atomic_t irq_mis_count; +/* Where if anywhere is the i8259 connect in external int mode */ +static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; + static DEFINE_SPINLOCK(ioapic_lock); /* @@ -784,10 +787,11 @@ } #ifndef CONFIG_XEN +#endif /* * Find the pin to which IRQ[irq] (ISA) is connected */ -static int find_isa_irq_pin(int irq, int type) +static int __init find_isa_irq_pin(int irq, int type) { int i; @@ -806,7 +810,33 @@ } return -1; } -#endif + +static int __init find_isa_irq_apic(int irq, int type) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].mpc_srcbus; + + if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || + mp_bus_id_to_type[lbus] == MP_BUS_EISA || + mp_bus_id_to_type[lbus] == MP_BUS_MCA || + mp_bus_id_to_type[lbus] == MP_BUS_NEC98 + ) && + (mp_irqs[i].mpc_irqtype == type) && + (mp_irqs[i].mpc_srcbusirq == irq)) + break; + } + if (i < mp_irq_entries) { + int apic; + for(apic = 0; apic < nr_ioapics; apic++) { + if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic) + return apic; + } + } + + return -1; +} /* * Find a specific PCI IRQ entry. @@ -1306,7 +1336,7 @@ * Set up the 8259A-master output pin: */ #ifndef CONFIG_XEN -static void __init setup_ExtINT_IRQ0_pin(unsigned int pin, int vector) +static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector) { struct IO_APIC_route_entry entry; unsigned long flags; @@ -1340,8 +1370,8 @@ * Add it to the IO-APIC irq-routing table: */ spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1)); - io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0)); + io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); + io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); spin_unlock_irqrestore(&ioapic_lock, flags); enable_8259A_irq(0); @@ -1647,7 +1677,8 @@ static void __init enable_IO_APIC(void) { union IO_APIC_reg_01 reg_01; - int i; + int i8259_apic, i8259_pin; + int i, apic; unsigned long flags; for (i = 0; i < PIN_MAP_SIZE; i++) { @@ -1661,11 +1692,52 @@ /* * The number of IO-APIC IRQ registers (== #pins): */ - for (i = 0; i < nr_ioapics; i++) { + for (apic = 0; apic < nr_ioapics; apic++) { spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(i, 1); + reg_01.raw = io_apic_read(apic, 1); spin_unlock_irqrestore(&ioapic_lock, flags); - nr_ioapic_registers[i] = reg_01.bits.entries+1; + nr_ioapic_registers[apic] = reg_01.bits.entries+1; + } + for(apic = 0; apic < nr_ioapics; apic++) { + int pin; + /* See if any of the pins is in ExtINT mode */ + for(pin = 0; pin < nr_ioapic_registers[i]; pin++) { + struct IO_APIC_route_entry entry; + spin_lock_irqsave(&ioapic_lock, flags); + *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin); + *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin); + spin_unlock_irqrestore(&ioapic_lock, flags); + + + /* If the interrupt line is enabled and in ExtInt mode + * I have found the pin where the i8259 is connected. + */ + if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) { + ioapic_i8259.apic = apic; + ioapic_i8259.pin = pin; + goto found_i8259; + } + } + } + found_i8259: + /* Look to see what if the MP table has reported the ExtINT */ + /* If we could not find the appropriate pin by looking at the ioapic + * the i8259 probably is not connected the ioapic but give the + * mptable a chance anyway. + */ + i8259_pin = find_isa_irq_pin(0, mp_ExtINT); + i8259_apic = find_isa_irq_apic(0, mp_ExtINT); + /* Trust the MP table if nothing is setup in the hardware */ + if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) { + printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n"); + ioapic_i8259.pin = i8259_pin; + ioapic_i8259.apic = i8259_apic; + } + /* Complain if the MP table and the hardware disagree */ + if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) && + (i8259_pin >= 0) && (ioapic_i8259.pin >= 0)) + { + printk(KERN_WARNING "ExtINT in hardware and MP table differ\n"); } /* @@ -1679,9 +1751,6 @@ */ void disable_IO_APIC(void) { -#ifndef CONFIG_XEN - int pin; -#endif /* * Clear the IO-APIC before rebooting: */ @@ -1693,8 +1762,7 @@ * Put that IOAPIC in virtual wire mode * so legacy interrupts can be delivered. */ - pin = find_isa_irq_pin(0, mp_ExtINT); - if (pin != -1) { + if (ioapic_i8259.pin != -1) { struct IO_APIC_route_entry entry; unsigned long flags; @@ -1705,7 +1773,7 @@ entry.polarity = 0; /* High */ entry.delivery_status = 0; entry.dest_mode = 0; /* Physical */ - entry.delivery_mode = 7; /* ExtInt */ + entry.delivery_mode = dest_ExtINT; /* ExtInt */ entry.vector = 0; entry.dest.physical.physical_dest = 0; @@ -1714,11 +1782,13 @@ * Add it to the IO-APIC irq-routing table: */ spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1)); - io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0)); + io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin, + *(((int *)&entry)+1)); + io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin, + *(((int *)&entry)+0)); spin_unlock_irqrestore(&ioapic_lock, flags); } - disconnect_bsp_APIC(pin != -1); + disconnect_bsp_APIC(ioapic_i8259.pin != -1); #endif } @@ -1994,7 +2064,7 @@ { int irq = vector_to_irq(vector); - move_irq(vector); + move_native_irq(vector); ack_edge_ioapic_irq(irq); } @@ -2009,7 +2079,7 @@ { int irq = vector_to_irq(vector); - move_irq(vector); + move_native_irq(vector); end_level_ioapic_irq(irq); } @@ -2174,20 +2244,21 @@ */ static inline void unlock_ExtINT_logic(void) { - int pin, i; + int apic, pin, i; struct IO_APIC_route_entry entry0, entry1; unsigned char save_control, save_freq_select; unsigned long flags; - pin = find_isa_irq_pin(8, mp_INT); + pin = find_isa_irq_pin(8, mp_INT); + apic = find_isa_irq_apic(8, mp_INT); if (pin == -1) return; spin_lock_irqsave(&ioapic_lock, flags); - *(((int *)&entry0) + 1) = io_apic_read(0, 0x11 + 2 * pin); - *(((int *)&entry0) + 0) = io_apic_read(0, 0x10 + 2 * pin); + *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin); + *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin); spin_unlock_irqrestore(&ioapic_lock, flags); - clear_IO_APIC_pin(0, pin); + clear_IO_APIC_pin(apic, pin); memset(&entry1, 0, sizeof(entry1)); @@ -2200,8 +2271,8 @@ entry1.vector = 0; spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry1) + 1)); - io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry1) + 0)); + io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1)); + io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0)); spin_unlock_irqrestore(&ioapic_lock, flags); save_control = CMOS_READ(RTC_CONTROL); @@ -2219,11 +2290,11 @@ CMOS_WRITE(save_control, RTC_CONTROL); CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); - clear_IO_APIC_pin(0, pin); + clear_IO_APIC_pin(apic, pin); spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry0) + 1)); - io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry0) + 0)); + io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1)); + io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0)); spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -2235,7 +2306,7 @@ */ static inline void check_timer(void) { - int pin1, pin2; + int apic1, pin1, apic2, pin2; int vector; /* @@ -2257,10 +2328,13 @@ timer_ack = 1; enable_8259A_irq(0); - pin1 = find_isa_irq_pin(0, mp_INT); - pin2 = find_isa_irq_pin(0, mp_ExtINT); - - printk(KERN_INFO "..TIMER: vector=0x%02X pin1=%d pin2=%d\n", vector, pin1, pin2); + pin1 = find_isa_irq_pin(0, mp_INT); + apic1 = find_isa_irq_apic(0, mp_INT); + pin2 = ioapic_i8259.pin; + apic2 = ioapic_i8259.apic; + + printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", + vector, apic1, pin1, apic2, pin2); if (pin1 != -1) { /* @@ -2277,8 +2351,9 @@ clear_IO_APIC_pin(0, pin1); return; } - clear_IO_APIC_pin(0, pin1); - printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to IO-APIC\n"); + clear_IO_APIC_pin(apic1, pin1); + printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to " + "IO-APIC\n"); } printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... "); @@ -2287,13 +2362,13 @@ /* * legacy devices should be connected to IO APIC #0 */ - setup_ExtINT_IRQ0_pin(pin2, vector); + setup_ExtINT_IRQ0_pin(apic2, pin2, vector); if (timer_irq_works()) { printk("works.\n"); if (pin1 != -1) - replace_pin_at_irq(0, 0, pin1, 0, pin2); + replace_pin_at_irq(0, apic1, pin1, apic2, pin2); else - add_pin_to_irq(0, 0, pin2); + add_pin_to_irq(0, apic2, pin2); if (nmi_watchdog == NMI_IO_APIC) { setup_nmi(); } @@ -2302,7 +2377,7 @@ /* * Cleanup, just in case ... */ - clear_IO_APIC_pin(0, pin2); + clear_IO_APIC_pin(apic2, pin2); } printk(" failed.\n"); diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/arch/i386/kernel/irq-xen.c --- a/linux-2.6-xen-sparse/arch/i386/kernel/irq-xen.c Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/arch/i386/kernel/irq-xen.c Wed Feb 1 18:00:19 2006 @@ -218,7 +218,7 @@ if (i == 0) { seq_printf(p, " "); - for_each_cpu(j) + for_each_online_cpu(j) seq_printf(p, "CPU%d ",j); seq_putc(p, '\n'); } @@ -232,7 +232,7 @@ #ifndef CONFIG_SMP seq_printf(p, "%10u ", kstat_irqs(i)); #else - for_each_cpu(j) + for_each_online_cpu(j) seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); #endif seq_printf(p, " %14s", irq_desc[i].handler->typename); @@ -246,12 +246,12 @@ spin_unlock_irqrestore(&irq_desc[i].lock, flags); } else if (i == NR_IRQS) { seq_printf(p, "NMI: "); - for_each_cpu(j) + for_each_online_cpu(j) seq_printf(p, "%10u ", nmi_count(j)); seq_putc(p, '\n'); #ifdef CONFIG_X86_LOCAL_APIC seq_printf(p, "LOC: "); - for_each_cpu(j) + for_each_online_cpu(j) seq_printf(p, "%10u ", per_cpu(irq_stat,j).apic_timer_irqs); seq_putc(p, '\n'); diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/arch/i386/kernel/ldt-xen.c --- a/linux-2.6-xen-sparse/arch/i386/kernel/ldt-xen.c Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/arch/i386/kernel/ldt-xen.c Wed Feb 1 18:00:19 2006 @@ -18,6 +18,7 @@ #include <asm/system.h> #include <asm/ldt.h> #include <asm/desc.h> +#include <asm/mmu_context.h> #ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ static void flush_ldt(void *null) diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/arch/i386/kernel/mpparse-xen.c --- a/linux-2.6-xen-sparse/arch/i386/kernel/mpparse-xen.c Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/arch/i386/kernel/mpparse-xen.c Wed Feb 1 18:00:19 2006 @@ -69,7 +69,7 @@ /* Processor that is doing the boot up */ unsigned int boot_cpu_physical_apicid = -1U; /* Internal processor count */ -static unsigned int __initdata num_processors; +static unsigned int __devinitdata num_processors; /* Bitmask of physically existing CPUs */ physid_mask_t phys_cpu_present_map; @@ -120,7 +120,7 @@ #endif #ifndef CONFIG_XEN -static void __init MP_processor_info (struct mpc_config_processor *m) +static void __devinit MP_processor_info (struct mpc_config_processor *m) { int ver, apicid; physid_mask_t phys_cpu; @@ -183,17 +183,6 @@ boot_cpu_physical_apicid = m->mpc_apicid; } - if (num_processors >= NR_CPUS) { - printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." - " Processor ignored.\n", NR_CPUS); - return; - } - - if (num_processors >= maxcpus) { - printk(KERN_WARNING "WARNING: maxcpus limit of %i reached." - " Processor ignored.\n", maxcpus); - return; - } ver = m->mpc_apicver; if (!MP_valid_apicid(apicid, ver)) { @@ -201,11 +190,6 @@ m->mpc_apicid, MAX_APICS); return; } - - cpu_set(num_processors, cpu_possible_map); - num_processors++; - phys_cpu = apicid_to_cpu_present(apicid); - physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu); /* * Validate version @@ -217,9 +201,29 @@ ver = 0x10; } apic_version[m->mpc_apicid] = ver; + + phys_cpu = apicid_to_cpu_present(apicid); + physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu); + + if (num_processors >= NR_CPUS) { + printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." + " Processor ignored.\n", NR_CPUS); + return; + } + + if (num_processors >= maxcpus) { + printk(KERN_WARNING "WARNING: maxcpus limit of %i reached." + " Processor ignored.\n", maxcpus); + return; + } + + cpu_set(num_processors, cpu_possible_map); + num_processors++; + if ((num_processors > 8) && - APIC_XAPIC(ver) && - (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)) + ((APIC_XAPIC(ver) && + (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)) || + (boot_cpu_data.x86_vendor == X86_VENDOR_AMD))) def_to_bigsmp = 1; else def_to_bigsmp = 0; @@ -850,7 +854,7 @@ } -void __init mp_register_lapic ( +void __devinit mp_register_lapic ( u8 id, u8 enabled) { diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/arch/i386/kernel/process-xen.c --- a/linux-2.6-xen-sparse/arch/i386/kernel/process-xen.c Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/arch/i386/kernel/process-xen.c Wed Feb 1 18:00:19 2006 @@ -108,15 +108,31 @@ if (need_resched()) { local_irq_enable(); } else { + clear_thread_flag(TIF_POLLING_NRFLAG); + smp_mb__after_clear_bit(); stop_hz_timer(); /* Blocking includes an implicit local_irq_enable(). */ HYPERVISOR_sched_op(SCHEDOP_block, 0); start_hz_timer(); + set_thread_flag(TIF_POLLING_NRFLAG); } } #ifdef CONFIG_APM_MODULE EXPORT_SYMBOL(default_idle); #endif + +#ifdef CONFIG_HOTPLUG_CPU +static inline void play_dead(void) +{ + HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); + local_irq_enable(); +} +#else +static inline void play_dead(void) +{ + BUG(); +} +#endif /* CONFIG_HOTPLUG_CPU */ /* * The idle thread. There's no useful work to be @@ -126,9 +142,9 @@ */ void cpu_idle(void) { -#if defined(CONFIG_HOTPLUG_CPU) - int cpu = raw_smp_processor_id(); -#endif + int cpu = smp_processor_id(); + + set_thread_flag(TIF_POLLING_NRFLAG); /* endless idle loop with no priority at all */ while (1) { @@ -139,17 +155,15 @@ rmb(); -#if defined(CONFIG_HOTPLUG_CPU) - if (cpu_is_offline(cpu)) { - HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL); - local_irq_enable(); - } -#endif + if (cpu_is_offline(cpu)) + play_dead(); __get_cpu_var(irq_stat).idle_timestamp = jiffies; xen_idle(); } + preempt_enable_no_resched(); schedule(); + preempt_disable(); } } @@ -187,6 +201,8 @@ void show_regs(struct pt_regs * regs) { + unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; + printk("\n"); printk("Pid: %d, comm: %20s\n", current->pid, current->comm); printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id()); @@ -203,6 +219,13 @@ printk(" DS: %04x ES: %04x\n", 0xffff & regs->xds,0xffff & regs->xes); + cr0 = read_cr0(); + cr2 = read_cr2(); + cr3 = read_cr3(); + if (current_cpu_data.x86 > 4) { + cr4 = read_cr4(); + } + printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4); show_trace(NULL, ®s->esp); } @@ -274,13 +297,6 @@ void flush_thread(void) { struct task_struct *tsk = current; - - /* - * Remove function-return probe instances associated with this task - * and put them back on the free list. Do not insert an exit probe for - * this function, it will be disabled by kprobe_flush_task if you do. - */ - kprobe_flush_task(tsk); memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8); memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); @@ -445,7 +461,9 @@ struct pt_regs ptregs; ptregs = *(struct pt_regs *) - ((unsigned long)tsk->thread_info+THREAD_SIZE - sizeof(ptregs)); + ((unsigned long)tsk->thread_info + + /* see comments in copy_thread() about -8 */ + THREAD_SIZE - sizeof(ptregs) - 8); ptregs.xcs &= 0xffff; ptregs.xds &= 0xffff; ptregs.xes &= 0xffff; @@ -453,7 +471,6 @@ elf_core_copy_regs(regs, &ptregs); - boot_option_idle_override = 1; return 1; } diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c --- a/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/arch/i386/kernel/setup-xen.c Wed Feb 1 18:00:19 2006 @@ -142,9 +142,7 @@ EXPORT_SYMBOL(drive_info); #endif struct screen_info screen_info; -#ifdef CONFIG_VT EXPORT_SYMBOL(screen_info); -#endif struct apm_info apm_info; EXPORT_SYMBOL(apm_info); struct sys_desc_table_struct { @@ -425,14 +423,24 @@ } } for (i = 0; i < e820.nr_map; i++) { - if (e820.map[i].type == E820_RAM) { - current_addr = e820.map[i].addr + e820.map[i].size; - if (current_addr >= size) { - e820.map[i].size -= current_addr-size; - e820.nr_map = i + 1; - return; - } - } + current_addr = e820.map[i].addr + e820.map[i].size; + if (current_addr < size) + continue; + + if (e820.map[i].type != E820_RAM) + continue; + + if (e820.map[i].addr >= size) { + /* + * This region starts past the end of the + * requested size, skip it completely. + */ + e820.nr_map = i; + } else { + e820.nr_map = i + 1; + e820.map[i].size -= current_addr - size; + } + return; } } diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/arch/i386/kernel/smpboot.c --- a/linux-2.6-xen-sparse/arch/i386/kernel/smpboot.c Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/arch/i386/kernel/smpboot.c Wed Feb 1 18:00:19 2006 @@ -68,15 +68,15 @@ /* Package ID of each logical CPU */ int phys_proc_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID}; -EXPORT_SYMBOL(phys_proc_id); /* Core ID of each logical CPU */ int cpu_core_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID}; -EXPORT_SYMBOL(cpu_core_id); - + +/* representing HT siblings of each logical CPU */ cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly; EXPORT_SYMBOL(cpu_sibling_map); +/* representing HT and core siblings of each logical CPU */ cpumask_t cpu_core_map[NR_CPUS] __read_mostly; EXPORT_SYMBOL(cpu_core_map); @@ -87,7 +87,11 @@ cpumask_t cpu_callin_map; cpumask_t cpu_callout_map; EXPORT_SYMBOL(cpu_callout_map); +#ifdef CONFIG_HOTPLUG_CPU +cpumask_t cpu_possible_map = CPU_MASK_ALL; +#else cpumask_t cpu_possible_map; +#endif EXPORT_SYMBOL(cpu_possible_map); static cpumask_t smp_commenced_mask; @@ -440,35 +444,60 @@ static int cpucount; +/* representing cpus for which sibling maps can be computed */ +static cpumask_t cpu_sibling_setup_map; + static inline void set_cpu_sibling_map(int cpu) { int i; + struct cpuinfo_x86 *c = cpu_data; + + cpu_set(cpu, cpu_sibling_setup_map); if (smp_num_siblings > 1) { - for (i = 0; i < NR_CPUS; i++) { - if (!cpu_isset(i, cpu_callout_map)) - continue; - if (cpu_core_id[cpu] == cpu_core_id[i]) { + for_each_cpu_mask(i, cpu_sibling_setup_map) { + if (phys_proc_id[cpu] == phys_proc_id[i] && + cpu_core_id[cpu] == cpu_core_id[i]) { cpu_set(i, cpu_sibling_map[cpu]); cpu_set(cpu, cpu_sibling_map[i]); - } - } - } else { - cpu_set(cpu, cpu_sibling_map[cpu]); - } - - if (current_cpu_data.x86_num_cores > 1) { - for (i = 0; i < NR_CPUS; i++) { - if (!cpu_isset(i, cpu_callout_map)) - continue; - if (phys_proc_id[cpu] == phys_proc_id[i]) { cpu_set(i, cpu_core_map[cpu]); cpu_set(cpu, cpu_core_map[i]); } } } else { + cpu_set(cpu, cpu_sibling_map[cpu]); + } + + if (current_cpu_data.x86_max_cores == 1) { cpu_core_map[cpu] = cpu_sibling_map[cpu]; + c[cpu].booted_cores = 1; + return; + } + + for_each_cpu_mask(i, cpu_sibling_setup_map) { + if (phys_proc_id[cpu] == phys_proc_id[i]) { + cpu_set(i, cpu_core_map[cpu]); + cpu_set(cpu, cpu_core_map[i]); + /* + * Does this new cpu bringup a new core? + */ + if (cpus_weight(cpu_sibling_map[cpu]) == 1) { + /* + * for each core in package, increment + * the booted_cores for this new cpu + */ + if (first_cpu(cpu_sibling_map[i]) == i) + c[cpu].booted_cores++; + /* + * increment the core count for all + * the other cpus in this package + */ + if (i != cpu) + c[i].booted_cores++; + } else if (i != cpu && !c[cpu].booted_cores) + c[cpu].booted_cores = c[i].booted_cores; + } } } @@ -483,6 +512,7 @@ * things done here to the most necessary things. */ cpu_init(); + preempt_disable(); smp_callin(); while (!cpu_isset(smp_processor_id(), smp_commenced_mask)) rep_nop(); @@ -608,7 +638,7 @@ printk("Inquiring remote APIC #%d...\n", apicid); - for (i = 0; i < sizeof(regs) / sizeof(*regs); i++) { + for (i = 0; i < ARRAY_SIZE(regs); i++) { printk("... APIC #%d %s: ", apicid, names[i]); /* @@ -1092,11 +1122,8 @@ current_thread_info()->cpu = 0; smp_tune_scheduling(); - cpus_clear(cpu_sibling_map[0]); - cpu_set(0, cpu_sibling_map[0]); - - cpus_clear(cpu_core_map[0]); - cpu_set(0, cpu_core_map[0]); + + set_cpu_sibling_map(0); /* * If we couldn't find an SMP configuration at boot time, @@ -1280,15 +1307,24 @@ remove_siblinginfo(int cpu) { int sibling; - + struct cpuinfo_x86 *c = cpu_data; + + for_each_cpu_mask(sibling, cpu_core_map[cpu]) { + cpu_clear(cpu, cpu_core_map[sibling]); + /* + * last thread sibling in this cpu core going down + */ + if (cpus_weight(cpu_sibling_map[cpu]) == 1) + c[sibling].booted_cores--; + } + for_each_cpu_mask(sibling, cpu_sibling_map[cpu]) cpu_clear(cpu, cpu_sibling_map[sibling]); - for_each_cpu_mask(sibling, cpu_core_map[cpu]) - cpu_clear(cpu, cpu_core_map[sibling]); cpus_clear(cpu_sibling_map[cpu]); cpus_clear(cpu_core_map[cpu]); phys_proc_id[cpu] = BAD_APICID; cpu_core_id[cpu] = BAD_APICID; + cpu_clear(cpu, cpu_sibling_setup_map); } int __cpu_disable(void) @@ -1307,8 +1343,7 @@ if (cpu == 0) return -EBUSY; - /* We enable the timer again on the exit path of the death loop */ - disable_APIC_timer(); + clear_local_APIC(); /* Allow any queued timer interrupts to get serviced */ local_irq_enable(); mdelay(1); diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/arch/i386/kernel/time-xen.c --- a/linux-2.6-xen-sparse/arch/i386/kernel/time-xen.c Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/arch/i386/kernel/time-xen.c Wed Feb 1 18:00:19 2006 @@ -69,8 +69,6 @@ #include <asm/arch_hooks.h> -#include "io_ports.h" - #include <xen/evtchn.h> #if defined (__i386__) @@ -78,10 +76,6 @@ #endif int pit_latch_buggy; /* extern */ - -u64 jiffies_64 = INITIAL_JIFFIES; - -EXPORT_SYMBOL(jiffies_64); #if defined(__x86_64__) unsigned long vxtime_hz = PIT_TICK_RATE; diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/arch/i386/kernel/traps-xen.c --- a/linux-2.6-xen-sparse/arch/i386/kernel/traps-xen.c Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/arch/i386/kernel/traps-xen.c Wed Feb 1 18:00:19 2006 @@ -626,13 +626,6 @@ nmi_enter(); cpu = smp_processor_id(); - -#ifdef CONFIG_HOTPLUG_CPU - if (!cpu_online(cpu)) { - nmi_exit(); - return; - } -#endif ++nmi_count(cpu); diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/arch/i386/kernel/traps.c --- a/linux-2.6-xen-sparse/arch/i386/kernel/traps.c Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/arch/i386/kernel/traps.c Wed Feb 1 18:00:19 2006 @@ -488,6 +488,7 @@ tss->io_bitmap_max - thread->io_bitmap_max); tss->io_bitmap_max = thread->io_bitmap_max; tss->io_bitmap_base = IO_BITMAP_OFFSET; + tss->io_bitmap_owner = thread; put_cpu(); return; } @@ -641,13 +642,6 @@ nmi_enter(); cpu = smp_processor_id(); - -#ifdef CONFIG_HOTPLUG_CPU - if (!cpu_online(cpu)) { - nmi_exit(); - return; - } -#endif ++nmi_count(cpu); diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/arch/i386/mach-xen/Makefile --- a/linux-2.6-xen-sparse/arch/i386/mach-xen/Makefile Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/arch/i386/mach-xen/Makefile Wed Feb 1 18:00:19 2006 @@ -3,5 +3,5 @@ # obj-y := setup.o topology.o - + topology-y := ../mach-default/topology.o diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/arch/i386/mm/fault-xen.c --- a/linux-2.6-xen-sparse/arch/i386/mm/fault-xen.c Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/arch/i386/mm/fault-xen.c Wed Feb 1 18:00:19 2006 @@ -22,7 +22,6 @@ #include <linux/highmem.h> #include <linux/module.h> #include <linux/kprobes.h> -#include <linux/percpu.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -30,8 +29,6 @@ #include <asm/kdebug.h> extern void die(const char *,struct pt_regs *,long); - -DEFINE_PER_CPU(pgd_t *, cur_pgd); /* * Unlock any spinlocks which will prevent us from getting the @@ -111,7 +108,7 @@ desc = (void *)desc + (seg & ~7); } else { /* Must disable preemption while reading the GDT. */ - desc = (u32 *)get_cpu_gdt_table(get_cpu()); + desc = (u32 *)get_cpu_gdt_table(get_cpu()); desc = (void *)desc + (seg & ~7); } @@ -223,10 +220,7 @@ unsigned long *p, page; unsigned long mfn; - preempt_disable(); - page = __pa(per_cpu(cur_pgd, smp_processor_id())); - preempt_enable(); - + page = read_cr3(); p = (unsigned long *)__va(page); p += (address >> 30) * 2; printk(KERN_ALERT "%08lx -> *pde = %08lx:%08lx\n", page, p[1], p[0]); @@ -256,13 +250,8 @@ { unsigned long page; - preempt_disable(); - page = ((unsigned long *) per_cpu(cur_pgd, smp_processor_id())) - [address >> 22]; - preempt_enable(); - - page = ((unsigned long *) per_cpu(cur_pgd, get_cpu())) - [address >> 22]; + page = read_cr3(); + page = ((unsigned long *) __va(page))[address >> 22]; printk(KERN_ALERT "*pde = ma %08lx pa %08lx\n", page, machine_to_phys(page)); /* @@ -304,8 +293,8 @@ unsigned long address; int write, si_code; - address = HYPERVISOR_shared_info->vcpu_info[ - smp_processor_id()].arch.cr2; + /* get the address */ + address = read_cr2(); /* Set the "privileged fault" bit to something sane. */ error_code &= ~4; @@ -582,14 +571,14 @@ * an interrupt in the middle of a task switch.. */ int index = pgd_index(address); + unsigned long pgd_paddr; pgd_t *pgd, *pgd_k; pud_t *pud, *pud_k; pmd_t *pmd, *pmd_k; pte_t *pte_k; - preempt_disable(); - pgd = index + per_cpu(cur_pgd, smp_processor_id()); - preempt_enable(); + pgd_paddr = read_cr3(); + pgd = index + (pgd_t *)__va(pgd_paddr); pgd_k = init_mm.pgd + index; if (!pgd_present(*pgd_k)) diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/arch/i386/mm/init-xen.c --- a/linux-2.6-xen-sparse/arch/i386/mm/init-xen.c Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/arch/i386/mm/init-xen.c Wed Feb 1 18:00:19 2006 @@ -27,6 +27,8 @@ #include <linux/slab.h> #include <linux/proc_fs.h> #include <linux/efi.h> +#include <linux/memory_hotplug.h> +#include <linux/initrd.h> #include <asm/processor.h> #include <asm/system.h> @@ -313,18 +315,47 @@ pkmap_page_table = pte; } -void __init one_highpage_init(struct page *page, int pfn, int bad_ppro) +static void __devinit free_new_highpage(struct page *page, int pfn) +{ + set_page_count(page, 1); + if (pfn < xen_start_info->nr_pages) + __free_page(page); + totalhigh_pages++; +} + +void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro) { if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) { ClearPageReserved(page); - set_page_count(page, 1); - if (pfn < xen_start_info->nr_pages) - __free_page(page); - totalhigh_pages++; + free_new_highpage(page, pfn); } else SetPageReserved(page); } +static int add_one_highpage_hotplug(struct page *page, unsigned long pfn) +{ + free_new_highpage(page, pfn); + totalram_pages++; +#ifdef CONFIG_FLATMEM + max_mapnr = max(pfn, max_mapnr); +#endif + num_physpages++; + return 0; +} + +/* + * Not currently handling the NUMA case. + * Assuming single node and all memory that + * has been added dynamically that would be + * onlined here is in HIGHMEM + */ +void online_page(struct page *page) +{ + ClearPageReserved(page); + add_one_highpage_hotplug(page, page_to_pfn(page)); +} + + #ifdef CONFIG_NUMA extern void set_highmem_pages_init(int); #else @@ -332,7 +363,7 @@ { int pfn; for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) - one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); + add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); totalram_pages += totalhigh_pages; } #endif /* CONFIG_FLATMEM */ @@ -359,12 +390,9 @@ { unsigned long vaddr; pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base; - int i; swapper_pg_dir = pgd_base; init_mm.pgd = pgd_base; - for (i = 0; i < NR_CPUS; i++) - per_cpu(cur_pgd, i) = pgd_base; /* Enable PSE if available */ if (cpu_has_pse) { @@ -693,6 +721,28 @@ set_bit(PG_pinned, &virt_to_page(init_mm.pgd)->flags); } + +/* + * this is for the non-NUMA, single node SMP system case. + * Specifically, in the case of x86, we will always add + * memory to the highmem for now. + */ +#ifndef CONFIG_NEED_MULTIPLE_NODES +int add_memory(u64 start, u64 size) +{ + struct pglist_data *pgdata = &contig_page_data; + struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1; + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + + return __add_pages(zone, start_pfn, nr_pages); +} + +int remove_memory(u64 start, u64 size) +{ + return -EINVAL; +} +#endif kmem_cache_t *pgd_cache; kmem_cache_t *pmd_cache; diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/arch/i386/mm/ioremap-xen.c --- a/linux-2.6-xen-sparse/arch/i386/mm/ioremap-xen.c Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/arch/i386/mm/ioremap-xen.c Wed Feb 1 18:00:19 2006 @@ -323,9 +323,15 @@ } EXPORT_SYMBOL(ioremap_nocache); +/** + * iounmap - Free a IO remapping + * @addr: virtual address from ioremap_* + * + * Caller must ensure there is only one unmapping for the same pointer. + */ void iounmap(volatile void __iomem *addr) { - struct vm_struct *p; + struct vm_struct *p, *o; if ((void __force *)addr <= high_memory) return; @@ -338,14 +344,27 @@ if ((unsigned long) addr >= fix_to_virt(FIX_ISAMAP_BEGIN)) return; - write_lock(&vmlist_lock); - p = __remove_vm_area((void *)(PAGE_MASK & (unsigned long __force)addr)); - if (!p) { - printk(KERN_WARNING "iounmap: bad address %p\n", addr); + addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr); + + /* Use the vm area unlocked, assuming the caller + ensures there isn't another iounmap for the same address + in parallel. Reuse of the virtual address is prevented by + leaving it in the global lists until we're done with it. + cpa takes care of the direct mappings. */ + read_lock(&vmlist_lock); + for (p = vmlist; p; p = p->next) { + if (p->addr == addr) + break; + } + read_unlock(&vmlist_lock); + + if (!p) { + printk("iounmap: bad address %p\n", addr); dump_stack(); - goto out_unlock; - } - + return; + } + + /* Reset the direct mapping. Can block */ if ((p->flags >> 20) && is_local_lowmem(p->phys_addr)) { /* p->size includes the guard page, but cpa doesn't like that */ change_page_attr(virt_to_page(bus_to_virt(p->phys_addr)), @@ -353,8 +372,10 @@ PAGE_KERNEL); global_flush_tlb(); } -out_unlock: - write_unlock(&vmlist_lock); + + /* Finally remove it */ + o = remove_vm_area((void *)addr); + BUG_ON(p != o || o == NULL); kfree(p); } EXPORT_SYMBOL(iounmap); diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/arch/i386/mm/pgtable-xen.c --- a/linux-2.6-xen-sparse/arch/i386/mm/pgtable-xen.c Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/arch/i386/mm/pgtable-xen.c Wed Feb 1 18:00:19 2006 @@ -39,11 +39,13 @@ pg_data_t *pgdat; unsigned long i; struct page_state ps; + unsigned long flags; printk(KERN_INFO "Mem-info:\n"); show_free_areas(); printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); for_each_pgdat(pgdat) { + pgdat_resize_lock(pgdat, &flags); for (i = 0; i < pgdat->node_spanned_pages; ++i) { page = pgdat_page_nr(pgdat, i); total++; @@ -56,6 +58,7 @@ else if (page_count(page)) shared += page_count(page) - 1; } + pgdat_resize_unlock(pgdat, &flags); } printk(KERN_INFO "%d pages of RAM\n", total); printk(KERN_INFO "%d pages of HIGHMEM\n", highmem); @@ -267,19 +270,19 @@ struct page *page = virt_to_page(pgd); page->index = (unsigned long)pgd_list; if (pgd_list) - pgd_list->private = (unsigned long)&page->index; + set_page_private(pgd_list, (unsigned long)&page->index); pgd_list = page; - page->private = (unsigned long)&pgd_list; + set_page_private(page, (unsigned long)&pgd_list); } static inline void pgd_list_del(pgd_t *pgd) { struct page *next, **pprev, *page = virt_to_page(pgd); next = (struct page *)page->index; - pprev = (struct page **)page->private; + pprev = (struct page **)page_private(page); *pprev = next; if (next) - next->private = (unsigned long)pprev; + set_page_private(next, (unsigned long)pprev); } void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused) diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/arch/i386/pci/Makefile --- a/linux-2.6-xen-sparse/arch/i386/pci/Makefile Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/arch/i386/pci/Makefile Wed Feb 1 18:00:19 2006 @@ -1,7 +1,7 @@ obj-y := i386.o obj-$(CONFIG_PCI_BIOS) += pcbios.o -obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o +obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o direct.o obj-$(CONFIG_PCI_DIRECT) += direct.o pci-y := fixup.o diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/arch/i386/pci/irq-xen.c --- a/linux-2.6-xen-sparse/arch/i386/pci/irq-xen.c Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/arch/i386/pci/irq-xen.c Wed Feb 1 18:00:19 2006 @@ -550,31 +550,48 @@ return 0; } -static __init int via_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) +static __init int via_router_probe(struct irq_router *r, + struct pci_dev *router, u16 device) { /* FIXME: We should move some of the quirk fixup stuff here */ - if (router->device == PCI_DEVICE_ID_VIA_82C686 && - device == PCI_DEVICE_ID_VIA_82C586_0) { - /* Asus k7m bios wrongly reports 82C686A as 586-compatible */ - device = PCI_DEVICE_ID_VIA_82C686; - } - - switch(device) - { - case PCI_DEVICE_ID_VIA_82C586_0: - r->name = "VIA"; - r->get = pirq_via586_get; - r->set = pirq_via586_set; - return 1; - case PCI_DEVICE_ID_VIA_82C596: + /* + * work arounds for some buggy BIOSes + */ + if (device == PCI_DEVICE_ID_VIA_82C586_0) { + switch(router->device) { case PCI_DEVICE_ID_VIA_82C686: - case PCI_DEVICE_ID_VIA_8231: + /* + * Asus k7m bios wrongly reports 82C686A + * as 586-compatible + */ + device = PCI_DEVICE_ID_VIA_82C686; + break; + case PCI_DEVICE_ID_VIA_8235: + /** + * Asus a7v-x bios wrongly reports 8235 + * as 586-compatible + */ + device = PCI_DEVICE_ID_VIA_8235; + break; + } + } + + switch(device) { + case PCI_DEVICE_ID_VIA_82C586_0: + r->name = "VIA"; + r->get = pirq_via586_get; + r->set = pirq_via586_set; + return 1; + case PCI_DEVICE_ID_VIA_82C596: + case PCI_DEVICE_ID_VIA_82C686: + case PCI_DEVICE_ID_VIA_8231: + case PCI_DEVICE_ID_VIA_8235: /* FIXME: add new ones for 8233/5 */ - r->name = "VIA"; - r->get = pirq_via_get; - r->set = pirq_via_set; - return 1; + r->name = "VIA"; + r->get = pirq_via_get; + r->set = pirq_via_set; + return 1; } return 0; } diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/arch/um/kernel/physmem.c --- a/linux-2.6-xen-sparse/arch/um/kernel/physmem.c Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/arch/um/kernel/physmem.c Wed Feb 1 18:00:19 2006 @@ -248,7 +248,7 @@ /* Changed during early boot */ unsigned long high_physmem; -extern unsigned long physmem_size; +extern unsigned long long physmem_size; int init_maps(unsigned long physmem, unsigned long iomem, unsigned long highmem) { @@ -323,7 +323,7 @@ extern int __syscall_stub_start, __binary_start; void setup_physmem(unsigned long start, unsigned long reserve_end, - unsigned long len, unsigned long highmem) + unsigned long len, unsigned long long highmem) { unsigned long reserve = reserve_end - start; int pfn = PFN_UP(__pa(reserve_end)); diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/arch/x86_64/Kconfig --- a/linux-2.6-xen-sparse/arch/x86_64/Kconfig Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/arch/x86_64/Kconfig Wed Feb 1 18:00:19 2006 @@ -242,22 +242,42 @@ source "kernel/Kconfig.preempt" -config K8_NUMA - bool "K8 NUMA support" - select NUMA +config NUMA + bool "Non Uniform Memory Access (NUMA) Support" depends on SMP && !X86_64_XEN help - Enable NUMA (Non Unified Memory Architecture) support for - AMD Opteron Multiprocessor systems. The kernel will try to allocate - memory used by a CPU on the local memory controller of the CPU - and add some more NUMA awareness to the kernel. - This code is recommended on all multiprocessor Opteron systems - and normally doesn't hurt on others. + Enable NUMA (Non Uniform Memory Access) support. The kernel + will try to allocate memory used by a CPU on the local memory + controller of the CPU and add some more NUMA awareness to the kernel. + This code is recommended on all multiprocessor Opteron systems. + If the system is EM64T, you should say N unless your system is EM64T + NUMA. + +config K8_NUMA + bool "Old style AMD Opteron NUMA detection" + depends on NUMA + default y + help + Enable K8 NUMA node topology detection. You should say Y here if + you have a multi processor AMD K8 system. This uses an old + method to read the NUMA configurtion directly from the builtin + Northbridge of Opteron. It is recommended to use X86_64_ACPI_NUMA + instead, which also takes priority if both are compiled in. + +# Dummy CONFIG option to select ACPI_NUMA from drivers/acpi/Kconfig. + +config X86_64_ACPI_NUMA + bool "ACPI NUMA detection" + depends on NUMA + select ACPI + select ACPI_NUMA + default y + help + Enable ACPI SRAT based node topology detection. config NUMA_EMU - bool "NUMA emulation support" - select NUMA - depends on SMP && !X86_64_XEN + bool "NUMA emulation" + depends on NUMA help Enable NUMA emulation. A flat machine will be split into virtual nodes when booted with "numa=fake=N", where N is the @@ -268,9 +288,6 @@ depends on NUMA default y -config NUMA - bool - default n config ARCH_DISCONTIGMEM_ENABLE def_bool y @@ -393,6 +410,14 @@ Additional support for intel specific MCE features such as the thermal monitor. +config X86_MCE_AMD + bool "AMD MCE features" + depends on X86_MCE && X86_LOCAL_APIC + default y + help + Additional support for AMD specific MCE features such as + the DRAM Error Threshold. + config PHYSICAL_START hex "Physical address where the kernel is loaded" if EMBEDDED default "0x100000" @@ -528,7 +553,7 @@ left. config IA32_AOUT - bool "IA32 a.out support" + tristate "IA32 a.out support" depends on IA32_EMULATION help Support old a.out binaries in the 32bit emulation. @@ -558,8 +583,21 @@ source fs/Kconfig +menu "Instrumentation Support" + depends on EXPERIMENTAL + source "arch/x86_64/oprofile/Kconfig" +config KPROBES + bool "Kprobes (EXPERIMENTAL)" + help + Kprobes allows you to trap at almost any kernel address and + execute a callback function. register_kprobe() establishes + a probepoint and specifies the callback. Kprobes is useful + for kernel debugging, non-intrusive instrumentation and testing. + If in doubt, say "N". +endmenu + source "arch/x86_64/Kconfig.debug" source "security/Kconfig" diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile --- a/linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/Makefile Wed Feb 1 18:00:19 2006 @@ -11,6 +11,7 @@ obj-$(CONFIG_X86_MCE) += mce.o obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o +obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o obj-$(CONFIG_MTRR) += ../../i386/kernel/cpu/mtrr/ obj-$(CONFIG_ACPI) += acpi/ obj-$(CONFIG_X86_MSR) += msr.o @@ -22,13 +23,13 @@ obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o \ genapic.o genapic_cluster.o genapic_flat.o obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o -obj-$(CONFIG_PM) += suspend.o +obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend.o +obj-$(CONFIG_ACPI_SLEEP) += suspend.o obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend_asm.o obj-$(CONFIG_CPU_FREQ) += cpufreq/ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_GART_IOMMU) += pci-gart.o aperture.o obj-$(CONFIG_DUMMY_IOMMU) += pci-nommu.o pci-dma.o -obj-$(CONFIG_SWIOTLB) += swiotlb.o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_X86_PM_TIMER) += pmtimer.o @@ -42,7 +43,6 @@ bootflag-y += ../../i386/kernel/bootflag.o cpuid-$(subst m,y,$(CONFIG_X86_CPUID)) += ../../i386/kernel/cpuid.o topology-y += ../../i386/mach-default/topology.o -swiotlb-$(CONFIG_SWIOTLB) += ../../ia64/lib/swiotlb.o microcode-$(subst m,y,$(CONFIG_MICROCODE)) += ../../i386/kernel/microcode.o intel_cacheinfo-y += ../../i386/kernel/cpu/intel_cacheinfo.o quirks-y += ../../i386/kernel/quirks.o @@ -51,6 +51,7 @@ ifdef CONFIG_XEN time-y += ../../i386/kernel/time-xen.o +obj-$(CONFIG_SWIOTLB) += swiotlb.o swiotlb-$(CONFIG_SWIOTLB) := ../../i386/kernel/swiotlb.o pci-dma-y += ../../i386/kernel/pci-dma-xen.o microcode-$(subst m,y,$(CONFIG_MICROCODE)) := ../../i386/kernel/microcode-xen.o diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/arch/x86_64/kernel/apic-xen.c --- a/linux-2.6-xen-sparse/arch/x86_64/kernel/apic-xen.c Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/apic-xen.c Wed Feb 1 18:00:19 2006 @@ -33,8 +33,6 @@ #include <asm/desc.h> #include <asm/arch_hooks.h> #include <asm/hpet.h> - -#include "io_ports.h" /* * Debug level diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/arch/x86_64/kernel/e820-xen.c --- a/linux-2.6-xen-sparse/arch/x86_64/kernel/e820-xen.c Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/e820-xen.c Wed Feb 1 18:00:19 2006 @@ -23,6 +23,7 @@ #include <asm/e820.h> #include <asm/proto.h> #include <asm/bootsetup.h> +#include <asm/sections.h> #include <xen/interface/memory.h> unsigned long pci_mem_start = 0xaeedbabe; @@ -54,7 +55,6 @@ } #ifndef CONFIG_XEN -extern char _end[]; /* * end_pfn only includes RAM, while end_pfn_map includes all e820 entries. diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/arch/x86_64/kernel/entry-xen.S --- a/linux-2.6-xen-sparse/arch/x86_64/kernel/entry-xen.S Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/entry-xen.S Wed Feb 1 18:00:19 2006 @@ -609,7 +609,14 @@ CFI_ENDPROC .endm -#if 0 +#ifndef CONFIG_XEN +ENTRY(thermal_interrupt) + apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt + +ENTRY(threshold_interrupt) + apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt + +#ifdef CONFIG_SMP ENTRY(reschedule_interrupt) apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt @@ -630,6 +637,7 @@ ENTRY(call_function_interrupt) apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt #endif +#endif /* !CONFIG_XEN */ #ifdef CONFIG_X86_LOCAL_APIC ENTRY(apic_timer_interrupt) diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/arch/x86_64/kernel/head-xen.S --- a/linux-2.6-xen-sparse/arch/x86_64/kernel/head-xen.S Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/head-xen.S Wed Feb 1 18:00:19 2006 @@ -25,6 +25,7 @@ #include <linux/threads.h> +#include <linux/init.h> #include <asm/desc.h> #include <asm/segment.h> #include <asm/page.h> @@ -243,6 +244,26 @@ .quad 0x0000000000003007 + __PHYSICAL_START /* -> level3_kernel_pgt */ #endif +#ifndef CONFIG_XEN +#ifndef CONFIG_HOTPLUG_CPU + __INITDATA +#endif + /* + * This default setting generates an ident mapping at address 0x100000 + * and a mapping for the kernel that precisely maps virtual address + * 0xffffffff80000000 to physical address 0x000000. (always using + * 2Mbyte large pages provided by PAE mode) + */ + .align PAGE_SIZE +ENTRY(boot_level4_pgt) + .quad 0x0000000000002007 + __PHYSICAL_START /* -> level3_ident_pgt */ + .fill 255,8,0 + .quad 0x000000000000a007 + __PHYSICAL_START + .fill 254,8,0 + /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ + .quad 0x0000000000003007 + __PHYSICAL_START /* -> level3_kernel_pgt */ +#endif + .data .align 16 diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/arch/x86_64/kernel/head64-xen.c --- a/linux-2.6-xen-sparse/arch/x86_64/kernel/head64-xen.c Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/head64-xen.c Wed Feb 1 18:00:19 2006 @@ -22,6 +22,8 @@ #include <asm/bootsetup.h> #include <asm/setup.h> #include <asm/desc.h> +#include <asm/pgtable.h> +#include <asm/sections.h> unsigned long start_pfn; @@ -30,9 +32,8 @@ #if 0 static void __init clear_bss(void) { - extern char __bss_start[], __bss_end[]; memset(__bss_start, 0, - (unsigned long) __bss_end - (unsigned long) __bss_start); + (unsigned long) __bss_stop - (unsigned long) __bss_start); } #endif @@ -84,8 +85,6 @@ boot_cpu_data.x86_mask = eax & 0xf; } -extern char _end[]; - void __init x86_64_start_kernel(char * real_mode_data) { int i; diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/arch/x86_64/kernel/io_apic-xen.c --- a/linux-2.6-xen-sparse/arch/x86_64/kernel/io_apic-xen.c Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/io_apic-xen.c Wed Feb 1 18:00:19 2006 @@ -57,7 +57,7 @@ * Rough estimation of how many shared IRQs there are, can * be changed anytime. */ -#define MAX_PLUS_SHARED_IRQS NR_IRQS +#define MAX_PLUS_SHARED_IRQS NR_IRQ_VECTORS #define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) /* @@ -162,6 +162,7 @@ static int first_free_entry = NR_IRQS; struct irq_pin_list *entry = irq_2_pin + irq; + BUG_ON(irq >= NR_IRQS); while (entry->next) entry = irq_2_pin + entry->next; @@ -169,7 +170,7 @@ entry->next = first_free_entry; entry = irq_2_pin + entry->next; if (++first_free_entry >= PIN_MAP_SIZE) - panic("io_apic.c: whoops"); + panic("io_apic.c: ran out of irq_2_pin entries!"); } entry->apic = apic; entry->pin = pin; @@ -182,6 +183,7 @@ int pin; \ struct irq_pin_list *entry = irq_2_pin + irq; \ \ + BUG_ON(irq >= NR_IRQS); \ for (;;) { \ unsigned int reg; \ pin = entry->pin; \ @@ -258,6 +260,8 @@ } #endif /* !CONFIG_XEN */ + +static u8 gsi_2_irq[NR_IRQ_VECTORS] = { [0 ... NR_IRQ_VECTORS-1] = 0xFF }; /* * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to @@ -470,6 +474,7 @@ best_guess = irq; } } + BUG_ON(best_guess >= NR_IRQS); return best_guess; } @@ -660,6 +665,64 @@ return MPBIOS_trigger(idx); } +static int next_irq = 16; + +/* + * gsi_irq_sharing -- Name overload! "irq" can be either a legacy IRQ + * in the range 0-15, a linux IRQ in the range 0-223, or a GSI number + * from ACPI, which can reach 800 in large boxen. + * + * Compact the sparse GSI space into a sequential IRQ series and reuse + * vectors if possible. + */ +int gsi_irq_sharing(int gsi) +{ + int i, tries, vector; + + BUG_ON(gsi >= NR_IRQ_VECTORS); + + if (platform_legacy_irq(gsi)) + return gsi; + + if (gsi_2_irq[gsi] != 0xFF) + return (int)gsi_2_irq[gsi]; + + tries = NR_IRQS; + try_again: + vector = assign_irq_vector(gsi); + + /* + * Sharing vectors means sharing IRQs, so scan irq_vectors for previous + * use of vector and if found, return that IRQ. However, we never want + * to share legacy IRQs, which usually have a different trigger mode + * than PCI. + */ + for (i = 0; i < NR_IRQS; i++) + if (IO_APIC_VECTOR(i) == vector) + break; + if (platform_legacy_irq(i)) { + if (--tries >= 0) { + IO_APIC_VECTOR(i) = 0; + goto try_again; + } + panic("gsi_irq_sharing: didn't find an IRQ using vector 0x%02X for GSI %d", vector, gsi); + } + if (i < NR_IRQS) { + gsi_2_irq[gsi] = i; + printk(KERN_INFO "GSI %d sharing vector 0x%02X and IRQ %d\n", + gsi, vector, i); + return i; + } + + i = next_irq++; + BUG_ON(i >= NR_IRQS); + gsi_2_irq[gsi] = i; + IO_APIC_VECTOR(i) = vector; + printk(KERN_INFO "GSI %d assigned vector 0x%02X and IRQ %d\n", + gsi, vector, i); + return i; +} + static int pin_2_irq(int idx, int apic, int pin) { int irq, i; @@ -689,6 +752,7 @@ while (i < apic) irq += nr_ioapic_registers[i++]; irq += pin; + irq = gsi_irq_sharing(irq); break; } default: @@ -698,6 +762,7 @@ break; } } + BUG_ON(irq >= NR_IRQS); /* * PCI IRQ command line redirection. Yes, limits are hardcoded. @@ -713,6 +778,7 @@ } } } + BUG_ON(irq >= NR_IRQS); return irq; } @@ -741,7 +807,7 @@ static int current_vector = FIRST_DEVICE_VECTOR; physdev_op_t op; - BUG_ON(irq >= NR_IRQ_VECTORS); + BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS); if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) return IO_APIC_VECTOR(irq); @@ -1990,6 +2056,7 @@ entry.polarity = active_high_low; entry.mask = 1; /* Disabled (masked) */ + irq = gsi_irq_sharing(irq); /* * IRQs < 16 are already in the irq_2_pin[] map */ diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/arch/x86_64/kernel/mpparse-xen.c --- a/linux-2.6-xen-sparse/arch/x86_64/kernel/mpparse-xen.c Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/mpparse-xen.c Wed Feb 1 18:00:19 2006 @@ -42,7 +42,7 @@ * Various Linux-internal data structures created from the * MP-table. */ -int apic_version [MAX_APICS]; +unsigned char apic_version [MAX_APICS]; unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; @@ -65,7 +65,9 @@ /* Processor that is doing the boot up */ unsigned int boot_cpu_id = -1U; /* Internal processor count */ -static unsigned int num_processors = 0; +unsigned int num_processors __initdata = 0; + +unsigned disabled_cpus __initdata; /* Bitmask of physically existing CPUs */ physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE; @@ -107,11 +109,14 @@ #ifndef CONFIG_XEN static void __init MP_processor_info (struct mpc_config_processor *m) { - int ver, cpu; + int cpu; + unsigned char ver; static int found_bsp=0; - if (!(m->mpc_cpuflag & CPU_ENABLED)) + if (!(m->mpc_cpuflag & CPU_ENABLED)) { + disabled_cpus++; return; + } printk(KERN_INFO "Processor #%d %d:%d APIC version %d\n", m->mpc_apicid, @@ -130,12 +135,14 @@ } cpu = num_processors++; - - if (m->mpc_apicid > MAX_APICS) { + +#if MAX_APICS < 255 + if ((int)m->mpc_apicid > MAX_APICS) { printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n", m->mpc_apicid, MAX_APICS); return; } +#endif ver = m->mpc_apicver; physid_set(m->mpc_apicid, phys_cpu_present_map); @@ -225,7 +232,7 @@ m->mpc_irqtype, m->mpc_irqflag & 3, (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq); - if (++mp_irq_entries == MAX_IRQ_SOURCES) + if (++mp_irq_entries >= MAX_IRQ_SOURCES) panic("Max # of irq sources exceeded!!\n"); } diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/arch/x86_64/kernel/pci-nommu-xen.c --- a/linux-2.6-xen-sparse/arch/x86_64/kernel/pci-nommu-xen.c Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/pci-nommu-xen.c Wed Feb 1 18:00:19 2006 @@ -25,7 +25,7 @@ */ void *dma_alloc_coherent(struct device *hwdev, size_t size, - dma_addr_t *dma_handle, unsigned gfp) + dma_addr_t *dma_handle, gfp_t gfp) { void *ret; u64 mask; diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/arch/x86_64/kernel/process-xen.c --- a/linux-2.6-xen-sparse/arch/x86_64/kernel/process-xen.c Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/process-xen.c Wed Feb 1 18:00:19 2006 @@ -96,10 +96,13 @@ if (need_resched()) { local_irq_enable(); } else { + clear_thread_flag(TIF_POLLING_NRFLAG); + smp_mb__after_clear_bit(); stop_hz_timer(); /* Blocking includes an implicit local_irq_enable(). */ HYPERVISOR_sched_op(SCHEDOP_block, 0); start_hz_timer(); + set_thread_flag(TIF_POLLING_NRFLAG); } } @@ -143,6 +146,8 @@ */ void cpu_idle (void) { + set_thread_flag(TIF_POLLING_NRFLAG); + /* endless idle loop with no priority at all */ while (1) { while (!need_resched()) { @@ -156,7 +161,9 @@ xen_idle(); } + preempt_enable_no_resched(); schedule(); + preempt_disable(); } } @@ -180,7 +187,8 @@ do { ssleep(1); for_each_online_cpu(cpu) { - if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu)) + if (cpu_isset(cpu, map) && + !per_cpu(cpu_idle_state, cpu)) cpu_clear(cpu, map); } cpus_and(map, map, cpu_online_map); @@ -208,7 +216,8 @@ system_utsname.version); printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip); printk_address(regs->rip); - printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, regs->eflags); + printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, + regs->eflags); printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", regs->rax, regs->rbx, regs->rcx); printk("RDX: %016lx RSI: %016lx RDI: %016lx\n", @@ -281,13 +290,6 @@ { struct task_struct *tsk = current; struct thread_info *t = current_thread_info(); - - /* - * Remove function-return probe instances associated with this task - * and put them back on the free list. Do not insert an exit probe for - * this function, it will be disabled by kprobe_flush_task if you do. - */ - kprobe_flush_task(tsk); if (t->flags & _TIF_ABI_PENDING) t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32); @@ -361,15 +363,14 @@ struct pt_regs * childregs; struct task_struct *me = current; - childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1; - + childregs = ((struct pt_regs *) + (THREAD_SIZE + (unsigned long) p->thread_info)) - 1; *childregs = *regs; childregs->rax = 0; childregs->rsp = rsp; - if (rsp == ~0UL) { + if (rsp == ~0UL) childregs->rsp = (unsigned long)childregs; - } p->thread.rsp = (unsigned long) childregs; p->thread.rsp0 = (unsigned long) (childregs+1); @@ -391,7 +392,8 @@ p->thread.io_bitmap_max = 0; return -ENOMEM; } - memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, IO_BITMAP_BYTES); + memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, + IO_BITMAP_BYTES); } /* @@ -432,7 +434,8 @@ * - fold all the options into a flag word and test it with a single test. * - could test fs/gs bitsliced */ -struct task_struct *__switch_to(struct task_struct *prev_p, struct task_struct *next_p) +struct task_struct * +__switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread, *next = &next_p->thread; @@ -530,7 +533,8 @@ prev->userrsp = read_pda(oldrsp); write_pda(oldrsp, next->userrsp); write_pda(pcurrent, next_p); - write_pda(kernelstack, (unsigned long)next_p->thread_info + THREAD_SIZE - PDA_STACKOFFSET); + write_pda(kernelstack, + (unsigned long)next_p->thread_info + THREAD_SIZE - PDA_STACKOFFSET); /* * Now maybe reload the debug registers @@ -591,7 +595,9 @@ return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL); } -asmlinkage long sys_clone(unsigned long clone_flags, unsigned long newsp, void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) +asmlinkage long +sys_clone(unsigned long clone_flags, unsigned long newsp, + void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) { if (!newsp) newsp = regs->rsp; @@ -627,7 +633,8 @@ return 0; fp = *(u64 *)(p->thread.rsp); do { - if (fp < (unsigned long)stack || fp > (unsigned long)stack+THREAD_SIZE) + if (fp < (unsigned long)stack || + fp > (unsigned long)stack+THREAD_SIZE) return 0; rip = *(u64 *)(fp+8); if (!in_sched_functions(rip)) @@ -662,8 +669,9 @@ task->thread.gsindex = 0; task->thread.gs = addr; if (doit) { - load_gs_index(0); - ret = HYPERVISOR_set_segment_base(SEGBASE_GS_USER, addr); + load_gs_index(0); + ret = HYPERVISOR_set_segment_base( + SEGBASE_GS_USER, addr); } } put_cpu(); @@ -680,7 +688,7 @@ set_32bit_tls(task, FS_TLS, addr); if (doit) { load_TLS(&task->thread, cpu); - asm volatile("mov %0,%%fs" :: "r" (FS_TLS_SEL)); + asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL)); } task->thread.fsindex = FS_TLS_SEL; task->thread.fs = 0; @@ -690,9 +698,9 @@ if (doit) { /* set the selector to 0 to not confuse __switch_to */ - asm volatile("mov %0,%%fs" :: "r" (0)); - ret = HYPERVISOR_set_segment_base(SEGBASE_FS, addr); - + asm volatile("movl %0,%%fs" :: "r" (0)); + ret = HYPERVISOR_set_segment_base(SEGBASE_FS, + addr); } } put_cpu(); @@ -701,9 +709,9 @@ unsigned long base; if (task->thread.fsindex == FS_TLS_SEL) base = read_32bit_tls(task, FS_TLS); - else if (doit) { + else if (doit) rdmsrl(MSR_FS_BASE, base); - } else + else base = task->thread.fs; ret = put_user(base, (unsigned long __user *)addr); break; @@ -712,9 +720,9 @@ unsigned long base; if (task->thread.gsindex == GS_TLS_SEL) base = read_32bit_tls(task, GS_TLS); - else if (doit) { + else if (doit) rdmsrl(MSR_KERNEL_GS_BASE, base); - } else + else base = task->thread.gs; ret = put_user(base, (unsigned long __user *)addr); break; diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c --- a/linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/setup-xen.c Wed Feb 1 18:00:19 2006 @@ -60,6 +60,7 @@ #include <asm/setup.h> #include <asm/mach_apic.h> #include <asm/numa.h> +#include <asm/sections.h> #ifdef CONFIG_XEN #include <linux/percpu.h> #include <xen/interface/physdev.h> @@ -139,7 +140,6 @@ struct e820map e820; extern int root_mountflags; -extern char _text, _etext, _edata, _end; char command_line[COMMAND_LINE_SIZE]; @@ -473,7 +473,6 @@ { unsigned long bootmap_size, bootmap; - memory_present(0, start_pfn, end_pfn); bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT; bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size); if (bootmap == -1L) @@ -773,8 +772,6 @@ } #endif - sparse_init(); - paging_init(); #ifdef CONFIG_X86_LOCAL_APIC /* @@ -855,6 +852,8 @@ check_ioapic(); #endif + zap_low_mappings(0); + #ifdef CONFIG_ACPI /* * Initialize the ACPI boot-time table parser (gets the RSDP and SDT). @@ -1032,7 +1031,7 @@ #endif bits = 0; - while ((1 << bits) < c->x86_num_cores) + while ((1 << bits) < c->x86_max_cores) bits++; /* Low order bits define the core id (index of core in socket) */ @@ -1062,10 +1061,10 @@ if (!node_online(node)) node = nearby_node(apicid); } - cpu_to_node[cpu] = node; + numa_set_node(cpu, node); printk(KERN_INFO "CPU %d(%d) -> Node %d -> Core %d\n", - cpu, c->x86_num_cores, node, cpu_core_id[cpu]); + cpu, c->x86_max_cores, node, cpu_core_id[cpu]); #endif #endif } @@ -1114,9 +1113,9 @@ display_cacheinfo(c); if (c->extended_cpuid_level >= 0x80000008) { - c->x86_num_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; - if (c->x86_num_cores & (c->x86_num_cores - 1)) - c->x86_num_cores = 1; + c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; + if (c->x86_max_cores & (c->x86_max_cores - 1)) + c->x86_max_cores = 1; amd_detect_cmp(c); } @@ -1128,54 +1127,44 @@ { #ifdef CONFIG_SMP u32 eax, ebx, ecx, edx; - int index_msb, tmp; + int index_msb, core_bits; int cpu = smp_processor_id(); - + + cpuid(1, &eax, &ebx, &ecx, &edx); + + c->apicid = phys_pkg_id(0); + if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY)) return; - cpuid(1, &eax, &ebx, &ecx, &edx); smp_num_siblings = (ebx & 0xff0000) >> 16; - + if (smp_num_siblings == 1) { printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); - } else if (smp_num_siblings > 1) { - index_msb = 31; - /* - * At this point we only support two siblings per - * processor package. - */ + } else if (smp_num_siblings > 1 ) { + if (smp_num_siblings > NR_CPUS) { printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings); smp_num_siblings = 1; return; } - tmp = smp_num_siblings; - while ((tmp & 0x80000000 ) == 0) { - tmp <<=1 ; - index_msb--; - } - if (smp_num_siblings & (smp_num_siblings - 1)) - index_msb++; + + index_msb = get_count_order(smp_num_siblings); phys_proc_id[cpu] = phys_pkg_id(index_msb); - + printk(KERN_INFO "CPU: Physical Processor ID: %d\n", phys_proc_id[cpu]); - smp_num_siblings = smp_num_siblings / c->x86_num_cores; - - tmp = smp_num_siblings; - index_msb = 31; - while ((tmp & 0x80000000) == 0) { - tmp <<=1 ; - index_msb--; - } - if (smp_num_siblings & (smp_num_siblings - 1)) - index_msb++; - - cpu_core_id[cpu] = phys_pkg_id(index_msb); - - if (c->x86_num_cores > 1) + smp_num_siblings = smp_num_siblings / c->x86_max_cores; + + index_msb = get_count_order(smp_num_siblings) ; + + core_bits = get_count_order(c->x86_max_cores); + + cpu_core_id[cpu] = phys_pkg_id(index_msb) & + ((1 << core_bits) - 1); + + if (c->x86_max_cores > 1) printk(KERN_INFO "CPU: Processor Core ID: %d\n", cpu_core_id[cpu]); } @@ -1214,7 +1203,7 @@ node = apicid_to_node[hard_smp_processor_id()]; if (node == NUMA_NO_NODE) node = 0; - cpu_to_node[cpu] = node; + numa_set_node(cpu, node); if (acpi_numa > 0) printk(KERN_INFO "CPU %d -> Node %d\n", cpu, node); @@ -1232,13 +1221,18 @@ unsigned eax = cpuid_eax(0x80000008); c->x86_virt_bits = (eax >> 8) & 0xff; c->x86_phys_bits = eax & 0xff; + /* CPUID workaround for Intel 0F34 CPU */ + if (c->x86_vendor == X86_VENDOR_INTEL && + c->x86 == 0xF && c->x86_model == 0x3 && + c->x86_mask == 0x4) + c->x86_phys_bits = 36; } if (c->x86 == 15) c->x86_cache_alignment = c->x86_clflush_size * 2; if (c->x86 >= 15) set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); - c->x86_num_cores = intel_num_cpu_cores(c); + c->x86_max_cores = intel_num_cpu_cores(c); srat_detect_node(); } @@ -1276,7 +1270,7 @@ c->x86_model_id[0] = '\0'; /* Unset */ c->x86_clflush_size = 64; c->x86_cache_alignment = c->x86_clflush_size; - c->x86_num_cores = 1; + c->x86_max_cores = 1; c->extended_cpuid_level = 0; memset(&c->x86_capability, 0, sizeof c->x86_capability); @@ -1299,10 +1293,10 @@ c->x86 = (tfms >> 8) & 0xf; c->x86_model = (tfms >> 4) & 0xf; c->x86_mask = tfms & 0xf; - if (c->x86 == 0xf) { + if (c->x86 == 0xf) c->x86 += (tfms >> 20) & 0xff; + if (c->x86 >= 0x6) c->x86_model += ((tfms >> 16) & 0xF) << 4; - } if (c->x86_capability[0] & (1<<19)) c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; } else { @@ -1452,7 +1446,7 @@ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* Intel-defined (#2) */ - "pni", NULL, NULL, "monitor", "ds_cpl", NULL, NULL, "est", + "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", NULL, "est", "tm2", NULL, "cid", NULL, NULL, "cx16", "xtpr", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, @@ -1510,13 +1504,12 @@ seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size); #ifdef CONFIG_SMP - if (smp_num_siblings * c->x86_num_cores > 1) { + if (smp_num_siblings * c->x86_max_cores > 1) { int cpu = c - cpu_data; seq_printf(m, "physical id\t: %d\n", phys_proc_id[cpu]); - seq_printf(m, "siblings\t: %d\n", - c->x86_num_cores * smp_num_siblings); + seq_printf(m, "siblings\t: %d\n", cpus_weight(cpu_core_map[cpu])); seq_printf(m, "core id\t\t: %d\n", cpu_core_id[cpu]); - seq_printf(m, "cpu cores\t: %d\n", c->x86_num_cores); + seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); } #endif diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/arch/x86_64/kernel/setup64-xen.c --- a/linux-2.6-xen-sparse/arch/x86_64/kernel/setup64-xen.c Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/setup64-xen.c Wed Feb 1 18:00:19 2006 @@ -255,6 +255,7 @@ /* CPU 0 is initialised in head64.c */ if (cpu != 0) { pda_init(cpu); + zap_low_mappings(cpu); } else estacks = boot_exception_stacks; diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/arch/x86_64/kernel/smp-xen.c --- a/linux-2.6-xen-sparse/arch/x86_64/kernel/smp-xen.c Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/smp-xen.c Wed Feb 1 18:00:19 2006 @@ -70,8 +70,6 @@ want false sharing in the per cpu data segment. */ static DEFINE_PER_CPU(union smp_flush_state, flush_state); #endif - -#define __cpuinit __init /* * We cannot call mmdrop() because we are in interrupt context, @@ -478,15 +476,16 @@ void smp_stop_cpu(void) { + unsigned long flags; /* * Remove this CPU: */ cpu_clear(smp_processor_id(), cpu_online_map); - local_irq_disable(); + local_irq_save(flags); #ifndef CONFIG_XEN disable_local_APIC(); #endif - local_irq_enable(); + local_irq_restore(flags); } static void smp_really_stop_cpu(void *dummy) diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/arch/x86_64/kernel/traps-xen.c --- a/linux-2.6-xen-sparse/arch/x86_64/kernel/traps-xen.c Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/traps-xen.c Wed Feb 1 18:00:19 2006 @@ -430,19 +430,6 @@ { conditional_sti(regs); -#ifdef CONFIG_CHECKING - { - unsigned long gs; - struct x8664_pda *pda = cpu_pda + safe_smp_processor_id(); - rdmsrl(MSR_GS_BASE, gs); - if (gs != (unsigned long)pda) { - wrmsrl(MSR_GS_BASE, pda); - printk("%s: wrong gs %lx expected %p rip %lx\n", str, gs, pda, - regs->rip); - } - } -#endif - if (user_mode(regs)) { struct task_struct *tsk = current; @@ -514,20 +501,6 @@ long error_code) { conditional_sti(regs); - -#ifdef CONFIG_CHECKING - { - unsigned long gs; - struct x8664_pda *pda = cpu_pda + safe_smp_processor_id(); - rdmsrl(MSR_GS_BASE, gs); - if (gs != (unsigned long)pda) { - wrmsrl(MSR_GS_BASE, pda); - oops_in_progress++; - printk("general protection handler: wrong gs %lx expected %p\n", gs, pda); - oops_in_progress--; - } - } -#endif if (user_mode(regs)) { struct task_struct *tsk = current; @@ -670,19 +643,6 @@ unsigned long condition; struct task_struct *tsk = current; siginfo_t info; - -#ifdef CONFIG_CHECKING - { - /* RED-PEN interaction with debugger - could destroy gs */ - unsigned long gs; - struct x8664_pda *pda = cpu_pda + safe_smp_processor_id(); - rdmsrl(MSR_GS_BASE, gs); - if (gs != (unsigned long)pda) { - wrmsrl(MSR_GS_BASE, pda); - printk("debug handler: wrong gs %lx expected %p\n", gs, pda); - } - } -#endif get_debugreg(condition, 6); @@ -896,6 +856,10 @@ } #endif +asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) +{ +} + /* * 'math_state_restore()' saves the current math information in the * old math state array, and gets the new ones from the current task diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/arch/x86_64/kernel/x8664_ksyms-xen.c --- a/linux-2.6-xen-sparse/arch/x86_64/kernel/x8664_ksyms-xen.c Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/arch/x86_64/kernel/x8664_ksyms-xen.c Wed Feb 1 18:00:19 2006 @@ -204,3 +204,6 @@ #endif EXPORT_SYMBOL(cpu_khz); + +EXPORT_SYMBOL(load_gs_index); + diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/arch/x86_64/mm/fault-xen.c --- a/linux-2.6-xen-sparse/arch/x86_64/mm/fault-xen.c Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/arch/x86_64/mm/fault-xen.c Wed Feb 1 18:00:19 2006 @@ -24,7 +24,6 @@ #include <linux/compiler.h> #include <linux/module.h> #include <linux/kprobes.h> -#include <linux/percpu.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -35,8 +34,6 @@ #include <asm/kdebug.h> #include <asm-generic/sections.h> #include <asm/kdebug.h> - -DEFINE_PER_CPU(pgd_t *, cur_pgd); void bust_spinlocks(int yes) { @@ -149,9 +146,10 @@ pmd_t *pmd; pte_t *pte; - preempt_disable(); - pgd = (pgd_t *)per_cpu(cur_pgd, smp_processor_id()); - preempt_enable(); + asm("movq %%cr3,%0" : "=r" (pgd)); + pgd = (pgd_t *)machine_to_phys((maddr_t)pgd); + + pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); pgd += pgd_index(address); printk("PGD %lx ", pgd_val(*pgd)); if (bad_address(pgd)) goto bad; @@ -252,9 +250,9 @@ /* On Xen the line below does not always work. Needs investigating! */ /*pgd = pgd_offset(current->mm ?: &init_mm, address);*/ - preempt_disable(); - pgd = (pgd_t *)per_cpu(cur_pgd, smp_processor_id()); - preempt_enable(); + asm("movq %%cr3,%0" : "=r" (pgd)); + pgd = (pgd_t *)machine_to_phys((maddr_t)pgd); + pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); pgd += pgd_index(address); pgd_ref = pgd_offset_k(address); if (pgd_none(*pgd_ref)) @@ -330,22 +328,9 @@ if (!user_mode(regs)) error_code &= ~4; /* means kernel */ -#ifdef CONFIG_CHECKING - { - unsigned long gs; - struct x8664_pda *pda = cpu_pda + stack_smp_processor_id(); - rdmsrl(MSR_GS_BASE, gs); - if (gs != (unsigned long)pda) { - wrmsrl(MSR_GS_BASE, pda); - printk("page_fault: wrong gs %lx expected %p\n", gs, pda); - } - } -#endif - /* get the address */ address = HYPERVISOR_shared_info->vcpu_info[ smp_processor_id()].arch.cr2; - if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, SIGSEGV) == NOTIFY_STOP) return; @@ -595,3 +580,10 @@ force_sig_info(SIGBUS, &info, tsk); return; } + +static int __init enable_pagefaulttrace(char *str) +{ + page_fault_trace = 1; + return 0; +} +__setup("pagefaulttrace", enable_pagefaulttrace); diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/arch/x86_64/mm/init-xen.c --- a/linux-2.6-xen-sparse/arch/x86_64/mm/init-xen.c Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/arch/x86_64/mm/init-xen.c Wed Feb 1 18:00:19 2006 @@ -25,6 +25,7 @@ #include <linux/pagemap.h> #include <linux/bootmem.h> #include <linux/proc_fs.h> +#include <linux/pci.h> #include <asm/processor.h> #include <asm/system.h> @@ -39,6 +40,7 @@ #include <asm/mmu_context.h> #include <asm/proto.h> #include <asm/smp.h> +#include <asm/sections.h> #include <xen/features.h> @@ -52,7 +54,7 @@ extern void swiotlb_init(void); #endif -extern char _stext[]; +static unsigned long dma_reserve __initdata; DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); extern unsigned long start_pfn; @@ -193,9 +195,6 @@ } /* References to section boundaries */ - -extern char _text, _etext, _edata, __bss_start, _end[]; -extern char __init_begin, __init_end; int after_bootmem; @@ -492,10 +491,6 @@ void __init xen_init_pt(void) { unsigned long addr, *page; - int i; - - for (i = 0; i < NR_CPUS; i++) - per_cpu(cur_pgd, i) = init_mm.pgd; memset((void *)init_level4_pgt, 0, PAGE_SIZE); memset((void *)level3_kernel_pgt, 0, PAGE_SIZE); @@ -628,9 +623,7 @@ __flush_tlb_all(); } -extern struct x8664_pda cpu_pda[NR_CPUS]; - -void zap_low_mappings(void) +void __cpuinit zap_low_mappings(int cpu) { /* this is not required for Xen */ #if 0 @@ -638,32 +631,74 @@ #endif } +/* Compute zone sizes for the DMA and DMA32 zones in a node. */ +__init void +size_zones(unsigned long *z, unsigned long *h, + unsigned long start_pfn, unsigned long end_pfn) +{ + int i; +#ifndef CONFIG_XEN + unsigned long w; +#endif + + for (i = 0; i < MAX_NR_ZONES; i++) + z[i] = 0; + +#ifndef CONFIG_XEN + if (start_pfn < MAX_DMA_PFN) + z[ZONE_DMA] = MAX_DMA_PFN - start_pfn; + if (start_pfn < MAX_DMA32_PFN) { + unsigned long dma32_pfn = MAX_DMA32_PFN; + if (dma32_pfn > end_pfn) + dma32_pfn = end_pfn; + z[ZONE_DMA32] = dma32_pfn - start_pfn; + } + z[ZONE_NORMAL] = end_pfn - start_pfn; + + /* Remove lower zones from higher ones. */ + w = 0; + for (i = 0; i < MAX_NR_ZONES; i++) { + if (z[i]) + z[i] -= w; + w += z[i]; + } + + /* Compute holes */ + w = start_pfn; + for (i = 0; i < MAX_NR_ZONES; i++) { + unsigned long s = w; + w += z[i]; + h[i] = e820_hole_size(s, w); + } + + /* Add the space pace needed for mem_map to the holes too. */ + for (i = 0; i < MAX_NR_ZONES; i++) + h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE; + + /* The 16MB DMA zone has the kernel and other misc mappings. + Account them too */ + if (h[ZONE_DMA]) { + h[ZONE_DMA] += dma_reserve; + if (h[ZONE_DMA] >= z[ZONE_DMA]) { + printk(KERN_WARNING + "Kernel too large and filling up ZONE_DMA?\n"); + h[ZONE_DMA] = z[ZONE_DMA]; + } + } +#else + z[ZONE_DMA] = end_pfn; + for (i = 0; i < MAX_NR_ZONES; i++) + h[i] = 0; +#endif +} + #ifndef CONFIG_NUMA void __init paging_init(void) { - { - unsigned long zones_size[MAX_NR_ZONES]; - unsigned long holes[MAX_NR_ZONES]; - /* unsigned int max_dma; */ - - memset(zones_size, 0, sizeof(zones_size)); - memset(holes, 0, sizeof(holes)); - - /* max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; */ - /* if (end_pfn < max_dma) { */ - zones_size[ZONE_DMA] = end_pfn; -#if 0 - holes[ZONE_DMA] = e820_hole_size(0, end_pfn); - } else { - zones_size[ZONE_DMA] = max_dma; - holes[ZONE_DMA] = e820_hole_size(0, max_dma); - zones_size[ZONE_NORMAL] = end_pfn - max_dma; - holes[ZONE_NORMAL] = e820_hole_size(max_dma, end_pfn); - } -#endif - free_area_init_node(0, NODE_DATA(0), zones_size, - __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes); - } + unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES]; + size_zones(zones, holes, 0, end_pfn); + free_area_init_node(0, NODE_DATA(0), zones, + __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes); set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info); HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO); @@ -780,18 +815,17 @@ datasize >> 10, initsize >> 10); +#ifndef CONFIG_XEN +#ifdef CONFIG_SMP /* - * Subtle. SMP is doing its boot stuff late (because it has to - * fork idle threads) - but it also needs low mappings for the - * protected-mode entry to work. We zap these entries only after - * the WP-bit has been tested. + * Sync boot_level4_pgt mappings with the init_level4_pgt + * except for the low identity mappings which are already zapped + * in init_level4_pgt. This sync-up is essential for AP's bringup */ -#ifndef CONFIG_SMP - zap_low_mappings(); -#endif -} - -extern char __initdata_begin[], __initdata_end[]; + memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t)); +#endif +#endif +} void free_initmem(void) { @@ -819,7 +853,7 @@ totalram_pages++; } memset(__initdata_begin, 0xba, __initdata_end - __initdata_begin); - printk ("Freeing unused kernel memory: %luk freed\n", (&__init_end - &__init_begin) >> 10); + printk ("Freeing unused kernel memory: %luk freed\n", (__init_end - __init_begin) >> 10); #endif } @@ -847,6 +881,8 @@ #else reserve_bootmem(phys, len); #endif + if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) + dma_reserve += len / PAGE_SIZE; } int kern_addr_valid(unsigned long addr) @@ -888,10 +924,6 @@ static ctl_table debug_table2[] = { { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL, proc_dointvec }, -#ifdef CONFIG_CHECKING - { 100, "page-fault-trace", &page_fault_trace, sizeof(int), 0644, NULL, - proc_dointvec }, -#endif { 0, } }; diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/arch/x86_64/pci/Makefile --- a/linux-2.6-xen-sparse/arch/x86_64/pci/Makefile Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/arch/x86_64/pci/Makefile Wed Feb 1 18:00:19 2006 @@ -11,7 +11,7 @@ obj-$(CONFIG_ACPI) += acpi.o obj-y += legacy.o irq.o common.o # mmconfig has a 64bit special -obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o +obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o direct.o obj-$(CONFIG_NUMA) += k8-bus.o diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/drivers/Makefile --- a/linux-2.6-xen-sparse/drivers/Makefile Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/drivers/Makefile Wed Feb 1 18:00:19 2006 @@ -7,6 +7,7 @@ obj-$(CONFIG_PCI) += pci/ obj-$(CONFIG_PARISC) += parisc/ +obj-$(CONFIG_RAPIDIO) += rapidio/ obj-y += video/ obj-$(CONFIG_ACPI) += acpi/ # PnP must come after ACPI since it will eventually need to check if acpi @@ -49,6 +50,7 @@ obj-$(CONFIG_PARIDE) += block/paride/ obj-$(CONFIG_TC) += tc/ obj-$(CONFIG_USB) += usb/ +obj-$(CONFIG_PCI) += usb/ obj-$(CONFIG_USB_GADGET) += usb/gadget/ obj-$(CONFIG_GAMEPORT) += input/gameport/ obj-$(CONFIG_INPUT) += input/ @@ -68,3 +70,4 @@ obj-$(CONFIG_SGI_IOC4) += sn/ obj-y += firmware/ obj-$(CONFIG_CRYPTO) += crypto/ +obj-$(CONFIG_SUPERH) += sh/ diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/drivers/acpi/Kconfig --- a/linux-2.6-xen-sparse/drivers/acpi/Kconfig Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/drivers/acpi/Kconfig Wed Feb 1 18:00:19 2006 @@ -197,7 +197,6 @@ config ACPI_IBM tristate "IBM ThinkPad Laptop Extras" depends on X86 - default y ---help--- This is a Linux ACPI driver for the IBM ThinkPad laptops. It adds support for Fn-Fx key combinations, Bluetooth control, video diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/drivers/char/mem.c --- a/linux-2.6-xen-sparse/drivers/char/mem.c Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/drivers/char/mem.c Wed Feb 1 18:00:19 2006 @@ -233,9 +233,7 @@ static int mmap_mem(struct file * file, struct vm_area_struct * vma) { #if defined(__HAVE_PHYS_MEM_ACCESS_PROT) - unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; - - vma->vm_page_prot = phys_mem_access_prot(file, offset, + vma->vm_page_prot = phys_mem_access_prot(file, vma->vm_pgoff, vma->vm_end - vma->vm_start, vma->vm_page_prot); #elif defined(pgprot_noncached) @@ -926,7 +924,8 @@ mem_class = class_create(THIS_MODULE, "mem"); for (i = 0; i < ARRAY_SIZE(devlist); i++) { - class_device_create(mem_class, MKDEV(MEM_MAJOR, devlist[i].minor), + class_device_create(mem_class, NULL, + MKDEV(MEM_MAJOR, devlist[i].minor), NULL, devlist[i].name); devfs_mk_cdev(MKDEV(MEM_MAJOR, devlist[i].minor), S_IFCHR | devlist[i].mode, devlist[i].name); diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/drivers/char/tpm/Kconfig --- a/linux-2.6-xen-sparse/drivers/char/tpm/Kconfig Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/drivers/char/tpm/Kconfig Wed Feb 1 18:00:19 2006 @@ -6,7 +6,7 @@ config TCG_TPM tristate "TPM Hardware Support" - depends on EXPERIMENTAL && (PCI || XEN) + depends on EXPERIMENTAL ---help--- If you have a TPM security chip in your system, which implements the Trusted Computing Group's specification, diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/drivers/char/tpm/tpm.c --- a/linux-2.6-xen-sparse/drivers/char/tpm/tpm.c Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/drivers/char/tpm/tpm.c Wed Feb 1 18:00:19 2006 @@ -47,6 +47,13 @@ { struct tpm_chip *chip = (struct tpm_chip *) ptr; + schedule_work(&chip->work); +} + +static void timeout_work(void * ptr) +{ + struct tpm_chip *chip = ptr; + down(&chip->buffer_mutex); atomic_set(&chip->data_pending, 0); memset(chip->data_buffer, 0, chip->vendor->buffersize); @@ -146,8 +153,7 @@ __be32 index; char *str = buf; - struct tpm_chip *chip = - pci_get_drvdata(to_pci_dev(dev)); + struct tpm_chip *chip = dev_get_drvdata(dev); if (chip == NULL) return -ENODEV; @@ -170,7 +176,8 @@ < READ_PCR_RESULT_SIZE){ dev_dbg(chip->dev, "A TPM error (%d) occurred" " attempting to read PCR %d of %d\n", - be32_to_cpu(*((__be32 *) (data + 6))), i, num_pcrs); + be32_to_cpu(*((__be32 *) (data + 6))), + i, num_pcrs); goto out; } str += sprintf(str, "PCR-%02d: ", i); @@ -198,17 +205,15 @@ int i, rc; char *str = buf; - struct tpm_chip *chip = - pci_get_drvdata(to_pci_dev(dev)); + struct tpm_chip *chip = dev_get_drvdata(dev); if (chip == NULL) return -ENODEV; - data = kmalloc(READ_PUBEK_RESULT_SIZE, GFP_KERNEL); + data = kzalloc(READ_PUBEK_RESULT_SIZE, GFP_KERNEL); if (!data) return -ENOMEM; memcpy(data, readpubek, sizeof(readpubek)); - memset(data + sizeof(readpubek), 0, 20); /* zero nonce */ if ((len = tpm_transmit(chip, data, READ_PUBEK_RESULT_SIZE)) < READ_PUBEK_RESULT_SIZE) { @@ -252,7 +257,6 @@ kfree(data); return rc; } - EXPORT_SYMBOL_GPL(tpm_show_pubek); #define CAP_VER_RESULT_SIZE 18 @@ -281,8 +285,7 @@ ssize_t len; char *str = buf; - struct tpm_chip *chip = - pci_get_drvdata(to_pci_dev(dev)); + struct tpm_chip *chip = dev_get_drvdata(dev); if (chip == NULL) return -ENODEV; @@ -321,7 +324,6 @@ return count; } EXPORT_SYMBOL_GPL(tpm_store_cancel); - /* * Device file system interface to the TPM @@ -346,8 +348,7 @@ } if (chip->num_opens) { - dev_dbg(chip->dev, - "Another process owns this TPM\n"); + dev_dbg(chip->dev, "Another process owns this TPM\n"); rc = -EBUSY; goto err_out; } @@ -373,7 +374,6 @@ spin_unlock(&driver_lock); return rc; } - EXPORT_SYMBOL_GPL(tpm_open); int tpm_release(struct inode *inode, struct file *file) @@ -384,16 +384,16 @@ file->private_data = NULL; chip->num_opens--; del_singleshot_timer_sync(&chip->user_read_timer); + flush_scheduled_work(); atomic_set(&chip->data_pending, 0); put_device(chip->dev); kfree(chip->data_buffer); spin_unlock(&driver_lock); return 0; } - EXPORT_SYMBOL_GPL(tpm_release); -ssize_t tpm_write(struct file * file, const char __user * buf, +ssize_t tpm_write(struct file *file, const char __user *buf, size_t size, loff_t * off) { struct tpm_chip *chip = file->private_data; @@ -431,13 +431,14 @@ EXPORT_SYMBOL_GPL(tpm_write); -ssize_t tpm_read(struct file * file, char __user * buf, +ssize_t tpm_read(struct file * file, char __user *buf, size_t size, loff_t * off) { struct tpm_chip *chip = file->private_data; int ret_size; int pos, pending = 0; + flush_scheduled_work(); ret_size = atomic_read(&chip->data_pending); if (ret_size > 0) { /* relay data */ if (size < ret_size) @@ -446,8 +447,7 @@ pos = atomic_read(&chip->data_position); down(&chip->buffer_mutex); - if (copy_to_user - ((void __user *) buf, &chip->data_buffer[pos], ret_size)) { + if (copy_to_user(buf, &chip->data_buffer[pos], ret_size)) { ret_size = -EFAULT; } else { pending = atomic_read(&chip->data_pending) - ret_size; @@ -466,7 +466,6 @@ return ret_size; } - EXPORT_SYMBOL_GPL(tpm_read); void tpm_remove_hardware(struct device *dev) @@ -490,13 +489,13 @@ sysfs_remove_group(&dev->kobj, chip->vendor->attr_group); - dev_mask[chip->dev_num / TPM_NUM_MASK_ENTRIES ] &= !(1 << (chip->dev_num % TPM_NUM_MASK_ENTRIES)); + dev_mask[chip->dev_num / TPM_NUM_MASK_ENTRIES ] &= + ~(1 << (chip->dev_num % TPM_NUM_MASK_ENTRIES)); kfree(chip); put_device(dev); } - EXPORT_SYMBOL_GPL(tpm_remove_hardware); static u8 savestate[] = { @@ -509,32 +508,30 @@ * We are about to suspend. Save the TPM state * so that it can be restored. */ -int tpm_pm_suspend(struct pci_dev *pci_dev, pm_message_t pm_state) -{ - struct tpm_chip *chip = pci_get_drvdata(pci_dev); +int tpm_pm_suspend(struct device *dev, pm_message_t pm_state) +{ + struct tpm_chip *chip = dev_get_drvdata(dev); if (chip == NULL) return -ENODEV; tpm_transmit(chip, savestate, sizeof(savestate)); return 0; } - EXPORT_SYMBOL_GPL(tpm_pm_suspend); /* * Resume from a power safe. The BIOS already restored * the TPM state. */ -int tpm_pm_resume(struct pci_dev *pci_dev) -{ - struct tpm_chip *chip = pci_get_drvdata(pci_dev); +int tpm_pm_resume(struct device *dev) +{ + struct tpm_chip *chip = dev_get_drvdata(dev); if (chip == NULL) return -ENODEV; return 0; } - EXPORT_SYMBOL_GPL(tpm_pm_resume); /* @@ -544,8 +541,7 @@ * upon errant exit from this function specific probe function should call * pci_disable_device */ -int tpm_register_hardware(struct device *dev, - struct tpm_vendor_specific *entry) +int tpm_register_hardware(struct device *dev, struct tpm_vendor_specific *entry) { #define DEVNAME_SIZE 7 @@ -554,15 +550,15 @@ int i, j; /* Driver specific per-device data */ - chip = kmalloc(sizeof(*chip), GFP_KERNEL); + chip = kzalloc(sizeof(*chip), GFP_KERNEL); if (chip == NULL) return -ENOMEM; - - memset(chip, 0, sizeof(struct tpm_chip)); init_MUTEX(&chip->buffer_mutex); init_MUTEX(&chip->tpm_mutex); INIT_LIST_HEAD(&chip->list); + + INIT_WORK(&chip->work, timeout_work, chip); init_timer(&chip->user_read_timer); chip->user_read_timer.function = user_reader_timeout; @@ -589,8 +585,7 @@ dev_num_search_complete: if (chip->dev_num < 0) { - dev_err(dev, - "No available tpm device numbers\n"); + dev_err(dev, "No available tpm device numbers\n"); kfree(chip); return -ENODEV; } else if (chip->dev_num == 0) @@ -628,7 +623,6 @@ return 0; } - EXPORT_SYMBOL_GPL(tpm_register_hardware); MODULE_AUTHOR("Leendert van Doorn (leendert@xxxxxxxxxxxxxx)"); diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/drivers/char/tpm/tpm.h --- a/linux-2.6-xen-sparse/drivers/char/tpm/tpm.h Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/drivers/char/tpm/tpm.h Wed Feb 1 18:00:19 2006 @@ -19,11 +19,11 @@ * */ #include <linux/module.h> -#include <linux/version.h> #include <linux/pci.h> #include <linux/delay.h> #include <linux/fs.h> #include <linux/miscdevice.h> +#include <linux/platform_device.h> enum tpm_timeout { TPM_TIMEOUT = 5, /* msecs */ @@ -76,6 +76,7 @@ struct semaphore buffer_mutex; struct timer_list user_read_timer; /* user needs to claim result */ + struct work_struct work; struct semaphore tpm_mutex; /* tpm is processing */ struct tpm_vendor_specific *vendor; diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/drivers/char/tpm/tpm_atmel.c --- a/linux-2.6-xen-sparse/drivers/char/tpm/tpm_atmel.c Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/drivers/char/tpm/tpm_atmel.c Wed Feb 1 18:00:19 2006 @@ -20,12 +20,7 @@ */ #include "tpm.h" - -/* Atmel definitions */ -enum tpm_atmel_addr { - TPM_ATMEL_BASE_ADDR_LO = 0x08, - TPM_ATMEL_BASE_ADDR_HI = 0x09 -}; +#include "tpm_atmel.h" /* write status bits */ enum tpm_atmel_write_status { @@ -40,7 +35,7 @@ ATML_STATUS_READY = 0x08 }; -static int tpm_atml_recv(struct tpm_chip *chip, u8 * buf, size_t count) +static int tpm_atml_recv(struct tpm_chip *chip, u8 *buf, size_t count) { u8 status, *hdr = buf; u32 size; @@ -52,13 +47,12 @@ return -EIO; for (i = 0; i < 6; i++) { - status = inb(chip->vendor->base + 1); + status = atmel_getb(chip, 1); if ((status & ATML_STATUS_DATA_AVAIL) == 0) { - dev_err(chip->dev, - "error reading header\n"); + dev_err(chip->dev, "error reading header\n"); return -EIO; } - *buf++ = inb(chip->vendor->base); + *buf++ = atmel_getb(chip, 0); } /* size of the data received */ @@ -69,10 +63,9 @@ dev_err(chip->dev, "Recv size(%d) less than available space\n", size); for (; i < size; i++) { /* clear the waiting data anyway */ - status = inb(chip->vendor->base + 1); + status = atmel_getb(chip, 1); if ((status & ATML_STATUS_DATA_AVAIL) == 0) { - dev_err(chip->dev, - "error reading data\n"); + dev_err(chip->dev, "error reading data\n"); return -EIO; } } @@ -81,17 +74,16 @@ /* read all the data available */ for (; i < size; i++) { - status = inb(chip->vendor->base + 1); + status = atmel_getb(chip, 1); if ((status & ATML_STATUS_DATA_AVAIL) == 0) { - dev_err(chip->dev, - "error reading data\n"); + dev_err(chip->dev, "error reading data\n"); return -EIO; } - *buf++ = inb(chip->vendor->base); + *buf++ = atmel_getb(chip, 0); } /* make sure data available is gone */ - status = inb(chip->vendor->base + 1); + status = atmel_getb(chip, 1); if (status & ATML_STATUS_DATA_AVAIL) { dev_err(chip->dev, "data available is stuck\n"); return -EIO; @@ -100,14 +92,14 @@ return size; } -static int tpm_atml_send(struct tpm_chip *chip, u8 * buf, size_t count) +static int tpm_atml_send(struct tpm_chip *chip, u8 *buf, size_t count) { int i; dev_dbg(chip->dev, "tpm_atml_send:\n"); for (i = 0; i < count; i++) { dev_dbg(chip->dev, "%d 0x%x(%d)\n", i, buf[i], buf[i]); - outb(buf[i], chip->vendor->base); + atmel_putb(buf[i], chip, 0); } return count; @@ -115,12 +107,12 @@ static void tpm_atml_cancel(struct tpm_chip *chip) { - outb(ATML_STATUS_ABORT, chip->vendor->base + 1); + atmel_putb(ATML_STATUS_ABORT, chip, 1); } static u8 tpm_atml_status(struct tpm_chip *chip) { - return inb(chip->vendor->base + 1); + return atmel_getb(chip, 1); } static struct file_operations atmel_ops = { @@ -142,7 +134,7 @@ &dev_attr_pcrs.attr, &dev_attr_caps.attr, &dev_attr_cancel.attr, - 0, + NULL, }; static struct attribute_group atmel_attr_grp = { .attrs = atmel_attrs }; @@ -159,27 +151,39 @@ .miscdev = { .fops = &atmel_ops, }, }; -static int __devinit tpm_atml_init(struct pci_dev *pci_dev, - const struct pci_device_id *pci_id) -{ - u8 version[4]; +static struct platform_device *pdev; + +static void atml_plat_remove(void) +{ + struct tpm_chip *chip = dev_get_drvdata(&pdev->dev); + + if (chip) { + if (chip->vendor->have_region) + atmel_release_region(chip->vendor->base, + chip->vendor->region_size); + atmel_put_base_addr(chip->vendor); + tpm_remove_hardware(chip->dev); + platform_device_unregister(pdev); + } +} + +static struct device_driver atml_drv = { + .name = "tpm_atmel", + .bus = &platform_bus_type, + .owner = THIS_MODULE, + .suspend = tpm_pm_suspend, + .resume = tpm_pm_resume, +}; + +static int __init init_atmel(void) +{ int rc = 0; - int lo, hi; - - if (pci_enable_device(pci_dev)) - return -EIO; - - lo = tpm_read_index(TPM_ADDR, TPM_ATMEL_BASE_ADDR_LO); - hi = tpm_read_index(TPM_ADDR, TPM_ATMEL_BASE_ADDR_HI); - - tpm_atmel.base = (hi<<8)|lo; - dev_dbg( &pci_dev->dev, "Operating with base: 0x%x\n", tpm_atmel.base); - - /* verify that it is an Atmel part */ - if (tpm_read_index(TPM_ADDR, 4) != 'A' || tpm_read_index(TPM_ADDR, 5) != 'T' - || tpm_read_index(TPM_ADDR, 6) != 'M' || tpm_read_index(TPM_ADDR, 7) != 'L') { + + driver_register(&atml_drv); + + if ((tpm_atmel.iobase = atmel_get_base_addr(&tpm_atmel)) == NULL) { rc = -ENODEV; - goto out_err; + goto err_unreg_drv; } /* query chip for its version number */ diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/drivers/char/tty_io.c --- a/linux-2.6-xen-sparse/drivers/char/tty_io.c Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/drivers/char/tty_io.c Wed Feb 1 18:00:19 2006 @@ -811,7 +811,7 @@ check_tty_count(tty, "do_tty_hangup"); file_list_lock(); /* This breaks for file handles being sent over AF_UNIX sockets ? */ - list_for_each_entry(filp, &tty->tty_files, f_list) { + list_for_each_entry(filp, &tty->tty_files, f_u.fu_list) { if (filp->f_op->write == redirected_tty_write) cons_filp = filp; if (filp->f_op->write != tty_write) @@ -1418,14 +1418,11 @@ /* Release locally allocated memory ... nothing placed in slots */ free_mem_out: - if (o_tp) - kfree(o_tp); + kfree(o_tp); if (o_tty) free_tty_struct(o_tty); - if (ltp) - kfree(ltp); - if (tp) - kfree(tp); + kfree(ltp); + kfree(tp); free_tty_struct(tty); fail_no_mem: @@ -2730,7 +2727,7 @@ pty_line_name(driver, index, name); else tty_line_name(driver, index, name); - class_device_create(tty_class, dev, device, name); + class_device_create(tty_class, NULL, dev, device, "%s", name); } /** @@ -2985,14 +2982,14 @@ register_chrdev_region(MKDEV(TTYAUX_MAJOR, 0), 1, "/dev/tty") < 0) panic("Couldn't register /dev/tty driver\n"); devfs_mk_cdev(MKDEV(TTYAUX_MAJOR, 0), S_IFCHR|S_IRUGO|S_IWUGO, "tty"); - class_device_create(tty_class, MKDEV(TTYAUX_MAJOR, 0), NULL, "tty"); + class_device_create(tty_class, NULL, MKDEV(TTYAUX_MAJOR, 0), NULL, "tty"); cdev_init(&console_cdev, &console_fops); if (cdev_add(&console_cdev, MKDEV(TTYAUX_MAJOR, 1), 1) || register_chrdev_region(MKDEV(TTYAUX_MAJOR, 1), 1, "/dev/console") < 0) panic("Couldn't register /dev/console driver\n"); devfs_mk_cdev(MKDEV(TTYAUX_MAJOR, 1), S_IFCHR|S_IRUSR|S_IWUSR, "console"); - class_device_create(tty_class, MKDEV(TTYAUX_MAJOR, 1), NULL, "console"); + class_device_create(tty_class, NULL, MKDEV(TTYAUX_MAJOR, 1), NULL, "console"); #ifdef CONFIG_UNIX98_PTYS cdev_init(&ptmx_cdev, &ptmx_fops); @@ -3000,7 +2997,7 @@ register_chrdev_region(MKDEV(TTYAUX_MAJOR, 2), 1, "/dev/ptmx") < 0) panic("Couldn't register /dev/ptmx driver\n"); devfs_mk_cdev(MKDEV(TTYAUX_MAJOR, 2), S_IFCHR|S_IRUGO|S_IWUGO, "ptmx"); - class_device_create(tty_class, MKDEV(TTYAUX_MAJOR, 2), NULL, "ptmx"); + class_device_create(tty_class, NULL, MKDEV(TTYAUX_MAJOR, 2), NULL, "ptmx"); #endif #ifdef CONFIG_VT @@ -3011,7 +3008,7 @@ register_chrdev_region(MKDEV(TTY_MAJOR, 0), 1, "/dev/vc/0") < 0) panic("Couldn't register /dev/tty0 driver\n"); devfs_mk_cdev(MKDEV(TTY_MAJOR, 0), S_IFCHR|S_IRUSR|S_IWUSR, "vc/0"); - class_device_create(tty_class, MKDEV(TTY_MAJOR, 0), NULL, "tty0"); + class_device_create(tty_class, NULL, MKDEV(TTY_MAJOR, 0), NULL, "tty0"); vty_init(); out_vt: diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/drivers/firmware/Kconfig --- a/linux-2.6-xen-sparse/drivers/firmware/Kconfig Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/drivers/firmware/Kconfig Wed Feb 1 18:00:19 2006 @@ -60,6 +60,7 @@ config DELL_RBU tristate "BIOS update support for DELL systems via sysfs" + depends on X86 select FW_LOADER help Say m if you want to have the option of updating the BIOS for your @@ -70,8 +71,7 @@ config DCDBAS tristate "Dell Systems Management Base Driver" - depends on X86 || X86_64 - default m + depends on X86 help The Dell Systems Management Base Driver provides a sysfs interface for systems management software to perform System Management diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/drivers/serial/Kconfig --- a/linux-2.6-xen-sparse/drivers/serial/Kconfig Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/drivers/serial/Kconfig Wed Feb 1 18:00:19 2006 @@ -10,7 +10,8 @@ # The new 8250/16550 serial drivers config SERIAL_8250 tristate "8250/16550 and compatible serial support" - depends on (BROKEN || !(SPARC64 || SPARC32 || XEN_DISABLE_SERIAL)) + depends on (BROKEN || !SPARC) + depends on !XEN_DISABLE_SERIAL select SERIAL_CORE ---help--- This selects whether you want to include the driver for the standard @@ -207,6 +208,14 @@ system, say Y to this option. The driver can handle 1, 2, or 3 port cards. If unsure, say N. +config SERIAL_8250_AU1X00 + bool "AU1X00 serial port support" + depends on SERIAL_8250 != n && SOC_AU1X00 + help + If you have an Au1x00 board and want to use the serial port, say Y + to this option. The driver can handle 1 or 2 serial ports. + If unsure, say N. + comment "Non-8250 serial port support" config SERIAL_AMBA_PL010 @@ -461,14 +470,14 @@ config SERIAL_SUNCORE bool - depends on SPARC32 || SPARC64 + depends on SPARC select SERIAL_CORE select SERIAL_CORE_CONSOLE default y config SERIAL_SUNZILOG tristate "Sun Zilog8530 serial support" - depends on SPARC32 || SPARC64 + depends on SPARC help This driver supports the Zilog8530 serial ports found on many Sparc systems. Say Y or M if you want to be able to these serial ports. @@ -483,7 +492,7 @@ config SERIAL_SUNSU tristate "Sun SU serial support" - depends on (SPARC32 || SPARC64) && PCI + depends on SPARC && PCI help This driver supports the 8250 serial ports that run the keyboard and mouse on (PCI) UltraSPARC systems. Say Y or M if you want to be able @@ -499,7 +508,7 @@ config SERIAL_MUX tristate "Serial MUX support" - depends on PARISC + depends on GSC select SERIAL_CORE default y ---help--- @@ -539,7 +548,7 @@ config SERIAL_SUNSAB tristate "Sun Siemens SAB82532 serial support" - depends on (SPARC32 || SPARC64) && PCI + depends on SPARC && PCI help This driver supports the Siemens SAB82532 DUSCC serial ports on newer (PCI) UltraSPARC systems. Say Y or M if you want to be able to these diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/drivers/xen/balloon/balloon.c --- a/linux-2.6-xen-sparse/drivers/xen/balloon/balloon.c Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/drivers/xen/balloon/balloon.c Wed Feb 1 18:00:19 2006 @@ -92,14 +92,14 @@ static struct timer_list balloon_timer; /* Use the private and mapping fields of struct page as a list. */ -#define PAGE_TO_LIST(p) ((struct list_head *)&p->private) +#define PAGE_TO_LIST(p) ((struct list_head *)&p->u.private) #define LIST_TO_PAGE(l) \ - (list_entry(((unsigned long *)l), struct page, private)) + (list_entry(((unsigned long *)l), struct page, u.private)) #define UNLIST_PAGE(p) \ do { \ list_del(PAGE_TO_LIST(p)); \ p->mapping = NULL; \ - p->private = 0; \ + p->u.private = 0; \ } while(0) #define IPRINTK(fmt, args...) \ diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/drivers/xen/core/smpboot.c --- a/linux-2.6-xen-sparse/drivers/xen/core/smpboot.c Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/drivers/xen/core/smpboot.c Wed Feb 1 18:00:19 2006 @@ -126,8 +126,10 @@ static void cpu_bringup(void) { - if (!cpu_isset(smp_processor_id(), cpu_initialized)) + if (!cpu_isset(smp_processor_id(), cpu_initialized)) { cpu_init(); + preempt_disable(); + } local_irq_enable(); cpu_idle(); } diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/fs/Kconfig --- a/linux-2.6-xen-sparse/fs/Kconfig Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/fs/Kconfig Wed Feb 1 18:00:19 2006 @@ -810,7 +810,7 @@ config HUGETLBFS bool "HugeTLB file system support" - depends X86 || IA64 || PPC64 || SPARC64 || SUPERH || X86_64 || BROKEN + depends X86 || IA64 || PPC64 || SPARC64 || SUPERH || BROKEN depends !XEN config HUGETLB_PAGE @@ -899,6 +899,7 @@ config HFS_FS tristate "Apple Macintosh file system support (EXPERIMENTAL)" depends on EXPERIMENTAL + select NLS help If you say Y here, you will be able to mount Macintosh-formatted floppy disks and hard drive partitions with full read-write access. @@ -1051,6 +1052,19 @@ - NOR flash with transparent ECC - DataFlash +config JFFS2_SUMMARY + bool "JFFS2 summary support (EXPERIMENTAL)" + depends on JFFS2_FS && EXPERIMENTAL + default n + help + This feature makes it possible to use summary information + for faster filesystem mount. + + The summary information can be inserted into a filesystem image + by the utility 'sumtool'. + + If unsure, say 'N'. + config JFFS2_COMPRESSION_OPTIONS bool "Advanced compression options for JFFS2" depends on JFFS2_FS @@ -1072,10 +1086,10 @@ default y help Zlib is designed to be a free, general-purpose, legally unencumbered, - lossless data-compression library for use on virtually any computer + lossless data-compression library for use on virtually any computer hardware and operating system. See <http://www.gzip.org/zlib/> for further information. - + Say 'Y' if unsure. config JFFS2_RTIME @@ -1097,7 +1111,7 @@ default JFFS2_CMODE_PRIORITY depends on JFFS2_FS help - You can set here the default compression mode of JFFS2 from + You can set here the default compression mode of JFFS2 from the available compression modes. Don't touch if unsure. config JFFS2_CMODE_NONE @@ -1108,13 +1122,13 @@ config JFFS2_CMODE_PRIORITY bool "priority" help - Tries the compressors in a predefinied order and chooses the first + Tries the compressors in a predefinied order and chooses the first successful one. config JFFS2_CMODE_SIZE bool "size (EXPERIMENTAL)" help - Tries all compressors and chooses the one which has the smallest + Tries all compressors and chooses the one which has the smallest result. endchoice @@ -1588,9 +1602,10 @@ PC operating systems. The CIFS protocol is fully supported by file servers such as Windows 2000 (including Windows 2003, NT 4 and Windows XP) as well by Samba (which provides excellent CIFS - server support for Linux and many other operating systems). Currently - you must use the smbfs client filesystem to access older SMB servers - such as Windows 9x and OS/2. + server support for Linux and many other operating systems). Limited + support for Windows ME and similar servers is provided as well. + You must use the smbfs client filesystem to access older SMB servers + such as OS/2 and DOS. The intent of the cifs module is to provide an advanced network file system client for mounting to CIFS compliant servers, @@ -1601,7 +1616,7 @@ cifs if running only a (Samba) server. It is possible to enable both smbfs and cifs (e.g. if you are using CIFS for accessing Windows 2003 and Samba 3 servers, and smbfs for accessing old servers). If you need - to mount to Samba or Windows 2003 servers from this machine, say Y. + to mount to Samba or Windows from this machine, say Y. config CIFS_STATS bool "CIFS statistics" @@ -1610,8 +1625,22 @@ Enabling this option will cause statistics for each server share mounted by the cifs client to be displayed in /proc/fs/cifs/Stats +config CIFS_STATS2 + bool "CIFS extended statistics" + depends on CIFS_STATS + help + Enabling this option will allow more detailed statistics on SMB + request timing to be displayed in /proc/fs/cifs/DebugData and also + allow optional logging of slow responses to dmesg (depending on the + value of /proc/fs/cifs/cifsFYI, see fs/cifs/README for more details). + These additional statistics may have a minor effect on performance + and memory utilization. + + Unless you are a developer or are doing network performance analysis + or tuning, say N. + config CIFS_XATTR - bool "CIFS extended attributes (EXPERIMENTAL)" + bool "CIFS extended attributes" depends on CIFS help Extended attributes are name:value pairs associated with inodes by @@ -1623,11 +1652,11 @@ prefaced by the user namespace prefix. The system namespace (used by some filesystems to store ACLs) is not supported at this time. - + If unsure, say N. config CIFS_POSIX - bool "CIFS POSIX Extensions (EXPERIMENTAL)" + bool "CIFS POSIX Extensions" depends on CIFS_XATTR help Enabling this option will cause the cifs client to attempt to @@ -1640,10 +1669,28 @@ config CIFS_EXPERIMENTAL bool "CIFS Experimental Features (EXPERIMENTAL)" - depends on CIFS + depends on CIFS && EXPERIMENTAL help - Enables cifs features under testing. These features - are highly experimental. If unsure, say N. + Enables cifs features under testing. These features are + experimental and currently include support for writepages + (multipage writebehind performance improvements) and directory + change notification ie fcntl(F_DNOTIFY) as well as some security + improvements. Some also depend on setting at runtime the + pseudo-file /proc/fs/cifs/Experimental (which is disabled by + default). See the file fs/cifs/README for more details. + + If unsure, say N. + +config CIFS_UPCALL + bool "CIFS Kerberos/SPNEGO advanced session setup (EXPERIMENTAL)" + depends on CIFS_EXPERIMENTAL + select CONNECTOR + help + Enables an upcall mechanism for CIFS which will be used to contact + userspace helper utilities to provide SPNEGO packaged Kerberos + tickets which are needed to mount to certain secure servers + (for which more secure Kerberos authentication is required). If + unsure, say N. config NCP_FS tristate "NCP file system support (to mount NetWare volumes)" diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/include/asm-i386/atomic.h --- a/linux-2.6-xen-sparse/include/asm-i386/atomic.h Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/include/asm-i386/atomic.h Wed Feb 1 18:00:19 2006 @@ -210,6 +210,27 @@ return atomic_add_return(-i,v); } +#define atomic_cmpxchg(v, old, new) ((int)cmpxchg(&((v)->counter), old, new)) + +/** + * atomic_add_unless - add unless the number is a given value + * @v: pointer of type atomic_t + * @a: the amount to add to v... + * @u: ...unless v is equal to u. + * + * Atomically adds @a to @v, so long as it was not @u. + * Returns non-zero if @v was not @u, and zero otherwise. + */ +#define atomic_add_unless(v, a, u) \ +({ \ + int c, old; \ + c = atomic_read(v); \ + while (c != (u) && (old = atomic_cmpxchg((v), c, c + (a))) != c) \ + c = old; \ + c != (u); \ +}) +#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) + #define atomic_inc_return(v) (atomic_add_return(1,v)) #define atomic_dec_return(v) (atomic_sub_return(1,v)) diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/desc.h --- a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/desc.h Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/desc.h Wed Feb 1 18:00:19 2006 @@ -14,6 +14,8 @@ #include <asm/mmu.h> extern struct desc_struct cpu_gdt_table[NR_CPUS][GDT_ENTRIES]; + +#define get_cpu_gdt_table(_cpu) ((struct desc_struct *)cpu_gdt_descr[(_cpu)].address) DECLARE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]); @@ -37,8 +39,6 @@ #define store_idt(dtr) __asm__ ("sidt %0":"=m" (*dtr)) #define store_tr(tr) __asm__ ("str %0":"=mr" (tr)) #define store_ldt(ldt) __asm__ ("sldt %0":"=mr" (ldt)) - -#define get_cpu_gdt_table(_cpu) ((struct desc_struct *)cpu_gdt_descr[(_cpu)].address) /* * This is the ldt that every process will get unless we need @@ -68,8 +68,7 @@ static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int size) { - _set_tssldt_desc(&get_cpu_gdt_table(cpu)[GDT_ENTRY_LDT], - (int)addr, ((size << 3)-1), 0x82); + _set_tssldt_desc(&get_cpu_gdt_table(cpu)[GDT_ENTRY_LDT], (int)addr, ((size << 3)-1), 0x82); } #define LDT_entry_a(info) \ diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/mmu_context.h --- a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/mmu_context.h Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/mmu_context.h Wed Feb 1 18:00:19 2006 @@ -63,7 +63,6 @@ cpu_set(cpu, next->cpu_vm_mask); /* Re-load page tables: load_cr3(next->pgd) */ - per_cpu(cur_pgd, cpu) = next->pgd; op->cmd = MMUEXT_NEW_BASEPTR; op->arg1.mfn = pfn_to_mfn(__pa(next->pgd) >> PAGE_SHIFT); op++; diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/param.h --- a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/param.h Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/param.h Wed Feb 1 18:00:19 2006 @@ -1,9 +1,8 @@ -#include <linux/config.h> - #ifndef _ASMi386_PARAM_H #define _ASMi386_PARAM_H #ifdef __KERNEL__ +# include <linux/config.h> # define HZ CONFIG_HZ /* Internal kernel timer frequency */ # define USER_HZ 100 /* .. some user interfaces are in "ticks" */ # define CLOCKS_PER_SEC (USER_HZ) /* like times() */ diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/pgtable-2level.h --- a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/pgtable-2level.h Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/pgtable-2level.h Wed Feb 1 18:00:19 2006 @@ -76,11 +76,6 @@ #define pfn_pte_ma(pfn, prot) __pte_ma(((pfn) << PAGE_SHIFT) | pgprot_val(prot)) #define pfn_pmd(pfn, prot) __pmd(((pfn) << PAGE_SHIFT) | pgprot_val(prot)) -#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) - -#define pmd_page_kernel(pmd) \ -((unsigned long) __va(pmd_val(pmd) & PAGE_MASK)) - /* * All present user pages are user-executable: */ diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/pgtable-3level.h --- a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/pgtable-3level.h Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/pgtable-3level.h Wed Feb 1 18:00:19 2006 @@ -103,11 +103,6 @@ */ static inline void pud_clear (pud_t * pud) { } -#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) - -#define pmd_page_kernel(pmd) \ -((unsigned long) __va(pmd_val(pmd) & PAGE_MASK)) - #define pud_page(pud) \ ((struct page *) __va(pud_val(pud) & PAGE_MASK)) diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/pgtable.h --- a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/pgtable.h Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/pgtable.h Wed Feb 1 18:00:19 2006 @@ -25,6 +25,9 @@ #include <linux/slab.h> #include <linux/list.h> #include <linux/spinlock.h> + +struct mm_struct; +struct vm_area_struct; /* * ZERO_PAGE is a global shared page that is always zero: used @@ -204,7 +207,8 @@ #define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE)) #define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0) -#define pmd_none(x) (!pmd_val(x)) +/* To avoid harmful races, pmd_none(x) should check only the lower when PAE */ +#define pmd_none(x) (!(unsigned long)pmd_val(x)) /* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t. can temporarily clear it. */ #define pmd_present(x) (pmd_val(x)) @@ -326,8 +330,6 @@ return pte; } -#define page_pte(page) page_pte_prot(page, __pgprot(0)) - #define pmd_large(pmd) \ ((pmd_val(pmd) & (_PAGE_PSE|_PAGE_PRESENT)) == (_PAGE_PSE|_PAGE_PRESENT)) @@ -371,6 +373,11 @@ (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) #define pte_offset_kernel(dir, address) \ ((pte_t *) pmd_page_kernel(*(dir)) + pte_index(address)) + +#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) + +#define pmd_page_kernel(pmd) \ + ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK)) /* * Helper function that returns the kernel pagetable entry controlling diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/processor.h --- a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/processor.h Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/processor.h Wed Feb 1 18:00:19 2006 @@ -66,7 +66,9 @@ int f00f_bug; int coma_bug; unsigned long loops_per_jiffy; - unsigned char x86_num_cores; + unsigned char x86_max_cores; /* cpuid returned max cores value */ + unsigned char booted_cores; /* number of cores as seen by OS */ + unsigned char apicid; } __attribute__((__aligned__(SMP_CACHE_BYTES))); #define X86_VENDOR_INTEL 0 @@ -89,7 +91,6 @@ extern struct cpuinfo_x86 new_cpu_data; extern struct tss_struct doublefault_tss; DECLARE_PER_CPU(struct tss_struct, init_tss); -DECLARE_PER_CPU(pgd_t *, cur_pgd); #ifdef CONFIG_SMP extern struct cpuinfo_x86 cpu_data[]; @@ -724,4 +725,10 @@ #define mtrr_bp_init() do {} while (0) #endif +#ifdef CONFIG_X86_MCE +extern void mcheck_init(struct cpuinfo_x86 *c); +#else +#define mcheck_init(c) do {} while(0) +#endif + #endif /* __ASM_I386_PROCESSOR_H */ diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/smp.h --- a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/smp.h Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/smp.h Wed Feb 1 18:00:19 2006 @@ -45,6 +45,8 @@ #define MAX_APICID 256 extern u8 x86_cpu_to_apicid[]; +#define cpu_physical_id(cpu) x86_cpu_to_apicid[cpu] + #ifdef CONFIG_HOTPLUG_CPU extern void cpu_exit_clear(void); extern void cpu_uninit(void); @@ -91,6 +93,10 @@ extern void __cpu_die(unsigned int cpu); #endif /* !__ASSEMBLY__ */ +#else /* CONFIG_SMP */ + +#define cpu_physical_id(cpu) boot_cpu_physical_apicid + #define NO_PROC_ID 0xFF /* No processor magic marker */ #endif diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/system.h --- a/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/system.h Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/include/asm-i386/mach-xen/asm/system.h Wed Feb 1 18:00:19 2006 @@ -122,21 +122,22 @@ #define write_cr0(x) \ __asm__ __volatile__("movl %0,%%cr0": :"r" (x)); -#define read_cr2() ({ \ - unsigned int __dummy; \ - __asm__ __volatile__( \ - "movl %%cr2,%0\n\t" \ - :"=r" (__dummy)); \ - __dummy; \ -}) +#define read_cr2() \ + (HYPERVISOR_shared_info->vcpu_info[smp_processor_id()].arch.cr2) #define write_cr2(x) \ __asm__ __volatile__("movl %0,%%cr2": :"r" (x)); -#define read_cr3() per_cpu(cur_pgd, smp_processor_id()) -#define write_cr3(x) do { \ - xen_pt_switch((x)); \ - per_cpu(cur_pgd, smp_processor_id()) = (x); \ -} while (/* CONSTCOND */0) +#define read_cr3() ({ \ + unsigned int __dummy; \ + __asm__ ( \ + "movl %%cr3,%0\n\t" \ + :"=r" (__dummy)); \ + machine_to_phys(__dummy); \ +}) +#define write_cr3(x) ({ \ + maddr_t __dummy = phys_to_machine(x); \ + __asm__ __volatile__("movl %0,%%cr3": :"r" (__dummy)); \ +}) #define read_cr4() ({ \ unsigned int __dummy; \ @@ -147,7 +148,6 @@ }) #define write_cr4(x) \ __asm__ __volatile__("movl %0,%%cr4": :"r" (x)); - #define stts() (HYPERVISOR_fpu_taskswitch(1)) #endif /* __KERNEL__ */ @@ -172,6 +172,8 @@ struct __xchg_dummy { unsigned long a[100]; }; #define __xg(x) ((struct __xchg_dummy *)(x)) + +#ifdef CONFIG_X86_CMPXCHG64 /* * The semantics of XCHGCMP8B are a bit strange, this is why @@ -226,6 +228,8 @@ (__builtin_constant_p(value) ? \ __set_64bit(ptr, (unsigned int)(value), (unsigned int)((value)>>32ULL) ) : \ __set_64bit(ptr, ll_low(value), ll_high(value)) ) + +#endif /* * Note: no "lock" prefix even on SMP: xchg always implies lock anyway @@ -265,6 +269,9 @@ #ifdef CONFIG_X86_CMPXCHG #define __HAVE_ARCH_CMPXCHG 1 +#define cmpxchg(ptr,o,n)\ + ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\ + (unsigned long)(n),sizeof(*(ptr)))) #endif static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, @@ -281,22 +288,78 @@ case 2: __asm__ __volatile__(LOCK "cmpxchgw %w1,%2" : "=a"(prev) - : "q"(new), "m"(*__xg(ptr)), "0"(old) + : "r"(new), "m"(*__xg(ptr)), "0"(old) : "memory"); return prev; case 4: __asm__ __volatile__(LOCK "cmpxchgl %1,%2" : "=a"(prev) - : "q"(new), "m"(*__xg(ptr)), "0"(old) + : "r"(new), "m"(*__xg(ptr)), "0"(old) : "memory"); return prev; } return old; } -#define cmpxchg(ptr,o,n)\ - ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\ - (unsigned long)(n),sizeof(*(ptr)))) +#ifndef CONFIG_X86_CMPXCHG +/* + * Building a kernel capable running on 80386. It may be necessary to + * simulate the cmpxchg on the 80386 CPU. For that purpose we define + * a function for each of the sizes we support. + */ + +extern unsigned long cmpxchg_386_u8(volatile void *, u8, u8); +extern unsigned long cmpxchg_386_u16(volatile void *, u16, u16); +extern unsigned long cmpxchg_386_u32(volatile void *, u32, u32); + +static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old, + unsigned long new, int size) +{ + switch (size) { + case 1: + return cmpxchg_386_u8(ptr, old, new); + case 2: + return cmpxchg_386_u16(ptr, old, new); + case 4: + return cmpxchg_386_u32(ptr, old, new); + } + return old; +} + +#define cmpxchg(ptr,o,n) \ +({ \ + __typeof__(*(ptr)) __ret; \ + if (likely(boot_cpu_data.x86 > 3)) \ + __ret = __cmpxchg((ptr), (unsigned long)(o), \ + (unsigned long)(n), sizeof(*(ptr))); \ + else \ + __ret = cmpxchg_386((ptr), (unsigned long)(o), \ + (unsigned long)(n), sizeof(*(ptr))); \ + __ret; \ +}) +#endif + +#ifdef CONFIG_X86_CMPXCHG64 + +static inline unsigned long long __cmpxchg64(volatile void *ptr, unsigned long long old, + unsigned long long new) +{ + unsigned long long prev; + __asm__ __volatile__(LOCK "cmpxchg8b %3" + : "=A"(prev) + : "b"((unsigned long)new), + "c"((unsigned long)(new >> 32)), + "m"(*__xg(ptr)), + "0"(old) + : "memory"); + return prev; +} + +#define cmpxchg64(ptr,o,n)\ + ((__typeof__(*(ptr)))__cmpxchg64((ptr),(unsigned long long)(o),\ + (unsigned long long)(n))) + +#endif #ifdef __KERNEL__ struct alt_instr { diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/include/asm-i386/rwsem.h --- a/linux-2.6-xen-sparse/include/asm-i386/rwsem.h Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/include/asm-i386/rwsem.h Wed Feb 1 18:00:19 2006 @@ -285,5 +285,10 @@ return tmp+delta; } +static inline int rwsem_is_locked(struct rw_semaphore *sem) +{ + return (sem->count != 0); +} + #endif /* __KERNEL__ */ #endif /* _I386_RWSEM_H */ diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/include/asm-i386/system.h --- a/linux-2.6-xen-sparse/include/asm-i386/system.h Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/include/asm-i386/system.h Wed Feb 1 18:00:19 2006 @@ -167,6 +167,8 @@ #define __xg(x) ((struct __xchg_dummy *)(x)) +#ifdef CONFIG_X86_CMPXCHG64 + /* * The semantics of XCHGCMP8B are a bit strange, this is why * there is a loop and the loading of %%eax and %%edx has to @@ -220,6 +222,8 @@ (__builtin_constant_p(value) ? \ __set_64bit(ptr, (unsigned int)(value), (unsigned int)((value)>>32ULL) ) : \ __set_64bit(ptr, ll_low(value), ll_high(value)) ) + +#endif /* * Note: no "lock" prefix even on SMP: xchg always implies lock anyway @@ -259,6 +263,9 @@ #ifdef CONFIG_X86_CMPXCHG #define __HAVE_ARCH_CMPXCHG 1 +#define cmpxchg(ptr,o,n)\ + ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\ + (unsigned long)(n),sizeof(*(ptr)))) #endif static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, @@ -275,22 +282,78 @@ case 2: __asm__ __volatile__(LOCK "cmpxchgw %w1,%2" : "=a"(prev) - : "q"(new), "m"(*__xg(ptr)), "0"(old) + : "r"(new), "m"(*__xg(ptr)), "0"(old) : "memory"); return prev; case 4: __asm__ __volatile__(LOCK "cmpxchgl %1,%2" : "=a"(prev) - : "q"(new), "m"(*__xg(ptr)), "0"(old) + : "r"(new), "m"(*__xg(ptr)), "0"(old) : "memory"); return prev; } return old; } -#define cmpxchg(ptr,o,n)\ - ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\ - (unsigned long)(n),sizeof(*(ptr)))) +#ifndef CONFIG_X86_CMPXCHG +/* + * Building a kernel capable running on 80386. It may be necessary to + * simulate the cmpxchg on the 80386 CPU. For that purpose we define + * a function for each of the sizes we support. + */ + +extern unsigned long cmpxchg_386_u8(volatile void *, u8, u8); +extern unsigned long cmpxchg_386_u16(volatile void *, u16, u16); +extern unsigned long cmpxchg_386_u32(volatile void *, u32, u32); + +static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old, + unsigned long new, int size) +{ + switch (size) { + case 1: + return cmpxchg_386_u8(ptr, old, new); + case 2: + return cmpxchg_386_u16(ptr, old, new); + case 4: + return cmpxchg_386_u32(ptr, old, new); + } + return old; +} + +#define cmpxchg(ptr,o,n) \ +({ \ + __typeof__(*(ptr)) __ret; \ + if (likely(boot_cpu_data.x86 > 3)) \ + __ret = __cmpxchg((ptr), (unsigned long)(o), \ + (unsigned long)(n), sizeof(*(ptr))); \ + else \ + __ret = cmpxchg_386((ptr), (unsigned long)(o), \ + (unsigned long)(n), sizeof(*(ptr))); \ + __ret; \ +}) +#endif + +#ifdef CONFIG_X86_CMPXCHG64 + +static inline unsigned long long __cmpxchg64(volatile void *ptr, unsigned long long old, + unsigned long long new) +{ + unsigned long long prev; + __asm__ __volatile__(LOCK "cmpxchg8b %3" + : "=A"(prev) + : "b"((unsigned long)new), + "c"((unsigned long)(new >> 32)), + "m"(*__xg(ptr)), + "0"(old) + : "memory"); + return prev; +} + +#define cmpxchg64(ptr,o,n)\ + ((__typeof__(*(ptr)))__cmpxchg64((ptr),(unsigned long long)(o),\ + (unsigned long long)(n))) + +#endif #ifdef __KERNEL__ struct alt_instr { diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/include/asm-um/page.h --- a/linux-2.6-xen-sparse/include/asm-um/page.h Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/include/asm-um/page.h Wed Feb 1 18:00:19 2006 @@ -115,7 +115,7 @@ #define pfn_valid(pfn) ((pfn) < max_mapnr) #define virt_addr_valid(v) pfn_valid(phys_to_pfn(__pa(v))) -extern struct page *arch_validate(struct page *page, int mask, int order); +extern struct page *arch_validate(struct page *page, gfp_t mask, int order); #define HAVE_ARCH_VALIDATE extern int arch_free_page(struct page *page, int order); diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/desc.h --- a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/desc.h Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/desc.h Wed Feb 1 18:00:19 2006 @@ -114,16 +114,19 @@ static inline void set_intr_gate(int nr, void *func) { + BUG_ON((unsigned)nr > 0xFF); _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, 0); } static inline void set_intr_gate_ist(int nr, void *func, unsigned ist) { + BUG_ON((unsigned)nr > 0xFF); _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, ist); } static inline void set_system_gate(int nr, void *func) { + BUG_ON((unsigned)nr > 0xFF); _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, 0); } @@ -145,10 +148,16 @@ static inline void set_tss_desc(unsigned cpu, void *addr) { + /* + * sizeof(unsigned long) coming from an extra "long" at the end + * of the iobitmap. See tss_struct definition in processor.h + * + * -1? seg base+limit should be pointing to the address of the + * last valid byte + */ set_tssldt_descriptor((struct ldttss_desc *)&get_cpu_gdt_table(cpu)[GDT_ENTRY_TSS], - (unsigned long)addr, - DESC_TSS, - sizeof(struct tss_struct) - 1); + (unsigned long)addr, DESC_TSS, + IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1); } static inline void set_ldt_desc(unsigned cpu, void *addr, int size) diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/hw_irq.h --- a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/hw_irq.h Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/hw_irq.h Wed Feb 1 18:00:19 2006 @@ -56,7 +56,7 @@ #define CALL_FUNCTION_VECTOR 0xfc #define KDB_VECTOR 0xfb /* reserved for KDB */ #define THERMAL_APIC_VECTOR 0xfa -/* 0xf9 free */ +#define THRESHOLD_APIC_VECTOR 0xf9 #define INVALIDATE_TLB_VECTOR_END 0xf8 #define INVALIDATE_TLB_VECTOR_START 0xf0 /* f0-f8 used for TLB flush */ diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/mmu_context.h --- a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/mmu_context.h Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/mmu_context.h Wed Feb 1 18:00:19 2006 @@ -60,6 +60,12 @@ extern void mm_unpin(struct mm_struct *mm); void mm_pin_all(void); +static inline void load_cr3(pgd_t *pgd) +{ + asm volatile("movq %0,%%cr3" :: "r" (phys_to_machine(__pa(pgd))) : + "memory"); +} + static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { @@ -79,7 +85,6 @@ set_bit(cpu, &next->cpu_vm_mask); /* load_cr3(next->pgd) */ - per_cpu(cur_pgd, smp_processor_id()) = next->pgd; op->cmd = MMUEXT_NEW_BASEPTR; op->arg1.mfn = pfn_to_mfn(__pa(next->pgd) >> PAGE_SHIFT); op++; diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/page.h --- a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/page.h Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/page.h Wed Feb 1 18:00:19 2006 @@ -31,7 +31,7 @@ #define PAGE_SIZE (1UL << PAGE_SHIFT) #endif #define PAGE_MASK (~(PAGE_SIZE-1)) -#define PHYSICAL_PAGE_MASK (~(PAGE_SIZE-1) & (__PHYSICAL_MASK << PAGE_SHIFT)) +#define PHYSICAL_PAGE_MASK (~(PAGE_SIZE-1) & __PHYSICAL_MASK) #define THREAD_ORDER 1 #ifdef __ASSEMBLY__ diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/param.h --- a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/param.h Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/param.h Wed Feb 1 18:00:19 2006 @@ -1,9 +1,8 @@ -#include <linux/config.h> - #ifndef _ASMx86_64_PARAM_H #define _ASMx86_64_PARAM_H #ifdef __KERNEL__ +# include <linux/config.h> # define HZ CONFIG_HZ /* Internal kernel timer frequency */ # define USER_HZ 100 /* .. some user interfaces are in "ticks */ # define CLOCKS_PER_SEC (USER_HZ) /* like times() */ diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/pgtable.h --- a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/pgtable.h Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/pgtable.h Wed Feb 1 18:00:19 2006 @@ -39,6 +39,7 @@ extern pud_t level3_ident_pgt[512]; extern pmd_t level2_kernel_pgt[512]; extern pgd_t init_level4_pgt[]; +extern pgd_t boot_level4_pgt[]; extern unsigned long __supported_pte_mask; #define swapper_pg_dir init_level4_pgt @@ -142,6 +143,8 @@ return pte; } #endif + +struct mm_struct; static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full) { @@ -405,8 +408,6 @@ * and a page entry and page directory to the page they refer to. */ -#define page_pte(page) page_pte_prot(page, __pgprot(0)) - /* * Level 4 access. * Never use these in the common code. @@ -450,7 +451,7 @@ #define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) #define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT)) #define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot))) -#define pmd_pfn(x) ((pmd_val(x) >> PAGE_SHIFT) & __PHYSICAL_MASK) +#define pmd_pfn(x) ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT) #define pte_to_pgoff(pte) ((pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT) #define pgoff_to_pte(off) ((pte_t) { ((off) << PAGE_SHIFT) | _PAGE_FILE }) diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/processor.h --- a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/processor.h Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/processor.h Wed Feb 1 18:00:19 2006 @@ -61,10 +61,12 @@ int x86_cache_alignment; int x86_tlbsize; /* number of 4K pages in DTLB/ITLB combined(in pages)*/ __u8 x86_virt_bits, x86_phys_bits; - __u8 x86_num_cores; + __u8 x86_max_cores; /* cpuid returned max cores value */ __u32 x86_power; __u32 extended_cpuid_level; /* Max extended CPUID function supported */ unsigned long loops_per_jiffy; + __u8 apicid; + __u8 booted_cores; /* number of cores as seen by OS */ } ____cacheline_aligned; #define X86_VENDOR_INTEL 0 @@ -167,11 +169,6 @@ #endif } - -#define load_cr3(pgdir) do { \ - xen_pt_switch(__pa(pgdir)); \ - per_cpu(cur_pgd, smp_processor_id()) = pgdir; \ -} while (/* CONSTCOND */0) /* * Bus types @@ -247,7 +244,6 @@ extern struct cpuinfo_x86 boot_cpu_data; DECLARE_PER_CPU(struct tss_struct,init_tss); -DECLARE_PER_CPU(pgd_t *, cur_pgd); #define ARCH_MIN_TASKALIGN 16 diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/smp.h --- a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/smp.h Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/smp.h Wed Feb 1 18:00:19 2006 @@ -46,7 +46,6 @@ extern void unlock_ipi_call_lock(void); extern int smp_num_siblings; extern void smp_send_reschedule(int cpu); -extern void zap_low_mappings(void); void smp_stop_cpu(void); extern int smp_call_function_single(int cpuid, void (*func) (void *info), void *info, int retry, int wait); @@ -83,6 +82,8 @@ extern int __cpu_disable(void); extern void __cpu_die(unsigned int cpu); extern void prefill_possible_map(void); +extern unsigned num_processors; +extern unsigned disabled_cpus; #endif /* !ASSEMBLY */ @@ -139,5 +140,11 @@ #endif #endif +#ifdef CONFIG_SMP +#define cpu_physical_id(cpu) x86_cpu_to_apicid[cpu] +#else +#define cpu_physical_id(cpu) boot_cpu_id #endif +#endif + diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/system.h --- a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/system.h Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/system.h Wed Feb 1 18:00:19 2006 @@ -9,6 +9,12 @@ #include <xen/interface/arch-x86_64.h> #ifdef __KERNEL__ + +#ifdef CONFIG_SMP +#define __vcpu_id smp_processor_id() +#else +#define __vcpu_id 0 +#endif #ifdef CONFIG_SMP #define LOCK_PREFIX "lock ; " @@ -325,7 +331,7 @@ do { \ vcpu_info_t *_vcpu; \ preempt_disable(); \ - _vcpu = &HYPERVISOR_shared_info->vcpu_info[smp_processor_id()]; \ + _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id]; \ _vcpu->evtchn_upcall_mask = 1; \ preempt_enable_no_resched(); \ barrier(); \ @@ -336,7 +342,7 @@ vcpu_info_t *_vcpu; \ barrier(); \ preempt_disable(); \ - _vcpu = &HYPERVISOR_shared_info->vcpu_info[smp_processor_id()]; \ + _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id]; \ _vcpu->evtchn_upcall_mask = 0; \ barrier(); /* unmask then check (avoid races) */ \ if ( unlikely(_vcpu->evtchn_upcall_pending) ) \ @@ -348,7 +354,7 @@ do { \ vcpu_info_t *_vcpu; \ preempt_disable(); \ - _vcpu = &HYPERVISOR_shared_info->vcpu_info[smp_processor_id()]; \ + _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id]; \ (x) = _vcpu->evtchn_upcall_mask; \ preempt_enable(); \ } while (0) @@ -358,7 +364,7 @@ vcpu_info_t *_vcpu; \ barrier(); \ preempt_disable(); \ - _vcpu = &HYPERVISOR_shared_info->vcpu_info[smp_processor_id()]; \ + _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id]; \ if ((_vcpu->evtchn_upcall_mask = (x)) == 0) { \ barrier(); /* unmask then check (avoid races) */ \ if ( unlikely(_vcpu->evtchn_upcall_pending) ) \ @@ -374,7 +380,7 @@ do { \ vcpu_info_t *_vcpu; \ preempt_disable(); \ - _vcpu = &HYPERVISOR_shared_info->vcpu_info[smp_processor_id()]; \ + _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id]; \ (x) = _vcpu->evtchn_upcall_mask; \ _vcpu->evtchn_upcall_mask = 1; \ preempt_enable_no_resched(); \ @@ -394,7 +400,7 @@ ({ int ___x; \ vcpu_info_t *_vcpu; \ preempt_disable(); \ - _vcpu = &HYPERVISOR_shared_info->vcpu_info[smp_processor_id()]; \ + _vcpu = &HYPERVISOR_shared_info->vcpu_info[__vcpu_id]; \ ___x = (_vcpu->evtchn_upcall_mask != 0); \ preempt_enable_no_resched(); \ ___x; }) diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/include/linux/gfp.h --- a/linux-2.6-xen-sparse/include/linux/gfp.h Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/include/linux/gfp.h Wed Feb 1 18:00:19 2006 @@ -11,9 +11,16 @@ /* * GFP bitmasks.. */ -/* Zone modifiers in GFP_ZONEMASK (see linux/mmzone.h - low two bits) */ -#define __GFP_DMA 0x01u -#define __GFP_HIGHMEM 0x02u +/* Zone modifiers in GFP_ZONEMASK (see linux/mmzone.h - low three bits) */ +#define __GFP_DMA ((__force gfp_t)0x01u) +#define __GFP_HIGHMEM ((__force gfp_t)0x02u) +#ifdef CONFIG_DMA_IS_DMA32 +#define __GFP_DMA32 ((__force gfp_t)0x01) /* ZONE_DMA is ZONE_DMA32 */ +#elif BITS_PER_LONG < 64 +#define __GFP_DMA32 ((__force gfp_t)0x00) /* ZONE_NORMAL is ZONE_DMA32 */ +#else +#define __GFP_DMA32 ((__force gfp_t)0x04) /* Has own ZONE_DMA32 */ +#endif /* * Action modifiers - doesn't change the zoning @@ -26,30 +33,29 @@ * * __GFP_NORETRY: The VM implementation must not retry indefinitely. */ -#define __GFP_WAIT 0x10u /* Can wait and reschedule? */ -#define __GFP_HIGH 0x20u /* Should access emergency pools? */ -#define __GFP_IO 0x40u /* Can start physical IO? */ -#define __GFP_FS 0x80u /* Can call down to low-level FS? */ -#define __GFP_COLD 0x100u /* Cache-cold page required */ -#define __GFP_NOWARN 0x200u /* Suppress page allocation failure warning */ -#define __GFP_REPEAT 0x400u /* Retry the allocation. Might fail */ -#define __GFP_NOFAIL 0x800u /* Retry for ever. Cannot fail */ -#define __GFP_NORETRY 0x1000u /* Do not retry. Might fail */ -#define __GFP_NO_GROW 0x2000u /* Slab internal usage */ -#define __GFP_COMP 0x4000u /* Add compound page metadata */ -#define __GFP_ZERO 0x8000u /* Return zeroed page on success */ -#define __GFP_NOMEMALLOC 0x10000u /* Don't use emergency reserves */ -#define __GFP_NORECLAIM 0x20000u /* No realy zone reclaim during allocation */ -#define __GFP_HARDWALL 0x40000u /* Enforce hardwall cpuset memory allocs */ +#define __GFP_WAIT ((__force gfp_t)0x10u) /* Can wait and reschedule? */ +#define __GFP_HIGH ((__force gfp_t)0x20u) /* Should access emergency pools? */ +#define __GFP_IO ((__force gfp_t)0x40u) /* Can start physical IO? */ +#define __GFP_FS ((__force gfp_t)0x80u) /* Can call down to low-level FS? */ +#define __GFP_COLD ((__force gfp_t)0x100u) /* Cache-cold page required */ +#define __GFP_NOWARN ((__force gfp_t)0x200u) /* Suppress page allocation failure warning */ +#define __GFP_REPEAT ((__force gfp_t)0x400u) /* Retry the allocation. Might fail */ +#define __GFP_NOFAIL ((__force gfp_t)0x800u) /* Retry for ever. Cannot fail */ +#define __GFP_NORETRY ((__force gfp_t)0x1000u)/* Do not retry. Might fail */ +#define __GFP_NO_GROW ((__force gfp_t)0x2000u)/* Slab internal usage */ +#define __GFP_COMP ((__force gfp_t)0x4000u)/* Add compound page metadata */ +#define __GFP_ZERO ((__force gfp_t)0x8000u)/* Return zeroed page on success */ +#define __GFP_NOMEMALLOC ((__force gfp_t)0x10000u) /* Don't use emergency reserves */ +#define __GFP_HARDWALL ((__force gfp_t)0x20000u) /* Enforce hardwall cpuset memory allocs */ #define __GFP_BITS_SHIFT 20 /* Room for 20 __GFP_FOO bits */ -#define __GFP_BITS_MASK ((1 << __GFP_BITS_SHIFT) - 1) +#define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /* if you forget to add the bitmask here kernel will crash, period */ #define GFP_LEVEL_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS| \ __GFP_COLD|__GFP_NOWARN|__GFP_REPEAT| \ __GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP| \ - __GFP_NOMEMALLOC|__GFP_NORECLAIM|__GFP_HARDWALL) + __GFP_NOMEMALLOC|__GFP_HARDWALL) #define GFP_ATOMIC (__GFP_HIGH) #define GFP_NOIO (__GFP_WAIT) @@ -64,6 +70,16 @@ #define GFP_DMA __GFP_DMA +/* 4GB DMA on some platforms */ +#define GFP_DMA32 __GFP_DMA32 + + +static inline int gfp_zone(gfp_t gfp) +{ + int zone = GFP_ZONEMASK & (__force int) gfp; + BUG_ON(zone >= GFP_ZONETYPES); + return zone; +} /* * There is only one page-allocator function, and two main namespaces to @@ -98,7 +114,7 @@ return NULL; return __alloc_pages(gfp_mask, order, - NODE_DATA(nid)->node_zonelists + (gfp_mask & GFP_ZONEMASK)); + NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_mask)); } #ifdef CONFIG_NUMA diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/include/linux/irq.h --- a/linux-2.6-xen-sparse/include/linux/irq.h Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/include/linux/irq.h Wed Feb 1 18:00:19 2006 @@ -10,6 +10,7 @@ */ #include <linux/config.h> +#include <linux/smp.h> #if !defined(CONFIG_ARCH_S390) diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/include/linux/mm.h --- a/linux-2.6-xen-sparse/include/linux/mm.h Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/include/linux/mm.h Wed Feb 1 18:00:19 2006 @@ -144,7 +144,8 @@ #define VM_GROWSDOWN 0x00000100 /* general info on the segment */ #define VM_GROWSUP 0x00000200 -#define VM_SHM 0x00000400 /* shared memory area, don't swap out */ +#define VM_SHM 0x00000000 /* Means nothing: delete it later */ +#define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ #define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ #define VM_EXECUTABLE 0x00001000 @@ -157,13 +158,14 @@ #define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ #define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ -#define VM_RESERVED 0x00080000 /* Don't unmap it from swap_out */ +#define VM_RESERVED 0x00080000 /* Count as reserved_vm like IO */ #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ #define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ #define VM_MAPPED_COPY 0x01000000 /* T if mapped copy of data (nommu mmap) */ +#define VM_INSERTPAGE 0x02000000 /* The vma has had "vm_insert_page()" done on it */ #ifdef CONFIG_XEN -#define VM_FOREIGN 0x02000000 /* Has pages belonging to another VM */ +#define VM_FOREIGN 0x04000000 /* Has pages belonging to another VM */ #endif #ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ @@ -209,12 +211,6 @@ struct mmu_gather; struct inode; -#ifdef ARCH_HAS_ATOMIC_UNSIGNED -typedef unsigned page_flags_t; -#else -typedef unsigned long page_flags_t; -#endif - /* * Each physical page in the system has a struct page associated with * it to keep track of whatever it is we are using the page for at the @@ -222,20 +218,25 @@ * a page. */ struct page { - page_flags_t flags; /* Atomic flags, some possibly + unsigned long flags; /* Atomic flags, some possibly * updated asynchronously */ atomic_t _count; /* Usage count, see below. */ atomic_t _mapcount; /* Count of ptes mapped in mms, * to show when page is mapped * & limit reverse map searches. */ - unsigned long private; /* Mapping-private opaque data: + union { + unsigned long private; /* Mapping-private opaque data: * usually used for buffer_heads * if PagePrivate set; used for * swp_entry_t if PageSwapCache * When page is free, this indicates * order in the buddy system. */ +#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS + spinlock_t ptl; +#endif + } u; struct address_space *mapping; /* If low bit clear, points to * inode address_space, or NULL. * If page mapped as anonymous @@ -263,6 +264,9 @@ #endif /* WANT_PAGE_VIRTUAL */ }; +#define page_private(page) ((page)->u.private) +#define set_page_private(page, v) ((page)->u.private = (v)) + /* * FIXME: take this include out, include page-flags.h in * files which need it (119 of them) @@ -312,40 +316,21 @@ extern void FASTCALL(__page_cache_release(struct page *)); -#ifdef CONFIG_HUGETLB_PAGE - -static inline int page_count(struct page *p) -{ - if (PageCompound(p)) - p = (struct page *)p->private; - return atomic_read(&(p)->_count) + 1; +static inline int page_count(struct page *page) +{ + if (PageCompound(page)) + page = (struct page *)page_private(page); + return atomic_read(&page->_count) + 1; } static inline void get_page(struct page *page) { if (unlikely(PageCompound(page))) - page = (struct page *)page->private; + page = (struct page *)page_private(page); atomic_inc(&page->_count); } void put_page(struct page *page); - -#else /* CONFIG_HUGETLB_PAGE */ - -#define page_count(p) (atomic_read(&(p)->_count) + 1) - -static inline void get_page(struct page *page) -{ - atomic_inc(&page->_count); -} - -static inline void put_page(struct page *page) -{ - if (!PageReserved(page) && put_page_testzero(page)) - __page_cache_release(page); -} - -#endif /* CONFIG_HUGETLB_PAGE */ /* * Multiple processes may "see" the same page. E.g. for untouched @@ -430,7 +415,7 @@ #endif /* Page flags: | [SECTION] | [NODE] | ZONE | ... | FLAGS | */ -#define SECTIONS_PGOFF ((sizeof(page_flags_t)*8) - SECTIONS_WIDTH) +#define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH) #define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH) #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) @@ -590,7 +575,7 @@ static inline pgoff_t page_index(struct page *page) { if (unlikely(PageSwapCache(page))) - return page->private; + return page_private(page); return page->index; } @@ -683,9 +668,10 @@ unsigned long truncate_count; /* Compare vm_truncate_count */ }; +struct page *vm_normal_page(struct vm_area_struct *, unsigned long, pte_t); unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *); -unsigned long unmap_vmas(struct mmu_gather **tlb, struct mm_struct *mm, +unsigned long unmap_vmas(struct mmu_gather **tlb, struct vm_area_struct *start_vma, unsigned long start_addr, unsigned long end_addr, unsigned long *nr_accounted, struct zap_details *); @@ -707,10 +693,6 @@ } extern int vmtruncate(struct inode * inode, loff_t offset); -extern pud_t *FASTCALL(__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)); -extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)); -extern pte_t *FASTCALL(pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); -extern pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot); extern int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot); extern int __handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access); @@ -726,6 +708,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int write, int force, struct page **pages, struct vm_area_struct **vmas); +void print_bad_pte(struct vm_area_struct *, pte_t, unsigned long); int __set_page_dirty_buffers(struct page *page); int __set_page_dirty_nobuffers(struct page *page); @@ -750,7 +733,7 @@ * The callback will be passed nr_to_scan == 0 when the VM is querying the * cache size, so a fastpath for that case is appropriate. */ -typedef int (*shrinker_t)(int nr_to_scan, unsigned int gfp_mask); +typedef int (*shrinker_t)(int nr_to_scan, gfp_t gfp_mask); /* * Add an aging callback. The int is the number of 'seeks' it takes @@ -762,38 +745,85 @@ extern struct shrinker *set_shrinker(int, shrinker_t); extern void remove_shrinker(struct shrinker *shrinker); -/* - * On a two-level or three-level page table, this ends up being trivial. Thus - * the inlining and the symmetry break with pte_alloc_map() that does all - * of this out-of-line. - */ +extern pte_t *FASTCALL(get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl)); + +int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address); +int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address); +int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address); +int __pte_alloc_kernel(pmd_t *pmd, unsigned long address); + /* * The following ifdef needed to get the 4level-fixup.h header to work. * Remove it when 4level-fixup.h has been removed. */ -#ifdef CONFIG_MMU -#ifndef __ARCH_HAS_4LEVEL_HACK +#if defined(CONFIG_MMU) && !defined(__ARCH_HAS_4LEVEL_HACK) static inline pud_t *pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) { - if (pgd_none(*pgd)) - return __pud_alloc(mm, pgd, address); - return pud_offset(pgd, address); + return (unlikely(pgd_none(*pgd)) && __pud_alloc(mm, pgd, address))? + NULL: pud_offset(pgd, address); } static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) { - if (pud_none(*pud)) - return __pmd_alloc(mm, pud, address); - return pmd_offset(pud, address); -} -#endif -#endif /* CONFIG_MMU */ + return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))? + NULL: pmd_offset(pud, address); +} +#endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */ + +#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS +/* + * We tuck a spinlock to guard each pagetable page into its struct page, + * at page->private, with BUILD_BUG_ON to make sure that this will not + * overflow into the next struct page (as it might with DEBUG_SPINLOCK). + * When freeing, reset page->mapping so free_pages_check won't complain. + */ +#define __pte_lockptr(page) &((page)->u.ptl) +#define pte_lock_init(_page) do { \ + spin_lock_init(__pte_lockptr(_page)); \ +} while (0) +#define pte_lock_deinit(page) ((page)->mapping = NULL) +#define pte_lockptr(mm, pmd) ({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));}) +#else +/* + * We use mm->page_table_lock to guard all pagetable pages of the mm. + */ +#define pte_lock_init(page) do {} while (0) +#define pte_lock_deinit(page) do {} while (0) +#define pte_lockptr(mm, pmd) ({(void)(pmd); &(mm)->page_table_lock;}) +#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ + +#define pte_offset_map_lock(mm, pmd, address, ptlp) \ +({ \ + spinlock_t *__ptl = pte_lockptr(mm, pmd); \ + pte_t *__pte = pte_offset_map(pmd, address); \ + *(ptlp) = __ptl; \ + spin_lock(__ptl); \ + __pte; \ +}) + +#define pte_unmap_unlock(pte, ptl) do { \ + spin_unlock(ptl); \ + pte_unmap(pte); \ +} while (0) + +#define pte_alloc_map(mm, pmd, address) \ + ((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \ + NULL: pte_offset_map(pmd, address)) + +#define pte_alloc_map_lock(mm, pmd, address, ptlp) \ + ((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \ + NULL: pte_offset_map_lock(mm, pmd, address, ptlp)) + +#define pte_alloc_kernel(pmd, address) \ + ((unlikely(!pmd_present(*(pmd))) && __pte_alloc_kernel(pmd, address))? \ + NULL: pte_offset_kernel(pmd, address)) extern void free_area_init(unsigned long * zones_size); extern void free_area_init_node(int nid, pg_data_t *pgdat, unsigned long * zones_size, unsigned long zone_start_pfn, unsigned long *zholes_size); extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long); +extern void setup_per_zone_pages_min(void); extern void mem_init(void); extern void show_mem(void); extern void si_meminfo(struct sysinfo * val); @@ -837,6 +867,7 @@ extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *, struct rb_node **, struct rb_node *); +extern void unlink_file_vma(struct vm_area_struct *); extern struct vm_area_struct *copy_vma(struct vm_area_struct **, unsigned long addr, unsigned long len, pgoff_t pgoff); extern void exit_mmap(struct mm_struct *); @@ -884,20 +915,23 @@ * turning readahead off */ int do_page_cache_readahead(struct address_space *mapping, struct file *filp, - unsigned long offset, unsigned long nr_to_read); + pgoff_t offset, unsigned long nr_to_read); int force_page_cache_readahead(struct address_space *mapping, struct file *filp, - unsigned long offset, unsigned long nr_to_read); -unsigned long page_cache_readahead(struct address_space *mapping, + pgoff_t offset, unsigned long nr_to_read); +unsigned long page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra, struct file *filp, - unsigned long offset, + pgoff_t offset, unsigned long size); void handle_ra_miss(struct address_space *mapping, struct file_ra_state *ra, pgoff_t offset); unsigned long max_sane_readahead(unsigned long nr); /* Do stack extension */ -extern int expand_stack(struct vm_area_struct * vma, unsigned long address); +extern int expand_stack(struct vm_area_struct *vma, unsigned long address); +#ifdef CONFIG_IA64 +extern int expand_upwards(struct vm_area_struct *vma, unsigned long address); +#endif /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); @@ -920,15 +954,19 @@ return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; } -extern struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr); - -extern struct page * vmalloc_to_page(void *addr); -extern unsigned long vmalloc_to_pfn(void *addr); -extern struct page * follow_page(struct mm_struct *mm, unsigned long address, - int write); -extern int check_user_page_readable(struct mm_struct *mm, unsigned long address); -int remap_pfn_range(struct vm_area_struct *, unsigned long, - unsigned long, unsigned long, pgprot_t); +struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); +struct page *vmalloc_to_page(void *addr); +unsigned long vmalloc_to_pfn(void *addr); +int remap_pfn_range(struct vm_area_struct *, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t); +int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); + +struct page *follow_page(struct vm_area_struct *, unsigned long address, + unsigned int foll_flags); +#define FOLL_WRITE 0x01 /* check pte is writable */ +#define FOLL_TOUCH 0x02 /* mark page accessed */ +#define FOLL_GET 0x04 /* do get_page on page */ +#define FOLL_ANON 0x08 /* give ZERO_PAGE if no pgtable */ #ifdef CONFIG_XEN typedef int (*pte_fn_t)(pte_t *pte, struct page *pte_page, unsigned long addr, @@ -938,28 +976,13 @@ #endif #ifdef CONFIG_PROC_FS -void __vm_stat_account(struct mm_struct *, unsigned long, struct file *, long); +void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long); #else -static inline void __vm_stat_account(struct mm_struct *mm, +static inline void vm_stat_account(struct mm_struct *mm, unsigned long flags, struct file *file, long pages) { } #endif /* CONFIG_PROC_FS */ - -static inline void vm_stat_account(struct vm_area_struct *vma) -{ - __vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, - vma_pages(vma)); -} - -static inline void vm_stat_unaccount(struct vm_area_struct *vma) -{ - __vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, - -vma_pages(vma)); -} - -/* update per process rss and vm hiwater data */ -extern void update_mem_hiwater(struct task_struct *tsk); #ifndef CONFIG_DEBUG_PAGEALLOC static inline void diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/include/linux/skbuff.h --- a/linux-2.6-xen-sparse/include/linux/skbuff.h Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/include/linux/skbuff.h Wed Feb 1 18:00:19 2006 @@ -137,6 +137,8 @@ unsigned int nr_frags; unsigned short tso_size; unsigned short tso_segs; + unsigned short ufo_size; + unsigned int ip6_frag_id; struct sk_buff *frag_list; skb_frag_t frags[MAX_SKB_FRAGS]; }; @@ -171,7 +173,6 @@ * struct sk_buff - socket buffer * @next: Next buffer in list * @prev: Previous buffer in list - * @list: List we are on * @sk: Socket we are owned by * @tstamp: Time we arrived * @dev: Device we arrived on/are leaving by @@ -192,6 +193,7 @@ * @proto_csum_valid: Protocol csum validated since arriving at localhost * @proto_csum_blank: Protocol csum must be added before leaving localhost * @pkt_type: Packet class + * @fclone: skbuff clone status * @ip_summed: Driver fed us an IP checksum * @priority: Packet queueing priority * @users: User count - see {datagram,tcp}.c @@ -204,7 +206,9 @@ * @destructor: Destruct function * @nfmark: Can be used for communication between hooks * @nfct: Associated connection, if any + * @ipvs_property: skbuff is owned by ipvs * @nfctinfo: Relationship of this skb to the connection + * @nfct_reasm: netfilter conntrack re-assembly pointer * @nf_bridge: Saved data about a bridged frame - see br_netfilter.c * @tc_index: Traffic control index * @tc_verd: traffic control verdict @@ -263,13 +267,13 @@ nohdr:1, nfctinfo:3; __u8 pkt_type:3, + fclone:2, #ifndef CONFIG_XEN - fclone:2; + ipvs_property:1; #else - fclone:2, + ipvs_property:1, proto_csum_valid:1, proto_csum_blank:1; - /* 1 bit spare */ #endif __be16 protocol; @@ -277,8 +281,8 @@ #ifdef CONFIG_NETFILTER __u32 nfmark; struct nf_conntrack *nfct; -#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) - __u8 ipvs_property:1; +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + struct sk_buff *nfct_reasm; #endif #ifdef CONFIG_BRIDGE_NETFILTER struct nf_bridge_info *nf_bridge; @@ -350,6 +354,11 @@ extern void skb_under_panic(struct sk_buff *skb, int len, void *here); +extern int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb, + int getfrag(void *from, char *to, int offset, + int len,int odd, struct sk_buff *skb), + void *from, int length); + struct skb_seq_state { __u32 lower_offset; @@ -605,27 +614,44 @@ */ /** - * __skb_queue_head - queue a buffer at the list head + * __skb_queue_after - queue a buffer at the list head * @list: list to use + * @prev: place after this buffer * @newsk: buffer to queue * - * Queue a buffer at the start of a list. This function takes no locks + * Queue a buffer int the middle of a list. This function takes no locks * and you must therefore hold required locks before calling it. * * A buffer cannot be placed on two lists at the same time. */ -extern void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk); -static inline void __skb_queue_head(struct sk_buff_head *list, - struct sk_buff *newsk) -{ - struct sk_buff *prev, *next; - +static inline void __skb_queue_after(struct sk_buff_head *list, + struct sk_buff *prev, + struct sk_buff *newsk) +{ + struct sk_buff *next; list->qlen++; - prev = (struct sk_buff *)list; + next = prev->next; newsk->next = next; newsk->prev = prev; next->prev = prev->next = newsk; +} + +/** + * __skb_queue_head - queue a buffer at the list head + * @list: list to use + * @newsk: buffer to queue + * + * Queue a buffer at the start of a list. This function takes no locks + * and you must therefore hold required locks before calling it. + * + * A buffer cannot be placed on two lists at the same time. + */ +extern void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk); +static inline void __skb_queue_head(struct sk_buff_head *list, + struct sk_buff *newsk) +{ + __skb_queue_after(list, (struct sk_buff *)list, newsk); } /** @@ -1205,6 +1231,11 @@ prefetch(skb->next), (skb != (struct sk_buff *)(queue)); \ skb = skb->next) +#define skb_queue_reverse_walk(queue, skb) \ + for (skb = (queue)->prev; \ + prefetch(skb->prev), (skb != (struct sk_buff *)(queue)); \ + skb = skb->prev) + extern struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock, int *err); @@ -1213,8 +1244,7 @@ extern int skb_copy_datagram_iovec(const struct sk_buff *from, int offset, struct iovec *to, int size); -extern int skb_copy_and_csum_datagram_iovec(const - struct sk_buff *skb, +extern int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb, int hlen, struct iovec *iov); extern void skb_free_datagram(struct sock *sk, struct sk_buff *skb); @@ -1282,6 +1312,30 @@ extern void __net_timestamp(struct sk_buff *skb); +extern unsigned int __skb_checksum_complete(struct sk_buff *skb); + +/** + * skb_checksum_complete - Calculate checksum of an entire packet + * @skb: packet to process + * + * This function calculates the checksum over the entire packet plus + * the value of skb->csum. The latter can be used to supply the + * checksum of a pseudo header as used by TCP/UDP. It returns the + * checksum. + * + * For protocols that contain complete checksums such as ICMP/TCP/UDP, + * this function can be used to verify that checksum on received + * packets. In that case the function should return zero if the + * checksum is correct. In particular, this function will return zero + * if skb->ip_summed is CHECKSUM_UNNECESSARY which indicates that the + * hardware has already verified the correctness of the checksum. + */ +static inline unsigned int skb_checksum_complete(struct sk_buff *skb) +{ + return skb->ip_summed != CHECKSUM_UNNECESSARY && + __skb_checksum_complete(skb); +} + #ifdef CONFIG_NETFILTER static inline void nf_conntrack_put(struct nf_conntrack *nfct) { @@ -1293,10 +1347,26 @@ if (nfct) atomic_inc(&nfct->use); } +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) +static inline void nf_conntrack_get_reasm(struct sk_buff *skb) +{ + if (skb) + atomic_inc(&skb->users); +} +static inline void nf_conntrack_put_reasm(struct sk_buff *skb) +{ + if (skb) + kfree_skb(skb); +} +#endif static inline void nf_reset(struct sk_buff *skb) { nf_conntrack_put(skb->nfct); skb->nfct = NULL; +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + nf_conntrack_put_reasm(skb->nfct_reasm); + skb->nfct_reasm = NULL; +#endif } #ifdef CONFIG_BRIDGE_NETFILTER diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/kernel/irq/manage.c --- a/linux-2.6-xen-sparse/kernel/irq/manage.c Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/kernel/irq/manage.c Wed Feb 1 18:00:19 2006 @@ -24,6 +24,7 @@ /** * synchronize_irq - wait for pending IRQ handlers (on other CPUs) + * @irq: interrupt number to wait for * * This function waits for any pending IRQ handlers for this interrupt * to complete before returning. If you use this function while @@ -34,6 +35,9 @@ void synchronize_irq(unsigned int irq) { struct irq_desc *desc = irq_desc + irq; + + if (irq >= NR_IRQS) + return; while (desc->status & IRQ_INPROGRESS) cpu_relax(); @@ -58,6 +62,9 @@ { irq_desc_t *desc = irq_desc + irq; unsigned long flags; + + if (irq >= NR_IRQS) + return; spin_lock_irqsave(&desc->lock, flags); if (!desc->depth++) { @@ -85,6 +92,9 @@ { irq_desc_t *desc = irq_desc + irq; + if (irq >= NR_IRQS) + return; + disable_irq_nosync(irq); if (desc->action) synchronize_irq(irq); @@ -106,6 +116,9 @@ { irq_desc_t *desc = irq_desc + irq; unsigned long flags; + + if (irq >= NR_IRQS) + return; spin_lock_irqsave(&desc->lock, flags); switch (desc->depth) { @@ -166,6 +179,9 @@ struct irqaction *old, **p; unsigned long flags; int shared = 0; + + if (irq >= NR_IRQS) + return -EINVAL; if (desc->handler == &no_irq_type) return -ENOSYS; diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/lib/Kconfig.debug --- a/linux-2.6-xen-sparse/lib/Kconfig.debug Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/lib/Kconfig.debug Wed Feb 1 18:00:19 2006 @@ -128,7 +128,7 @@ config DEBUG_BUGVERBOSE bool "Verbose BUG() reporting (adds 70K)" if DEBUG_KERNEL && EMBEDDED depends on BUG - depends on ARM || ARM26 || M32R || M68K || SPARC32 || SPARC64 || (X86 && !X86_64) || FRV + depends on ARM || ARM26 || M32R || M68K || SPARC32 || SPARC64 || X86_32 || FRV default !EMBEDDED help Say Y here to make BUG() panics output the file name and line number @@ -168,13 +168,34 @@ If unsure, say N. +config DEBUG_VM + bool "Debug VM" + depends on DEBUG_KERNEL + help + Enable this to debug the virtual-memory system. + + If unsure, say N. + config FRAME_POINTER bool "Compile the kernel with frame pointers" depends on DEBUG_KERNEL && (X86 || CRIS || M68K || M68KNOMMU || FRV || UML) default y if DEBUG_INFO && UML help If you say Y here the resulting kernel image will be slightly larger - and slower, but it might give very useful debugging information - on some architectures or you use external debuggers. + and slower, but it might give very useful debugging information on + some architectures or if you use external debuggers. If you don't debug the kernel, you can say N. +config RCU_TORTURE_TEST + tristate "torture tests for RCU" + depends on DEBUG_KERNEL + default n + help + This option provides a kernel module that runs torture tests + on the RCU infrastructure. The kernel module may be built + after the fact on the running kernel to be tested, if desired. + + Say Y here if you want RCU torture tests to start automatically + at boot time (you probably don't). + Say M if you want the RCU torture tests to build as a module. + Say N if you are unsure. diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/mm/highmem.c --- a/linux-2.6-xen-sparse/mm/highmem.c Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/mm/highmem.c Wed Feb 1 18:00:19 2006 @@ -30,11 +30,9 @@ static mempool_t *page_pool, *isa_page_pool; -static void *page_pool_alloc(gfp_t gfp_mask, void *data) -{ - unsigned int gfp = gfp_mask | (unsigned int) (long) data; - - return alloc_page(gfp); +static void *page_pool_alloc_isa(gfp_t gfp_mask, void *data) +{ + return alloc_page(gfp_mask | GFP_DMA); } static void page_pool_free(void *page, void *data) @@ -51,6 +49,12 @@ * n means that there are (n-1) current users of it. */ #ifdef CONFIG_HIGHMEM + +static void *page_pool_alloc(gfp_t gfp_mask, void *data) +{ + return alloc_page(gfp_mask); +} + static int pkmap_count[LAST_PKMAP]; static unsigned int last_pkmap_nr; static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock); @@ -278,7 +282,7 @@ if (isa_page_pool) return 0; - isa_page_pool = mempool_create(ISA_POOL_SIZE, page_pool_alloc, page_pool_free, (void *) __GFP_DMA); + isa_page_pool = mempool_create(ISA_POOL_SIZE, page_pool_alloc_isa, page_pool_free, NULL); if (!isa_page_pool) BUG(); diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/mm/memory.c --- a/linux-2.6-xen-sparse/mm/memory.c Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/mm/memory.c Wed Feb 1 18:00:19 2006 @@ -114,6 +114,7 @@ { struct page *page = pmd_page(*pmd); pmd_clear(pmd); + pte_lock_deinit(page); pte_free_tlb(tlb, page); dec_page_state(nr_page_table_pages); tlb->mm->nr_ptes--; @@ -249,7 +250,7 @@ free_pud_range(*tlb, pgd, addr, next, floor, ceiling); } while (pgd++, addr = next, addr != end); - if (!tlb_is_full_mm(*tlb)) + if (!(*tlb)->fullmm) flush_tlb_pgtables((*tlb)->mm, start, end); } @@ -259,6 +260,12 @@ while (vma) { struct vm_area_struct *next = vma->vm_next; unsigned long addr = vma->vm_start; + + /* + * Hide vma from rmap and vmtruncate before freeing pgtables + */ + anon_vma_unlink(vma); + unlink_file_vma(vma); if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) { hugetlb_free_pgd_range(tlb, addr, vma->vm_end, @@ -272,6 +279,8 @@ HPAGE_SIZE)) { vma = next; next = vma->vm_next; + anon_vma_unlink(vma); + unlink_file_vma(vma); } free_pgd_range(tlb, addr, vma->vm_end, floor, next? next->vm_start: ceiling); @@ -280,75 +289,141 @@ } } -pte_t fastcall *pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, - unsigned long address) -{ - if (!pmd_present(*pmd)) { - struct page *new; - - spin_unlock(&mm->page_table_lock); - new = pte_alloc_one(mm, address); - spin_lock(&mm->page_table_lock); - if (!new) - return NULL; - /* - * Because we dropped the lock, we should re-check the - * entry, as somebody else could have populated it.. - */ - if (pmd_present(*pmd)) { - pte_free(new); - goto out; - } +int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) +{ + struct page *new = pte_alloc_one(mm, address); + if (!new) + return -ENOMEM; + + pte_lock_init(new); + spin_lock(&mm->page_table_lock); + if (pmd_present(*pmd)) { /* Another has populated it */ + pte_lock_deinit(new); + pte_free(new); + } else { mm->nr_ptes++; inc_page_state(nr_page_table_pages); pmd_populate(mm, pmd, new); } -out: - return pte_offset_map(pmd, address); -} - -pte_t fastcall * pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address) -{ - if (!pmd_present(*pmd)) { - pte_t *new; - - spin_unlock(&mm->page_table_lock); - new = pte_alloc_one_kernel(mm, address); - spin_lock(&mm->page_table_lock); - if (!new) + spin_unlock(&mm->page_table_lock); + return 0; +} + +int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) +{ + pte_t *new = pte_alloc_one_kernel(&init_mm, address); + if (!new) + return -ENOMEM; + + spin_lock(&init_mm.page_table_lock); + if (pmd_present(*pmd)) /* Another has populated it */ + pte_free_kernel(new); + else + pmd_populate_kernel(&init_mm, pmd, new); + spin_unlock(&init_mm.page_table_lock); + return 0; +} + +static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) +{ + if (file_rss) + add_mm_counter(mm, file_rss, file_rss); + if (anon_rss) + add_mm_counter(mm, anon_rss, anon_rss); +} + +/* + * This function is called to print an error when a bad pte + * is found. For example, we might have a PFN-mapped pte in + * a region that doesn't allow it. + * + * The calling function must still handle the error. + */ +void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr) +{ + printk(KERN_ERR "Bad pte = %08llx, process = %s, " + "vm_flags = %lx, vaddr = %lx\n", + (long long)pte_val(pte), + (vma->vm_mm == current->mm ? current->comm : "???"), + vma->vm_flags, vaddr); + dump_stack(); +} + +static inline int is_cow_mapping(unsigned int flags) +{ + return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; +} + +/* + * This function gets the "struct page" associated with a pte. + * + * NOTE! Some mappings do not have "struct pages". A raw PFN mapping + * will have each page table entry just pointing to a raw page frame + * number, and as far as the VM layer is concerned, those do not have + * pages associated with them - even if the PFN might point to memory + * that otherwise is perfectly fine and has a "struct page". + * + * The way we recognize those mappings is through the rules set up + * by "remap_pfn_range()": the vma will have the VM_PFNMAP bit set, + * and the vm_pgoff will point to the first PFN mapped: thus every + * page that is a raw mapping will always honor the rule + * + * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT) + * + * and if that isn't true, the page has been COW'ed (in which case it + * _does_ have a "struct page" associated with it even if it is in a + * VM_PFNMAP range). + */ +struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte) +{ + unsigned long pfn = pte_pfn(pte); + + if (vma->vm_flags & VM_PFNMAP) { + unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT; + if (pfn == vma->vm_pgoff + off) return NULL; - - /* - * Because we dropped the lock, we should re-check the - * entry, as somebody else could have populated it.. - */ - if (pmd_present(*pmd)) { - pte_free_kernel(new); - goto out; - } - pmd_populate_kernel(mm, pmd, new); - } -out: - return pte_offset_kernel(pmd, address); + if (!is_cow_mapping(vma->vm_flags)) + return NULL; + } + + /* + * Add some anal sanity checks for now. Eventually, + * we should just do "return pfn_to_page(pfn)", but + * in the meantime we check that we get a valid pfn, + * and that the resulting page looks ok. + * + * Remove this test eventually! + */ + if (unlikely(!pfn_valid(pfn))) { + if (!vma->vm_flags & VM_RESERVED) + print_bad_pte(vma, pte, addr); + return NULL; + } + + /* + * NOTE! We still have PageReserved() pages in the page + * tables. + * + * The PAGE_ZERO() pages and various VDSO mappings can + * cause them to exist. + */ + return pfn_to_page(pfn); } /* * copy one vm_area from one task to the other. Assumes the page tables * already present in the new task to be cleared in the whole range * covered by this vma. - * - * dst->page_table_lock is held on entry and exit, - * but may be dropped within p[mg]d_alloc() and pte_alloc_map(). */ static inline void copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pte_t *dst_pte, pte_t *src_pte, unsigned long vm_flags, - unsigned long addr) -{ + pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, + unsigned long addr, int *rss) +{ + unsigned long vm_flags = vma->vm_flags; pte_t pte = *src_pte; struct page *page; - unsigned long pfn; /* pte contains position in swap or file, so copy. */ if (unlikely(!pte_present(pte))) { @@ -357,34 +432,20 @@ /* make sure dst_mm is on swapoff's mmlist. */ if (unlikely(list_empty(&dst_mm->mmlist))) { spin_lock(&mmlist_lock); - list_add(&dst_mm->mmlist, &src_mm->mmlist); + if (list_empty(&dst_mm->mmlist)) + list_add(&dst_mm->mmlist, + &src_mm->mmlist); spin_unlock(&mmlist_lock); } } - set_pte_at(dst_mm, addr, dst_pte, pte); - return; - } - - pfn = pte_pfn(pte); - /* the pte points outside of valid memory, the - * mapping is assumed to be good, meaningful - * and not mapped via rmap - duplicate the - * mapping as is. - */ - page = NULL; - if (pfn_valid(pfn)) - page = pfn_to_page(pfn); - - if (!page || PageReserved(page)) { - set_pte_at(dst_mm, addr, dst_pte, pte); - return; + goto out_set_pte; } /* * If it's a COW mapping, write protect it both * in the parent and the child */ - if ((vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE) { + if (is_cow_mapping(vm_flags)) { ptep_set_wrprotect(src_mm, addr, src_pte); pte = *src_pte; } @@ -396,12 +457,16 @@ if (vm_flags & VM_SHARED) pte = pte_mkclean(pte); pte = pte_mkold(pte); - get_page(page); - inc_mm_counter(dst_mm, rss); - if (PageAnon(page)) - inc_mm_counter(dst_mm, anon_rss); + + page = vm_normal_page(vma, addr, pte); + if (page) { + get_page(page); + page_dup_rmap(page); + rss[!!PageAnon(page)]++; + } + +out_set_pte: set_pte_at(dst_mm, addr, dst_pte, pte); - page_dup_rmap(page); } static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, @@ -409,38 +474,44 @@ unsigned long addr, unsigned long end) { pte_t *src_pte, *dst_pte; - unsigned long vm_flags = vma->vm_flags; - int progress; + spinlock_t *src_ptl, *dst_ptl; + int progress = 0; + int rss[2]; again: - dst_pte = pte_alloc_map(dst_mm, dst_pmd, addr); + rss[1] = rss[0] = 0; + dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); if (!dst_pte) return -ENOMEM; src_pte = pte_offset_map_nested(src_pmd, addr); - - progress = 0; - spin_lock(&src_mm->page_table_lock); + src_ptl = pte_lockptr(src_mm, src_pmd); + spin_lock(src_ptl); + do { /* * We are holding two locks at this point - either of them * could generate latencies in another task on another CPU. */ - if (progress >= 32 && (need_resched() || - need_lockbreak(&src_mm->page_table_lock) || - need_lockbreak(&dst_mm->page_table_lock))) - break; + if (progress >= 32) { + progress = 0; + if (need_resched() || + need_lockbreak(src_ptl) || + need_lockbreak(dst_ptl)) + break; + } if (pte_none(*src_pte)) { progress++; continue; } - copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vm_flags, addr); + copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss); progress += 8; } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); - spin_unlock(&src_mm->page_table_lock); - + + spin_unlock(src_ptl); pte_unmap_nested(src_pte - 1); - pte_unmap(dst_pte - 1); - cond_resched_lock(&dst_mm->page_table_lock); + add_mm_rss(dst_mm, rss[0], rss[1]); + pte_unmap_unlock(dst_pte - 1, dst_ptl); + cond_resched(); if (addr != end) goto again; return 0; @@ -504,7 +575,7 @@ * readonly mappings. The tradeoff is that copy_page_range is more * efficient than faulting. */ - if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_RESERVED))) { + if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) { if (!vma->anon_vma) return 0; } @@ -525,25 +596,30 @@ return 0; } -static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd, +static unsigned long zap_pte_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, - struct zap_details *details) -{ + long *zap_work, struct zap_details *details) +{ + struct mm_struct *mm = tlb->mm; pte_t *pte; - - pte = pte_offset_map(pmd, addr); + spinlock_t *ptl; + int file_rss = 0; + int anon_rss = 0; + + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); do { pte_t ptent = *pte; - if (pte_none(ptent)) + if (pte_none(ptent)) { + (*zap_work)--; continue; + } if (pte_present(ptent)) { - struct page *page = NULL; - unsigned long pfn = pte_pfn(ptent); - if (pfn_valid(pfn)) { - page = pfn_to_page(pfn); - if (PageReserved(page)) - page = NULL; - } + struct page *page; + + (*zap_work) -= PAGE_SIZE; + + page = vm_normal_page(vma, addr, ptent); if (unlikely(details) && page) { /* * unmap_shared_mapping_pages() wants to @@ -562,7 +638,7 @@ page->index > details->last_index)) continue; } - ptent = ptep_get_and_clear_full(tlb->mm, addr, pte, + ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); tlb_remove_tlb_entry(tlb, pte, addr); if (unlikely(!page)) @@ -570,15 +646,17 @@ if (unlikely(details) && details->nonlinear_vma && linear_page_index(details->nonlinear_vma, addr) != page->index) - set_pte_at(tlb->mm, addr, pte, + set_pte_at(mm, addr, pte, pgoff_to_pte(page->index)); - if (pte_dirty(ptent)) - set_page_dirty(page); if (PageAnon(page)) - dec_mm_counter(tlb->mm, anon_rss); - else if (pte_young(ptent)) - mark_page_accessed(page); - tlb->freed++; + anon_rss--; + else { + if (pte_dirty(ptent)) + set_page_dirty(page); + if (pte_young(ptent)) + mark_page_accessed(page); + file_rss--; + } page_remove_rmap(page); tlb_remove_page(tlb, page); continue; @@ -591,14 +669,19 @@ continue; if (!pte_file(ptent)) free_swap_and_cache(pte_to_swp_entry(ptent)); - pte_clear_full(tlb->mm, addr, pte, tlb->fullmm); - } while (pte++, addr += PAGE_SIZE, addr != end); - pte_unmap(pte - 1); -} - -static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud, + pte_clear_full(mm, addr, pte, tlb->fullmm); + } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); + + add_mm_rss(mm, file_rss, anon_rss); + pte_unmap_unlock(pte - 1, ptl); + + return addr; +} + +static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, - struct zap_details *details) + long *zap_work, struct zap_details *details) { pmd_t *pmd; unsigned long next; @@ -606,15 +689,21 @@ pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); - if (pmd_none_or_clear_bad(pmd)) + if (pmd_none_or_clear_bad(pmd)) { + (*zap_work)--; continue; - zap_pte_range(tlb, pmd, addr, next, details); - } while (pmd++, addr = next, addr != end); -} - -static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd, + } + next = zap_pte_range(tlb, vma, pmd, addr, next, + zap_work, details); + } while (pmd++, addr = next, (addr != end && *zap_work > 0)); + + return addr; +} + +static inline unsigned long zap_pud_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, - struct zap_details *details) + long *zap_work, struct zap_details *details) { pud_t *pud; unsigned long next; @@ -622,15 +711,21 @@ pud = pud_offset(pgd, addr); do { next = pud_addr_end(addr, end); - if (pud_none_or_clear_bad(pud)) + if (pud_none_or_clear_bad(pud)) { + (*zap_work)--; continue; - zap_pmd_range(tlb, pud, addr, next, details); - } while (pud++, addr = next, addr != end); -} - -static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, + } + next = zap_pmd_range(tlb, vma, pud, addr, next, + zap_work, details); + } while (pud++, addr = next, (addr != end && *zap_work > 0)); + + return addr; +} + +static unsigned long unmap_page_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long addr, unsigned long end, - struct zap_details *details) + long *zap_work, struct zap_details *details) { pgd_t *pgd; unsigned long next; @@ -643,11 +738,16 @@ pgd = pgd_offset(vma->vm_mm, addr); do { next = pgd_addr_end(addr, end); - if (pgd_none_or_clear_bad(pgd)) + if (pgd_none_or_clear_bad(pgd)) { + (*zap_work)--; continue; - zap_pud_range(tlb, pgd, addr, next, details); - } while (pgd++, addr = next, addr != end); + } + next = zap_pud_range(tlb, vma, pgd, addr, next, + zap_work, details); + } while (pgd++, addr = next, (addr != end && *zap_work > 0)); tlb_end_vma(tlb, vma); + + return addr; } #ifdef CONFIG_PREEMPT @@ -660,7 +760,6 @@ /** * unmap_vmas - unmap a range of memory covered by a list of vma's * @tlbp: address of the caller's struct mmu_gather - * @mm: the controlling mm_struct * @vma: the starting vma * @start_addr: virtual address at which to start unmapping * @end_addr: virtual address at which to end unmapping @@ -669,10 +768,10 @@ * * Returns the end address of the unmapping (restart addr if interrupted). * - * Unmap all pages in the vma list. Called under page_table_lock. - * - * We aim to not hold page_table_lock for too long (for scheduling latency - * reasons). So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to + * Unmap all pages in the vma list. + * + * We aim to not hold locks for too long (for scheduling latency reasons). + * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to * return the ending mmu_gather to the caller. * * Only addresses between `start' and `end' will be unmapped. @@ -684,17 +783,17 @@ * ensure that any thus-far unmapped pages are flushed before unmap_vmas() * drops the lock and schedules. */ -unsigned long unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, +unsigned long unmap_vmas(struct mmu_gather **tlbp, struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, unsigned long *nr_accounted, struct zap_details *details) { - unsigned long zap_bytes = ZAP_BLOCK_SIZE; + long zap_work = ZAP_BLOCK_SIZE; unsigned long tlb_start = 0; /* For tlb_finish_mmu */ int tlb_start_valid = 0; unsigned long start = start_addr; spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; - int fullmm = tlb_is_full_mm(*tlbp); + int fullmm = (*tlbp)->fullmm; for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { unsigned long end; @@ -710,45 +809,39 @@ *nr_accounted += (end - start) >> PAGE_SHIFT; while (start != end) { - unsigned long block; - if (!tlb_start_valid) { tlb_start = start; tlb_start_valid = 1; } - if (is_vm_hugetlb_page(vma)) { - block = end - start; + if (unlikely(is_vm_hugetlb_page(vma))) { unmap_hugepage_range(vma, start, end); - } else { - block = min(zap_bytes, end - start); - unmap_page_range(*tlbp, vma, start, - start + block, details); + zap_work -= (end - start) / + (HPAGE_SIZE / PAGE_SIZE); + start = end; + } else + start = unmap_page_range(*tlbp, vma, + start, end, &zap_work, details); + + if (zap_work > 0) { + BUG_ON(start != end); + break; } - start += block; - zap_bytes -= block; - if ((long)zap_bytes > 0) - continue; - tlb_finish_mmu(*tlbp, tlb_start, start); if (need_resched() || - need_lockbreak(&mm->page_table_lock) || (i_mmap_lock && need_lockbreak(i_mmap_lock))) { if (i_mmap_lock) { - /* must reset count of rss freed */ - *tlbp = tlb_gather_mmu(mm, fullmm); + *tlbp = NULL; goto out; } - spin_unlock(&mm->page_table_lock); cond_resched(); - spin_lock(&mm->page_table_lock); } - *tlbp = tlb_gather_mmu(mm, fullmm); + *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm); tlb_start_valid = 0; - zap_bytes = ZAP_BLOCK_SIZE; + zap_work = ZAP_BLOCK_SIZE; } } out: @@ -770,123 +863,92 @@ unsigned long end = address + size; unsigned long nr_accounted = 0; - if (is_vm_hugetlb_page(vma)) { - zap_hugepage_range(vma, address, size); - return end; - } - lru_add_drain(); - spin_lock(&mm->page_table_lock); tlb = tlb_gather_mmu(mm, 0); - end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details); - tlb_finish_mmu(tlb, address, end); - spin_unlock(&mm->page_table_lock); + update_hiwater_rss(mm); + end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); + if (tlb) + tlb_finish_mmu(tlb, address, end); return end; } /* * Do a quick page-table lookup for a single page. - * mm->page_table_lock must be held. - */ -static struct page *__follow_page(struct mm_struct *mm, unsigned long address, - int read, int write, int accessed) + */ +struct page *follow_page(struct vm_area_struct *vma, unsigned long address, + unsigned int flags) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *ptep, pte; - unsigned long pfn; + spinlock_t *ptl; struct page *page; - - page = follow_huge_addr(mm, address, write); - if (! IS_ERR(page)) - return page; - + struct mm_struct *mm = vma->vm_mm; + + page = follow_huge_addr(mm, address, flags & FOLL_WRITE); + if (!IS_ERR(page)) { + BUG_ON(flags & FOLL_GET); + goto out; + } + + page = NULL; pgd = pgd_offset(mm, address); if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) - goto out; + goto no_page_table; pud = pud_offset(pgd, address); if (pud_none(*pud) || unlikely(pud_bad(*pud))) - goto out; + goto no_page_table; pmd = pmd_offset(pud, address); if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) + goto no_page_table; + + if (pmd_huge(*pmd)) { + BUG_ON(flags & FOLL_GET); + page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); goto out; - if (pmd_huge(*pmd)) - return follow_huge_pmd(mm, address, pmd, write); - - ptep = pte_offset_map(pmd, address); + } + + ptep = pte_offset_map_lock(mm, pmd, address, &ptl); if (!ptep) goto out; pte = *ptep; - pte_unmap(ptep); - if (pte_present(pte)) { - if (write && !pte_write(pte)) - goto out; - if (read && !pte_read(pte)) - goto out; - pfn = pte_pfn(pte); - if (pfn_valid(pfn)) { - page = pfn_to_page(pfn); - if (accessed) { - if (write && !pte_dirty(pte) &&!PageDirty(page)) - set_page_dirty(page); - mark_page_accessed(page); - } - return page; - } - } - + if (!pte_present(pte)) + goto unlock; + if ((flags & FOLL_WRITE) && !pte_write(pte)) + goto unlock; + page = vm_normal_page(vma, address, pte); + if (unlikely(!page)) + goto unlock; + + if (flags & FOLL_GET) + get_page(page); + if (flags & FOLL_TOUCH) { + if ((flags & FOLL_WRITE) && + !pte_dirty(pte) && !PageDirty(page)) + set_page_dirty(page); + mark_page_accessed(page); + } +unlock: + pte_unmap_unlock(ptep, ptl); out: - return NULL; -} - -inline struct page * -follow_page(struct mm_struct *mm, unsigned long address, int write) -{ - return __follow_page(mm, address, 0, write, 1); -} - -/* - * check_user_page_readable() can be called frm niterrupt context by oprofile, - * so we need to avoid taking any non-irq-safe locks - */ -int check_user_page_readable(struct mm_struct *mm, unsigned long address) -{ - return __follow_page(mm, address, 1, 0, 0) != NULL; -} -EXPORT_SYMBOL(check_user_page_readable); - -static inline int -untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma, - unsigned long address) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - - /* Check if the vma is for an anonymous mapping. */ - if (vma->vm_ops && vma->vm_ops->nopage) - return 0; - - /* Check if page directory entry exists. */ - pgd = pgd_offset(mm, address); - if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) - return 1; - - pud = pud_offset(pgd, address); - if (pud_none(*pud) || unlikely(pud_bad(*pud))) - return 1; - - /* Check if page middle directory entry exists. */ - pmd = pmd_offset(pud, address); - if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) - return 1; - - /* There is a pte slot for 'address' in 'mm'. */ - return 0; + return page; + +no_page_table: + /* + * When core dumping an enormous anonymous area that nobody + * has touched so far, we don't want to allocate page tables. + */ + if (flags & FOLL_ANON) { + page = ZERO_PAGE(address); + if (flags & FOLL_GET) + get_page(page); + BUG_ON(flags & FOLL_WRITE); + } + return page; } int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, @@ -894,18 +956,19 @@ struct page **pages, struct vm_area_struct **vmas) { int i; - unsigned int flags; + unsigned int vm_flags; /* * Require read or write permissions. * If 'force' is set, we only require the "MAY" flags. */ - flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); - flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); + vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); + vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); i = 0; do { - struct vm_area_struct * vma; + struct vm_area_struct *vma; + unsigned int foll_flags; vma = find_extend_vma(mm, start); if (!vma && in_gate_area(tsk, start)) { @@ -933,8 +996,10 @@ return i ? : -EFAULT; } if (pages) { - pages[i] = pte_page(*pte); - get_page(pages[i]); + struct page *page = vm_normal_page(gate_vma, start, *pte); + pages[i] = page; + if (page) + get_page(page); } pte_unmap(pte); if (vmas) @@ -962,8 +1027,8 @@ } } #endif - if (!vma || (vma->vm_flags & VM_IO) - || !(flags & vma->vm_flags)) + if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP)) + || !(vm_flags & vma->vm_flags)) return i ? : -EFAULT; if (is_vm_hugetlb_page(vma)) { @@ -971,29 +1036,25 @@ &start, &len, i); continue; } - spin_lock(&mm->page_table_lock); + + foll_flags = FOLL_TOUCH; + if (pages) + foll_flags |= FOLL_GET; + if (!write && !(vma->vm_flags & VM_LOCKED) && + (!vma->vm_ops || !vma->vm_ops->nopage)) + foll_flags |= FOLL_ANON; + do { - int write_access = write; struct page *page; - cond_resched_lock(&mm->page_table_lock); - while (!(page = follow_page(mm, start, write_access))) { + if (write) + foll_flags |= FOLL_WRITE; + + cond_resched(); + while (!(page = follow_page(vma, start, foll_flags))) { int ret; - - /* - * Shortcut for anonymous pages. We don't want - * to force the creation of pages tables for - * insanely big anonymously mapped areas that - * nobody touched so far. This is important - * for doing a core dump for these mappings. - */ - if (!write && untouched_anonymous_page(mm,vma,start)) { - page = ZERO_PAGE(start); - break; - } - spin_unlock(&mm->page_table_lock); - ret = __handle_mm_fault(mm, vma, start, write_access); - + ret = __handle_mm_fault(mm, vma, start, + foll_flags & FOLL_WRITE); /* * The VM_FAULT_WRITE bit tells us that do_wp_page has * broken COW when necessary, even if maybe_mkwrite @@ -1001,7 +1062,7 @@ * subsequent page lookups as if they were reads. */ if (ret & VM_FAULT_WRITE) - write_access = 0; + foll_flags &= ~FOLL_WRITE; switch (ret & ~VM_FAULT_WRITE) { case VM_FAULT_MINOR: @@ -1017,13 +1078,10 @@ default: BUG(); } - spin_lock(&mm->page_table_lock); } if (pages) { pages[i] = page; flush_dcache_page(page); - if (!PageReserved(page)) - page_cache_get(page); } if (vmas) vmas[i] = vma; @@ -1031,7 +1089,6 @@ start += PAGE_SIZE; len--; } while (len && start < vma->vm_end); - spin_unlock(&mm->page_table_lock); } while (len); return i; } @@ -1041,16 +1098,21 @@ unsigned long addr, unsigned long end, pgprot_t prot) { pte_t *pte; - - pte = pte_alloc_map(mm, pmd, addr); + spinlock_t *ptl; + + pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) return -ENOMEM; do { - pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(addr), prot)); + struct page *page = ZERO_PAGE(addr); + pte_t zero_pte = pte_wrprotect(mk_pte(page, prot)); + page_cache_get(page); + page_add_file_rmap(page); + inc_mm_counter(mm, file_rss); BUG_ON(!pte_none(*pte)); set_pte_at(mm, addr, pte, zero_pte); } while (pte++, addr += PAGE_SIZE, addr != end); - pte_unmap(pte - 1); + pte_unmap_unlock(pte - 1, ptl); return 0; } @@ -1100,16 +1162,94 @@ BUG_ON(addr >= end); pgd = pgd_offset(mm, addr); flush_cache_range(vma, addr, end); - spin_lock(&mm->page_table_lock); do { next = pgd_addr_end(addr, end); err = zeromap_pud_range(mm, pgd, addr, next, prot); if (err) break; } while (pgd++, addr = next, addr != end); - spin_unlock(&mm->page_table_lock); return err; } + +pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) +{ + pgd_t * pgd = pgd_offset(mm, addr); + pud_t * pud = pud_alloc(mm, pgd, addr); + if (pud) { + pmd_t * pmd = pmd_alloc(mm, pud, addr); + if (pmd) + return pte_alloc_map_lock(mm, pmd, addr, ptl); + } + return NULL; +} + +/* + * This is the old fallback for page remapping. + * + * For historical reasons, it only allows reserved pages. Only + * old drivers should use this, and they needed to mark their + * pages reserved for the old functions anyway. + */ +static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *page, pgprot_t prot) +{ + int retval; + pte_t *pte; + spinlock_t *ptl; + + retval = -EINVAL; + if (PageAnon(page)) + goto out; + retval = -ENOMEM; + flush_dcache_page(page); + pte = get_locked_pte(mm, addr, &ptl); + if (!pte) + goto out; + retval = -EBUSY; + if (!pte_none(*pte)) + goto out_unlock; + + /* Ok, finally just insert the thing.. */ + get_page(page); + inc_mm_counter(mm, file_rss); + page_add_file_rmap(page); + set_pte_at(mm, addr, pte, mk_pte(page, prot)); + + retval = 0; +out_unlock: + pte_unmap_unlock(pte, ptl); +out: + return retval; +} + +/* + * This allows drivers to insert individual pages they've allocated + * into a user vma. + * + * The page has to be a nice clean _individual_ kernel allocation. + * If you allocate a compound page, you need to have marked it as + * such (__GFP_COMP), or manually just split the page up yourself + * (which is mainly an issue of doing "set_page_count(page, 1)" for + * each sub-page, and then freeing them one by one when you free + * them rather than freeing it as a compound page). + * + * NOTE! Traditionally this was done with "remap_pfn_range()" which + * took an arbitrary page protection parameter. This doesn't allow + * that. Your vma protection will have to be set up correctly, which + * means that if you want a shared writable mapping, you'd better + * ask for a shared writable mapping! + * + * The page does not need to be reserved. + */ +int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page) +{ + if (addr < vma->vm_start || addr >= vma->vm_end) + return -EFAULT; + if (!page_count(page)) + return -EINVAL; + vma->vm_flags |= VM_INSERTPAGE; + return insert_page(vma->vm_mm, addr, page, vma->vm_page_prot); +} +EXPORT_SYMBOL(vm_insert_page); /* * maps a range of physical memory into the requested pages. the old @@ -1121,17 +1261,17 @@ unsigned long pfn, pgprot_t prot) { pte_t *pte; - - pte = pte_alloc_map(mm, pmd, addr); + spinlock_t *ptl; + + pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) return -ENOMEM; do { BUG_ON(!pte_none(*pte)); - if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn))) - set_pte_at(mm, addr, pte, pfn_pte(pfn, prot)); + set_pte_at(mm, addr, pte, pfn_pte(pfn, prot)); pfn++; } while (pte++, addr += PAGE_SIZE, addr != end); - pte_unmap(pte - 1); + pte_unmap_unlock(pte - 1, ptl); return 0; } @@ -1190,16 +1330,31 @@ * rest of the world about it: * VM_IO tells people not to look at these pages * (accesses can have side effects). - * VM_RESERVED tells swapout not to try to touch - * this region. + * VM_RESERVED is specified all over the place, because + * in 2.4 it kept swapout's vma scan off this vma; but + * in 2.6 the LRU scan won't even find its pages, so this + * flag means no more than count its pages in reserved_vm, + * and omit it from core dump, even when VM_IO turned off. + * VM_PFNMAP tells the core MM that the base pages are just + * raw PFN mappings, and do not have a "struct page" associated + * with them. + * + * There's a horrible special case to handle copy-on-write + * behaviour that some programs depend on. We mark the "original" + * un-COW'ed pages by matching them up with "vma->vm_pgoff". */ - vma->vm_flags |= VM_IO | VM_RESERVED; + if (is_cow_mapping(vma->vm_flags)) { + if (addr != vma->vm_start || end != vma->vm_end) + return -EINVAL; + vma->vm_pgoff = pfn; + } + + vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; BUG_ON(addr >= end); pfn -= addr >> PAGE_SHIFT; pgd = pgd_offset(mm, addr); flush_cache_range(vma, addr, end); - spin_lock(&mm->page_table_lock); do { next = pgd_addr_end(addr, end); err = remap_pud_range(mm, pgd, addr, next, @@ -1207,7 +1362,6 @@ if (err) break; } while (pgd++, addr = next, addr != end); - spin_unlock(&mm->page_table_lock); return err; } EXPORT_SYMBOL(remap_pfn_range); @@ -1224,7 +1378,7 @@ struct page *pte_page; pte = (mm == &init_mm) ? - pte_alloc_kernel(mm, pmd, addr) : + pte_alloc_kernel(pmd, addr) : pte_alloc_map(mm, pmd, addr); if (!pte) return -ENOMEM; @@ -1300,17 +1454,40 @@ BUG_ON(addr >= end); pgd = pgd_offset(mm, addr); - spin_lock(&mm->page_table_lock); do { next = pgd_addr_end(addr, end); err = generic_pud_range(mm, pgd, addr, next, fn, data); if (err) break; } while (pgd++, addr = next, addr != end); - spin_unlock(&mm->page_table_lock); return err; } #endif + +/* + * handle_pte_fault chooses page fault handler according to an entry + * which was read non-atomically. Before making any commitment, on + * those architectures or configurations (e.g. i386 with PAE) which + * might give a mix of unmatched parts, do_swap_page and do_file_page + * must check under lock before unmapping the pte and proceeding + * (but do_wp_page is only called after already making such a check; + * and do_anonymous_page and do_no_page can safely check later on). + */ +static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, + pte_t *page_table, pte_t orig_pte) +{ + int same = 1; +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) + if (sizeof(pte_t) > sizeof(unsigned long)) { + spinlock_t *ptl = pte_lockptr(mm, pmd); + spin_lock(ptl); + same = pte_same(*page_table, orig_pte); + spin_unlock(ptl); + } +#endif + pte_unmap(page_table); + return same; +} /* * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when @@ -1325,28 +1502,37 @@ return pte; } -/* - * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock - */ -static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address, - pte_t *page_table) -{ - pte_t entry; - - entry = maybe_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)), - vma); - ptep_establish(vma, address, page_table, entry); - update_mmu_cache(vma, address, entry); - lazy_mmu_prot_update(entry); +static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va) +{ + /* + * If the source page was a PFN mapping, we don't have + * a "struct page" for it. We do a best-effort copy by + * just copying from the original user address. If that + * fails, we just zero-fill it. Live with it. + */ + if (unlikely(!src)) { + void *kaddr = kmap_atomic(dst, KM_USER0); + void __user *uaddr = (void __user *)(va & PAGE_MASK); + + /* + * This really shouldn't fail, because the page is there + * in the page tables. But it might just be unreadable, + * in which case we just give up and fill the result with + * zeroes. + */ + if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) + memset(kaddr, 0, PAGE_SIZE); + kunmap_atomic(kaddr, KM_USER0); + return; + + } + copy_user_highpage(dst, src, va); } /* * This routine handles present pages, when users try to write * to a shared page. It is done by copying the page to a new address * and decrementing the shared-page counter for the old page. - * - * Goto-purists beware: the only reason for goto's here is that it results - * in better assembly code.. The "default" path will see no jumps at all. * * Note that this routine assumes that the protection checks have been * done by the caller (the low-level page fault routine in most cases). @@ -1357,105 +1543,76 @@ * change only once the write actually happens. This avoids a few races, * and potentially makes it more efficient. * - * We hold the mm semaphore and the page_table_lock on entry and exit - * with the page_table_lock released. - */ -static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, - unsigned long address, pte_t *page_table, pmd_t *pmd, pte_t pte) + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults), with pte both mapped and locked. + * We return with mmap_sem still held, but pte unmapped and unlocked. + */ +static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + spinlock_t *ptl, pte_t orig_pte) { struct page *old_page, *new_page; - unsigned long pfn = pte_pfn(pte); pte_t entry; - int ret; -#ifdef CONFIG_XEN - struct page invalid_page; -#endif - - if (unlikely(!pfn_valid(pfn))) { -#ifdef CONFIG_XEN - /* This can happen with /dev/mem (PROT_WRITE, MAP_PRIVATE). */ - invalid_page.flags = (1<<PG_reserved) | (1<<PG_locked); - old_page = &invalid_page; - } else { - old_page = pfn_to_page(pfn); -#else - /* - * This should really halt the system so it can be debugged or - * at least the kernel stops what it's doing before it corrupts - * data, but for the moment just pretend this is OOM. - */ - pte_unmap(page_table); - printk(KERN_ERR "do_wp_page: bogus page at address %08lx\n", - address); - spin_unlock(&mm->page_table_lock); - return VM_FAULT_OOM; -#endif - } -#ifndef CONFIG_XEN - old_page = pfn_to_page(pfn); -#endif + int ret = VM_FAULT_MINOR; + + old_page = vm_normal_page(vma, address, orig_pte); + if (!old_page) + goto gotten; if (PageAnon(old_page) && !TestSetPageLocked(old_page)) { int reuse = can_share_swap_page(old_page); unlock_page(old_page); if (reuse) { - flush_cache_page(vma, address, pfn); - entry = maybe_mkwrite(pte_mkyoung(pte_mkdirty(pte)), - vma); + flush_cache_page(vma, address, pte_pfn(orig_pte)); + entry = pte_mkyoung(orig_pte); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); ptep_set_access_flags(vma, address, page_table, entry, 1); update_mmu_cache(vma, address, entry); lazy_mmu_prot_update(entry); - pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); - return VM_FAULT_MINOR|VM_FAULT_WRITE; + ret |= VM_FAULT_WRITE; + goto unlock; } } - pte_unmap(page_table); /* * Ok, we need to copy. Oh, well.. */ - if (!PageReserved(old_page)) - page_cache_get(old_page); - spin_unlock(&mm->page_table_lock); + page_cache_get(old_page); +gotten: + pte_unmap_unlock(page_table, ptl); if (unlikely(anon_vma_prepare(vma))) - goto no_new_page; + goto oom; if (old_page == ZERO_PAGE(address)) { new_page = alloc_zeroed_user_highpage(vma, address); if (!new_page) - goto no_new_page; + goto oom; } else { new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); if (!new_page) - goto no_new_page; -#ifndef CONFIG_XEN - copy_user_highpage(new_page, old_page, address); -#else - if (old_page == &invalid_page) { - char *vto = kmap_atomic(new_page, KM_USER1); - copy_page(vto, (void *)(address & PAGE_MASK)); - kunmap_atomic(vto, KM_USER1); - } else { - copy_user_highpage(new_page, old_page, address); - } -#endif - } + goto oom; + cow_user_page(new_page, old_page, address); + } + /* * Re-check the pte - we dropped the lock */ - ret = VM_FAULT_MINOR; - spin_lock(&mm->page_table_lock); - page_table = pte_offset_map(pmd, address); - if (likely(pte_same(*page_table, pte))) { - if (PageAnon(old_page)) - dec_mm_counter(mm, anon_rss); - if (PageReserved(old_page)) - inc_mm_counter(mm, rss); - else + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + if (likely(pte_same(*page_table, orig_pte))) { + if (old_page) { page_remove_rmap(old_page); - flush_cache_page(vma, address, pfn); - break_cow(vma, new_page, address, page_table); + if (!PageAnon(old_page)) { + dec_mm_counter(mm, file_rss); + inc_mm_counter(mm, anon_rss); + } + } else + inc_mm_counter(mm, anon_rss); + flush_cache_page(vma, address, pte_pfn(orig_pte)); + entry = mk_pte(new_page, vma->vm_page_prot); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + ptep_establish(vma, address, page_table, entry); + update_mmu_cache(vma, address, entry); + lazy_mmu_prot_update(entry); lru_cache_add_active(new_page); page_add_anon_rmap(new_page, vma, address); @@ -1463,14 +1620,16 @@ new_page = old_page; ret |= VM_FAULT_WRITE; } - pte_unmap(page_table); - page_cache_release(new_page); - page_cache_release(old_page); - spin_unlock(&mm->page_table_lock); + if (new_page) + page_cache_release(new_page); + if (old_page) + page_cache_release(old_page); +unlock: + pte_unmap_unlock(page_table, ptl); return ret; - -no_new_page: - page_cache_release(old_page); +oom: + if (old_page) + page_cache_release(old_page); return VM_FAULT_OOM; } @@ -1539,13 +1698,6 @@ restart_addr = zap_page_range(vma, start_addr, end_addr - start_addr, details); - - /* - * We cannot rely on the break test in unmap_vmas: - * on the one hand, we don't want to restart our loop - * just because that broke out for the page_table_lock; - * on the other hand, it does no test when vma is small. - */ need_break = need_resched() || need_lockbreak(details->i_mmap_lock); @@ -1794,38 +1946,37 @@ } /* - * We hold the mm semaphore and the page_table_lock on entry and - * should release the pagetable lock on exit.. - */ -static int do_swap_page(struct mm_struct * mm, - struct vm_area_struct * vma, unsigned long address, - pte_t *page_table, pmd_t *pmd, pte_t orig_pte, int write_access) -{ + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults), and pte mapped but not yet locked. + * We return with mmap_sem still held, but pte unmapped and unlocked. + */ +static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + int write_access, pte_t orig_pte) +{ + spinlock_t *ptl; struct page *page; - swp_entry_t entry = pte_to_swp_entry(orig_pte); + swp_entry_t entry; pte_t pte; int ret = VM_FAULT_MINOR; - pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); + if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) + goto out; + + entry = pte_to_swp_entry(orig_pte); page = lookup_swap_cache(entry); if (!page) { swapin_readahead(entry, address, vma); page = read_swap_cache_async(entry, vma, address); if (!page) { /* - * Back out if somebody else faulted in this pte while - * we released the page table lock. + * Back out if somebody else faulted in this pte + * while we released the pte lock. */ - spin_lock(&mm->page_table_lock); - page_table = pte_offset_map(pmd, address); + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (likely(pte_same(*page_table, orig_pte))) ret = VM_FAULT_OOM; - else - ret = VM_FAULT_MINOR; - pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); - goto out; + goto unlock; } /* Had to read the page from swap area: Major fault */ @@ -1838,15 +1989,11 @@ lock_page(page); /* - * Back out if somebody else faulted in this pte while we - * released the page table lock. + * Back out if somebody else already faulted in this pte. */ - spin_lock(&mm->page_table_lock); - page_table = pte_offset_map(pmd, address); - if (unlikely(!pte_same(*page_table, orig_pte))) { - ret = VM_FAULT_MINOR; + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + if (unlikely(!pte_same(*page_table, orig_pte))) goto out_nomap; - } if (unlikely(!PageUptodate(page))) { ret = VM_FAULT_SIGBUS; @@ -1855,7 +2002,7 @@ /* The page isn't present yet, go ahead with the fault. */ - inc_mm_counter(mm, rss); + inc_mm_counter(mm, anon_rss); pte = mk_pte(page, vma->vm_page_prot); if (write_access && can_share_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); @@ -1873,7 +2020,7 @@ if (write_access) { if (do_wp_page(mm, vma, address, - page_table, pmd, pte) == VM_FAULT_OOM) + page_table, pmd, ptl, pte) == VM_FAULT_OOM) ret = VM_FAULT_OOM; goto out; } @@ -1881,74 +2028,76 @@ /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, pte); lazy_mmu_prot_update(pte); - pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); +unlock: + pte_unmap_unlock(page_table, ptl); out: return ret; out_nomap: - pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); + pte_unmap_unlock(page_table, ptl); unlock_page(page); page_cache_release(page); - goto out; -} - -/* - * We are called with the MM semaphore and page_table_lock - * spinlock held to protect against concurrent faults in - * multithreaded programs. - */ -static int -do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, - pte_t *page_table, pmd_t *pmd, int write_access, - unsigned long addr) -{ + return ret; +} + +/* + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults), and pte mapped but not yet locked. + * We return with mmap_sem still held, but pte unmapped and unlocked. + */ +static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + int write_access) +{ + struct page *page; + spinlock_t *ptl; pte_t entry; - struct page * page = ZERO_PAGE(addr); - - /* Read-only mapping of ZERO_PAGE. */ - entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); - - /* ..except if it's a write access */ + if (write_access) { /* Allocate our own private page. */ pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); if (unlikely(anon_vma_prepare(vma))) - goto no_mem; - page = alloc_zeroed_user_highpage(vma, addr); + goto oom; + page = alloc_zeroed_user_highpage(vma, address); if (!page) - goto no_mem; - - spin_lock(&mm->page_table_lock); - page_table = pte_offset_map(pmd, addr); - - if (!pte_none(*page_table)) { - pte_unmap(page_table); - page_cache_release(page); - spin_unlock(&mm->page_table_lock); - goto out; - } - inc_mm_counter(mm, rss); - entry = maybe_mkwrite(pte_mkdirty(mk_pte(page, - vma->vm_page_prot)), - vma); + goto oom; + + entry = mk_pte(page, vma->vm_page_prot); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + if (!pte_none(*page_table)) + goto release; + inc_mm_counter(mm, anon_rss); lru_cache_add_active(page); SetPageReferenced(page); - page_add_anon_rmap(page, vma, addr); - } - - set_pte_at(mm, addr, page_table, entry); - pte_unmap(page_table); + page_add_anon_rmap(page, vma, address); + } else { + /* Map the ZERO_PAGE - vm_page_prot is readonly */ + page = ZERO_PAGE(address); + page_cache_get(page); + entry = mk_pte(page, vma->vm_page_prot); + + ptl = pte_lockptr(mm, pmd); + spin_lock(ptl); + if (!pte_none(*page_table)) + goto release; + inc_mm_counter(mm, file_rss); + page_add_file_rmap(page); + } + + set_pte_at(mm, address, page_table, entry); /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, addr, entry); + update_mmu_cache(vma, address, entry); lazy_mmu_prot_update(entry); - spin_unlock(&mm->page_table_lock); -out: +unlock: + pte_unmap_unlock(page_table, ptl); return VM_FAULT_MINOR; -no_mem: +release: + page_cache_release(page); + goto unlock; +oom: return VM_FAULT_OOM; } @@ -1961,25 +2110,24 @@ * As this is called only for pages that do not currently exist, we * do not need to flush old virtual caches or the TLB. * - * This is called with the MM semaphore held and the page table - * spinlock held. Exit with the spinlock released. - */ -static int -do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd) -{ - struct page * new_page; + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults), and pte mapped but not yet locked. + * We return with mmap_sem still held, but pte unmapped and unlocked. + */ +static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + int write_access) +{ + spinlock_t *ptl; + struct page *new_page; struct address_space *mapping = NULL; pte_t entry; unsigned int sequence = 0; int ret = VM_FAULT_MINOR; int anon = 0; - if (!vma->vm_ops || !vma->vm_ops->nopage) - return do_anonymous_page(mm, vma, page_table, - pmd, write_access, address); pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); + BUG_ON(vma->vm_flags & VM_PFNMAP); if (vma->vm_file) { mapping = vma->vm_file->f_mapping; @@ -1987,7 +2135,6 @@ smp_rmb(); /* serializes i_size against truncate_count */ } retry: - cond_resched(); new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret); /* * No smp_rmb is needed here as long as there's a full @@ -2020,19 +2167,20 @@ anon = 1; } - spin_lock(&mm->page_table_lock); + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); /* * For a file-backed vma, someone could have truncated or otherwise * invalidated this page. If unmap_mapping_range got called, * retry getting the page. */ if (mapping && unlikely(sequence != mapping->truncate_count)) { + pte_unmap_unlock(page_table, ptl); + page_cache_release(new_page); + cond_resched(); sequence = mapping->truncate_count; - spin_unlock(&mm->page_table_lock); - page_cache_release(new_page); + smp_rmb(); goto retry; } - page_table = pte_offset_map(pmd, address); /* * This silly early PAGE_DIRTY setting removes a race @@ -2046,68 +2194,67 @@ */ /* Only go through if we didn't race with anybody else... */ if (pte_none(*page_table)) { - if (!PageReserved(new_page)) - inc_mm_counter(mm, rss); - flush_icache_page(vma, new_page); entry = mk_pte(new_page, vma->vm_page_prot); if (write_access) entry = maybe_mkwrite(pte_mkdirty(entry), vma); set_pte_at(mm, address, page_table, entry); if (anon) { + inc_mm_counter(mm, anon_rss); lru_cache_add_active(new_page); page_add_anon_rmap(new_page, vma, address); - } else + } else { + inc_mm_counter(mm, file_rss); page_add_file_rmap(new_page); - pte_unmap(page_table); + } } else { /* One of our sibling threads was faster, back out. */ - pte_unmap(page_table); page_cache_release(new_page); - spin_unlock(&mm->page_table_lock); - goto out; + goto unlock; } /* no need to invalidate: a not-present page shouldn't be cached */ update_mmu_cache(vma, address, entry); lazy_mmu_prot_update(entry); - spin_unlock(&mm->page_table_lock); -out: +unlock: + pte_unmap_unlock(page_table, ptl); return ret; oom: page_cache_release(new_page); - ret = VM_FAULT_OOM; - goto out; + return VM_FAULT_OOM; } /* * Fault of a previously existing named mapping. Repopulate the pte * from the encoded file_pte if possible. This enables swappable * nonlinear vmas. - */ -static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma, - unsigned long address, int write_access, pte_t *pte, pmd_t *pmd) -{ - unsigned long pgoff; + * + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults), and pte mapped but not yet locked. + * We return with mmap_sem still held, but pte unmapped and unlocked. + */ +static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + int write_access, pte_t orig_pte) +{ + pgoff_t pgoff; int err; - BUG_ON(!vma->vm_ops || !vma->vm_ops->nopage); - /* - * Fall back to the linear mapping if the fs does not support - * ->populate: - */ - if (!vma->vm_ops->populate || - (write_access && !(vma->vm_flags & VM_SHARED))) { - pte_clear(mm, address, pte); - return do_no_page(mm, vma, address, write_access, pte, pmd); - } - - pgoff = pte_to_pgoff(*pte); - - pte_unmap(pte); - spin_unlock(&mm->page_table_lock); - - err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, vma->vm_page_prot, pgoff, 0); + if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) + return VM_FAULT_MINOR; + + if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) { + /* + * Page table corrupted: show pte and kill process. + */ + print_bad_pte(vma, orig_pte, address); + return VM_FAULT_OOM; + } + /* We can then assume vm->vm_ops && vma->vm_ops->populate */ + + pgoff = pte_to_pgoff(orig_pte); + err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, + vma->vm_page_prot, pgoff, 0); if (err == -ENOMEM) return VM_FAULT_OOM; if (err) @@ -2124,56 +2271,68 @@ * with external mmu caches can use to update those (ie the Sparc or * PowerPC hashed page tables that act as extended TLBs). * - * Note the "page_table_lock". It is to protect against kswapd removing - * pages from under us. Note that kswapd only ever _removes_ pages, never - * adds them. As such, once we have noticed that the page is not present, - * we can drop the lock early. - * - * The adding of pages is protected by the MM semaphore (which we hold), - * so we don't need to worry about a page being suddenly been added into - * our VM. - * - * We enter with the pagetable spinlock held, we are supposed to - * release it when done. + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults), and pte mapped but not yet locked. + * We return with mmap_sem still held, but pte unmapped and unlocked. */ static inline int handle_pte_fault(struct mm_struct *mm, - struct vm_area_struct * vma, unsigned long address, - int write_access, pte_t *pte, pmd_t *pmd) + struct vm_area_struct *vma, unsigned long address, + pte_t *pte, pmd_t *pmd, int write_access) { pte_t entry; - - entry = *pte; + pte_t old_entry; + spinlock_t *ptl; + + old_entry = entry = *pte; if (!pte_present(entry)) { - /* - * If it truly wasn't present, we know that kswapd - * and the PTE updates will not touch it later. So - * drop the lock. - */ - if (pte_none(entry)) - return do_no_page(mm, vma, address, write_access, pte, pmd); + if (pte_none(entry)) { + if (!vma->vm_ops || !vma->vm_ops->nopage) + return do_anonymous_page(mm, vma, address, + pte, pmd, write_access); + return do_no_page(mm, vma, address, + pte, pmd, write_access); + } if (pte_file(entry)) - return do_file_page(mm, vma, address, write_access, pte, pmd); - return do_swap_page(mm, vma, address, pte, pmd, entry, write_access); - } - + return do_file_page(mm, vma, address, + pte, pmd, write_access, entry); + return do_swap_page(mm, vma, address, + pte, pmd, write_access, entry); + } + + ptl = pte_lockptr(mm, pmd); + spin_lock(ptl); + if (unlikely(!pte_same(*pte, entry))) + goto unlock; if (write_access) { if (!pte_write(entry)) - return do_wp_page(mm, vma, address, pte, pmd, entry); + return do_wp_page(mm, vma, address, + pte, pmd, ptl, entry); entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); - ptep_set_access_flags(vma, address, pte, entry, write_access); - update_mmu_cache(vma, address, entry); - lazy_mmu_prot_update(entry); - pte_unmap(pte); - spin_unlock(&mm->page_table_lock); + if (!pte_same(old_entry, entry)) { + ptep_set_access_flags(vma, address, pte, entry, write_access); + update_mmu_cache(vma, address, entry); + lazy_mmu_prot_update(entry); + } else { + /* + * This is needed only for protection faults but the arch code + * is not yet telling us if this is a protection fault or not. + * This still avoids useless tlb flushes for .text page faults + * with threads. + */ + if (write_access) + flush_tlb_page(vma, address); + } +unlock: + pte_unmap_unlock(pte, ptl); return VM_FAULT_MINOR; } /* * By the time we get here, we already hold the mm semaphore */ -int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma, +int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, int write_access) { pgd_t *pgd; @@ -2188,100 +2347,78 @@ if (unlikely(is_vm_hugetlb_page(vma))) return hugetlb_fault(mm, vma, address, write_access); - /* - * We need the page table lock to synchronize with kswapd - * and the SMP-safe atomic PTE updates. - */ pgd = pgd_offset(mm, address); - spin_lock(&mm->page_table_lock); - pud = pud_alloc(mm, pgd, address); if (!pud) - goto oom; - + return VM_FAULT_OOM; pmd = pmd_alloc(mm, pud, address); if (!pmd) - goto oom; - + return VM_FAULT_OOM; pte = pte_alloc_map(mm, pmd, address); if (!pte) - goto oom; - - return handle_pte_fault(mm, vma, address, write_access, pte, pmd); - - oom: + return VM_FAULT_OOM; + + return handle_pte_fault(mm, vma, address, pte, pmd, write_access); +} + +#ifndef __PAGETABLE_PUD_FOLDED +/* + * Allocate page upper directory. + * We've already handled the fast-path in-line. + */ +int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) +{ + pud_t *new = pud_alloc_one(mm, address); + if (!new) + return -ENOMEM; + + spin_lock(&mm->page_table_lock); + if (pgd_present(*pgd)) /* Another has populated it */ + pud_free(new); + else + pgd_populate(mm, pgd, new); spin_unlock(&mm->page_table_lock); - return VM_FAULT_OOM; -} - -#ifndef __PAGETABLE_PUD_FOLDED -/* - * Allocate page upper directory. - * - * We've already handled the fast-path in-line, and we own the - * page table lock. - */ -pud_t fastcall *__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) -{ - pud_t *new; - + return 0; +} +#else +/* Workaround for gcc 2.96 */ +int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) +{ + return 0; +} +#endif /* __PAGETABLE_PUD_FOLDED */ + +#ifndef __PAGETABLE_PMD_FOLDED +/* + * Allocate page middle directory. + * We've already handled the fast-path in-line. + */ +int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) +{ + pmd_t *new = pmd_alloc_one(mm, address); + if (!new) + return -ENOMEM; + + spin_lock(&mm->page_table_lock); +#ifndef __ARCH_HAS_4LEVEL_HACK + if (pud_present(*pud)) /* Another has populated it */ + pmd_free(new); + else + pud_populate(mm, pud, new); +#else + if (pgd_present(*pud)) /* Another has populated it */ + pmd_free(new); + else + pgd_populate(mm, pud, new); +#endif /* __ARCH_HAS_4LEVEL_HACK */ spin_unlock(&mm->page_table_lock); - new = pud_alloc_one(mm, address); - spin_lock(&mm->page_table_lock); - if (!new) - return NULL; - - /* - * Because we dropped the lock, we should re-check the - * entry, as somebody else could have populated it.. - */ - if (pgd_present(*pgd)) { - pud_free(new); - goto out; - } - pgd_populate(mm, pgd, new); - out: - return pud_offset(pgd, address); -} -#endif /* __PAGETABLE_PUD_FOLDED */ - -#ifndef __PAGETABLE_PMD_FOLDED -/* - * Allocate page middle directory. - * - * We've already handled the fast-path in-line, and we own the - * page table lock. - */ -pmd_t fastcall *__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) -{ - pmd_t *new; - - spin_unlock(&mm->page_table_lock); - new = pmd_alloc_one(mm, address); - spin_lock(&mm->page_table_lock); - if (!new) - return NULL; - - /* - * Because we dropped the lock, we should re-check the - * entry, as somebody else could have populated it.. - */ -#ifndef __ARCH_HAS_4LEVEL_HACK - if (pud_present(*pud)) { - pmd_free(new); - goto out; - } - pud_populate(mm, pud, new); + return 0; +} #else - if (pgd_present(*pud)) { - pmd_free(new); - goto out; - } - pgd_populate(mm, pud, new); -#endif /* __ARCH_HAS_4LEVEL_HACK */ - - out: - return pmd_offset(pud, address); +/* Workaround for gcc 2.96 */ +int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) +{ + return 0; } #endif /* __PAGETABLE_PMD_FOLDED */ @@ -2346,22 +2483,6 @@ EXPORT_SYMBOL(vmalloc_to_pfn); -/* - * update_mem_hiwater - * - update per process rss and vm high water data - */ -void update_mem_hiwater(struct task_struct *tsk) -{ - if (tsk->mm) { - unsigned long rss = get_mm_counter(tsk->mm, rss); - - if (tsk->mm->hiwater_rss < rss) - tsk->mm->hiwater_rss = rss; - if (tsk->mm->hiwater_vm < tsk->mm->total_vm) - tsk->mm->hiwater_vm = tsk->mm->total_vm; - } -} - #if !defined(__HAVE_ARCH_GATE_AREA) #if defined(AT_SYSINFO_EHDR) diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/mm/mmap.c --- a/linux-2.6-xen-sparse/mm/mmap.c Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/mm/mmap.c Wed Feb 1 18:00:19 2006 @@ -155,10 +155,6 @@ return -ENOMEM; } -EXPORT_SYMBOL(sysctl_overcommit_memory); -EXPORT_SYMBOL(sysctl_overcommit_ratio); -EXPORT_SYMBOL(sysctl_max_map_count); -EXPORT_SYMBOL(vm_committed_space); EXPORT_SYMBOL(__vm_enough_memory); /* @@ -181,26 +177,36 @@ } /* - * Remove one vm structure and free it. - */ -static void remove_vm_struct(struct vm_area_struct *vma) + * Unlink a file-based vm structure from its prio_tree, to hide + * vma from rmap and vmtruncate before freeing its page tables. + */ +void unlink_file_vma(struct vm_area_struct *vma) { struct file *file = vma->vm_file; - might_sleep(); if (file) { struct address_space *mapping = file->f_mapping; spin_lock(&mapping->i_mmap_lock); __remove_shared_vm_struct(vma, file, mapping); spin_unlock(&mapping->i_mmap_lock); } +} + +/* + * Close a vm structure and free it, returning the next. + */ +static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) +{ + struct vm_area_struct *next = vma->vm_next; + + might_sleep(); if (vma->vm_ops && vma->vm_ops->close) vma->vm_ops->close(vma); - if (file) - fput(file); - anon_vma_unlink(vma); + if (vma->vm_file) + fput(vma->vm_file); mpol_free(vma_policy(vma)); kmem_cache_free(vm_area_cachep, vma); + return next; } asmlinkage unsigned long sys_brk(unsigned long brk) @@ -605,7 +611,7 @@ * If the vma has a ->close operation then the driver probably needs to release * per-vma resources, so we don't attempt to merge those. */ -#define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED) +#define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP) static inline int is_mergeable_vma(struct vm_area_struct *vma, struct file *file, unsigned long vm_flags) @@ -832,7 +838,7 @@ } #ifdef CONFIG_PROC_FS -void __vm_stat_account(struct mm_struct *mm, unsigned long flags, +void vm_stat_account(struct mm_struct *mm, unsigned long flags, struct file *file, long pages) { const unsigned long stack_flags @@ -1110,7 +1116,7 @@ } out: mm->total_vm += len >> PAGE_SHIFT; - __vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); + vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); if (vm_flags & VM_LOCKED) { mm->locked_vm += len >> PAGE_SHIFT; make_pages_present(addr, addr + len); @@ -1475,15 +1481,19 @@ mm->total_vm += grow; if (vma->vm_flags & VM_LOCKED) mm->locked_vm += grow; - __vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); + vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); return 0; } -#ifdef CONFIG_STACK_GROWSUP -/* - * vma is the first one with address > vma->vm_end. Have to extend vma. - */ -int expand_stack(struct vm_area_struct * vma, unsigned long address) +#if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64) +/* + * PA-RISC uses this for its stack; IA64 for its Register Backing Store. + * vma is the last one with address > vma->vm_end. Have to extend vma. + */ +#ifndef CONFIG_IA64 +static inline +#endif +int expand_upwards(struct vm_area_struct *vma, unsigned long address) { int error; @@ -1520,6 +1530,13 @@ } anon_vma_unlock(vma); return error; +} +#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ + +#ifdef CONFIG_STACK_GROWSUP +int expand_stack(struct vm_area_struct *vma, unsigned long address) +{ + return expand_upwards(vma, address); } struct vm_area_struct * @@ -1603,36 +1620,24 @@ } #endif -/* Normal function to fix up a mapping - * This function is the default for when an area has no specific - * function. This may be used as part of a more specific routine. +/* + * Ok - we have the memory areas we should free on the vma list, + * so release them, and do the vma updates. * - * By the time this function is called, the area struct has been - * removed from the process mapping list. - */ -static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area) -{ - size_t len = area->vm_end - area->vm_start; - - area->vm_mm->total_vm -= len >> PAGE_SHIFT; - if (area->vm_flags & VM_LOCKED) - area->vm_mm->locked_vm -= len >> PAGE_SHIFT; - vm_stat_unaccount(area); - remove_vm_struct(area); -} - -/* - * Update the VMA and inode share lists. - * - * Ok - we have the memory areas we should free on the 'free' list, - * so release them, and do the vma updates. - */ -static void unmap_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) -{ + * Called with the mm semaphore held. + */ +static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) +{ + /* Update high watermark before we lower total_vm */ + update_hiwater_vm(mm); do { - struct vm_area_struct *next = vma->vm_next; - unmap_vma(mm, vma); - vma = next; + long nrpages = vma_pages(vma); + + mm->total_vm -= nrpages; + if (vma->vm_flags & VM_LOCKED) + mm->locked_vm -= nrpages; + vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); + vma = remove_vma(vma); } while (vma); validate_mm(mm); } @@ -1651,14 +1656,13 @@ unsigned long nr_accounted = 0; lru_add_drain(); - spin_lock(&mm->page_table_lock); tlb = tlb_gather_mmu(mm, 0); - unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL); + update_hiwater_rss(mm); + unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, next? next->vm_start: 0); tlb_finish_mmu(tlb, start, end); - spin_unlock(&mm->page_table_lock); } /* @@ -1799,7 +1803,7 @@ unmap_region(mm, vma, prev, start, end); /* Fix up all other VM information */ - unmap_vma_list(mm, vma); + remove_vma_list(mm, vma); return 0; } @@ -1821,7 +1825,7 @@ static inline void verify_mm_writelocked(struct mm_struct *mm) { -#ifdef CONFIG_DEBUG_KERNEL +#ifdef CONFIG_DEBUG_VM if (unlikely(down_read_trylock(&mm->mmap_sem))) { WARN_ON(1); up_read(&mm->mmap_sem); @@ -1937,34 +1941,21 @@ #endif lru_add_drain(); - - spin_lock(&mm->page_table_lock); - flush_cache_mm(mm); tlb = tlb_gather_mmu(mm, 1); + /* Don't update_hiwater_rss(mm) here, do_exit already did */ /* Use -1 here to ensure all VMAs in the mm are unmapped */ - end = unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL); + end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); tlb_finish_mmu(tlb, 0, end); - mm->mmap = mm->mmap_cache = NULL; - mm->mm_rb = RB_ROOT; - set_mm_counter(mm, rss, 0); - mm->total_vm = 0; - mm->locked_vm = 0; - - spin_unlock(&mm->page_table_lock); - - /* - * Walk the list again, actually closing and freeing it - * without holding any MM locks. - */ - while (vma) { - struct vm_area_struct *next = vma->vm_next; - remove_vm_struct(vma); - vma = next; - } + /* + * Walk the list again, actually closing and freeing it, + * with preemption enabled, without holding any MM locks. + */ + while (vma) + vma = remove_vma(vma); BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); } diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/mm/page_alloc.c --- a/linux-2.6-xen-sparse/mm/page_alloc.c Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/mm/page_alloc.c Wed Feb 1 18:00:19 2006 @@ -33,6 +33,7 @@ #include <linux/sysctl.h> #include <linux/cpu.h> #include <linux/cpuset.h> +#include <linux/memory_hotplug.h> #include <linux/nodemask.h> #include <linux/vmalloc.h> @@ -59,11 +60,13 @@ * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA - */ -int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 }; + * + * TBD: should special case ZONE_DMA32 machines here - in those we normally + * don't need any ZONE_NORMAL reservation + */ +int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 }; EXPORT_SYMBOL(totalram_pages); -EXPORT_SYMBOL(nr_swap_pages); /* * Used by page_zone() to look up the address of the struct zone whose @@ -72,27 +75,50 @@ struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly; EXPORT_SYMBOL(zone_table); -static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; +static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" }; int min_free_kbytes = 1024; unsigned long __initdata nr_kernel_pages; unsigned long __initdata nr_all_pages; -/* - * Temporary debugging check for pages not lying within a given zone. - */ -static int bad_range(struct zone *zone, struct page *page) -{ - if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages) - return 1; - if (page_to_pfn(page) < zone->zone_start_pfn) - return 1; +static int page_outside_zone_boundaries(struct zone *zone, struct page *page) +{ + int ret = 0; + unsigned seq; + unsigned long pfn = page_to_pfn(page); + + do { + seq = zone_span_seqbegin(zone); + if (pfn >= zone->zone_start_pfn + zone->spanned_pages) + ret = 1; + else if (pfn < zone->zone_start_pfn) + ret = 1; + } while (zone_span_seqretry(zone, seq)); + + return ret; +} + +static int page_is_consistent(struct zone *zone, struct page *page) +{ #ifdef CONFIG_HOLES_IN_ZONE if (!pfn_valid(page_to_pfn(page))) - return 1; + return 0; #endif if (zone != page_zone(page)) + return 0; + + return 1; +} +/* + * Temporary debugging check for pages not lying within a given zone. + */ +static int bad_range(struct zone *zone, struct page *page) +{ + if (page_outside_zone_boundaries(zone, page)) return 1; + if (!page_is_consistent(zone, page)) + return 1; + return 0; } @@ -101,7 +127,7 @@ printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n", function, current->comm, page); printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", - (int)(2*sizeof(page_flags_t)), (unsigned long)page->flags, + (int)(2*sizeof(unsigned long)), (unsigned long)page->flags, page->mapping, page_mapcount(page), page_count(page)); printk(KERN_EMERG "Backtrace:\n"); dump_stack(); @@ -114,17 +140,13 @@ 1 << PG_reclaim | 1 << PG_slab | 1 << PG_swapcache | - 1 << PG_writeback); + 1 << PG_writeback ); set_page_count(page, 0); reset_page_mapcount(page); page->mapping = NULL; add_taint(TAINT_BAD_PAGE); } -#ifndef CONFIG_HUGETLB_PAGE -#define prep_compound_page(page, order) do { } while (0) -#define destroy_compound_page(page, order) do { } while (0) -#else /* * Higher-order pages are called "compound pages". They are structured thusly: * @@ -153,7 +175,7 @@ struct page *p = page + i; SetPageCompound(p); - p->private = (unsigned long)page; + set_page_private(p, (unsigned long)page); } } @@ -173,12 +195,11 @@ if (!PageCompound(p)) bad_page(__FUNCTION__, page); - if (p->private != (unsigned long)page) + if (page_private(p) != (unsigned long)page) bad_page(__FUNCTION__, page); ClearPageCompound(p); } } -#endif /* CONFIG_HUGETLB_PAGE */ /* * function for dealing with page's order in buddy system. @@ -186,18 +207,18 @@ * So, we don't need atomic page->flags operations here. */ static inline unsigned long page_order(struct page *page) { - return page->private; + return page_private(page); } static inline void set_page_order(struct page *page, int order) { - page->private = order; + set_page_private(page, order); __SetPagePrivate(page); } static inline void rmv_page_order(struct page *page) { __ClearPagePrivate(page); - page->private = 0; + set_page_private(page, 0); } /* @@ -237,14 +258,13 @@ * (a) the buddy is free && * (b) the buddy is on the buddy system && * (c) a page and its buddy have the same order. - * for recording page's order, we use page->private and PG_private. + * for recording page's order, we use page_private(page) and PG_private. * */ static inline int page_is_buddy(struct page *page, int order) { if (PagePrivate(page) && (page_order(page) == order) && - !PageReserved(page) && page_count(page) == 0) return 1; return 0; @@ -264,7 +284,7 @@ * parts of the VM system. * At each level, we keep a list of pages, which are heads of continuous * free pages of length of (1 << order) and marked with PG_Private.Page's - * order is recorded in page->private field. + * order is recorded in page_private(page) field. * So when we are allocating or freeing one, we can derive the state of the * other. That is, if we allocate a small block, and both were * free, the remainder of the region must be split into blocks. @@ -314,7 +334,7 @@ zone->free_area[order].nr_free++; } -static inline void free_pages_check(const char *function, struct page *page) +static inline int free_pages_check(const char *function, struct page *page) { if ( page_mapcount(page) || page->mapping != NULL || @@ -327,10 +347,17 @@ 1 << PG_reclaim | 1 << PG_slab | 1 << PG_swapcache | - 1 << PG_writeback ))) + 1 << PG_writeback | + 1 << PG_reserved ))) bad_page(function, page); if (PageDirty(page)) __ClearPageDirty(page); + /* + * For now, we report if PG_reserved was found set, but do not + * clear it, and do not free the page. But we shall soon need + * to do more, for when the ZERO_PAGE count wraps negative. + */ + return PageReserved(page); } /* @@ -370,11 +397,10 @@ { LIST_HEAD(list); int i; + int reserved = 0; if (arch_free_page(page, order)) return; - - mod_page_state(pgfree, 1 << order); #ifndef CONFIG_MMU if (order > 0) @@ -383,8 +409,12 @@ #endif for (i = 0 ; i < (1 << order) ; ++i) - free_pages_check(__FUNCTION__, page + i); + reserved += free_pages_check(__FUNCTION__, page + i); + if (reserved) + return; + list_add(&page->lru, &list); + mod_page_state(pgfree, 1 << order); kernel_map_pages(page, 1<<order, 0); free_pages_bulk(page_zone(page), 1, &list, order); } @@ -442,7 +472,7 @@ /* * This page is about to be returned from the page allocator */ -static void prep_new_page(struct page *page, int order) +static int prep_new_page(struct page *page, int order) { if ( page_mapcount(page) || page->mapping != NULL || @@ -456,15 +486,24 @@ 1 << PG_reclaim | 1 << PG_slab | 1 << PG_swapcache | - 1 << PG_writeback ))) + 1 << PG_writeback | + 1 << PG_reserved ))) bad_page(__FUNCTION__, page); + + /* + * For now, we report if PG_reserved was found set, but do not + * clear it, and do not allocate the page: as a safety net. + */ + if (PageReserved(page)) + return 1; page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_referenced | 1 << PG_arch_1 | 1 << PG_checked | 1 << PG_mappedtodisk); - page->private = 0; + set_page_private(page, 0); set_page_refs(page, order); kernel_map_pages(page, 1 << order, 1); + return 0; } /* @@ -648,11 +687,14 @@ if (arch_free_page(page, 0)) return; - kernel_map_pages(page, 1, 0); - inc_page_state(pgfree); if (PageAnon(page)) page->mapping = NULL; - free_pages_check(__FUNCTION__, page); + if (free_pages_check(__FUNCTION__, page)) + return; + + inc_page_state(pgfree); + kernel_map_pages(page, 1, 0); + pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; local_irq_save(flags); list_add(&page->lru, &pcp->list); @@ -691,12 +733,14 @@ buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags) { unsigned long flags; - struct page *page = NULL; + struct page *page; int cold = !!(gfp_flags & __GFP_COLD); +again: if (order == 0) { struct per_cpu_pages *pcp; + page = NULL; pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; local_irq_save(flags); if (pcp->count <= pcp->low) @@ -709,9 +753,7 @@ } local_irq_restore(flags); put_cpu(); - } - - if (page == NULL) { + } else { spin_lock_irqsave(&zone->lock, flags); page = __rmqueue(zone, order); spin_unlock_irqrestore(&zone->lock, flags); @@ -720,7 +762,8 @@ if (page != NULL) { BUG_ON(bad_range(zone, page)); mod_page_state_zone(zone, pgalloc, 1 << order); - prep_new_page(page, order); + if (prep_new_page(page, order)) + goto again; if (gfp_flags & __GFP_ZERO) prep_zero_page(page, order, gfp_flags); @@ -731,20 +774,28 @@ return page; } +#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ +#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */ +#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */ +#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */ +#define ALLOC_HARDER 0x10 /* try to alloc harder */ +#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ +#define ALLOC_CPUSET 0x40 /* check for correct cpuset */ + /* * Return 1 if free pages are above 'mark'. This takes into account the order * of the allocation. */ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, - int classzone_idx, int can_try_harder, int gfp_high) + int classzone_idx, int alloc_flags) { /* free_pages my go negative - that's OK */ long min = mark, free_pages = z->free_pages - (1 << order) + 1; int o; - if (gfp_high) + if (alloc_flags & ALLOC_HIGH) min -= min / 2; - if (can_try_harder) + if (alloc_flags & ALLOC_HARDER) min -= min / 4; if (free_pages <= min + z->lowmem_reserve[classzone_idx]) @@ -762,123 +813,127 @@ return 1; } -static inline int -should_reclaim_zone(struct zone *z, gfp_t gfp_mask) -{ - if (!z->reclaim_pages) - return 0; - if (gfp_mask & __GFP_NORECLAIM) - return 0; - return 1; -} - -/* - * This is the 'heart' of the zoned buddy allocator. - */ -struct page * fastcall -__alloc_pages(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist) -{ - const int wait = gfp_mask & __GFP_WAIT; - struct zone **zones, *z; - struct page *page; - struct reclaim_state reclaim_state; - struct task_struct *p = current; - int i; - int classzone_idx; - int do_retry; - int can_try_harder; - int did_some_progress; - - might_sleep_if(wait); - - /* - * The caller may dip into page reserves a bit more if the caller - * cannot run direct reclaim, or is the caller has realtime scheduling - * policy - */ - can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait; - - zones = zonelist->zones; /* the list of zones suitable for gfp_mask */ - - if (unlikely(zones[0] == NULL)) { - /* Should this ever happen?? */ - return NULL; - } - - classzone_idx = zone_idx(zones[0]); - -restart: +/* + * get_page_from_freeliest goes through the zonelist trying to allocate + * a page. + */ +static struct page * +get_page_from_freelist(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, int alloc_flags) +{ + struct zone **z = zonelist->zones; + struct page *page = NULL; + int classzone_idx = zone_idx(*z); + /* * Go through the zonelist once, looking for a zone with enough free. * See also cpuset_zone_allowed() comment in kernel/cpuset.c. */ - for (i = 0; (z = zones[i]) != NULL; i++) { - int do_reclaim = should_reclaim_zone(z, gfp_mask); - - if (!cpuset_zone_allowed(z, __GFP_HARDWALL)) + do { + if ((alloc_flags & ALLOC_CPUSET) && + !cpuset_zone_allowed(*z, gfp_mask)) continue; - /* - * If the zone is to attempt early page reclaim then this loop - * will try to reclaim pages and check the watermark a second - * time before giving up and falling back to the next zone. - */ -zone_reclaim_retry: - if (!zone_watermark_ok(z, order, z->pages_low, - classzone_idx, 0, 0)) { - if (!do_reclaim) + if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { + unsigned long mark; + if (alloc_flags & ALLOC_WMARK_MIN) + mark = (*z)->pages_min; + else if (alloc_flags & ALLOC_WMARK_LOW) + mark = (*z)->pages_low; + else + mark = (*z)->pages_high; + if (!zone_watermark_ok(*z, order, mark, + classzone_idx, alloc_flags)) continue; - else { - zone_reclaim(z, gfp_mask, order); - /* Only try reclaim once */ - do_reclaim = 0; - goto zone_reclaim_retry; - } } - page = buffered_rmqueue(z, order, gfp_mask); - if (page) - goto got_pg; - } - - for (i = 0; (z = zones[i]) != NULL; i++) - wakeup_kswapd(z, order); + page = buffered_rmqueue(*z, order, gfp_mask); + if (page) { + zone_statistics(zonelist, *z); + break; + } + } while (*(++z) != NULL); + return page; +} + +/* + * This is the 'heart' of the zoned buddy allocator. + */ +struct page * fastcall +__alloc_pages(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist) +{ + const gfp_t wait = gfp_mask & __GFP_WAIT; + struct zone **z; + struct page *page; + struct reclaim_state reclaim_state; + struct task_struct *p = current; + int do_retry; + int alloc_flags; + int did_some_progress; + + might_sleep_if(wait); + +restart: + z = zonelist->zones; /* the list of zones suitable for gfp_mask */ + + if (unlikely(*z == NULL)) { + /* Should this ever happen?? */ + return NULL; + } + + page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, + zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET); + if (page) + goto got_pg; + + do { + wakeup_kswapd(*z, order); + } while (*(++z)); + + /* + * OK, we're below the kswapd watermark and have kicked background + * reclaim. Now things get more complex, so set up alloc_flags according + * to how we want to proceed. + * + * The caller may dip into page reserves a bit more if the caller + * cannot run direct reclaim, or if the caller has realtime scheduling + * policy. + */ + alloc_flags = ALLOC_WMARK_MIN; + if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait) + alloc_flags |= ALLOC_HARDER; + if (gfp_mask & __GFP_HIGH) + alloc_flags |= ALLOC_HIGH; + if (wait) + alloc_flags |= ALLOC_CPUSET; /* * Go through the zonelist again. Let __GFP_HIGH and allocations - * coming from realtime tasks to go deeper into reserves + * coming from realtime tasks go deeper into reserves. * * This is the last chance, in general, before the goto nopage. * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. * See also cpuset_zone_allowed() comment in kernel/cpuset.c. */ - for (i = 0; (z = zones[i]) != NULL; i++) { - if (!zone_watermark_ok(z, order, z->pages_min, - classzone_idx, can_try_harder, - gfp_mask & __GFP_HIGH)) - continue; - - if (wait && !cpuset_zone_allowed(z, gfp_mask)) - continue; - - page = buffered_rmqueue(z, order, gfp_mask); - if (page) - goto got_pg; - } + page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags); + if (page) + goto got_pg; /* This allocation should allow future memory freeing. */ if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) && !in_interrupt()) { if (!(gfp_mask & __GFP_NOMEMALLOC)) { +nofail_alloc: /* go through the zonelist yet again, ignoring mins */ - for (i = 0; (z = zones[i]) != NULL; i++) { - if (!cpuset_zone_allowed(z, gfp_mask)) - continue; - page = buffered_rmqueue(z, order, gfp_mask); - if (page) - goto got_pg; + page = get_page_from_freelist(gfp_mask, order, + zonelist, ALLOC_NO_WATERMARKS|ALLOC_CPUSET); + if (page) + goto got_pg; + if (gfp_mask & __GFP_NOFAIL) { + blk_congestion_wait(WRITE, HZ/50); + goto nofail_alloc; } } goto nopage; @@ -896,7 +951,7 @@ reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; - did_some_progress = try_to_free_pages(zones, gfp_mask); + did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask); p->reclaim_state = NULL; p->flags &= ~PF_MEMALLOC; @@ -904,19 +959,10 @@ cond_resched(); if (likely(did_some_progress)) { - for (i = 0; (z = zones[i]) != NULL; i++) { - if (!zone_watermark_ok(z, order, z->pages_min, - classzone_idx, can_try_harder, - gfp_mask & __GFP_HIGH)) - continue; - - if (!cpuset_zone_allowed(z, gfp_mask)) - continue; - - page = buffered_rmqueue(z, order, gfp_mask); - if (page) - goto got_pg; - } + page = get_page_from_freelist(gfp_mask, order, + zonelist, alloc_flags); + if (page) + goto got_pg; } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { /* * Go through the zonelist yet one more time, keep @@ -924,18 +970,10 @@ * a parallel oom killing, we must fail if we're still * under heavy pressure. */ - for (i = 0; (z = zones[i]) != NULL; i++) { - if (!zone_watermark_ok(z, order, z->pages_high, - classzone_idx, 0, 0)) - continue; - - if (!cpuset_zone_allowed(z, __GFP_HARDWALL)) - continue; - - page = buffered_rmqueue(z, order, gfp_mask); - if (page) - goto got_pg; - } + page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, + zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET); + if (page) + goto got_pg; out_of_memory(gfp_mask, order); goto restart; @@ -968,9 +1006,7 @@ dump_stack(); show_mem(); } - return NULL; got_pg: - zone_statistics(zonelist, z); return page; } @@ -998,7 +1034,7 @@ * get_zeroed_page() returns a 32-bit address, which cannot represent * a highmem page */ - BUG_ON(gfp_mask & __GFP_HIGHMEM); + BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); page = alloc_pages(gfp_mask | __GFP_ZERO, 0); if (page) @@ -1018,7 +1054,7 @@ fastcall void __free_pages(struct page *page, unsigned int order) { - if (!PageReserved(page) && put_page_testzero(page)) { + if (put_page_testzero(page)) { if (order == 0) free_hot_page(page); else @@ -1091,7 +1127,7 @@ */ unsigned int nr_free_buffer_pages(void) { - return nr_free_zone_pages(GFP_USER & GFP_ZONEMASK); + return nr_free_zone_pages(gfp_zone(GFP_USER)); } /* @@ -1099,7 +1135,7 @@ */ unsigned int nr_free_pagecache_pages(void) { - return nr_free_zone_pages(GFP_HIGHUSER & GFP_ZONEMASK); + return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER)); } #ifdef CONFIG_HIGHMEM @@ -1307,11 +1343,8 @@ } else printk("\n"); - for (cpu = 0; cpu < NR_CPUS; ++cpu) { + for_each_online_cpu(cpu) { struct per_cpu_pageset *pageset; - - if (!cpu_possible(cpu)) - continue; pageset = zone_pcp(zone, cpu); @@ -1421,6 +1454,10 @@ zone = pgdat->node_zones + ZONE_NORMAL; if (zone->present_pages) zonelist->zones[j++] = zone; + case ZONE_DMA32: + zone = pgdat->node_zones + ZONE_DMA32; + if (zone->present_pages) + zonelist->zones[j++] = zone; case ZONE_DMA: zone = pgdat->node_zones + ZONE_DMA; if (zone->present_pages) @@ -1428,6 +1465,18 @@ } return j; +} + +static inline int highest_zone(int zone_bits) +{ + int res = ZONE_NORMAL; + if (zone_bits & (__force int)__GFP_HIGHMEM) + res = ZONE_HIGHMEM; + if (zone_bits & (__force int)__GFP_DMA32) + res = ZONE_DMA32; + if (zone_bits & (__force int)__GFP_DMA) + res = ZONE_DMA; + return res; } #ifdef CONFIG_NUMA @@ -1526,11 +1575,7 @@ zonelist = pgdat->node_zonelists + i; for (j = 0; zonelist->zones[j] != NULL; j++); - k = ZONE_NORMAL; - if (i & __GFP_HIGHMEM) - k = ZONE_HIGHMEM; - if (i & __GFP_DMA) - k = ZONE_DMA; + k = highest_zone(i); j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); zonelist->zones[j] = NULL; @@ -1551,12 +1596,7 @@ zonelist = pgdat->node_zonelists + i; j = 0; - k = ZONE_NORMAL; - if (i & __GFP_HIGHMEM) - k = ZONE_HIGHMEM; - if (i & __GFP_DMA) - k = ZONE_DMA; - + k = highest_zone(i); j = build_zonelists_node(pgdat, zonelist, j, k); /* * Now we build the zonelist so that it contains the zones @@ -1661,7 +1701,7 @@ * up by free_all_bootmem() once the early boot process is * done. Non-atomic initialization, single-pass. */ -void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone, +void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone, unsigned long start_pfn) { struct page *page; @@ -1675,7 +1715,7 @@ continue; page = pfn_to_page(pfn); set_page_links(page, zone, nid, pfn); - set_page_count(page, 0); + set_page_count(page, 1); reset_page_mapcount(page); SetPageReserved(page); INIT_LIST_HEAD(&page->lru); @@ -1722,14 +1762,13 @@ /* * The per-cpu-pages pools are set to around 1000th of the - * size of the zone. But no more than 1/4 of a meg - there's - * no point in going beyond the size of L2 cache. + * size of the zone. But no more than 1/2 of a meg. * * OK, so we don't know how big the cache is. So guess. */ batch = zone->present_pages / 1024; - if (batch * PAGE_SIZE > 256 * 1024) - batch = (256 * 1024) / PAGE_SIZE; + if (batch * PAGE_SIZE > 512 * 1024) + batch = (512 * 1024) / PAGE_SIZE; batch /= 4; /* We effectively *= 4 below */ if (batch < 1) batch = 1; @@ -1744,7 +1783,8 @@ * of pages of one half of the possible page colors * and the other with pages of the other colors. */ - batch = (1 << fls(batch + batch/2)) - 1; + batch = (1 << (fls(batch + batch/2)-1)) - 1; + return batch; } @@ -1756,7 +1796,7 @@ pcp = &p->pcp[0]; /* hot */ pcp->count = 0; - pcp->low = 2 * batch; + pcp->low = 0; pcp->high = 6 * batch; pcp->batch = max(1UL, 1 * batch); INIT_LIST_HEAD(&pcp->list); @@ -1765,7 +1805,7 @@ pcp->count = 0; pcp->low = 0; pcp->high = 2 * batch; - pcp->batch = max(1UL, 1 * batch); + pcp->batch = max(1UL, batch/2); INIT_LIST_HEAD(&pcp->list); } @@ -1845,11 +1885,10 @@ if (process_zones(cpu)) ret = NOTIFY_BAD; break; -#ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: case CPU_DEAD: free_zone_pagesets(cpu); break; -#endif default: break; } @@ -1859,7 +1898,7 @@ static struct notifier_block pageset_notifier = { &pageset_cpuup_callback, NULL, 0 }; -void __init setup_per_cpu_pageset() +void __init setup_per_cpu_pageset(void) { int err; @@ -1874,6 +1913,60 @@ #endif +static __devinit +void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) +{ + int i; + struct pglist_data *pgdat = zone->zone_pgdat; + + /* + * The per-page waitqueue mechanism uses hashed waitqueues + * per zone. + */ + zone->wait_table_size = wait_table_size(zone_size_pages); + zone->wait_table_bits = wait_table_bits(zone->wait_table_size); + zone->wait_table = (wait_queue_head_t *) + alloc_bootmem_node(pgdat, zone->wait_table_size + * sizeof(wait_queue_head_t)); + + for(i = 0; i < zone->wait_table_size; ++i) + init_waitqueue_head(zone->wait_table + i); +} + +static __devinit void zone_pcp_init(struct zone *zone) +{ + int cpu; + unsigned long batch = zone_batchsize(zone); + + for (cpu = 0; cpu < NR_CPUS; cpu++) { +#ifdef CONFIG_NUMA + /* Early boot. Slab allocator not functional yet */ + zone->pageset[cpu] = &boot_pageset[cpu]; + setup_pageset(&boot_pageset[cpu],0); +#else + setup_pageset(zone_pcp(zone,cpu), batch); +#endif + } + printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", + zone->name, zone->present_pages, batch); +} + +static __devinit void init_currently_empty_zone(struct zone *zone, + unsigned long zone_start_pfn, unsigned long size) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + + zone_wait_table_init(zone, size); + pgdat->nr_zones = zone_idx(zone) + 1; + + zone->zone_mem_map = pfn_to_page(zone_start_pfn); + zone->zone_start_pfn = zone_start_pfn; + + memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); + + zone_init_free_lists(pgdat, zone, zone->spanned_pages); +} + /* * Set up the zone data structures: * - mark all pages reserved @@ -1883,10 +1976,11 @@ static void __init free_area_init_core(struct pglist_data *pgdat, unsigned long *zones_size, unsigned long *zholes_size) { - unsigned long i, j; - int cpu, nid = pgdat->node_id; + unsigned long j; + int nid = pgdat->node_id; unsigned long zone_start_pfn = pgdat->node_start_pfn; + pgdat_resize_init(pgdat); pgdat->nr_zones = 0; init_waitqueue_head(&pgdat->kswapd_wait); pgdat->kswapd_max_order = 0; @@ -1894,13 +1988,12 @@ for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize; - unsigned long batch; realsize = size = zones_size[j]; if (zholes_size) realsize -= zholes_size[j]; - if (j == ZONE_DMA || j == ZONE_NORMAL) + if (j < ZONE_HIGHMEM) nr_kernel_pages += realsize; nr_all_pages += realsize; @@ -1909,24 +2002,13 @@ zone->name = zone_names[j]; spin_lock_init(&zone->lock); spin_lock_init(&zone->lru_lock); + zone_seqlock_init(zone); zone->zone_pgdat = pgdat; zone->free_pages = 0; zone->temp_priority = zone->prev_priority = DEF_PRIORITY; - batch = zone_batchsize(zone); - - for (cpu = 0; cpu < NR_CPUS; cpu++) { -#ifdef CONFIG_NUMA - /* Early boot. Slab allocator not functional yet */ - zone->pageset[cpu] = &boot_pageset[cpu]; - setup_pageset(&boot_pageset[cpu],0); -#else - setup_pageset(zone_pcp(zone,cpu), batch); -#endif - } - printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", - zone_names[j], realsize, batch); + zone_pcp_init(zone); INIT_LIST_HEAD(&zone->active_list); INIT_LIST_HEAD(&zone->inactive_list); zone->nr_scan_active = 0; @@ -1937,32 +2019,9 @@ if (!size) continue; - /* - * The per-page waitqueue mechanism uses hashed waitqueues - * per zone. - */ - zone->wait_table_size = wait_table_size(size); - zone->wait_table_bits = - wait_table_bits(zone->wait_table_size); - zone->wait_table = (wait_queue_head_t *) - alloc_bootmem_node(pgdat, zone->wait_table_size - * sizeof(wait_queue_head_t)); - - for(i = 0; i < zone->wait_table_size; ++i) - init_waitqueue_head(zone->wait_table + i); - - pgdat->nr_zones = j+1; - - zone->zone_mem_map = pfn_to_page(zone_start_pfn); - zone->zone_start_pfn = zone_start_pfn; - - memmap_init(size, nid, j, zone_start_pfn); - zonetable_add(zone, nid, j, zone_start_pfn, size); - + init_currently_empty_zone(zone, zone_start_pfn, size); zone_start_pfn += size; - - zone_init_free_lists(pgdat, zone, zone->spanned_pages); } } @@ -2362,7 +2421,7 @@ * that the pages_{min,low,high} values for each zone are set correctly * with respect to min_free_kbytes. */ -static void setup_per_zone_pages_min(void) +void setup_per_zone_pages_min(void) { unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); unsigned long lowmem_pages = 0; @@ -2376,13 +2435,18 @@ } for_each_zone(zone) { + unsigned long tmp; spin_lock_irqsave(&zone->lru_lock, flags); + tmp = (pages_min * zone->present_pages) / lowmem_pages; if (is_highmem(zone)) { /* - * Often, highmem doesn't need to reserve any pages. - * But the pages_min/low/high values are also used for - * batching up page reclaim activity so we need a - * decent value here. + * __GFP_HIGH and PF_MEMALLOC allocations usually don't + * need highmem pages, so cap pages_min to a small + * value here. + * + * The (pages_high-pages_low) and (pages_low-pages_min) + * deltas controls asynch page reclaim, and so should + * not be capped for highmem. */ int min_pages; @@ -2393,19 +2457,15 @@ min_pages = 128; zone->pages_min = min_pages; } else { - /* if it's a lowmem zone, reserve a number of pages + /* + * If it's a lowmem zone, reserve a number of pages * proportionate to the zone's size. */ - zone->pages_min = (pages_min * zone->present_pages) / - lowmem_pages; + zone->pages_min = tmp; } - /* - * When interpreting these watermarks, just keep in mind that: - * zone->pages_min == (zone->pages_min * 4) / 4; - */ - zone->pages_low = (zone->pages_min * 5) / 4; - zone->pages_high = (zone->pages_min * 6) / 4; + zone->pages_low = zone->pages_min + tmp / 4; + zone->pages_high = zone->pages_min + tmp / 2; spin_unlock_irqrestore(&zone->lru_lock, flags); } } diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/net/core/dev.c --- a/linux-2.6-xen-sparse/net/core/dev.c Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/net/core/dev.c Wed Feb 1 18:00:19 2006 @@ -1114,6 +1114,19 @@ return ret; } +/* Take action when hardware reception checksum errors are detected. */ +#ifdef CONFIG_BUG +void netdev_rx_csum_fault(struct net_device *dev) +{ + if (net_ratelimit()) { + printk(KERN_ERR "%s: hw csum failure.\n", + dev ? dev->name : "<unknown>"); + dump_stack(); + } +} +EXPORT_SYMBOL(netdev_rx_csum_fault); +#endif + #ifdef CONFIG_HIGHMEM /* Actually, we should eliminate this check as soon as we know, that: * 1. IOMMU is present and allows to map all the memory. @@ -2767,6 +2780,20 @@ dev->name); dev->features &= ~NETIF_F_TSO; } + if (dev->features & NETIF_F_UFO) { + if (!(dev->features & NETIF_F_HW_CSUM)) { + printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no " + "NETIF_F_HW_CSUM feature.\n", + dev->name); + dev->features &= ~NETIF_F_UFO; + } + if (!(dev->features & NETIF_F_SG)) { + printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no " + "NETIF_F_SG feature.\n", + dev->name); + dev->features &= ~NETIF_F_UFO; + } + } /* * nil rebuild_header routine, diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/net/core/skbuff.c --- a/linux-2.6-xen-sparse/net/core/skbuff.c Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/net/core/skbuff.c Wed Feb 1 18:00:19 2006 @@ -122,6 +122,8 @@ * __alloc_skb - allocate a network buffer * @size: size to allocate * @gfp_mask: allocation mask + * @fclone: allocate from fclone cache instead of head cache + * and allocate a cloned (child) skb * * Allocate a new &sk_buff. The returned buffer has no headroom and a * tail room of size bytes. The object has a reference count of one. @@ -175,6 +177,8 @@ skb_shinfo(skb)->tso_size = 0; skb_shinfo(skb)->tso_segs = 0; skb_shinfo(skb)->frag_list = NULL; + skb_shinfo(skb)->ufo_size = 0; + skb_shinfo(skb)->ip6_frag_id = 0; out: return skb; nodata: @@ -247,6 +251,8 @@ skb_shinfo(skb)->tso_size = 0; skb_shinfo(skb)->tso_segs = 0; skb_shinfo(skb)->frag_list = NULL; + skb_shinfo(skb)->ufo_size = 0; + skb_shinfo(skb)->ip6_frag_id = 0; out: return skb; nodata: @@ -354,6 +360,9 @@ } #ifdef CONFIG_NETFILTER nf_conntrack_put(skb->nfct); +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + nf_conntrack_put_reasm(skb->nfct_reasm); +#endif #ifdef CONFIG_BRIDGE_NETFILTER nf_bridge_put(skb->nf_bridge); #endif @@ -436,8 +445,16 @@ C(nfct); nf_conntrack_get(skb->nfct); C(nfctinfo); +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + C(nfct_reasm); + nf_conntrack_get_reasm(skb->nfct_reasm); +#endif #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) C(ipvs_property); +#endif +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + C(nfct_reasm); + nf_conntrack_get_reasm(skb->nfct_reasm); #endif #ifdef CONFIG_BRIDGE_NETFILTER C(nf_bridge); @@ -496,6 +513,10 @@ new->nfct = old->nfct; nf_conntrack_get(old->nfct); new->nfctinfo = old->nfctinfo; +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + new->nfct_reasm = old->nfct_reasm; + nf_conntrack_get_reasm(old->nfct_reasm); +#endif #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) new->ipvs_property = old->ipvs_property; #endif @@ -1718,6 +1739,78 @@ skb_prepare_seq_read(skb, from, to, TS_SKB_CB(state)); return textsearch_find(config, state); +} + +/** + * skb_append_datato_frags: - append the user data to a skb + * @sk: sock structure + * @skb: skb structure to be appened with user data. + * @getfrag: call back function to be used for getting the user data + * @from: pointer to user message iov + * @length: length of the iov message + * + * Description: This procedure append the user data in the fragment part + * of the skb if any page alloc fails user this procedure returns -ENOMEM + */ +int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb, + int (*getfrag)(void *from, char *to, int offset, + int len, int odd, struct sk_buff *skb), + void *from, int length) +{ + int frg_cnt = 0; + skb_frag_t *frag = NULL; + struct page *page = NULL; + int copy, left; + int offset = 0; + int ret; + + do { + /* Return error if we don't have space for new frag */ + frg_cnt = skb_shinfo(skb)->nr_frags; + if (frg_cnt >= MAX_SKB_FRAGS) + return -EFAULT; + + /* allocate a new page for next frag */ + page = alloc_pages(sk->sk_allocation, 0); + + /* If alloc_page fails just return failure and caller will + * free previous allocated pages by doing kfree_skb() + */ + if (page == NULL) + return -ENOMEM; + + /* initialize the next frag */ + sk->sk_sndmsg_page = page; + sk->sk_sndmsg_off = 0; + skb_fill_page_desc(skb, frg_cnt, page, 0, 0); + skb->truesize += PAGE_SIZE; + atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc); + + /* get the new initialized frag */ + frg_cnt = skb_shinfo(skb)->nr_frags; + frag = &skb_shinfo(skb)->frags[frg_cnt - 1]; + + /* copy the user data to page */ + left = PAGE_SIZE - frag->page_offset; + copy = (length > left)? left : length; + + ret = getfrag(from, (page_address(frag->page) + + frag->page_offset + frag->size), + offset, copy, 0, skb); + if (ret < 0) + return -EFAULT; + + /* copy was successful so update the size parameters */ + sk->sk_sndmsg_off += copy; + frag->size += copy; + skb->len += copy; + skb->data_len += copy; + offset += copy; + length -= copy; + + } while (length > 0); + + return 0; } void __init skb_init(void) @@ -1771,3 +1864,4 @@ EXPORT_SYMBOL(skb_seq_read); EXPORT_SYMBOL(skb_abort_seq_read); EXPORT_SYMBOL(skb_find_text); +EXPORT_SYMBOL(skb_append_datato_frags); diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/arch/i386/power/Makefile --- /dev/null Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/arch/i386/power/Makefile Wed Feb 1 18:00:19 2006 @@ -0,0 +1,4 @@ +obj-$(CONFIG_PM_LEGACY) += cpu.o +obj-$(CONFIG_SOFTWARE_SUSPEND) += cpu.o +obj-$(CONFIG_ACPI_SLEEP) += cpu.o +obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/lib/Makefile --- /dev/null Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/lib/Makefile Wed Feb 1 18:00:19 2006 @@ -0,0 +1,60 @@ +# +# Makefile for some libs needed in the kernel. +# + +lib-y := errno.o ctype.o string.o vsprintf.o cmdline.o \ + bust_spinlocks.o rbtree.o radix-tree.o dump_stack.o \ + idr.o div64.o int_sqrt.o bitmap.o extable.o prio_tree.o \ + sha1.o + +lib-y += kobject.o kref.o kobject_uevent.o klist.o + +obj-y += sort.o parser.o halfmd4.o + +ifeq ($(CONFIG_DEBUG_KOBJECT),y) +CFLAGS_kobject.o += -DDEBUG +CFLAGS_kobject_uevent.o += -DDEBUG +endif + +obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o +lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o +lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o +lib-$(CONFIG_SEMAPHORE_SLEEPERS) += semaphore-sleepers.o +lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o +obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o +obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o + +ifneq ($(CONFIG_HAVE_DEC_LOCK),y) + lib-y += dec_and_lock.o +endif + +obj-$(CONFIG_CRC_CCITT) += crc-ccitt.o +obj-$(CONFIG_CRC16) += crc16.o +obj-$(CONFIG_CRC32) += crc32.o +obj-$(CONFIG_LIBCRC32C) += libcrc32c.o +obj-$(CONFIG_GENERIC_IOMAP) += iomap.o +obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o + +obj-$(CONFIG_ZLIB_INFLATE) += zlib_inflate/ +obj-$(CONFIG_ZLIB_DEFLATE) += zlib_deflate/ +obj-$(CONFIG_REED_SOLOMON) += reed_solomon/ + +obj-$(CONFIG_TEXTSEARCH) += textsearch.o +obj-$(CONFIG_TEXTSEARCH_KMP) += ts_kmp.o +obj-$(CONFIG_TEXTSEARCH_BM) += ts_bm.o +obj-$(CONFIG_TEXTSEARCH_FSM) += ts_fsm.o + +ifneq ($(CONFIG_XEN),y) +obj-$(CONFIG_SWIOTLB) += swiotlb.o +endif + +hostprogs-y := gen_crc32table +clean-files := crc32table.h + +$(obj)/crc32.o: $(obj)/crc32table.h + +quiet_cmd_crc32 = GEN $@ + cmd_crc32 = $< > $@ + +$(obj)/crc32table.h: $(obj)/gen_crc32table + $(call cmd,crc32) diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/mm/Kconfig --- /dev/null Wed Feb 1 17:06:16 2006 +++ b/linux-2.6-xen-sparse/mm/Kconfig Wed Feb 1 18:00:19 2006 @@ -0,0 +1,137 @@ +config SELECT_MEMORY_MODEL + def_bool y + depends on EXPERIMENTAL || ARCH_SELECT_MEMORY_MODEL + +choice + prompt "Memory model" + depends on SELECT_MEMORY_MODEL + default DISCONTIGMEM_MANUAL if ARCH_DISCONTIGMEM_DEFAULT + default SPARSEMEM_MANUAL if ARCH_SPARSEMEM_DEFAULT + default FLATMEM_MANUAL + +config FLATMEM_MANUAL + bool "Flat Memory" + depends on !ARCH_DISCONTIGMEM_ENABLE || ARCH_FLATMEM_ENABLE + help + This option allows you to change some of the ways that + Linux manages its memory internally. Most users will + only have one option here: FLATMEM. This is normal + and a correct option. + + Some users of more advanced features like NUMA and + memory hotplug may have different options here. + DISCONTIGMEM is an more mature, better tested system, + but is incompatible with memory hotplug and may suffer + decreased performance over SPARSEMEM. If unsure between + "Sparse Memory" and "Discontiguous Memory", choose + "Discontiguous Memory". + + If unsure, choose this option (Flat Memory) over any other. + +config DISCONTIGMEM_MANUAL + bool "Discontiguous Memory" + depends on ARCH_DISCONTIGMEM_ENABLE + help + This option provides enhanced support for discontiguous + memory systems, over FLATMEM. These systems have holes + in their physical address spaces, and this option provides + more efficient handling of these holes. However, the vast + majority of hardware has quite flat address spaces, and + can have degraded performance from extra overhead that + this option imposes. + + Many NUMA configurations will have this as the only option. + + If unsure, choose "Flat Memory" over this option. + +config SPARSEMEM_MANUAL + bool "Sparse Memory" + depends on ARCH_SPARSEMEM_ENABLE + help + This will be the only option for some systems, including + memory hotplug systems. This is normal. + + For many other systems, this will be an alternative to + "Discontiguous Memory". This option provides some potential + performance benefits, along with decreased code complexity, + but it is newer, and more experimental. + + If unsure, choose "Discontiguous Memory" or "Flat Memory" + over this option. + +endchoice + +config DISCONTIGMEM + def_bool y + depends on (!SELECT_MEMORY_MODEL && ARCH_DISCONTIGMEM_ENABLE) || DISCONTIGMEM_MANUAL + +config SPARSEMEM + def_bool y + depends on SPARSEMEM_MANUAL + +config FLATMEM + def_bool y + depends on (!DISCONTIGMEM && !SPARSEMEM) || FLATMEM_MANUAL + +config FLAT_NODE_MEM_MAP + def_bool y + depends on !SPARSEMEM + +# +# Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's +# to represent different areas of memory. This variable allows +# those dependencies to exist individually. +# +config NEED_MULTIPLE_NODES + def_bool y + depends on DISCONTIGMEM || NUMA + +config HAVE_MEMORY_PRESENT + def_bool y + depends on ARCH_HAVE_MEMORY_PRESENT || SPARSEMEM + +# +# SPARSEMEM_EXTREME (which is the default) does some bootmem +# allocations when memory_present() is called. If this can not +# be done on your architecture, select this option. However, +# statically allocating the mem_section[] array can potentially +# consume vast quantities of .bss, so be careful. +# +# This option will also potentially produce smaller runtime code +# with gcc 3.4 and later. +# +config SPARSEMEM_STATIC + def_bool n + +# +# Architectecture platforms which require a two level mem_section in SPARSEMEM +# must select this option. This is usually for architecture platforms with +# an extremely sparse physical address space. +# +config SPARSEMEM_EXTREME + def_bool y + depends on SPARSEMEM && !SPARSEMEM_STATIC + +# eventually, we can have this option just 'select SPARSEMEM' +config MEMORY_HOTPLUG + bool "Allow for memory hot-add" + depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND + +comment "Memory hotplug is currently incompatible with Software Suspend" + depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND + +# Heavily threaded applications may benefit from splitting the mm-wide +# page_table_lock, so that faults on different parts of the user address +# space can be handled with less contention: split it at this NR_CPUS. +# Default to 4 for wider testing, though 8 might be more appropriate. +# ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock. +# PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes. +# XEN uses the mapping field on pagetable pages to store a pointer to +# the destructor. +# +config SPLIT_PTLOCK_CPUS + int + default "4096" if ARM && !CPU_CACHE_VIPT + default "4096" if PARISC && !PA20 + default "4096" if XEN + default "4" diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/arch/i386/kernel/apm.c --- a/linux-2.6-xen-sparse/arch/i386/kernel/apm.c Wed Feb 1 17:06:16 2006 +++ /dev/null Wed Feb 1 18:00:19 2006 @@ -1,2420 +0,0 @@ -/* -*- linux-c -*- - * APM BIOS driver for Linux - * Copyright 1994-2001 Stephen Rothwell (sfr@xxxxxxxxxxxxxxxx) - * - * Initial development of this driver was funded by NEC Australia P/L - * and NEC Corporation - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2, or (at your option) any - * later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * October 1995, Rik Faith (faith@xxxxxxxxxx): - * Minor enhancements and updates (to the patch set) for 1.3.x - * Documentation - * January 1996, Rik Faith (faith@xxxxxxxxxx): - * Make /proc/apm easy to format (bump driver version) - * March 1996, Rik Faith (faith@xxxxxxxxxx): - * Prohibit APM BIOS calls unless apm_enabled. - * (Thanks to Ulrich Windl <Ulrich.Windl@xxxxxxxxxxxxxxxxxxxx>) - * April 1996, Stephen Rothwell (sfr@xxxxxxxxxxxxxxxx) - * Version 1.0 and 1.1 - * May 1996, Version 1.2 - * Feb 1998, Version 1.3 - * Feb 1998, Version 1.4 - * Aug 1998, Version 1.5 - * Sep 1998, Version 1.6 - * Nov 1998, Version 1.7 - * Jan 1999, Version 1.8 - * Jan 1999, Version 1.9 - * Oct 1999, Version 1.10 - * Nov 1999, Version 1.11 - * Jan 2000, Version 1.12 - * Feb 2000, Version 1.13 - * Nov 2000, Version 1.14 - * Oct 2001, Version 1.15 - * Jan 2002, Version 1.16 - * Oct 2002, Version 1.16ac - * - * History: - * 0.6b: first version in official kernel, Linux 1.3.46 - * 0.7: changed /proc/apm format, Linux 1.3.58 - * 0.8: fixed gcc 2.7.[12] compilation problems, Linux 1.3.59 - * 0.9: only call bios if bios is present, Linux 1.3.72 - * 1.0: use fixed device number, consolidate /proc/apm into this file, - * Linux 1.3.85 - * 1.1: support user-space standby and suspend, power off after system - * halted, Linux 1.3.98 - * 1.2: When resetting RTC after resume, take care so that the time - * is only incorrect by 30-60mS (vs. 1S previously) (Gabor J. Toth - * <jtoth@xxxxxxxxxxxxx>); improve interaction between - * screen-blanking and gpm (Stephen Rothwell); Linux 1.99.4 - * 1.2a:Simple change to stop mysterious bug reports with SMP also added - * levels to the printk calls. APM is not defined for SMP machines. - * The new replacment for it is, but Linux doesn't yet support this. - * Alan Cox Linux 2.1.55 - * 1.3: Set up a valid data descriptor 0x40 for buggy BIOS's - * 1.4: Upgraded to support APM 1.2. Integrated ThinkPad suspend patch by - * Dean Gaudet <dgaudet@xxxxxxxxxx>. - * C. Scott Ananian <cananian@xxxxxxxxxxxxxxxxxxxx> Linux 2.1.87 - * 1.5: Fix segment register reloading (in case of bad segments saved - * across BIOS call). - * Stephen Rothwell - * 1.6: Cope with complier/assembler differences. - * Only try to turn off the first display device. - * Fix OOPS at power off with no APM BIOS by Jan Echternach - * <echter@xxxxxxxxxxxxxxxxxxxxxxxxx> - * Stephen Rothwell - * 1.7: Modify driver's cached copy of the disabled/disengaged flags - * to reflect current state of APM BIOS. - * Chris Rankin <rankinc@xxxxxxxxxxxxx> - * Reset interrupt 0 timer to 100Hz after suspend - * Chad Miller <cmiller@xxxxxxxxxxxxx> - * Add CONFIG_APM_IGNORE_SUSPEND_BOUNCE - * Richard Gooch <rgooch@xxxxxxxxxxxxx> - * Allow boot time disabling of APM - * Make boot messages far less verbose by default - * Make asm safer - * Stephen Rothwell - * 1.8: Add CONFIG_APM_RTC_IS_GMT - * Richard Gooch <rgooch@xxxxxxxxxxxxx> - * change APM_NOINTS to CONFIG_APM_ALLOW_INTS - * remove dependency on CONFIG_PROC_FS - * Stephen Rothwell - * 1.9: Fix small typo. <laslo@xxxxxxxxxxxxxx> - * Try to cope with BIOS's that need to have all display - * devices blanked and not just the first one. - * Ross Paterson <ross@xxxxxxxxxxxxxx> - * Fix segment limit setting it has always been wrong as - * the segments needed to have byte granularity. - * Mark a few things __init. - * Add hack to allow power off of SMP systems by popular request. - * Use CONFIG_SMP instead of __SMP__ - * Ignore BOUNCES for three seconds. - * Stephen Rothwell - * 1.10: Fix for Thinkpad return code. - * Merge 2.2 and 2.3 drivers. - * Remove APM dependencies in arch/i386/kernel/process.c - * Remove APM dependencies in drivers/char/sysrq.c - * Reset time across standby. - * Allow more inititialisation on SMP. - * Remove CONFIG_APM_POWER_OFF and make it boot time - * configurable (default on). - * Make debug only a boot time parameter (remove APM_DEBUG). - * Try to blank all devices on any error. - * 1.11: Remove APM dependencies in drivers/char/console.c - * Check nr_running to detect if we are idle (from - * Borislav Deianov <borislav@xxxxxxxxxxxxxxxxxxxx>) - * Fix for bioses that don't zero the top part of the - * entrypoint offset (Mario Sitta <sitta@xxxxxxxxxxxx>) - * (reported by Panos Katsaloulis <teras@xxxxxxxxxxx>). - * Real mode power off patch (Walter Hofmann - * <Walter.Hofmann@xxxxxxxxxxxxxxxxxxxxxxxxxxx>). - * 1.12: Remove CONFIG_SMP as the compiler will optimize - * the code away anyway (smp_num_cpus == 1 in UP) - * noted by Artur Skawina <skawina@xxxxxxxxxxxxx>. - * Make power off under SMP work again. - * Fix thinko with initial engaging of BIOS. - * Make sure power off only happens on CPU 0 - * (Paul "Rusty" Russell <rusty@xxxxxxxxxxxxxxx>). - * Do error notification to user mode if BIOS calls fail. - * Move entrypoint offset fix to ...boot/setup.S - * where it belongs (Cosmos <gis88564@xxxxxxxxxxxxxxx>). - * Remove smp-power-off. SMP users must now specify - * "apm=power-off" on the kernel command line. Suggested - * by Jim Avera <jima@xxxxxxx>, modified by Alan Cox - * <alan@xxxxxxxxxxxxxxxxxxx>. - * Register the /proc/apm entry even on SMP so that - * scripts that check for it before doing power off - * work (Jim Avera <jima@xxxxxxx>). - * 1.13: Changes for new pm_ interfaces (Andy Henroid - * <andy_henroid@xxxxxxxxx>). - * Modularize the code. - * Fix the Thinkpad (again) :-( (CONFIG_APM_IGNORE_MULTIPLE_SUSPENDS - * is now the way life works). - * Fix thinko in suspend() (wrong return). - * Notify drivers on critical suspend. - * Make kapmd absorb more idle time (Pavel Machek <pavel@xxxxxxx> - * modified by sfr). - * Disable interrupts while we are suspended (Andy Henroid - * <andy_henroid@xxxxxxxxx> fixed by sfr). - * Make power off work on SMP again (Tony Hoyle - * <tmh@xxxxxxxxxxxxxxxxx> and <zlatko@xxxxxxxx>) modified by sfr. - * Remove CONFIG_APM_SUSPEND_BOUNCE. The bounce ignore - * interval is now configurable. - * 1.14: Make connection version persist across module unload/load. - * Enable and engage power management earlier. - * Disengage power management on module unload. - * Changed to use the sysrq-register hack for registering the - * power off function called by magic sysrq based upon discussions - * in irc://irc.openprojects.net/#kernelnewbies - * (Crutcher Dunnavant <crutcher+kernel@xxxxxxxxxxxxxx>). - * Make CONFIG_APM_REAL_MODE_POWER_OFF run time configurable. - * (Arjan van de Ven <arjanv@xxxxxxxxxx>) modified by sfr. - * Work around byte swap bug in one of the Vaio's BIOS's - * (Marc Boucher <marc@xxxxxxx>). - * Exposed the disable flag to dmi so that we can handle known - * broken APM (Alan Cox <alan@xxxxxxxxxx>). - * 1.14ac: If the BIOS says "I slowed the CPU down" then don't spin - * calling it - instead idle. (Alan Cox <alan@xxxxxxxxxx>) - * If an APM idle fails log it and idle sensibly - * 1.15: Don't queue events to clients who open the device O_WRONLY. - * Don't expect replies from clients who open the device O_RDONLY. - * (Idea from Thomas Hood) - * Minor waitqueue cleanups. (John Fremlin <chief@xxxxxxxxxxx>) - * 1.16: Fix idle calling. (Andreas Steinmetz <ast@xxxxxxxx> et al.) - * Notify listeners of standby or suspend events before notifying - * drivers. Return EBUSY to ioctl() if suspend is rejected. - * (Russell King <rmk@xxxxxxxxxxxxxxxx> and Thomas Hood) - * Ignore first resume after we generate our own resume event - * after a suspend (Thomas Hood) - * Daemonize now gets rid of our controlling terminal (sfr). - * CONFIG_APM_CPU_IDLE now just affects the default value of - * idle_threshold (sfr). - * Change name of kernel apm daemon (as it no longer idles) (sfr). - * 1.16ac: Fix up SMP support somewhat. You can now force SMP on and we - * make _all_ APM calls on the CPU#0. Fix unsafe sign bug. - * TODO: determine if its "boot CPU" or "CPU0" we want to lock to. - * - * APM 1.1 Reference: - * - * Intel Corporation, Microsoft Corporation. Advanced Power Management - * (APM) BIOS Interface Specification, Revision 1.1, September 1993. - * Intel Order Number 241704-001. Microsoft Part Number 781-110-X01. - * - * [This document is available free from Intel by calling 800.628.8686 (fax - * 916.356.6100) or 800.548.4725; or via anonymous ftp from - * ftp://ftp.intel.com/pub/IAL/software_specs/apmv11.doc. It is also - * available from Microsoft by calling 206.882.8080.] - * - * APM 1.2 Reference: - * Intel Corporation, Microsoft Corporation. Advanced Power Management - * (APM) BIOS Interface Specification, Revision 1.2, February 1996. - * - * [This document is available from Microsoft at: - * http://www.microsoft.com/hwdev/busbios/amp_12.htm] - */ - -#include <linux/config.h> -#include <linux/module.h> - -#include <linux/poll.h> -#include <linux/types.h> -#include <linux/stddef.h> -#include <linux/timer.h> -#include <linux/fcntl.h> -#include <linux/slab.h> -#include <linux/stat.h> -#include <linux/proc_fs.h> -#include <linux/miscdevice.h> -#include <linux/apm_bios.h> -#include <linux/init.h> -#include <linux/time.h> -#include <linux/sched.h> -#include <linux/pm.h> -#include <linux/device.h> -#include <linux/kernel.h> -#include <linux/smp.h> -#include <linux/smp_lock.h> -#include <linux/dmi.h> -#include <linux/suspend.h> - -#include <asm/system.h> -#include <asm/uaccess.h> -#include <asm/desc.h> -#include <asm/i8253.h> - -#include "io_ports.h" - -extern unsigned long get_cmos_time(void); -extern void machine_real_restart(unsigned char *, int); - -#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) -extern int (*console_blank_hook)(int); -#endif - -/* - * The apm_bios device is one of the misc char devices. - * This is its minor number. - */ -#define APM_MINOR_DEV 134 - -/* - * See Documentation/Config.help for the configuration options. - * - * Various options can be changed at boot time as follows: - * (We allow underscores for compatibility with the modules code) - * apm=on/off enable/disable APM - * [no-]allow[-_]ints allow interrupts during BIOS calls - * [no-]broken[-_]psr BIOS has a broken GetPowerStatus call - * [no-]realmode[-_]power[-_]off switch to real mode before - * powering off - * [no-]debug log some debugging messages - * [no-]power[-_]off power off on shutdown - * [no-]smp Use apm even on an SMP box - * bounce[-_]interval=<n> number of ticks to ignore suspend - * bounces - * idle[-_]threshold=<n> System idle percentage above which to - * make APM BIOS idle calls. Set it to - * 100 to disable. - * idle[-_]period=<n> Period (in 1/100s of a second) over - * which the idle percentage is - * calculated. - */ - -/* KNOWN PROBLEM MACHINES: - * - * U: TI 4000M TravelMate: BIOS is *NOT* APM compliant - * [Confirmed by TI representative] - * ?: ACER 486DX4/75: uses dseg 0040, in violation of APM specification - * [Confirmed by BIOS disassembly] - * [This may work now ...] - * P: Toshiba 1950S: battery life information only gets updated after resume - * P: Midwest Micro Soundbook Elite DX2/66 monochrome: screen blanking - * broken in BIOS [Reported by Garst R. Reese <reese@xxxxxxx>] - * ?: AcerNote-950: oops on reading /proc/apm - workaround is a WIP - * Neale Banks <neale@xxxxxxxxxxxxxxxx> December 2000 - * - * Legend: U = unusable with APM patches - * P = partially usable with APM patches - */ - -/* - * Define as 1 to make the driver always call the APM BIOS busy - * routine even if the clock was not reported as slowed by the - * idle routine. Otherwise, define as 0. - */ -#define ALWAYS_CALL_BUSY 1 - -/* - * Define to make the APM BIOS calls zero all data segment registers (so - * that an incorrect BIOS implementation will cause a kernel panic if it - * tries to write to arbitrary memory). - */ -#define APM_ZERO_SEGS - -#include "apm.h" - -/* - * Define to make all _set_limit calls use 64k limits. The APM 1.1 BIOS is - * supposed to provide limit information that it recognizes. Many machines - * do this correctly, but many others do not restrict themselves to their - * claimed limit. When this happens, they will cause a segmentation - * violation in the kernel at boot time. Most BIOS's, however, will - * respect a 64k limit, so we use that. If you want to be pedantic and - * hold your BIOS to its claims, then undefine this. - */ -#define APM_RELAX_SEGMENTS - -/* - * Define to re-initialize the interrupt 0 timer to 100 Hz after a suspend. - * This patched by Chad Miller <cmiller@xxxxxxxxxxxxx>, original code by - * David Chen <chen@xxxxxxxxxxxxxx> - */ -#undef INIT_TIMER_AFTER_SUSPEND - -#ifdef INIT_TIMER_AFTER_SUSPEND -#include <linux/timex.h> -#include <asm/io.h> -#include <linux/delay.h> -#endif - -/* - * Need to poll the APM BIOS every second - */ -#define APM_CHECK_TIMEOUT (HZ) - -/* - * Ignore suspend events for this amount of time after a resume - */ -#define DEFAULT_BOUNCE_INTERVAL (3 * HZ) - -/* - * Maximum number of events stored - */ -#define APM_MAX_EVENTS 20 - -/* - * The per-file APM data - */ -struct apm_user { - int magic; - struct apm_user * next; - unsigned int suser: 1; - unsigned int writer: 1; - unsigned int reader: 1; - unsigned int suspend_wait: 1; - int suspend_result; - int suspends_pending; - int standbys_pending; - int suspends_read; - int standbys_read; - int event_head; - int event_tail; - apm_event_t events[APM_MAX_EVENTS]; -}; - -/* - * The magic number in apm_user - */ -#define APM_BIOS_MAGIC 0x4101 - -/* - * idle percentage above which bios idle calls are done - */ -#ifdef CONFIG_APM_CPU_IDLE -#define DEFAULT_IDLE_THRESHOLD 95 -#else -#define DEFAULT_IDLE_THRESHOLD 100 -#endif -#define DEFAULT_IDLE_PERIOD (100 / 3) - -/* - * Local variables - */ -static struct { - unsigned long offset; - unsigned short segment; -} apm_bios_entry; -static int clock_slowed; -static int idle_threshold = DEFAULT_IDLE_THRESHOLD; -static int idle_period = DEFAULT_IDLE_PERIOD; -static int set_pm_idle; -static int suspends_pending; -static int standbys_pending; -static int ignore_sys_suspend; -static int ignore_normal_resume; -static int bounce_interval = DEFAULT_BOUNCE_INTERVAL; - -#ifdef CONFIG_APM_RTC_IS_GMT -# define clock_cmos_diff 0 -# define got_clock_diff 1 -#else -static long clock_cmos_diff; -static int got_clock_diff; -#endif -static int debug; -static int smp; -static int apm_disabled = -1; -#ifdef CONFIG_SMP -static int power_off; -#else -static int power_off = 1; -#endif -#ifdef CONFIG_APM_REAL_MODE_POWER_OFF -static int realmode_power_off = 1; -#else -static int realmode_power_off; -#endif -static int exit_kapmd; -static int kapmd_running; -#ifdef CONFIG_APM_ALLOW_INTS -static int allow_ints = 1; -#else -static int allow_ints; -#endif -static int broken_psr; - -static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue); -static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); -static struct apm_user * user_list; -static DEFINE_SPINLOCK(user_list_lock); -static struct desc_struct bad_bios_desc = { 0, 0x00409200 }; - -static char driver_version[] = "1.16ac"; /* no spaces */ - -/* - * APM event names taken from the APM 1.2 specification. These are - * the message codes that the BIOS uses to tell us about events - */ -static char * apm_event_name[] = { - "system standby", - "system suspend", - "normal resume", - "critical resume", - "low battery", - "power status change", - "update time", - "critical suspend", - "user standby", - "user suspend", - "system standby resume", - "capabilities change" -}; -#define NR_APM_EVENT_NAME \ - (sizeof(apm_event_name) / sizeof(apm_event_name[0])) - -typedef struct lookup_t { - int key; - char * msg; -} lookup_t; - -/* - * The BIOS returns a set of standard error codes in AX when the - * carry flag is set. - */ - -static const lookup_t error_table[] = { -/* N/A { APM_SUCCESS, "Operation succeeded" }, */ - { APM_DISABLED, "Power management disabled" }, - { APM_CONNECTED, "Real mode interface already connected" }, - { APM_NOT_CONNECTED, "Interface not connected" }, - { APM_16_CONNECTED, "16 bit interface already connected" }, -/* N/A { APM_16_UNSUPPORTED, "16 bit interface not supported" }, */ - { APM_32_CONNECTED, "32 bit interface already connected" }, - { APM_32_UNSUPPORTED, "32 bit interface not supported" }, - { APM_BAD_DEVICE, "Unrecognized device ID" }, - { APM_BAD_PARAM, "Parameter out of range" }, - { APM_NOT_ENGAGED, "Interface not engaged" }, - { APM_BAD_FUNCTION, "Function not supported" }, - { APM_RESUME_DISABLED, "Resume timer disabled" }, - { APM_BAD_STATE, "Unable to enter requested state" }, -/* N/A { APM_NO_EVENTS, "No events pending" }, */ - { APM_NO_ERROR, "BIOS did not set a return code" }, - { APM_NOT_PRESENT, "No APM present" } -}; -#define ERROR_COUNT (sizeof(error_table)/sizeof(lookup_t)) - -/** - * apm_error - display an APM error - * @str: information string - * @err: APM BIOS return code - * - * Write a meaningful log entry to the kernel log in the event of - * an APM error. - */ - -static void apm_error(char *str, int err) -{ - int i; - - for (i = 0; i < ERROR_COUNT; i++) - if (error_table[i].key == err) break; - if (i < ERROR_COUNT) - printk(KERN_NOTICE "apm: %s: %s\n", str, error_table[i].msg); - else - printk(KERN_NOTICE "apm: %s: unknown error code %#2.2x\n", - str, err); -} - -/* - * Lock APM functionality to physical CPU 0 - */ - -#ifdef CONFIG_SMP - -static cpumask_t apm_save_cpus(void) -{ - cpumask_t x = current->cpus_allowed; - /* Some bioses don't like being called from CPU != 0 */ - set_cpus_allowed(current, cpumask_of_cpu(0)); - BUG_ON(smp_processor_id() != 0); - return x; -} - -static inline void apm_restore_cpus(cpumask_t mask) -{ - set_cpus_allowed(current, mask); -} - -#else - -/* - * No CPU lockdown needed on a uniprocessor - */ - -#define apm_save_cpus() (current->cpus_allowed) -#define apm_restore_cpus(x) (void)(x) - -#endif - -/* - * These are the actual BIOS calls. Depending on APM_ZERO_SEGS and - * apm_info.allow_ints, we are being really paranoid here! Not only - * are interrupts disabled, but all the segment registers (except SS) - * are saved and zeroed this means that if the BIOS tries to reference - * any data without explicitly loading the segment registers, the kernel - * will fault immediately rather than have some unforeseen circumstances - * for the rest of the kernel. And it will be very obvious! :-) Doing - * this depends on CS referring to the same physical memory as DS so that - * DS can be zeroed before the call. Unfortunately, we can't do anything - * about the stack segment/pointer. Also, we tell the compiler that - * everything could change. - * - * Also, we KNOW that for the non error case of apm_bios_call, there - * is no useful data returned in the low order 8 bits of eax. - */ -#define APM_DO_CLI \ - if (apm_info.allow_ints) \ - local_irq_enable(); \ - else \ - local_irq_disable(); - -#ifdef APM_ZERO_SEGS -# define APM_DECL_SEGS \ - unsigned int saved_fs; unsigned int saved_gs; -# define APM_DO_SAVE_SEGS \ - savesegment(fs, saved_fs); savesegment(gs, saved_gs) -# define APM_DO_RESTORE_SEGS \ - loadsegment(fs, saved_fs); loadsegment(gs, saved_gs) -#else -# define APM_DECL_SEGS -# define APM_DO_SAVE_SEGS -# define APM_DO_RESTORE_SEGS -#endif - -/** - * apm_bios_call - Make an APM BIOS 32bit call - * @func: APM function to execute - * @ebx_in: EBX register for call entry - * @ecx_in: ECX register for call entry - * @eax: EAX register return - * @ebx: EBX register return - * @ecx: ECX register return - * @edx: EDX register return - * @esi: ESI register return - * - * Make an APM call using the 32bit protected mode interface. The - * caller is responsible for knowing if APM BIOS is configured and - * enabled. This call can disable interrupts for a long period of - * time on some laptops. The return value is in AH and the carry - * flag is loaded into AL. If there is an error, then the error - * code is returned in AH (bits 8-15 of eax) and this function - * returns non-zero. - */ - -static u8 apm_bios_call(u32 func, u32 ebx_in, u32 ecx_in, - u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, u32 *esi) -{ - APM_DECL_SEGS - unsigned long flags; - cpumask_t cpus; - int cpu; - struct desc_struct save_desc_40; - - cpus = apm_save_cpus(); - - cpu = get_cpu(); - save_desc_40 = get_cpu_gdt_table(cpu)[0x40 / 8]; - get_cpu_gdt_table(cpu)[0x40 / 8] = bad_bios_desc; - - local_save_flags(flags); - APM_DO_CLI; - APM_DO_SAVE_SEGS; - apm_bios_call_asm(func, ebx_in, ecx_in, eax, ebx, ecx, edx, esi); - APM_DO_RESTORE_SEGS; - local_irq_restore(flags); - get_cpu_gdt_table(cpu)[0x40 / 8] = save_desc_40; - put_cpu(); - apm_restore_cpus(cpus); - - return *eax & 0xff; -} - -/** - * apm_bios_call_simple - make a simple APM BIOS 32bit call - * @func: APM function to invoke - * @ebx_in: EBX register value for BIOS call - * @ecx_in: ECX register value for BIOS call - * @eax: EAX register on return from the BIOS call - * - * Make a BIOS call that does only returns one value, or just status. - * If there is an error, then the error code is returned in AH - * (bits 8-15 of eax) and this function returns non-zero. This is - * used for simpler BIOS operations. This call may hold interrupts - * off for a long time on some laptops. - */ - -static u8 apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax) -{ - u8 error; - APM_DECL_SEGS - unsigned long flags; - cpumask_t cpus; - int cpu; - struct desc_struct save_desc_40; - - - cpus = apm_save_cpus(); - - cpu = get_cpu(); - save_desc_40 = get_cpu_gdt_table(cpu)[0x40 / 8]; - get_cpu_gdt_table(cpu)[0x40 / 8] = bad_bios_desc; - - local_save_flags(flags); - APM_DO_CLI; - APM_DO_SAVE_SEGS; - error = apm_bios_call_simple_asm(func, ebx_in, ecx_in, eax); - APM_DO_RESTORE_SEGS; - local_irq_restore(flags); - get_cpu_gdt_table(cpu)[0x40 / 8] = save_desc_40; - put_cpu(); - apm_restore_cpus(cpus); - return error; -} - -/** - * apm_driver_version - APM driver version - * @val: loaded with the APM version on return - * - * Retrieve the APM version supported by the BIOS. This is only - * supported for APM 1.1 or higher. An error indicates APM 1.0 is - * probably present. - * - * On entry val should point to a value indicating the APM driver - * version with the high byte being the major and the low byte the - * minor number both in BCD - * - * On return it will hold the BIOS revision supported in the - * same format. - */ - -static int apm_driver_version(u_short *val) -{ - u32 eax; - - if (apm_bios_call_simple(APM_FUNC_VERSION, 0, *val, &eax)) - return (eax >> 8) & 0xff; - *val = eax; - return APM_SUCCESS; -} - -/** - * apm_get_event - get an APM event from the BIOS - * @event: pointer to the event - * @info: point to the event information - * - * The APM BIOS provides a polled information for event - * reporting. The BIOS expects to be polled at least every second - * when events are pending. When a message is found the caller should - * poll until no more messages are present. However, this causes - * problems on some laptops where a suspend event notification is - * not cleared until it is acknowledged. - * - * Additional information is returned in the info pointer, providing - * that APM 1.2 is in use. If no messges are pending the value 0x80 - * is returned (No power management events pending). - */ - -static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info) -{ - u32 eax; - u32 ebx; - u32 ecx; - u32 dummy; - - if (apm_bios_call(APM_FUNC_GET_EVENT, 0, 0, &eax, &ebx, &ecx, - &dummy, &dummy)) - return (eax >> 8) & 0xff; - *event = ebx; - if (apm_info.connection_version < 0x0102) - *info = ~0; /* indicate info not valid */ - else - *info = ecx; - return APM_SUCCESS; -} - -/** - * set_power_state - set the power management state - * @what: which items to transition - * @state: state to transition to - * - * Request an APM change of state for one or more system devices. The - * processor state must be transitioned last of all. what holds the - * class of device in the upper byte and the device number (0xFF for - * all) for the object to be transitioned. - * - * The state holds the state to transition to, which may in fact - * be an acceptance of a BIOS requested state change. - */ - -static int set_power_state(u_short what, u_short state) -{ - u32 eax; - - if (apm_bios_call_simple(APM_FUNC_SET_STATE, what, state, &eax)) - return (eax >> 8) & 0xff; - return APM_SUCCESS; -} - -/** - * set_system_power_state - set system wide power state - * @state: which state to enter - * - * Transition the entire system into a new APM power state. - */ - -static int set_system_power_state(u_short state) -{ - return set_power_state(APM_DEVICE_ALL, state); -} - -/** - * apm_do_idle - perform power saving - * - * This function notifies the BIOS that the processor is (in the view - * of the OS) idle. It returns -1 in the event that the BIOS refuses - * to handle the idle request. On a success the function returns 1 - * if the BIOS did clock slowing or 0 otherwise. - */ - -static int apm_do_idle(void) -{ - u32 eax; - - if (apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax)) { - static unsigned long t; - - /* This always fails on some SMP boards running UP kernels. - * Only report the failure the first 5 times. - */ - if (++t < 5) - { - printk(KERN_DEBUG "apm_do_idle failed (%d)\n", - (eax >> 8) & 0xff); - t = jiffies; - } - return -1; - } - clock_slowed = (apm_info.bios.flags & APM_IDLE_SLOWS_CLOCK) != 0; - return clock_slowed; -} - -/** - * apm_do_busy - inform the BIOS the CPU is busy - * - * Request that the BIOS brings the CPU back to full performance. - */ - -static void apm_do_busy(void) -{ - u32 dummy; - - if (clock_slowed || ALWAYS_CALL_BUSY) { - (void) apm_bios_call_simple(APM_FUNC_BUSY, 0, 0, &dummy); - clock_slowed = 0; - } -} - -/* - * If no process has really been interested in - * the CPU for some time, we want to call BIOS - * power management - we probably want - * to conserve power. - */ -#define IDLE_CALC_LIMIT (HZ * 100) -#define IDLE_LEAKY_MAX 16 - -static void (*original_pm_idle)(void); - -extern void default_idle(void); - -/** - * apm_cpu_idle - cpu idling for APM capable Linux - * - * This is the idling function the kernel executes when APM is available. It - * tries to do BIOS powermanagement based on the average system idle time. - * Furthermore it calls the system default idle routine. - */ - -static void apm_cpu_idle(void) -{ - static int use_apm_idle; /* = 0 */ - static unsigned int last_jiffies; /* = 0 */ - static unsigned int last_stime; /* = 0 */ - - int apm_idle_done = 0; - unsigned int jiffies_since_last_check = jiffies - last_jiffies; - unsigned int bucket; - -recalc: - if (jiffies_since_last_check > IDLE_CALC_LIMIT) { - use_apm_idle = 0; - last_jiffies = jiffies; - last_stime = current->stime; - } else if (jiffies_since_last_check > idle_period) { - unsigned int idle_percentage; - - idle_percentage = current->stime - last_stime; - idle_percentage *= 100; - idle_percentage /= jiffies_since_last_check; - use_apm_idle = (idle_percentage > idle_threshold); - if (apm_info.forbid_idle) - use_apm_idle = 0; - last_jiffies = jiffies; - last_stime = current->stime; - } - - bucket = IDLE_LEAKY_MAX; - - while (!need_resched()) { - if (use_apm_idle) { - unsigned int t; - - t = jiffies; - switch (apm_do_idle()) { - case 0: apm_idle_done = 1; - if (t != jiffies) { - if (bucket) { - bucket = IDLE_LEAKY_MAX; - continue; - } - } else if (bucket) { - bucket--; - continue; - } - break; - case 1: apm_idle_done = 1; - break; - default: /* BIOS refused */ - break; - } - } - if (original_pm_idle) - original_pm_idle(); - else - default_idle(); - jiffies_since_last_check = jiffies - last_jiffies; - if (jiffies_since_last_check > idle_period) - goto recalc; - } - - if (apm_idle_done) - apm_do_busy(); -} - -/** - * apm_power_off - ask the BIOS to power off - * - * Handle the power off sequence. This is the one piece of code we - * will execute even on SMP machines. In order to deal with BIOS - * bugs we support real mode APM BIOS power off calls. We also make - * the SMP call on CPU0 as some systems will only honour this call - * on their first cpu. - */ - -static void apm_power_off(void) -{ - unsigned char po_bios_call[] = { - 0xb8, 0x00, 0x10, /* movw $0x1000,ax */ - 0x8e, 0xd0, /* movw ax,ss */ - 0xbc, 0x00, 0xf0, /* movw $0xf000,sp */ - 0xb8, 0x07, 0x53, /* movw $0x5307,ax */ - 0xbb, 0x01, 0x00, /* movw $0x0001,bx */ - 0xb9, 0x03, 0x00, /* movw $0x0003,cx */ - 0xcd, 0x15 /* int $0x15 */ - }; - - /* Some bioses don't like being called from CPU != 0 */ - if (apm_info.realmode_power_off) - { - (void)apm_save_cpus(); - machine_real_restart(po_bios_call, sizeof(po_bios_call)); - } - else - (void) set_system_power_state(APM_STATE_OFF); -} - -#ifdef CONFIG_APM_DO_ENABLE - -/** - * apm_enable_power_management - enable BIOS APM power management - * @enable: enable yes/no - * - * Enable or disable the APM BIOS power services. - */ - -static int apm_enable_power_management(int enable) -{ - u32 eax; - - if ((enable == 0) && (apm_info.bios.flags & APM_BIOS_DISENGAGED)) - return APM_NOT_ENGAGED; - if (apm_bios_call_simple(APM_FUNC_ENABLE_PM, APM_DEVICE_BALL, - enable, &eax)) - return (eax >> 8) & 0xff; - if (enable) - apm_info.bios.flags &= ~APM_BIOS_DISABLED; - else - apm_info.bios.flags |= APM_BIOS_DISABLED; - return APM_SUCCESS; -} -#endif - -/** - * apm_get_power_status - get current power state - * @status: returned status - * @bat: battery info - * @life: estimated life - * - * Obtain the current power status from the APM BIOS. We return a - * status which gives the rough battery status, and current power - * source. The bat value returned give an estimate as a percentage - * of life and a status value for the battery. The estimated life - * if reported is a lifetime in secodnds/minutes at current powwer - * consumption. - */ - -static int apm_get_power_status(u_short *status, u_short *bat, u_short *life) -{ - u32 eax; - u32 ebx; - u32 ecx; - u32 edx; - u32 dummy; - - if (apm_info.get_power_status_broken) - return APM_32_UNSUPPORTED; - if (apm_bios_call(APM_FUNC_GET_STATUS, APM_DEVICE_ALL, 0, - &eax, &ebx, &ecx, &edx, &dummy)) - return (eax >> 8) & 0xff; - *status = ebx; - *bat = ecx; - if (apm_info.get_power_status_swabinminutes) { - *life = swab16((u16)edx); - *life |= 0x8000; - } else - *life = edx; - return APM_SUCCESS; -} - -#if 0 -static int apm_get_battery_status(u_short which, u_short *status, - u_short *bat, u_short *life, u_short *nbat) -{ - u32 eax; - u32 ebx; - u32 ecx; - u32 edx; - u32 esi; - - if (apm_info.connection_version < 0x0102) { - /* pretend we only have one battery. */ - if (which != 1) - return APM_BAD_DEVICE; - *nbat = 1; - return apm_get_power_status(status, bat, life); - } - - if (apm_bios_call(APM_FUNC_GET_STATUS, (0x8000 | (which)), 0, &eax, - &ebx, &ecx, &edx, &esi)) - return (eax >> 8) & 0xff; - *status = ebx; - *bat = ecx; - *life = edx; - *nbat = esi; - return APM_SUCCESS; -} -#endif - -/** - * apm_engage_power_management - enable PM on a device - * @device: identity of device - * @enable: on/off - * - * Activate or deactive power management on either a specific device - * or the entire system (%APM_DEVICE_ALL). - */ - -static int apm_engage_power_management(u_short device, int enable) -{ - u32 eax; - - if ((enable == 0) && (device == APM_DEVICE_ALL) - && (apm_info.bios.flags & APM_BIOS_DISABLED)) - return APM_DISABLED; - if (apm_bios_call_simple(APM_FUNC_ENGAGE_PM, device, enable, &eax)) - return (eax >> 8) & 0xff; - if (device == APM_DEVICE_ALL) { - if (enable) - apm_info.bios.flags &= ~APM_BIOS_DISENGAGED; - else - apm_info.bios.flags |= APM_BIOS_DISENGAGED; - } - return APM_SUCCESS; -} - -#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) - -/** - * apm_console_blank - blank the display - * @blank: on/off - * - * Attempt to blank the console, firstly by blanking just video device - * zero, and if that fails (some BIOSes don't support it) then it blanks - * all video devices. Typically the BIOS will do laptop backlight and - * monitor powerdown for us. - */ - -static int apm_console_blank(int blank) -{ - int error; - u_short state; - - state = blank ? APM_STATE_STANDBY : APM_STATE_READY; - /* Blank the first display device */ - error = set_power_state(0x100, state); - if ((error != APM_SUCCESS) && (error != APM_NO_ERROR)) { - /* try to blank them all instead */ - error = set_power_state(0x1ff, state); - if ((error != APM_SUCCESS) && (error != APM_NO_ERROR)) - /* try to blank device one instead */ - error = set_power_state(0x101, state); - } - if ((error == APM_SUCCESS) || (error == APM_NO_ERROR)) - return 1; - if (error == APM_NOT_ENGAGED) { - static int tried; - int eng_error; - if (tried++ == 0) { - eng_error = apm_engage_power_management(APM_DEVICE_ALL, 1); - if (eng_error) { - apm_error("set display", error); - apm_error("engage interface", eng_error); - return 0; - } else - return apm_console_blank(blank); - } - } - apm_error("set display", error); - return 0; -} -#endif - -static int queue_empty(struct apm_user *as) -{ - return as->event_head == as->event_tail; -} - -static apm_event_t get_queued_event(struct apm_user *as) -{ - as->event_tail = (as->event_tail + 1) % APM_MAX_EVENTS; - return as->events[as->event_tail]; -} - -static void queue_event(apm_event_t event, struct apm_user *sender) -{ - struct apm_user * as; - - spin_lock(&user_list_lock); - if (user_list == NULL) - goto out; - for (as = user_list; as != NULL; as = as->next) { - if ((as == sender) || (!as->reader)) - continue; - as->event_head = (as->event_head + 1) % APM_MAX_EVENTS; - if (as->event_head == as->event_tail) { - static int notified; - - if (notified++ == 0) - printk(KERN_ERR "apm: an event queue overflowed\n"); - as->event_tail = (as->event_tail + 1) % APM_MAX_EVENTS; - } - as->events[as->event_head] = event; - if ((!as->suser) || (!as->writer)) - continue; - switch (event) { - case APM_SYS_SUSPEND: - case APM_USER_SUSPEND: - as->suspends_pending++; - suspends_pending++; - break; - - case APM_SYS_STANDBY: - case APM_USER_STANDBY: - as->standbys_pending++; - standbys_pending++; - break; - } - } - wake_up_interruptible(&apm_waitqueue); -out: - spin_unlock(&user_list_lock); -} - -static void set_time(void) -{ - if (got_clock_diff) { /* Must know time zone in order to set clock */ - xtime.tv_sec = get_cmos_time() + clock_cmos_diff; - xtime.tv_nsec = 0; - } -} - -static void get_time_diff(void) -{ -#ifndef CONFIG_APM_RTC_IS_GMT - /* - * Estimate time zone so that set_time can update the clock - */ - clock_cmos_diff = -get_cmos_time(); - clock_cmos_diff += get_seconds(); - got_clock_diff = 1; -#endif -} - -static void reinit_timer(void) -{ -#ifdef INIT_TIMER_AFTER_SUSPEND - unsigned long flags; - - spin_lock_irqsave(&i8253_lock, flags); - /* set the clock to 100 Hz */ - outb_p(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ - udelay(10); - outb_p(LATCH & 0xff, PIT_CH0); /* LSB */ - udelay(10); - outb(LATCH >> 8, PIT_CH0); /* MSB */ - udelay(10); - spin_unlock_irqrestore(&i8253_lock, flags); -#endif -} - -static int suspend(int vetoable) -{ - int err; - struct apm_user *as; - - if (pm_send_all(PM_SUSPEND, (void *)3)) { - /* Vetoed */ - if (vetoable) { - if (apm_info.connection_version > 0x100) - set_system_power_state(APM_STATE_REJECT); - err = -EBUSY; - ignore_sys_suspend = 0; - printk(KERN_WARNING "apm: suspend was vetoed.\n"); - goto out; - } - printk(KERN_CRIT "apm: suspend was vetoed, but suspending anyway.\n"); - } - - device_suspend(PMSG_SUSPEND); - local_irq_disable(); - device_power_down(PMSG_SUSPEND); - - /* serialize with the timer interrupt */ - write_seqlock(&xtime_lock); - - /* protect against access to timer chip registers */ - spin_lock(&i8253_lock); - - get_time_diff(); - /* - * Irq spinlock must be dropped around set_system_power_state. - * We'll undo any timer changes due to interrupts below. - */ - spin_unlock(&i8253_lock); - write_sequnlock(&xtime_lock); - local_irq_enable(); - - save_processor_state(); - err = set_system_power_state(APM_STATE_SUSPEND); - ignore_normal_resume = 1; - restore_processor_state(); - - local_irq_disable(); - write_seqlock(&xtime_lock); - spin_lock(&i8253_lock); - reinit_timer(); - set_time(); - - spin_unlock(&i8253_lock); - write_sequnlock(&xtime_lock); - - if (err == APM_NO_ERROR) - err = APM_SUCCESS; - if (err != APM_SUCCESS) - apm_error("suspend", err); - err = (err == APM_SUCCESS) ? 0 : -EIO; - device_power_up(); - local_irq_enable(); - device_resume(); - pm_send_all(PM_RESUME, (void *)0); - queue_event(APM_NORMAL_RESUME, NULL); - out: - spin_lock(&user_list_lock); - for (as = user_list; as != NULL; as = as->next) { - as->suspend_wait = 0; - as->suspend_result = err; - } - spin_unlock(&user_list_lock); - wake_up_interruptible(&apm_suspend_waitqueue); - return err; -} - -static void standby(void) -{ - int err; - - local_irq_disable(); - device_power_down(PMSG_SUSPEND); - /* serialize with the timer interrupt */ - write_seqlock(&xtime_lock); - /* If needed, notify drivers here */ - get_time_diff(); - write_sequnlock(&xtime_lock); - local_irq_enable(); - - err = set_system_power_state(APM_STATE_STANDBY); - if ((err != APM_SUCCESS) && (err != APM_NO_ERROR)) - apm_error("standby", err); - - local_irq_disable(); - device_power_up(); - local_irq_enable(); -} - -static apm_event_t get_event(void) -{ - int error; - apm_event_t event; - apm_eventinfo_t info; - - static int notified; - - /* we don't use the eventinfo */ - error = apm_get_event(&event, &info); - if (error == APM_SUCCESS) - return event; - - if ((error != APM_NO_EVENTS) && (notified++ == 0)) - apm_error("get_event", error); - - return 0; -} - -static void check_events(void) -{ - apm_event_t event; - static unsigned long last_resume; - static int ignore_bounce; - - while ((event = get_event()) != 0) { - if (debug) { - if (event <= NR_APM_EVENT_NAME) - printk(KERN_DEBUG "apm: received %s notify\n", - apm_event_name[event - 1]); - else - printk(KERN_DEBUG "apm: received unknown " - "event 0x%02x\n", event); - } - if (ignore_bounce - && ((jiffies - last_resume) > bounce_interval)) - ignore_bounce = 0; - - switch (event) { - case APM_SYS_STANDBY: - case APM_USER_STANDBY: - queue_event(event, NULL); - if (standbys_pending <= 0) - standby(); - break; - - case APM_USER_SUSPEND: -#ifdef CONFIG_APM_IGNORE_USER_SUSPEND - if (apm_info.connection_version > 0x100) - set_system_power_state(APM_STATE_REJECT); - break; -#endif - case APM_SYS_SUSPEND: - if (ignore_bounce) { - if (apm_info.connection_version > 0x100) - set_system_power_state(APM_STATE_REJECT); - break; - } - /* - * If we are already processing a SUSPEND, - * then further SUSPEND events from the BIOS - * will be ignored. We also return here to - * cope with the fact that the Thinkpads keep - * sending a SUSPEND event until something else - * happens! - */ - if (ignore_sys_suspend) - return; - ignore_sys_suspend = 1; - queue_event(event, NULL); - if (suspends_pending <= 0) - (void) suspend(1); - break; - - case APM_NORMAL_RESUME: - case APM_CRITICAL_RESUME: - case APM_STANDBY_RESUME: - ignore_sys_suspend = 0; - last_resume = jiffies; - ignore_bounce = 1; - if ((event != APM_NORMAL_RESUME) - || (ignore_normal_resume == 0)) { - write_seqlock_irq(&xtime_lock); - set_time(); - write_sequnlock_irq(&xtime_lock); - device_resume(); - pm_send_all(PM_RESUME, (void *)0); - queue_event(event, NULL); - } - ignore_normal_resume = 0; - break; - - case APM_CAPABILITY_CHANGE: - case APM_LOW_BATTERY: - case APM_POWER_STATUS_CHANGE: - queue_event(event, NULL); - /* If needed, notify drivers here */ - break; - - case APM_UPDATE_TIME: - write_seqlock_irq(&xtime_lock); - set_time(); - write_sequnlock_irq(&xtime_lock); - break; - - case APM_CRITICAL_SUSPEND: - /* - * We are not allowed to reject a critical suspend. - */ - (void) suspend(0); - break; - } - } -} - -static void apm_event_handler(void) -{ - static int pending_count = 4; - int err; - - if ((standbys_pending > 0) || (suspends_pending > 0)) { - if ((apm_info.connection_version > 0x100) && - (pending_count-- <= 0)) { - pending_count = 4; - if (debug) - printk(KERN_DEBUG "apm: setting state busy\n"); - err = set_system_power_state(APM_STATE_BUSY); - if (err) - apm_error("busy", err); - } - } else - pending_count = 4; - check_events(); -} - -/* - * This is the APM thread main loop. - */ - -static void apm_mainloop(void) -{ - DECLARE_WAITQUEUE(wait, current); - - add_wait_queue(&apm_waitqueue, &wait); - set_current_state(TASK_INTERRUPTIBLE); - for (;;) { - schedule_timeout(APM_CHECK_TIMEOUT); - if (exit_kapmd) - break; - /* - * Ok, check all events, check for idle (and mark us sleeping - * so as not to count towards the load average).. - */ - set_current_state(TASK_INTERRUPTIBLE); - apm_event_handler(); - } - remove_wait_queue(&apm_waitqueue, &wait); -} - -static int check_apm_user(struct apm_user *as, const char *func) -{ - if ((as == NULL) || (as->magic != APM_BIOS_MAGIC)) { - printk(KERN_ERR "apm: %s passed bad filp\n", func); - return 1; - } - return 0; -} - -static ssize_t do_read(struct file *fp, char __user *buf, size_t count, loff_t *ppos) -{ - struct apm_user * as; - int i; - apm_event_t event; - - as = fp->private_data; - if (check_apm_user(as, "read")) - return -EIO; - if ((int)count < sizeof(apm_event_t)) - return -EINVAL; - if ((queue_empty(as)) && (fp->f_flags & O_NONBLOCK)) - return -EAGAIN; - wait_event_interruptible(apm_waitqueue, !queue_empty(as)); - i = count; - while ((i >= sizeof(event)) && !queue_empty(as)) { - event = get_queued_event(as); - if (copy_to_user(buf, &event, sizeof(event))) { - if (i < count) - break; - return -EFAULT; - } - switch (event) { - case APM_SYS_SUSPEND: - case APM_USER_SUSPEND: - as->suspends_read++; - break; - - case APM_SYS_STANDBY: - case APM_USER_STANDBY: - as->standbys_read++; - break; - } - buf += sizeof(event); - i -= sizeof(event); - } - if (i < count) - return count - i; - if (signal_pending(current)) - return -ERESTARTSYS; - return 0; -} - -static unsigned int do_poll(struct file *fp, poll_table * wait) -{ - struct apm_user * as; - - as = fp->private_data; - if (check_apm_user(as, "poll")) - return 0; - poll_wait(fp, &apm_waitqueue, wait); - if (!queue_empty(as)) - return POLLIN | POLLRDNORM; - return 0; -} - -static int do_ioctl(struct inode * inode, struct file *filp, - u_int cmd, u_long arg) -{ - struct apm_user * as; - - as = filp->private_data; - if (check_apm_user(as, "ioctl")) - return -EIO; - if ((!as->suser) || (!as->writer)) - return -EPERM; - switch (cmd) { - case APM_IOC_STANDBY: - if (as->standbys_read > 0) { - as->standbys_read--; - as->standbys_pending--; - standbys_pending--; - } else - queue_event(APM_USER_STANDBY, as); - if (standbys_pending <= 0) - standby(); - break; - case APM_IOC_SUSPEND: - if (as->suspends_read > 0) { - as->suspends_read--; - as->suspends_pending--; - suspends_pending--; - } else - queue_event(APM_USER_SUSPEND, as); - if (suspends_pending <= 0) { - return suspend(1); - } else { - as->suspend_wait = 1; - wait_event_interruptible(apm_suspend_waitqueue, - as->suspend_wait == 0); - return as->suspend_result; - } - break; - default: - return -EINVAL; - } - return 0; -} - -static int do_release(struct inode * inode, struct file * filp) -{ - struct apm_user * as; - - as = filp->private_data; - if (check_apm_user(as, "release")) - return 0; - filp->private_data = NULL; - if (as->standbys_pending > 0) { - standbys_pending -= as->standbys_pending; - if (standbys_pending <= 0) - standby(); - } - if (as->suspends_pending > 0) { - suspends_pending -= as->suspends_pending; - if (suspends_pending <= 0) - (void) suspend(1); - } - spin_lock(&user_list_lock); - if (user_list == as) - user_list = as->next; - else { - struct apm_user * as1; - - for (as1 = user_list; - (as1 != NULL) && (as1->next != as); - as1 = as1->next) - ; - if (as1 == NULL) - printk(KERN_ERR "apm: filp not in user list\n"); - else - as1->next = as->next; - } - spin_unlock(&user_list_lock); - kfree(as); - return 0; -} - -static int do_open(struct inode * inode, struct file * filp) -{ - struct apm_user * as; - - as = (struct apm_user *)kmalloc(sizeof(*as), GFP_KERNEL); - if (as == NULL) { - printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n", - sizeof(*as)); - return -ENOMEM; - } - as->magic = APM_BIOS_MAGIC; - as->event_tail = as->event_head = 0; - as->suspends_pending = as->standbys_pending = 0; - as->suspends_read = as->standbys_read = 0; - /* - * XXX - this is a tiny bit broken, when we consider BSD - * process accounting. If the device is opened by root, we - * instantly flag that we used superuser privs. Who knows, - * we might close the device immediately without doing a - * privileged operation -- cevans - */ - as->suser = capable(CAP_SYS_ADMIN); - as->writer = (filp->f_mode & FMODE_WRITE) == FMODE_WRITE; - as->reader = (filp->f_mode & FMODE_READ) == FMODE_READ; - spin_lock(&user_list_lock); - as->next = user_list; - user_list = as; - spin_unlock(&user_list_lock); - filp->private_data = as; - return 0; -} - -static int apm_get_info(char *buf, char **start, off_t fpos, int length) -{ - char * p; - unsigned short bx; - unsigned short cx; - unsigned short dx; - int error; - unsigned short ac_line_status = 0xff; - unsigned short battery_status = 0xff; - unsigned short battery_flag = 0xff; - int percentage = -1; - int time_units = -1; - char *units = "?"; - - p = buf; - - if ((num_online_cpus() == 1) && - !(error = apm_get_power_status(&bx, &cx, &dx))) { - ac_line_status = (bx >> 8) & 0xff; - battery_status = bx & 0xff; - if ((cx & 0xff) != 0xff) - percentage = cx & 0xff; - - if (apm_info.connection_version > 0x100) { - battery_flag = (cx >> 8) & 0xff; - if (dx != 0xffff) { - units = (dx & 0x8000) ? "min" : "sec"; - time_units = dx & 0x7fff; - } - } - } - /* Arguments, with symbols from linux/apm_bios.h. Information is - from the Get Power Status (0x0a) call unless otherwise noted. - - 0) Linux driver version (this will change if format changes) - 1) APM BIOS Version. Usually 1.0, 1.1 or 1.2. - 2) APM flags from APM Installation Check (0x00): - bit 0: APM_16_BIT_SUPPORT - bit 1: APM_32_BIT_SUPPORT - bit 2: APM_IDLE_SLOWS_CLOCK - bit 3: APM_BIOS_DISABLED - bit 4: APM_BIOS_DISENGAGED - 3) AC line status - 0x00: Off-line - 0x01: On-line - 0x02: On backup power (BIOS >= 1.1 only) - 0xff: Unknown - 4) Battery status - 0x00: High - 0x01: Low - 0x02: Critical - 0x03: Charging - 0x04: Selected battery not present (BIOS >= 1.2 only) - 0xff: Unknown - 5) Battery flag - bit 0: High - bit 1: Low - bit 2: Critical - bit 3: Charging - bit 7: No system battery - 0xff: Unknown - 6) Remaining battery life (percentage of charge): - 0-100: valid - -1: Unknown - 7) Remaining battery life (time units): - Number of remaining minutes or seconds - -1: Unknown - 8) min = minutes; sec = seconds */ - - p += sprintf(p, "%s %d.%d 0x%02x 0x%02x 0x%02x 0x%02x %d%% %d %s\n", - driver_version, - (apm_info.bios.version >> 8) & 0xff, - apm_info.bios.version & 0xff, - apm_info.bios.flags, - ac_line_status, - battery_status, - battery_flag, - percentage, - time_units, - units); - - return p - buf; -} - -static int apm(void *unused) -{ - unsigned short bx; - unsigned short cx; - unsigned short dx; - int error; - char * power_stat; - char * bat_stat; - - kapmd_running = 1; - - daemonize("kapmd"); - - current->flags |= PF_NOFREEZE; - -#ifdef CONFIG_SMP - /* 2002/08/01 - WT - * This is to avoid random crashes at boot time during initialization - * on SMP systems in case of "apm=power-off" mode. Seen on ASUS A7M266D. - * Some bioses don't like being called from CPU != 0. - * Method suggested by Ingo Molnar. - */ - set_cpus_allowed(current, cpumask_of_cpu(0)); - BUG_ON(smp_processor_id() != 0); -#endif - - if (apm_info.connection_version == 0) { - apm_info.connection_version = apm_info.bios.version; - if (apm_info.connection_version > 0x100) { - /* - * We only support BIOSs up to version 1.2 - */ - if (apm_info.connection_version > 0x0102) - apm_info.connection_version = 0x0102; - error = apm_driver_version(&apm_info.connection_version); - if (error != APM_SUCCESS) { - apm_error("driver version", error); - /* Fall back to an APM 1.0 connection. */ - apm_info.connection_version = 0x100; - } - } - } - - if (debug) - printk(KERN_INFO "apm: Connection version %d.%d\n", - (apm_info.connection_version >> 8) & 0xff, - apm_info.connection_version & 0xff); - -#ifdef CONFIG_APM_DO_ENABLE - if (apm_info.bios.flags & APM_BIOS_DISABLED) { - /* - * This call causes my NEC UltraLite Versa 33/C to hang if it - * is booted with PM disabled but not in the docking station. - * Unfortunate ... - */ - error = apm_enable_power_management(1); - if (error) { - apm_error("enable power management", error); - return -1; - } - } -#endif - - if ((apm_info.bios.flags & APM_BIOS_DISENGAGED) - && (apm_info.connection_version > 0x0100)) { - error = apm_engage_power_management(APM_DEVICE_ALL, 1); - if (error) { - apm_error("engage power management", error); - return -1; - } - } - - if (debug && (num_online_cpus() == 1 || smp )) { - error = apm_get_power_status(&bx, &cx, &dx); - if (error) - printk(KERN_INFO "apm: power status not available\n"); - else { - switch ((bx >> 8) & 0xff) { - case 0: power_stat = "off line"; break; - case 1: power_stat = "on line"; break; - case 2: power_stat = "on backup power"; break; - default: power_stat = "unknown"; break; - } - switch (bx & 0xff) { - case 0: bat_stat = "high"; break; - case 1: bat_stat = "low"; break; - case 2: bat_stat = "critical"; break; - case 3: bat_stat = "charging"; break; - default: bat_stat = "unknown"; break; - } - printk(KERN_INFO - "apm: AC %s, battery status %s, battery life ", - power_stat, bat_stat); - if ((cx & 0xff) == 0xff) - printk("unknown\n"); - else - printk("%d%%\n", cx & 0xff); - if (apm_info.connection_version > 0x100) { - printk(KERN_INFO - "apm: battery flag 0x%02x, battery life ", - (cx >> 8) & 0xff); - if (dx == 0xffff) - printk("unknown\n"); - else - printk("%d %s\n", dx & 0x7fff, - (dx & 0x8000) ? - "minutes" : "seconds"); - } - } - } - - /* Install our power off handler.. */ - if (power_off) - pm_power_off = apm_power_off; - - if (num_online_cpus() == 1 || smp) { -#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) - console_blank_hook = apm_console_blank; -#endif - apm_mainloop(); -#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) - console_blank_hook = NULL; -#endif - } - kapmd_running = 0; - - return 0; -} - -#ifndef MODULE -static int __init apm_setup(char *str) -{ - int invert; - - while ((str != NULL) && (*str != '\0')) { - if (strncmp(str, "off", 3) == 0) - apm_disabled = 1; - if (strncmp(str, "on", 2) == 0) - apm_disabled = 0; - if ((strncmp(str, "bounce-interval=", 16) == 0) || - (strncmp(str, "bounce_interval=", 16) == 0)) - bounce_interval = simple_strtol(str + 16, NULL, 0); - if ((strncmp(str, "idle-threshold=", 15) == 0) || - (strncmp(str, "idle_threshold=", 15) == 0)) - idle_threshold = simple_strtol(str + 15, NULL, 0); - if ((strncmp(str, "idle-period=", 12) == 0) || - (strncmp(str, "idle_period=", 12) == 0)) - idle_period = simple_strtol(str + 12, NULL, 0); - invert = (strncmp(str, "no-", 3) == 0) || - (strncmp(str, "no_", 3) == 0); - if (invert) - str += 3; - if (strncmp(str, "debug", 5) == 0) - debug = !invert; - if ((strncmp(str, "power-off", 9) == 0) || - (strncmp(str, "power_off", 9) == 0)) - power_off = !invert; - if (strncmp(str, "smp", 3) == 0) - { - smp = !invert; - idle_threshold = 100; - } - if ((strncmp(str, "allow-ints", 10) == 0) || - (strncmp(str, "allow_ints", 10) == 0)) - apm_info.allow_ints = !invert; - if ((strncmp(str, "broken-psr", 10) == 0) || - (strncmp(str, "broken_psr", 10) == 0)) - apm_info.get_power_status_broken = !invert; - if ((strncmp(str, "realmode-power-off", 18) == 0) || - (strncmp(str, "realmode_power_off", 18) == 0)) - apm_info.realmode_power_off = !invert; - str = strchr(str, ','); - if (str != NULL) - str += strspn(str, ", \t"); - } - return 1; -} - -__setup("apm=", apm_setup); -#endif - -static struct file_operations apm_bios_fops = { - .owner = THIS_MODULE, - .read = do_read, - .poll = do_poll, - .ioctl = do_ioctl, - .open = do_open, - .release = do_release, -}; - -static struct miscdevice apm_device = { - APM_MINOR_DEV, - "apm_bios", - &apm_bios_fops -}; - - -/* Simple "print if true" callback */ -static int __init print_if_true(struct dmi_system_id *d) -{ - printk("%s\n", d->ident); - return 0; -} - -/* - * Some Bioses enable the PS/2 mouse (touchpad) at resume, even if it was - * disabled before the suspend. Linux used to get terribly confused by that. - */ -static int __init broken_ps2_resume(struct dmi_system_id *d) -{ - printk(KERN_INFO "%s machine detected. Mousepad Resume Bug workaround hopefully not needed.\n", d->ident); - return 0; -} - -/* Some bioses have a broken protected mode poweroff and need to use realmode */ -static int __init set_realmode_power_off(struct dmi_system_id *d) -{ - if (apm_info.realmode_power_off == 0) { - apm_info.realmode_power_off = 1; - printk(KERN_INFO "%s bios detected. Using realmode poweroff only.\n", d->ident); - } - return 0; -} - -/* Some laptops require interrupts to be enabled during APM calls */ -static int __init set_apm_ints(struct dmi_system_id *d) -{ - if (apm_info.allow_ints == 0) { - apm_info.allow_ints = 1; - printk(KERN_INFO "%s machine detected. Enabling interrupts during APM calls.\n", d->ident); - } - return 0; -} - -/* Some APM bioses corrupt memory or just plain do not work */ -static int __init apm_is_horked(struct dmi_system_id *d) -{ - if (apm_info.disabled == 0) { - apm_info.disabled = 1; - printk(KERN_INFO "%s machine detected. Disabling APM.\n", d->ident); - } - return 0; -} - -static int __init apm_is_horked_d850md(struct dmi_system_id *d) -{ - if (apm_info.disabled == 0) { - apm_info.disabled = 1; - printk(KERN_INFO "%s machine detected. Disabling APM.\n", d->ident); - printk(KERN_INFO "This bug is fixed in bios P15 which is available for \n"); - printk(KERN_INFO "download from support.intel.com \n"); - } - return 0; -} - -/* Some APM bioses hang on APM idle calls */ -static int __init apm_likes_to_melt(struct dmi_system_id *d) -{ - if (apm_info.forbid_idle == 0) { - apm_info.forbid_idle = 1; - printk(KERN_INFO "%s machine detected. Disabling APM idle calls.\n", d->ident); - } - return 0; -} - -/* - * Check for clue free BIOS implementations who use - * the following QA technique - * - * [ Write BIOS Code ]<------ - * | ^ - * < Does it Compile >----N-- - * |Y ^ - * < Does it Boot Win98 >-N-- - * |Y - * [Ship It] - * - * Phoenix A04 08/24/2000 is known bad (Dell Inspiron 5000e) - * Phoenix A07 09/29/2000 is known good (Dell Inspiron 5000) - */ -static int __init broken_apm_power(struct dmi_system_id *d) -{ - apm_info.get_power_status_broken = 1; - printk(KERN_WARNING "BIOS strings suggest APM bugs, disabling power status reporting.\n"); - return 0; -} - -/* - * This bios swaps the APM minute reporting bytes over (Many sony laptops - * have this problem). - */ -static int __init swab_apm_power_in_minutes(struct dmi_system_id *d) -{ - apm_info.get_power_status_swabinminutes = 1; - printk(KERN_WARNING "BIOS strings suggest APM reports battery life in minutes and wrong byte order.\n"); - return 0; -} - -static struct dmi_system_id __initdata apm_dmi_table[] = { - { - print_if_true, - KERN_WARNING "IBM T23 - BIOS 1.03b+ and controller firmware 1.02+ may be needed for Linux APM.", - { DMI_MATCH(DMI_SYS_VENDOR, "IBM"), - DMI_MATCH(DMI_BIOS_VERSION, "1AET38WW (1.01b)"), }, - }, - { /* Handle problems with APM on the C600 */ - broken_ps2_resume, "Dell Latitude C600", - { DMI_MATCH(DMI_SYS_VENDOR, "Dell"), - DMI_MATCH(DMI_PRODUCT_NAME, "Latitude C600"), }, - }, - { /* Allow interrupts during suspend on Dell Latitude laptops*/ - set_apm_ints, "Dell Latitude", - { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), - DMI_MATCH(DMI_PRODUCT_NAME, "Latitude C510"), } - }, - { /* APM crashes */ - apm_is_horked, "Dell Inspiron 2500", - { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), - DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"), - DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"), - DMI_MATCH(DMI_BIOS_VERSION,"A11"), }, - }, - { /* Allow interrupts during suspend on Dell Inspiron laptops*/ - set_apm_ints, "Dell Inspiron", { - DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), - DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 4000"), }, - }, - { /* Handle problems with APM on Inspiron 5000e */ - broken_apm_power, "Dell Inspiron 5000e", - { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), - DMI_MATCH(DMI_BIOS_VERSION, "A04"), - DMI_MATCH(DMI_BIOS_DATE, "08/24/2000"), }, - }, - { /* Handle problems with APM on Inspiron 2500 */ - broken_apm_power, "Dell Inspiron 2500", - { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), - DMI_MATCH(DMI_BIOS_VERSION, "A12"), - DMI_MATCH(DMI_BIOS_DATE, "02/04/2002"), }, - }, - { /* APM crashes */ - apm_is_horked, "Dell Dimension 4100", - { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), - DMI_MATCH(DMI_PRODUCT_NAME, "XPS-Z"), - DMI_MATCH(DMI_BIOS_VENDOR,"Intel Corp."), - DMI_MATCH(DMI_BIOS_VERSION,"A11"), }, - }, - { /* Allow interrupts during suspend on Compaq Laptops*/ - set_apm_ints, "Compaq 12XL125", - { DMI_MATCH(DMI_SYS_VENDOR, "Compaq"), - DMI_MATCH(DMI_PRODUCT_NAME, "Compaq PC"), - DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), - DMI_MATCH(DMI_BIOS_VERSION,"4.06"), }, - }, - { /* Allow interrupts during APM or the clock goes slow */ - set_apm_ints, "ASUSTeK", - { DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK Computer Inc."), - DMI_MATCH(DMI_PRODUCT_NAME, "L8400K series Notebook PC"), }, - }, - { /* APM blows on shutdown */ - apm_is_horked, "ABIT KX7-333[R]", - { DMI_MATCH(DMI_BOARD_VENDOR, "ABIT"), - DMI_MATCH(DMI_BOARD_NAME, "VT8367-8233A (KX7-333[R])"), }, - }, - { /* APM crashes */ - apm_is_horked, "Trigem Delhi3", - { DMI_MATCH(DMI_SYS_VENDOR, "TriGem Computer, Inc"), - DMI_MATCH(DMI_PRODUCT_NAME, "Delhi3"), }, - }, - { /* APM crashes */ - apm_is_horked, "Fujitsu-Siemens", - { DMI_MATCH(DMI_BIOS_VENDOR, "hoenix/FUJITSU SIEMENS"), - DMI_MATCH(DMI_BIOS_VERSION, "Version1.01"), }, - }, - { /* APM crashes */ - apm_is_horked_d850md, "Intel D850MD", - { DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."), - DMI_MATCH(DMI_BIOS_VERSION, "MV85010A.86A.0016.P07.0201251536"), }, - }, - { /* APM crashes */ - apm_is_horked, "Intel D810EMO", - { DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."), - DMI_MATCH(DMI_BIOS_VERSION, "MO81010A.86A.0008.P04.0004170800"), }, - }, - { /* APM crashes */ - apm_is_horked, "Dell XPS-Z", - { DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."), - DMI_MATCH(DMI_BIOS_VERSION, "A11"), - DMI_MATCH(DMI_PRODUCT_NAME, "XPS-Z"), }, - }, - { /* APM crashes */ - apm_is_horked, "Sharp PC-PJ/AX", - { DMI_MATCH(DMI_SYS_VENDOR, "SHARP"), - DMI_MATCH(DMI_PRODUCT_NAME, "PC-PJ/AX"), - DMI_MATCH(DMI_BIOS_VENDOR,"SystemSoft"), - DMI_MATCH(DMI_BIOS_VERSION,"Version R2.08"), }, - }, - { /* APM crashes */ - apm_is_horked, "Dell Inspiron 2500", - { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), - DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"), - DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"), - DMI_MATCH(DMI_BIOS_VERSION,"A11"), }, - }, - { /* APM idle hangs */ - apm_likes_to_melt, "Jabil AMD", - { DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."), - DMI_MATCH(DMI_BIOS_VERSION, "0AASNP06"), }, - }, - { /* APM idle hangs */ - apm_likes_to_melt, "AMI Bios", - { DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."), - DMI_MATCH(DMI_BIOS_VERSION, "0AASNP05"), }, - }, - { /* Handle problems with APM on Sony Vaio PCG-N505X(DE) */ - swab_apm_power_in_minutes, "Sony VAIO", - { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), - DMI_MATCH(DMI_BIOS_VERSION, "R0206H"), - DMI_MATCH(DMI_BIOS_DATE, "08/23/99"), }, - }, - { /* Handle problems with APM on Sony Vaio PCG-N505VX */ - swab_apm_power_in_minutes, "Sony VAIO", - { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), - DMI_MATCH(DMI_BIOS_VERSION, "W2K06H0"), - DMI_MATCH(DMI_BIOS_DATE, "02/03/00"), }, - }, - { /* Handle problems with APM on Sony Vaio PCG-XG29 */ - swab_apm_power_in_minutes, "Sony VAIO", - { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), - DMI_MATCH(DMI_BIOS_VERSION, "R0117A0"), - DMI_MATCH(DMI_BIOS_DATE, "04/25/00"), }, - }, - { /* Handle problems with APM on Sony Vaio PCG-Z600NE */ - swab_apm_power_in_minutes, "Sony VAIO", - { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), - DMI_MATCH(DMI_BIOS_VERSION, "R0121Z1"), - DMI_MATCH(DMI_BIOS_DATE, "05/11/00"), }, - }, - { /* Handle problems with APM on Sony Vaio PCG-Z600NE */ - swab_apm_power_in_minutes, "Sony VAIO", - { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), - DMI_MATCH(DMI_BIOS_VERSION, "WME01Z1"), - DMI_MATCH(DMI_BIOS_DATE, "08/11/00"), }, - }, - { /* Handle problems with APM on Sony Vaio PCG-Z600LEK(DE) */ - swab_apm_power_in_minutes, "Sony VAIO", - { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), - DMI_MATCH(DMI_BIOS_VERSION, "R0206Z3"), - DMI_MATCH(DMI_BIOS_DATE, "12/25/00"), }, - }, - { /* Handle problems with APM on Sony Vaio PCG-Z505LS */ - swab_apm_power_in_minutes, "Sony VAIO", - { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), - DMI_MATCH(DMI_BIOS_VERSION, "R0203D0"), - DMI_MATCH(DMI_BIOS_DATE, "05/12/00"), }, - }, - { /* Handle problems with APM on Sony Vaio PCG-Z505LS */ - swab_apm_power_in_minutes, "Sony VAIO", - { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), - DMI_MATCH(DMI_BIOS_VERSION, "R0203Z3"), - DMI_MATCH(DMI_BIOS_DATE, "08/25/00"), }, - }, - { /* Handle problems with APM on Sony Vaio PCG-Z505LS (with updated BIOS) */ - swab_apm_power_in_minutes, "Sony VAIO", - { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), - DMI_MATCH(DMI_BIOS_VERSION, "R0209Z3"), - DMI_MATCH(DMI_BIOS_DATE, "05/12/01"), }, - }, - { /* Handle problems with APM on Sony Vaio PCG-F104K */ - swab_apm_power_in_minutes, "Sony VAIO", - { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), - DMI_MATCH(DMI_BIOS_VERSION, "R0204K2"), - DMI_MATCH(DMI_BIOS_DATE, "08/28/00"), }, - }, - - { /* Handle problems with APM on Sony Vaio PCG-C1VN/C1VE */ - swab_apm_power_in_minutes, "Sony VAIO", - { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), - DMI_MATCH(DMI_BIOS_VERSION, "R0208P1"), - DMI_MATCH(DMI_BIOS_DATE, "11/09/00"), }, - }, - { /* Handle problems with APM on Sony Vaio PCG-C1VE */ - swab_apm_power_in_minutes, "Sony VAIO", - { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), - DMI_MATCH(DMI_BIOS_VERSION, "R0204P1"), - DMI_MATCH(DMI_BIOS_DATE, "09/12/00"), }, - }, - { /* Handle problems with APM on Sony Vaio PCG-C1VE */ - swab_apm_power_in_minutes, "Sony VAIO", - { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), - DMI_MATCH(DMI_BIOS_VERSION, "WXPO1Z3"), - DMI_MATCH(DMI_BIOS_DATE, "10/26/01"), }, - }, - { /* broken PM poweroff bios */ - set_realmode_power_off, "Award Software v4.60 PGMA", - { DMI_MATCH(DMI_BIOS_VENDOR, "Award Software International, Inc."), - DMI_MATCH(DMI_BIOS_VERSION, "4.60 PGMA"), - DMI_MATCH(DMI_BIOS_DATE, "134526184"), }, - }, - - /* Generic per vendor APM settings */ - - { /* Allow interrupts during suspend on IBM laptops */ - set_apm_ints, "IBM", - { DMI_MATCH(DMI_SYS_VENDOR, "IBM"), }, - }, - - { } -}; - -/* - * Just start the APM thread. We do NOT want to do APM BIOS - * calls from anything but the APM thread, if for no other reason - * than the fact that we don't trust the APM BIOS. This way, - * most common APM BIOS problems that lead to protection errors - * etc will have at least some level of being contained... - * - * In short, if something bad happens, at least we have a choice - * of just killing the apm thread.. - */ -static int __init apm_init(void) -{ - struct proc_dir_entry *apm_proc; - int ret; - int i; - - dmi_check_system(apm_dmi_table); - - if (apm_info.bios.version == 0) { - printk(KERN_INFO "apm: BIOS not found.\n"); - return -ENODEV; - } - printk(KERN_INFO - "apm: BIOS version %d.%d Flags 0x%02x (Driver version %s)\n", - ((apm_info.bios.version >> 8) & 0xff), - (apm_info.bios.version & 0xff), - apm_info.bios.flags, - driver_version); - if ((apm_info.bios.flags & APM_32_BIT_SUPPORT) == 0) { - printk(KERN_INFO "apm: no 32 bit BIOS support\n"); - return -ENODEV; - } - - if (allow_ints) - apm_info.allow_ints = 1; - if (broken_psr) - apm_info.get_power_status_broken = 1; - if (realmode_power_off) - apm_info.realmode_power_off = 1; - /* User can override, but default is to trust DMI */ - if (apm_disabled != -1) - apm_info.disabled = apm_disabled; - - /* - * Fix for the Compaq Contura 3/25c which reports BIOS version 0.1 - * but is reportedly a 1.0 BIOS. - */ - if (apm_info.bios.version == 0x001) - apm_info.bios.version = 0x100; - - /* BIOS < 1.2 doesn't set cseg_16_len */ - if (apm_info.bios.version < 0x102) - apm_info.bios.cseg_16_len = 0; /* 64k */ - - if (debug) { - printk(KERN_INFO "apm: entry %x:%lx cseg16 %x dseg %x", - apm_info.bios.cseg, apm_info.bios.offset, - apm_info.bios.cseg_16, apm_info.bios.dseg); - if (apm_info.bios.version > 0x100) - printk(" cseg len %x, dseg len %x", - apm_info.bios.cseg_len, - apm_info.bios.dseg_len); - if (apm_info.bios.version > 0x101) - printk(" cseg16 len %x", apm_info.bios.cseg_16_len); - printk("\n"); - } - - if (apm_info.disabled) { - printk(KERN_NOTICE "apm: disabled on user request.\n"); - return -ENODEV; - } - if ((num_online_cpus() > 1) && !power_off && !smp) { - printk(KERN_NOTICE "apm: disabled - APM is not SMP safe.\n"); - apm_info.disabled = 1; - return -ENODEV; - } - if (PM_IS_ACTIVE()) { - printk(KERN_NOTICE "apm: overridden by ACPI.\n"); - apm_info.disabled = 1; - return -ENODEV; - } - pm_active = 1; - - /* - * Set up a segment that references the real mode segment 0x40 - * that extends up to the end of page zero (that we have reserved). - * This is for buggy BIOS's that refer to (real mode) segment 0x40 - * even though they are called in protected mode. - */ - set_base(bad_bios_desc, __va((unsigned long)0x40 << 4)); - _set_limit((char *)&bad_bios_desc, 4095 - (0x40 << 4)); - - apm_bios_entry.offset = apm_info.bios.offset; - apm_bios_entry.segment = APM_CS; - - for (i = 0; i < NR_CPUS; i++) { - set_base(get_cpu_gdt_table(i)[APM_CS >> 3], - __va((unsigned long)apm_info.bios.cseg << 4)); - set_base(get_cpu_gdt_table(i)[APM_CS_16 >> 3], - __va((unsigned long)apm_info.bios.cseg_16 << 4)); - set_base(get_cpu_gdt_table(i)[APM_DS >> 3], - __va((unsigned long)apm_info.bios.dseg << 4)); -#ifndef APM_RELAX_SEGMENTS - if (apm_info.bios.version == 0x100) { -#endif - /* For ASUS motherboard, Award BIOS rev 110 (and others?) */ - _set_limit((char *)&get_cpu_gdt_table(i)[APM_CS >> 3], 64 * 1024 - 1); - /* For some unknown machine. */ - _set_limit((char *)&get_cpu_gdt_table(i)[APM_CS_16 >> 3], 64 * 1024 - 1); - /* For the DEC Hinote Ultra CT475 (and others?) */ - _set_limit((char *)&get_cpu_gdt_table(i)[APM_DS >> 3], 64 * 1024 - 1); -#ifndef APM_RELAX_SEGMENTS - } else { - _set_limit((char *)&get_cpu_gdt_table(i)[APM_CS >> 3], - (apm_info.bios.cseg_len - 1) & 0xffff); - _set_limit((char *)&get_cpu_gdt_table(i)[APM_CS_16 >> 3], - (apm_info.bios.cseg_16_len - 1) & 0xffff); - _set_limit((char *)&get_cpu_gdt_table(i)[APM_DS >> 3], - (apm_info.bios.dseg_len - 1) & 0xffff); - /* workaround for broken BIOSes */ - if (apm_info.bios.cseg_len <= apm_info.bios.offset) - _set_limit((char *)&get_cpu_gdt_table(i)[APM_CS >> 3], 64 * 1024 -1); - if (apm_info.bios.dseg_len <= 0x40) { /* 0x40 * 4kB == 64kB */ - /* for the BIOS that assumes granularity = 1 */ - get_cpu_gdt_table(i)[APM_DS >> 3].b |= 0x800000; - printk(KERN_NOTICE "apm: we set the granularity of dseg.\n"); - } - } -#endif - } - - apm_proc = create_proc_info_entry("apm", 0, NULL, apm_get_info); - if (apm_proc) - apm_proc->owner = THIS_MODULE; - - ret = kernel_thread(apm, NULL, CLONE_KERNEL | SIGCHLD); - if (ret < 0) { - printk(KERN_ERR "apm: disabled - Unable to start kernel thread.\n"); - return -ENOMEM; - } - - if (num_online_cpus() > 1 && !smp ) { - printk(KERN_NOTICE - "apm: disabled - APM is not SMP safe (power off active).\n"); - return 0; - } - - misc_register(&apm_device); - - if (HZ != 100) - idle_period = (idle_period * HZ) / 100; - if (idle_threshold < 100) { - original_pm_idle = pm_idle; - pm_idle = apm_cpu_idle; - set_pm_idle = 1; - } - - return 0; -} - -static void __exit apm_exit(void) -{ - int error; - - if (set_pm_idle) { - pm_idle = original_pm_idle; - /* - * We are about to unload the current idle thread pm callback - * (pm_idle), Wait for all processors to update cached/local - * copies of pm_idle before proceeding. - */ - cpu_idle_wait(); - } - if (((apm_info.bios.flags & APM_BIOS_DISENGAGED) == 0) - && (apm_info.connection_version > 0x0100)) { - error = apm_engage_power_management(APM_DEVICE_ALL, 0); - if (error) - apm_error("disengage power management", error); - } - misc_deregister(&apm_device); - remove_proc_entry("apm", NULL); - if (power_off) - pm_power_off = NULL; - exit_kapmd = 1; - while (kapmd_running) - schedule(); - pm_active = 0; -} - -module_init(apm_init); -module_exit(apm_exit); - -MODULE_AUTHOR("Stephen Rothwell"); -MODULE_DESCRIPTION("Advanced Power Management"); -MODULE_LICENSE("GPL"); -module_param(debug, bool, 0644); -MODULE_PARM_DESC(debug, "Enable debug mode"); -module_param(power_off, bool, 0444); -MODULE_PARM_DESC(power_off, "Enable power off"); -module_param(bounce_interval, int, 0444); -MODULE_PARM_DESC(bounce_interval, - "Set the number of ticks to ignore suspend bounces"); -module_param(allow_ints, bool, 0444); -MODULE_PARM_DESC(allow_ints, "Allow interrupts during BIOS calls"); -module_param(broken_psr, bool, 0444); -MODULE_PARM_DESC(broken_psr, "BIOS has a broken GetPowerStatus call"); -module_param(realmode_power_off, bool, 0444); -MODULE_PARM_DESC(realmode_power_off, - "Switch to real mode before powering off"); -module_param(idle_threshold, int, 0444); -MODULE_PARM_DESC(idle_threshold, - "System idle percentage above which to make APM BIOS idle calls"); -module_param(idle_period, int, 0444); -MODULE_PARM_DESC(idle_period, - "Period (in sec/100) over which to caculate the idle percentage"); -module_param(smp, bool, 0444); -MODULE_PARM_DESC(smp, - "Set this to enable APM use on an SMP platform. Use with caution on older systems"); -MODULE_ALIAS_MISCDEV(APM_MINOR_DEV); diff -r d609de73b9fa -r 5a63f675107c linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/io_ports.h --- a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/io_ports.h Wed Feb 1 17:06:16 2006 +++ /dev/null Wed Feb 1 18:00:19 2006 @@ -1,30 +0,0 @@ -/* - * arch/i386/mach-generic/io_ports.h - * - * Machine specific IO port address definition for generic. - * Written by Osamu Tomita <tomita@xxxxxxxxxxx> - */ -#ifndef _MACH_IO_PORTS_H -#define _MACH_IO_PORTS_H - -/* i8253A PIT registers */ -#define PIT_MODE 0x43 -#define PIT_CH0 0x40 -#define PIT_CH2 0x42 - -/* i8259A PIC registers */ -#define PIC_MASTER_CMD 0x20 -#define PIC_MASTER_IMR 0x21 -#define PIC_MASTER_ISR PIC_MASTER_CMD -#define PIC_MASTER_POLL PIC_MASTER_ISR -#define PIC_MASTER_OCW3 PIC_MASTER_ISR -#define PIC_SLAVE_CMD 0xa0 -#define PIC_SLAVE_IMR 0xa1 - -/* i8259A PIC related value */ -#define PIC_CASCADE_IR 2 -#define MASTER_ICW4_DEFAULT 0x01 -#define SLAVE_ICW4_DEFAULT 0x01 -#define PIC_ICW4_AEOI 2 - -#endif /* !_MACH_IO_PORTS_H */ _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-changelog
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |