[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-changelog] [linux-2.6.18-xen] Imported linux-2.6-xen-sparse from xen-unstable.hg 15200:bd3d6b4c52ec
# HG changeset patch # User Ian Campbell <ian.campbell@xxxxxxxxxxxxx> # Date 1180947928 -3600 # Node ID a533be77c572209cd133734bab614a102a61ab75 # Parent 42f6970e18c9d339a78789e1a3e50b4cb948d99a Imported linux-2.6-xen-sparse from xen-unstable.hg 15200:bd3d6b4c52ec --- arch/i386/Kconfig | 85 arch/i386/Kconfig.cpu | 6 arch/i386/Kconfig.debug | 1 arch/i386/Makefile | 24 arch/i386/boot-xen/Makefile | 19 arch/i386/kernel/Makefile | 22 arch/i386/kernel/acpi/Makefile | 4 arch/i386/kernel/acpi/boot-xen.c | 1168 ++++++++ arch/i386/kernel/apic-xen.c | 155 + arch/i386/kernel/asm-offsets.c | 7 arch/i386/kernel/cpu/Makefile | 5 arch/i386/kernel/cpu/common-xen.c | 743 +++++ arch/i386/kernel/cpu/mtrr/Makefile | 7 arch/i386/kernel/cpu/mtrr/main-xen.c | 197 + arch/i386/kernel/crash.c | 4 arch/i386/kernel/early_printk-xen.c | 2 arch/i386/kernel/entry-xen.S | 1216 ++++++++ arch/i386/kernel/fixup.c | 88 arch/i386/kernel/head-xen.S | 207 + arch/i386/kernel/init_task-xen.c | 51 arch/i386/kernel/io_apic-xen.c | 2777 ++++++++++++++++++++ arch/i386/kernel/ioport-xen.c | 122 arch/i386/kernel/irq-xen.c | 324 ++ arch/i386/kernel/ldt-xen.c | 270 + arch/i386/kernel/machine_kexec.c | 40 arch/i386/kernel/microcode-xen.c | 144 + arch/i386/kernel/mpparse-xen.c | 1185 ++++++++ arch/i386/kernel/pci-dma-xen.c | 378 ++ arch/i386/kernel/process-xen.c | 853 ++++++ arch/i386/kernel/quirks-xen.c | 47 arch/i386/kernel/setup-xen.c | 1898 +++++++++++++ arch/i386/kernel/smp-xen.c | 624 ++++ arch/i386/kernel/swiotlb.c | 726 +++++ arch/i386/kernel/sysenter.c | 18 arch/i386/kernel/time-xen.c | 1143 ++++++++ arch/i386/kernel/traps-xen.c | 1186 ++++++++ arch/i386/kernel/vm86.c | 12 arch/i386/kernel/vsyscall-note-xen.S | 32 arch/i386/mach-xen/Makefile | 5 arch/i386/mach-xen/setup.c | 147 + arch/i386/mm/Makefile | 8 arch/i386/mm/fault-xen.c | 769 +++++ arch/i386/mm/highmem-xen.c | 136 arch/i386/mm/hypervisor.c | 432 +++ arch/i386/mm/init-xen.c | 850 ++++++ arch/i386/mm/ioremap-xen.c | 443 +++ arch/i386/mm/pgtable-xen.c | 727 +++++ arch/i386/oprofile/Makefile | 7 arch/i386/oprofile/xenoprof.c | 179 + arch/i386/pci/Makefile | 9 arch/i386/pci/irq-xen.c | 1205 ++++++++ arch/i386/pci/pcifront.c | 55 arch/i386/power/Makefile | 4 arch/ia64/Kconfig | 57 arch/ia64/Makefile | 12 arch/ia64/hp/common/sba_iommu.c | 61 arch/ia64/kernel/acpi.c | 6 arch/ia64/kernel/asm-offsets.c | 25 arch/ia64/kernel/entry.S | 36 arch/ia64/kernel/fsys.S | 41 arch/ia64/kernel/gate.S | 103 arch/ia64/kernel/gate.lds.S | 14 arch/ia64/kernel/head.S | 6 arch/ia64/kernel/iosapic.c | 75 arch/ia64/kernel/irq_ia64.c | 356 ++ arch/ia64/kernel/pal.S | 5 arch/ia64/kernel/patch.c | 67 arch/ia64/kernel/perfmon.c | 89 arch/ia64/kernel/setup.c | 135 arch/ia64/kernel/time.c | 194 + arch/ia64/mm/ioremap.c | 3 arch/ia64/oprofile/Makefile | 4 arch/ia64/oprofile/init.c | 14 arch/ia64/oprofile/oprofile_perfmon.h | 28 arch/ia64/oprofile/perfmon.c | 35 arch/ia64/oprofile/xenoprof.c | 142 + arch/ia64/pci/pci.c | 16 arch/ia64/xen/Makefile | 9 arch/ia64/xen/hypercall.S | 170 + arch/ia64/xen/hypervisor.c | 1264 +++++++++ arch/ia64/xen/machvec.c | 4 arch/ia64/xen/mem.c | 75 arch/ia64/xen/swiotlb.c | 882 ++++++ arch/ia64/xen/util.c | 105 arch/ia64/xen/xcom_hcall.c | 397 ++ arch/ia64/xen/xcom_mini.c | 469 +++ arch/ia64/xen/xcom_privcmd.c | 673 ++++ arch/ia64/xen/xen_dma.c | 145 + arch/ia64/xen/xencomm.c | 263 + arch/ia64/xen/xenentry.S | 931 ++++++ arch/ia64/xen/xenhpski.c | 19 arch/ia64/xen/xenivt.S | 2177 +++++++++++++++ arch/ia64/xen/xenminstate.h | 358 ++ arch/ia64/xen/xenpal.S | 85 arch/ia64/xen/xensetup.S | 52 arch/um/kernel/physmem.c | 4 arch/x86_64/Kconfig | 68 arch/x86_64/Makefile | 20 arch/x86_64/ia32/Makefile | 20 arch/x86_64/ia32/ia32entry-xen.S | 743 +++++ arch/x86_64/ia32/syscall32-xen.c | 128 arch/x86_64/ia32/syscall32_syscall-xen.S | 28 arch/x86_64/ia32/vsyscall-int80.S | 58 arch/x86_64/ia32/vsyscall-sigreturn.S | 2 arch/x86_64/kernel/Makefile | 19 arch/x86_64/kernel/acpi/Makefile | 1 arch/x86_64/kernel/apic-xen.c | 197 + arch/x86_64/kernel/asm-offsets.c | 2 arch/x86_64/kernel/crash.c | 6 arch/x86_64/kernel/e820-xen.c | 774 +++++ arch/x86_64/kernel/early_printk-xen.c | 302 ++ arch/x86_64/kernel/entry-xen.S | 1325 +++++++++ arch/x86_64/kernel/genapic-xen.c | 143 + arch/x86_64/kernel/genapic_xen.c | 161 + arch/x86_64/kernel/head-xen.S | 203 + arch/x86_64/kernel/head64-xen.c | 162 + arch/x86_64/kernel/init_task.c | 3 arch/x86_64/kernel/io_apic-xen.c | 2269 ++++++++++++++++ arch/x86_64/kernel/ioport-xen.c | 99 arch/x86_64/kernel/irq-xen.c | 197 + arch/x86_64/kernel/ldt-xen.c | 282 ++ arch/x86_64/kernel/machine_kexec.c | 118 arch/x86_64/kernel/mpparse-xen.c | 1011 +++++++ arch/x86_64/kernel/pci-swiotlb-xen.c | 55 arch/x86_64/kernel/process-xen.c | 829 +++++ arch/x86_64/kernel/setup-xen.c | 1677 ++++++++++++ arch/x86_64/kernel/setup64-xen.c | 361 ++ arch/x86_64/kernel/smp-xen.c | 600 ++++ arch/x86_64/kernel/traps-xen.c | 1175 ++++++++ arch/x86_64/kernel/vsyscall-xen.c | 239 + arch/x86_64/kernel/xen_entry.S | 40 arch/x86_64/mm/Makefile | 10 arch/x86_64/mm/fault-xen.c | 724 +++++ arch/x86_64/mm/init-xen.c | 1238 ++++++++ arch/x86_64/mm/pageattr-xen.c | 433 +++ arch/x86_64/oprofile/Makefile | 10 arch/x86_64/pci/Makefile | 12 drivers/Makefile | 1 drivers/acpi/Kconfig | 3 drivers/char/mem.c | 6 drivers/char/tpm/Kconfig | 10 drivers/char/tpm/Makefile | 2 drivers/char/tpm/tpm.h | 15 drivers/char/tpm/tpm_vtpm.c | 542 +++ drivers/char/tpm/tpm_vtpm.h | 55 drivers/char/tpm/tpm_xen.c | 720 +++++ drivers/char/tty_io.c | 7 drivers/firmware/Kconfig | 1 drivers/pci/Kconfig | 1 drivers/serial/Kconfig | 1 drivers/video/console/Kconfig | 1 drivers/xen/Kconfig | 260 + drivers/xen/Makefile | 20 drivers/xen/balloon/Makefile | 2 drivers/xen/balloon/balloon.c | 663 ++++ drivers/xen/balloon/common.h | 58 drivers/xen/balloon/sysfs.c | 170 + drivers/xen/blkback/Makefile | 3 drivers/xen/blkback/blkback.c | 614 ++++ drivers/xen/blkback/common.h | 139 + drivers/xen/blkback/interface.c | 181 + drivers/xen/blkback/vbd.c | 118 drivers/xen/blkback/xenbus.c | 533 +++ drivers/xen/blkfront/Makefile | 5 drivers/xen/blkfront/blkfront.c | 902 ++++++ drivers/xen/blkfront/block.h | 142 + drivers/xen/blkfront/vbd.c | 372 ++ drivers/xen/blktap/Makefile | 5 drivers/xen/blktap/blktap.c | 1641 +++++++++++ drivers/xen/blktap/common.h | 121 drivers/xen/blktap/interface.c | 174 + drivers/xen/blktap/xenbus.c | 477 +++ drivers/xen/char/Makefile | 2 drivers/xen/char/mem.c | 203 + drivers/xen/console/Makefile | 2 drivers/xen/console/console.c | 721 +++++ drivers/xen/console/xencons_ring.c | 143 + drivers/xen/core/Makefile | 12 drivers/xen/core/cpu_hotplug.c | 172 + drivers/xen/core/evtchn.c | 1015 +++++++ drivers/xen/core/features.c | 34 drivers/xen/core/gnttab.c | 753 +++++ drivers/xen/core/hypervisor_sysfs.c | 59 drivers/xen/core/machine_kexec.c | 189 + drivers/xen/core/machine_reboot.c | 241 + drivers/xen/core/reboot.c | 249 + drivers/xen/core/smpboot.c | 452 +++ drivers/xen/core/xen_proc.c | 23 drivers/xen/core/xen_sysfs.c | 378 ++ drivers/xen/evtchn/Makefile | 2 drivers/xen/evtchn/evtchn.c | 469 +++ drivers/xen/fbfront/Makefile | 2 drivers/xen/fbfront/xenfb.c | 752 +++++ drivers/xen/fbfront/xenkbd.c | 333 ++ drivers/xen/gntdev/Makefile | 1 drivers/xen/gntdev/gntdev.c | 973 +++++++ drivers/xen/netback/Makefile | 5 drivers/xen/netback/common.h | 165 + drivers/xen/netback/interface.c | 336 ++ drivers/xen/netback/loopback.c | 320 ++ drivers/xen/netback/netback.c | 1606 +++++++++++ drivers/xen/netback/xenbus.c | 453 +++ drivers/xen/netfront/Makefile | 4 drivers/xen/netfront/netfront.c | 2133 +++++++++++++++ drivers/xen/pciback/Makefile | 15 drivers/xen/pciback/conf_space.c | 426 +++ drivers/xen/pciback/conf_space.h | 126 drivers/xen/pciback/conf_space_capability.c | 71 drivers/xen/pciback/conf_space_capability.h | 23 drivers/xen/pciback/conf_space_capability_pm.c | 128 drivers/xen/pciback/conf_space_capability_vpd.c | 42 drivers/xen/pciback/conf_space_header.c | 309 ++ drivers/xen/pciback/conf_space_quirks.c | 126 drivers/xen/pciback/conf_space_quirks.h | 35 drivers/xen/pciback/passthrough.c | 157 + drivers/xen/pciback/pci_stub.c | 929 ++++++ drivers/xen/pciback/pciback.h | 93 drivers/xen/pciback/pciback_ops.c | 95 drivers/xen/pciback/slot.c | 151 + drivers/xen/pciback/vpci.c | 204 + drivers/xen/pciback/xenbus.c | 454 +++ drivers/xen/pcifront/Makefile | 7 drivers/xen/pcifront/pci.c | 46 drivers/xen/pcifront/pci_op.c | 268 + drivers/xen/pcifront/pcifront.h | 40 drivers/xen/pcifront/xenbus.c | 295 ++ drivers/xen/privcmd/Makefile | 2 drivers/xen/privcmd/privcmd.c | 284 ++ drivers/xen/tpmback/Makefile | 4 drivers/xen/tpmback/common.h | 85 drivers/xen/tpmback/interface.c | 167 + drivers/xen/tpmback/tpmback.c | 944 ++++++ drivers/xen/tpmback/xenbus.c | 289 ++ drivers/xen/util.c | 70 drivers/xen/xenbus/Makefile | 9 drivers/xen/xenbus/xenbus_backend_client.c | 147 + drivers/xen/xenbus/xenbus_client.c | 283 ++ drivers/xen/xenbus/xenbus_comms.c | 232 + drivers/xen/xenbus/xenbus_comms.h | 46 drivers/xen/xenbus/xenbus_dev.c | 404 ++ drivers/xen/xenbus/xenbus_probe.c | 1087 +++++++ drivers/xen/xenbus/xenbus_probe.h | 75 drivers/xen/xenbus/xenbus_probe_backend.c | 287 ++ drivers/xen/xenbus/xenbus_xs.c | 880 ++++++ drivers/xen/xenoprof/xenoprofile.c | 500 +++ fs/Kconfig | 1 include/asm-i386/apic.h | 2 include/asm-i386/kexec.h | 14 include/asm-i386/mach-xen/asm/agp.h | 37 include/asm-i386/mach-xen/asm/desc.h | 164 + include/asm-i386/mach-xen/asm/dma-mapping.h | 157 + include/asm-i386/mach-xen/asm/fixmap.h | 155 + include/asm-i386/mach-xen/asm/floppy.h | 147 + include/asm-i386/mach-xen/asm/highmem.h | 80 include/asm-i386/mach-xen/asm/hw_irq.h | 72 include/asm-i386/mach-xen/asm/hypercall.h | 407 ++ include/asm-i386/mach-xen/asm/hypervisor.h | 258 + include/asm-i386/mach-xen/asm/io.h | 390 ++ include/asm-i386/mach-xen/asm/irqflags.h | 127 include/asm-i386/mach-xen/asm/maddr.h | 193 + include/asm-i386/mach-xen/asm/mmu.h | 29 include/asm-i386/mach-xen/asm/mmu_context.h | 108 include/asm-i386/mach-xen/asm/page.h | 235 + include/asm-i386/mach-xen/asm/param.h | 23 include/asm-i386/mach-xen/asm/pci.h | 146 + include/asm-i386/mach-xen/asm/pgalloc.h | 59 include/asm-i386/mach-xen/asm/pgtable-2level-defs.h | 20 include/asm-i386/mach-xen/asm/pgtable-2level.h | 120 include/asm-i386/mach-xen/asm/pgtable-3level-defs.h | 24 include/asm-i386/mach-xen/asm/pgtable-3level.h | 209 + include/asm-i386/mach-xen/asm/pgtable.h | 531 +++ include/asm-i386/mach-xen/asm/processor.h | 741 +++++ include/asm-i386/mach-xen/asm/ptrace.h | 90 include/asm-i386/mach-xen/asm/scatterlist.h | 22 include/asm-i386/mach-xen/asm/segment.h | 117 include/asm-i386/mach-xen/asm/setup.h | 81 include/asm-i386/mach-xen/asm/smp.h | 103 include/asm-i386/mach-xen/asm/spinlock.h | 202 + include/asm-i386/mach-xen/asm/swiotlb.h | 43 include/asm-i386/mach-xen/asm/synch_bitops.h | 145 + include/asm-i386/mach-xen/asm/system.h | 488 +++ include/asm-i386/mach-xen/asm/tlbflush.h | 101 include/asm-i386/mach-xen/asm/vga.h | 20 include/asm-i386/mach-xen/asm/xenoprof.h | 48 include/asm-i386/mach-xen/irq_vectors.h | 125 include/asm-i386/mach-xen/mach_traps.h | 33 include/asm-i386/mach-xen/setup_arch.h | 5 include/asm-ia64/agp.h | 31 include/asm-ia64/dma-mapping.h | 23 include/asm-ia64/gcc_intrin.h | 68 include/asm-ia64/hw_irq.h | 10 include/asm-ia64/hypercall.h | 416 ++ include/asm-ia64/hypervisor.h | 226 + include/asm-ia64/intel_intrin.h | 66 include/asm-ia64/io.h | 37 include/asm-ia64/iosapic.h | 2 include/asm-ia64/irq.h | 33 include/asm-ia64/machvec.h | 2 include/asm-ia64/machvec_xen.h | 43 include/asm-ia64/maddr.h | 107 include/asm-ia64/meminit.h | 5 include/asm-ia64/page.h | 23 include/asm-ia64/pal.h | 1 include/asm-ia64/pgalloc.h | 4 include/asm-ia64/privop.h | 60 include/asm-ia64/processor.h | 1 include/asm-ia64/sal.h | 21 include/asm-ia64/swiotlb.h | 41 include/asm-ia64/synch_bitops.h | 61 include/asm-ia64/system.h | 4 include/asm-ia64/uaccess.h | 20 include/asm-ia64/xen/privop.h | 310 ++ include/asm-ia64/xen/xcom_hcall.h | 88 include/asm-ia64/xen/xencomm.h | 60 include/asm-ia64/xenoprof.h | 48 include/asm-um/page.h | 2 include/asm-x86_64/apic.h | 2 include/asm-x86_64/kexec.h | 13 include/asm-x86_64/mach-xen/asm/agp.h | 35 include/asm-x86_64/mach-xen/asm/arch_hooks.h | 27 include/asm-x86_64/mach-xen/asm/bootsetup.h | 42 include/asm-x86_64/mach-xen/asm/desc.h | 263 + include/asm-x86_64/mach-xen/asm/dma-mapping.h | 207 + include/asm-x86_64/mach-xen/asm/e820.h | 66 include/asm-x86_64/mach-xen/asm/fixmap.h | 112 include/asm-x86_64/mach-xen/asm/floppy.h | 206 + include/asm-x86_64/mach-xen/asm/hw_irq.h | 136 include/asm-x86_64/mach-xen/asm/hypercall.h | 406 ++ include/asm-x86_64/mach-xen/asm/hypervisor.h | 2 include/asm-x86_64/mach-xen/asm/io.h | 330 ++ include/asm-x86_64/mach-xen/asm/irq.h | 38 include/asm-x86_64/mach-xen/asm/irqflags.h | 139 + include/asm-x86_64/mach-xen/asm/maddr.h | 161 + include/asm-x86_64/mach-xen/asm/mmu.h | 38 include/asm-x86_64/mach-xen/asm/mmu_context.h | 136 include/asm-x86_64/mach-xen/asm/msr.h | 399 ++ include/asm-x86_64/mach-xen/asm/nmi.h | 93 include/asm-x86_64/mach-xen/asm/page.h | 217 + include/asm-x86_64/mach-xen/asm/pci.h | 166 + include/asm-x86_64/mach-xen/asm/pgalloc.h | 204 + include/asm-x86_64/mach-xen/asm/pgtable.h | 575 ++++ include/asm-x86_64/mach-xen/asm/processor.h | 506 +++ include/asm-x86_64/mach-xen/asm/ptrace.h | 127 include/asm-x86_64/mach-xen/asm/smp.h | 150 + include/asm-x86_64/mach-xen/asm/synch_bitops.h | 2 include/asm-x86_64/mach-xen/asm/system.h | 262 + include/asm-x86_64/mach-xen/asm/timer.h | 67 include/asm-x86_64/mach-xen/asm/tlbflush.h | 103 include/asm-x86_64/mach-xen/asm/vga.h | 20 include/asm-x86_64/mach-xen/asm/xenoprof.h | 1 include/asm-x86_64/mach-xen/asm/xor.h | 328 ++ include/asm-x86_64/mach-xen/irq_vectors.h | 123 include/asm-x86_64/mach-xen/mach_time.h | 111 include/asm-x86_64/mach-xen/mach_timer.h | 50 include/asm-x86_64/mach-xen/setup_arch_post.h | 63 include/asm-x86_64/mach-xen/setup_arch_pre.h | 5 include/linux/gfp.h | 6 include/linux/highmem.h | 6 include/linux/interrupt.h | 6 include/linux/kexec.h | 13 include/linux/mm.h | 14 include/linux/page-flags.h | 14 include/linux/skbuff.h | 8 include/xen/balloon.h | 57 include/xen/blkif.h | 97 include/xen/cpu_hotplug.h | 44 include/xen/driver_util.h | 14 include/xen/evtchn.h | 126 include/xen/features.h | 20 include/xen/gnttab.h | 167 + include/xen/hvm.h | 23 include/xen/hypercall.h | 24 include/xen/hypervisor_sysfs.h | 32 include/xen/interface/COPYING | 38 include/xen/interface/acm.h | 228 + include/xen/interface/acm_ops.h | 159 + include/xen/interface/arch-ia64.h | 628 ++++ include/xen/interface/arch-powerpc.h | 125 include/xen/interface/arch-x86/xen-x86_32.h | 168 + include/xen/interface/arch-x86/xen-x86_64.h | 211 + include/xen/interface/arch-x86/xen.h | 204 + include/xen/interface/arch-x86_32.h | 27 include/xen/interface/arch-x86_64.h | 27 include/xen/interface/callback.h | 92 include/xen/interface/dom0_ops.h | 120 include/xen/interface/domctl.h | 478 +++ include/xen/interface/elfnote.h | 233 + include/xen/interface/elfstructs.h | 527 +++ include/xen/interface/event_channel.h | 264 + include/xen/interface/features.h | 71 include/xen/interface/foreign/Makefile | 37 include/xen/interface/foreign/mkchecker.py | 58 include/xen/interface/foreign/mkheader.py | 167 + include/xen/interface/foreign/reference.size | 18 include/xen/interface/foreign/structs.py | 58 include/xen/interface/grant_table.h | 422 +++ include/xen/interface/hvm/e820.h | 47 include/xen/interface/hvm/hvm_info_table.h | 41 include/xen/interface/hvm/hvm_op.h | 73 include/xen/interface/hvm/ioreq.h | 122 include/xen/interface/hvm/params.h | 60 include/xen/interface/hvm/save.h | 462 +++ include/xen/interface/hvm/vmx_assist.h | 116 include/xen/interface/io/blkif.h | 128 include/xen/interface/io/console.h | 51 include/xen/interface/io/fbif.h | 138 include/xen/interface/io/kbdif.h | 130 include/xen/interface/io/netif.h | 184 + include/xen/interface/io/pciif.h | 83 include/xen/interface/io/protocols.h | 21 include/xen/interface/io/ring.h | 299 ++ include/xen/interface/io/tpmif.h | 77 include/xen/interface/io/xenbus.h | 73 include/xen/interface/io/xs_wire.h | 117 include/xen/interface/kexec.h | 137 include/xen/interface/libelf.h | 241 + include/xen/interface/memory.h | 281 ++ include/xen/interface/nmi.h | 78 include/xen/interface/physdev.h | 169 + include/xen/interface/platform.h | 143 + include/xen/interface/sched.h | 121 include/xen/interface/sysctl.h | 182 + include/xen/interface/trace.h | 119 include/xen/interface/vcpu.h | 192 + include/xen/interface/version.h | 91 include/xen/interface/xen-compat.h | 51 include/xen/interface/xen.h | 610 ++++ include/xen/interface/xencomm.h | 41 include/xen/interface/xenoprof.h | 132 include/xen/pcifront.h | 77 include/xen/public/evtchn.h | 88 include/xen/public/gntdev.h | 105 include/xen/public/privcmd.h | 79 include/xen/xen_proc.h | 12 include/xen/xenbus.h | 302 ++ include/xen/xencons.h | 19 include/xen/xenoprof.h | 42 kernel/Kconfig.preempt | 1 kernel/fork.c | 3 kernel/irq/spurious.c | 3 kernel/kexec.c | 73 lib/Makefile | 6 mm/Kconfig | 3 mm/highmem.c | 11 mm/memory.c | 129 mm/mmap.c | 4 mm/page_alloc.c | 24 net/core/dev.c | 66 net/core/skbuff.c | 5 scripts/Makefile.xen | 14 450 files changed, 100652 insertions(+), 189 deletions(-) diff -r 42f6970e18c9 -r a533be77c572 arch/i386/Kconfig --- a/arch/i386/Kconfig Mon Jun 04 10:05:24 2007 +0100 +++ b/arch/i386/Kconfig Mon Jun 04 10:05:28 2007 +0100 @@ -16,6 +16,7 @@ config X86_32 config GENERIC_TIME bool + depends on !X86_XEN default y config LOCKDEP_SUPPORT @@ -102,6 +103,15 @@ config X86_PC bool "PC-compatible" help Choose this option if your computer is a standard PC or compatible. + +config X86_XEN + bool "Xen-compatible" + select X86_UP_APIC if !SMP && XEN_PRIVILEGED_GUEST + select X86_UP_IOAPIC if !SMP && XEN_PRIVILEGED_GUEST + select SWIOTLB + help + Choose this option if you plan to run this kernel on top of the + Xen Hypervisor. config X86_ELAN bool "AMD Elan" @@ -213,6 +223,7 @@ source "arch/i386/Kconfig.cpu" config HPET_TIMER bool "HPET Timer Support" + depends on !X86_XEN help This enables the use of the HPET for the kernel's internal timer. HPET is the next generation timer replacing legacy 8254s. @@ -263,7 +274,7 @@ source "kernel/Kconfig.preempt" config X86_UP_APIC bool "Local APIC support on uniprocessors" - depends on !SMP && !(X86_VISWS || X86_VOYAGER) + depends on !SMP && !(X86_VISWS || X86_VOYAGER || XEN_UNPRIVILEGED_GUEST) help A local APIC (Advanced Programmable Interrupt Controller) is an integrated interrupt controller in the CPU. If you have a single-CPU @@ -288,12 +299,12 @@ config X86_UP_IOAPIC config X86_LOCAL_APIC bool - depends on X86_UP_APIC || ((X86_VISWS || SMP) && !X86_VOYAGER) + depends on X86_UP_APIC || ((X86_VISWS || SMP) && !(X86_VOYAGER || XEN_UNPRIVILEGED_GUEST)) default y config X86_IO_APIC bool - depends on X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER)) + depends on X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER || XEN_UNPRIVILEGED_GUEST)) default y config X86_VISWS_APIC @@ -303,7 +314,7 @@ config X86_VISWS_APIC config X86_MCE bool "Machine Check Exception" - depends on !X86_VOYAGER + depends on !(X86_VOYAGER || X86_XEN) ---help--- Machine Check Exception support allows the processor to notify the kernel if it detects a problem (e.g. overheating, component failure). @@ -402,6 +413,7 @@ config X86_REBOOTFIXUPS config MICROCODE tristate "/dev/cpu/microcode - Intel IA32 CPU microcode support" + depends on !XEN_UNPRIVILEGED_GUEST ---help--- If you say Y here and also to "/dev file system support" in the 'File systems' section, you will be able to update the microcode on @@ -419,6 +431,7 @@ config MICROCODE config X86_MSR tristate "/dev/cpu/*/msr - Model-specific register support" + depends on !X86_XEN help This device gives privileged processes access to the x86 Model-Specific Registers (MSRs). It is a character device with @@ -433,6 +446,10 @@ config X86_CPUID be executed on a specific processor. It is a character device with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to /dev/cpu/31/cpuid. + +config SWIOTLB + bool + default n source "drivers/firmware/Kconfig" @@ -616,6 +633,7 @@ config HIGHPTE config MATH_EMULATION bool "Math emulation" + depends on !X86_XEN ---help--- Linux can emulate a math coprocessor (used for floating point operations) if you don't have one. 486DX and Pentium processors have @@ -641,6 +659,8 @@ config MATH_EMULATION config MTRR bool "MTRR (Memory Type Range Register) support" + depends on !XEN_UNPRIVILEGED_GUEST + default y if X86_XEN ---help--- On Intel P6 family processors (Pentium Pro, Pentium II and later) the Memory Type Range Registers (MTRRs) may be used to control @@ -675,7 +695,7 @@ config MTRR config EFI bool "Boot from EFI support" - depends on ACPI + depends on ACPI && !X86_XEN default n ---help--- This enables the the kernel to boot on EFI platforms using @@ -693,7 +713,7 @@ config EFI config IRQBALANCE bool "Enable kernel irq balancing" - depends on SMP && X86_IO_APIC + depends on SMP && X86_IO_APIC && !X86_XEN default y help The default yes will allow the kernel to do irq load balancing. @@ -741,7 +761,7 @@ source kernel/Kconfig.hz config KEXEC bool "kexec system call (EXPERIMENTAL)" - depends on EXPERIMENTAL + depends on EXPERIMENTAL && !XEN_UNPRIVILEGED_GUEST help kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot @@ -793,6 +813,7 @@ config HOTPLUG_CPU config COMPAT_VDSO bool "Compat VDSO support" + depends on !X86_XEN default y help Map the VDSO to the predictable old-style address too. @@ -810,18 +831,20 @@ config ARCH_ENABLE_MEMORY_HOTPLUG depends on HIGHMEM menu "Power management options (ACPI, APM)" - depends on !X86_VOYAGER - + depends on !(X86_VOYAGER || XEN_UNPRIVILEGED_GUEST) + +if !X86_XEN source kernel/power/Kconfig +endif source "drivers/acpi/Kconfig" menu "APM (Advanced Power Management) BIOS Support" -depends on PM && !X86_VISWS +depends on PM && !(X86_VISWS || X86_XEN) config APM tristate "APM (Advanced Power Management) BIOS support" - depends on PM + depends on PM && PM_LEGACY ---help--- APM is a BIOS specification for saving power using several different techniques. This is mostly useful for battery powered laptops with @@ -1006,6 +1029,7 @@ choice config PCI_GOBIOS bool "BIOS" + depends on !X86_XEN config PCI_GOMMCONFIG bool "MMConfig" @@ -1013,6 +1037,13 @@ config PCI_GODIRECT config PCI_GODIRECT bool "Direct" +config PCI_GOXEN_FE + bool "Xen PCI Frontend" + depends on X86_XEN + help + The PCI device frontend driver allows the kernel to import arbitrary + PCI devices from a PCI backend to support PCI driver domains. + config PCI_GOANY bool "Any" @@ -1020,7 +1051,7 @@ endchoice config PCI_BIOS bool - depends on !X86_VISWS && PCI && (PCI_GOBIOS || PCI_GOANY) + depends on !(X86_VISWS || X86_XEN) && PCI && (PCI_GOBIOS || PCI_GOANY) default y config PCI_DIRECT @@ -1032,6 +1063,18 @@ config PCI_MMCONFIG bool depends on PCI && ACPI && (PCI_GOMMCONFIG || PCI_GOANY) default y + +config XEN_PCIDEV_FRONTEND + bool + depends on PCI && X86_XEN && (PCI_GOXEN_FE || PCI_GOANY) + default y + +config XEN_PCIDEV_FE_DEBUG + bool "Xen PCI Frontend Debugging" + depends on XEN_PCIDEV_FRONTEND + default n + help + Enables some debug statements within the PCI Frontend. source "drivers/pci/pcie/Kconfig" @@ -1043,7 +1086,7 @@ config ISA_DMA_API config ISA bool "ISA support" - depends on !(X86_VOYAGER || X86_VISWS) + depends on !(X86_VOYAGER || X86_VISWS || X86_XEN) help Find out whether you have ISA slots on your motherboard. ISA is the name of a bus system, i.e. the way the CPU talks to the other stuff @@ -1070,7 +1113,7 @@ source "drivers/eisa/Kconfig" source "drivers/eisa/Kconfig" config MCA - bool "MCA support" if !(X86_VISWS || X86_VOYAGER) + bool "MCA support" if !(X86_VISWS || X86_VOYAGER || X86_XEN) default y if X86_VOYAGER help MicroChannel Architecture is found in some IBM PS/2 machines and @@ -1146,6 +1189,8 @@ source "security/Kconfig" source "crypto/Kconfig" +source "drivers/xen/Kconfig" + source "lib/Kconfig" # @@ -1171,7 +1216,7 @@ config X86_SMP config X86_HT bool - depends on SMP && !(X86_VISWS || X86_VOYAGER) + depends on SMP && !(X86_VISWS || X86_VOYAGER || X86_XEN) default y config X86_BIOS_REBOOT @@ -1184,6 +1229,16 @@ config X86_TRAMPOLINE depends on X86_SMP || (X86_VOYAGER && SMP) default y +config X86_NO_TSS + bool + depends on X86_XEN + default y + +config X86_NO_IDT + bool + depends on X86_XEN + default y + config KTIME_SCALAR bool default y diff -r 42f6970e18c9 -r a533be77c572 arch/i386/Kconfig.cpu --- a/arch/i386/Kconfig.cpu Mon Jun 04 10:05:24 2007 +0100 +++ b/arch/i386/Kconfig.cpu Mon Jun 04 10:05:28 2007 +0100 @@ -251,7 +251,7 @@ config X86_PPRO_FENCE config X86_F00F_BUG bool - depends on M586MMX || M586TSC || M586 || M486 || M386 + depends on (M586MMX || M586TSC || M586 || M486 || M386) && !X86_NO_IDT default y config X86_WP_WORKS_OK @@ -311,5 +311,5 @@ config X86_OOSTORE config X86_TSC bool - depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MGEODEGX1 || MGEODE_LX) && !X86_NUMAQ - default y + depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MGEODEGX1 || MGEODE_LX) && !X86_NUMAQ && !X86_XEN + default y diff -r 42f6970e18c9 -r a533be77c572 arch/i386/Kconfig.debug --- a/arch/i386/Kconfig.debug Mon Jun 04 10:05:24 2007 +0100 +++ b/arch/i386/Kconfig.debug Mon Jun 04 10:05:28 2007 +0100 @@ -79,6 +79,7 @@ config DOUBLEFAULT config DOUBLEFAULT default y bool "Enable doublefault exception handler" if EMBEDDED + depends on !X86_NO_TSS help This option allows trapping of rare doublefault exceptions that would otherwise cause a system to silently reboot. Disabling this diff -r 42f6970e18c9 -r a533be77c572 arch/i386/Makefile --- a/arch/i386/Makefile Mon Jun 04 10:05:24 2007 +0100 +++ b/arch/i386/Makefile Mon Jun 04 10:05:28 2007 +0100 @@ -48,6 +48,11 @@ CFLAGS += $(shell if [ $(call cc-vers CFLAGS += $(cflags-y) +cppflags-$(CONFIG_XEN) += \ + -D__XEN_INTERFACE_VERSION__=$(CONFIG_XEN_INTERFACE_VERSION) + +CPPFLAGS += $(cppflags-y) + # Default subarch .c files mcore-y := mach-default @@ -70,6 +75,10 @@ mcore-$(CONFIG_X86_BIGSMP) := mach-defau #Summit subarch support mflags-$(CONFIG_X86_SUMMIT) := -Iinclude/asm-i386/mach-summit mcore-$(CONFIG_X86_SUMMIT) := mach-default + +# Xen subarch support +mflags-$(CONFIG_X86_XEN) := -Iinclude/asm-i386/mach-xen +mcore-$(CONFIG_X86_XEN) := mach-xen # generic subarchitecture mflags-$(CONFIG_X86_GENERICARCH) := -Iinclude/asm-i386/mach-generic @@ -105,6 +114,19 @@ PHONY += zImage bzImage compressed zlilo PHONY += zImage bzImage compressed zlilo bzlilo \ zdisk bzdisk fdimage fdimage144 fdimage288 isoimage install +ifdef CONFIG_XEN +CPPFLAGS := -Iinclude$(if $(KBUILD_SRC),2)/asm/mach-xen $(CPPFLAGS) +head-y := arch/i386/kernel/head-xen.o arch/i386/kernel/init_task-xen.o +boot := arch/i386/boot-xen +.PHONY: vmlinuz +all: vmlinuz + +vmlinuz: vmlinux + $(Q)$(MAKE) $(build)=$(boot) $@ + +install: + $(Q)$(MAKE) $(build)=$(boot) $@ +else all: bzImage # KBUILD_IMAGE specify target image being built @@ -127,6 +149,7 @@ fdimage fdimage144 fdimage288 isoimage: install: $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) install +endif archclean: $(Q)$(MAKE) $(clean)=arch/i386/boot @@ -145,3 +168,4 @@ CLEAN_FILES += arch/$(ARCH)/boot/fdimage CLEAN_FILES += arch/$(ARCH)/boot/fdimage \ arch/$(ARCH)/boot/image.iso \ arch/$(ARCH)/boot/mtools.conf +CLEAN_FILES += vmlinuz vmlinux-stripped diff -r 42f6970e18c9 -r a533be77c572 arch/i386/boot-xen/Makefile --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/arch/i386/boot-xen/Makefile Mon Jun 04 10:05:28 2007 +0100 @@ -0,0 +1,19 @@ + +OBJCOPYFLAGS := -g --strip-unneeded + +vmlinuz: vmlinux-stripped FORCE + $(call if_changed,gzip) + +vmlinux-stripped: vmlinux FORCE + $(call if_changed,objcopy) + +INSTALL_ROOT := $(patsubst %/boot,%,$(INSTALL_PATH)) + +XINSTALL_NAME ?= $(KERNELRELEASE) +install: + mkdir -p $(INSTALL_ROOT)/boot + install -m0644 vmlinuz $(INSTALL_ROOT)/boot/vmlinuz-$(XINSTALL_NAME)$(INSTALL_SUFFIX) + install -m0644 vmlinux $(INSTALL_ROOT)/boot/vmlinux-syms-$(XINSTALL_NAME)$(INSTALL_SUFFIX) + install -m0664 .config $(INSTALL_ROOT)/boot/config-$(XINSTALL_NAME)$(INSTALL_SUFFIX) + install -m0664 System.map $(INSTALL_ROOT)/boot/System.map-$(XINSTALL_NAME)$(INSTALL_SUFFIX) + ln -f -s vmlinuz-$(XINSTALL_NAME)$(INSTALL_SUFFIX) $(INSTALL_ROOT)/boot/vmlinuz-$(VERSION).$(PATCHLEVEL)$(shell [ -e $(objtree)/localversion-xen ] && cat $(objtree)/localversion-xen)$(INSTALL_SUFFIX) diff -r 42f6970e18c9 -r a533be77c572 arch/i386/kernel/Makefile --- a/arch/i386/kernel/Makefile Mon Jun 04 10:05:24 2007 +0100 +++ b/arch/i386/kernel/Makefile Mon Jun 04 10:05:28 2007 +0100 @@ -44,6 +44,12 @@ EXTRA_AFLAGS := -traditional obj-$(CONFIG_SCx200) += scx200.o +ifdef CONFIG_XEN +vsyscall_note := vsyscall-note-xen.o +else +vsyscall_note := vsyscall-note.o +endif + # vsyscall.o contains the vsyscall DSO images as __initdata. # We must build both images before we can assemble it. # Note: kbuild does not track this dependency due to usage of .incbin @@ -65,7 +71,7 @@ SYSCFLAGS_vsyscall-int80.so = $(vsyscall $(obj)/vsyscall-int80.so $(obj)/vsyscall-sysenter.so: \ $(obj)/vsyscall-%.so: $(src)/vsyscall.lds \ - $(obj)/vsyscall-%.o $(obj)/vsyscall-note.o FORCE + $(obj)/vsyscall-%.o $(obj)/$(vsyscall_note) FORCE $(call if_changed,syscall) # We also create a special relocatable object that should mirror the symbol @@ -77,8 +83,20 @@ extra-y += vsyscall-syms.o SYSCFLAGS_vsyscall-syms.o = -r $(obj)/vsyscall-syms.o: $(src)/vsyscall.lds \ - $(obj)/vsyscall-sysenter.o $(obj)/vsyscall-note.o FORCE + $(obj)/vsyscall-sysenter.o $(obj)/$(vsyscall_note) FORCE $(call if_changed,syscall) k8-y += ../../x86_64/kernel/k8.o +ifdef CONFIG_XEN +include $(srctree)/scripts/Makefile.xen + +obj-y += fixup.o +microcode-$(subst m,y,$(CONFIG_MICROCODE)) := microcode-xen.o +n-obj-xen := i8259.o timers/ reboot.o smpboot.o trampoline.o + +obj-y := $(call filterxen, $(obj-y), $(n-obj-xen)) +obj-y := $(call cherrypickxen, $(obj-y)) +extra-y := $(call cherrypickxen, $(extra-y)) +%/head-xen.o %/head-xen.s: EXTRA_AFLAGS := +endif diff -r 42f6970e18c9 -r a533be77c572 arch/i386/kernel/acpi/Makefile --- a/arch/i386/kernel/acpi/Makefile Mon Jun 04 10:05:24 2007 +0100 +++ b/arch/i386/kernel/acpi/Makefile Mon Jun 04 10:05:28 2007 +0100 @@ -6,3 +6,7 @@ obj-y += cstate.o processor.o obj-y += cstate.o processor.o endif +ifdef CONFIG_XEN +include $(srctree)/scripts/Makefile.xen +obj-y := $(call cherrypickxen, $(obj-y), $(src)) +endif diff -r 42f6970e18c9 -r a533be77c572 arch/i386/kernel/acpi/boot-xen.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/arch/i386/kernel/acpi/boot-xen.c Mon Jun 04 10:05:28 2007 +0100 @@ -0,0 +1,1168 @@ +/* + * boot.c - Architecture-Specific Low-Level ACPI Boot Support + * + * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@xxxxxxxxx> + * Copyright (C) 2001 Jun Nakajima <jun.nakajima@xxxxxxxxx> + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ + +#include <linux/init.h> +#include <linux/acpi.h> +#include <linux/efi.h> +#include <linux/module.h> +#include <linux/dmi.h> +#include <linux/irq.h> + +#include <asm/pgtable.h> +#include <asm/io_apic.h> +#include <asm/apic.h> +#include <asm/io.h> +#include <asm/mpspec.h> + +#ifdef CONFIG_X86_64 + +extern void __init clustered_apic_check(void); + +extern int gsi_irq_sharing(int gsi); +#include <asm/proto.h> + +static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id) { return 0; } + + +#else /* X86 */ + +#ifdef CONFIG_X86_LOCAL_APIC +#include <mach_apic.h> +#include <mach_mpparse.h> +#endif /* CONFIG_X86_LOCAL_APIC */ + +static inline int gsi_irq_sharing(int gsi) { return gsi; } + +#endif /* X86 */ + +#define BAD_MADT_ENTRY(entry, end) ( \ + (!entry) || (unsigned long)entry + sizeof(*entry) > end || \ + ((acpi_table_entry_header *)entry)->length < sizeof(*entry)) + +#define PREFIX "ACPI: " + +int acpi_noirq __initdata; /* skip ACPI IRQ initialization */ +int acpi_pci_disabled __initdata; /* skip ACPI PCI scan and IRQ initialization */ +int acpi_ht __initdata = 1; /* enable HT */ + +int acpi_lapic; +int acpi_ioapic; +int acpi_strict; +EXPORT_SYMBOL(acpi_strict); + +acpi_interrupt_flags acpi_sci_flags __initdata; +int acpi_sci_override_gsi __initdata; +int acpi_skip_timer_override __initdata; + +#ifdef CONFIG_X86_LOCAL_APIC +static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; +#endif + +#ifndef __HAVE_ARCH_CMPXCHG +#warning ACPI uses CMPXCHG, i486 and later hardware +#endif + +#define MAX_MADT_ENTRIES 256 +u8 x86_acpiid_to_apicid[MAX_MADT_ENTRIES] = + {[0 ... MAX_MADT_ENTRIES - 1] = 0xff }; +EXPORT_SYMBOL(x86_acpiid_to_apicid); + +/* -------------------------------------------------------------------------- + Boot-time Configuration + -------------------------------------------------------------------------- */ + +/* + * The default interrupt routing model is PIC (8259). This gets + * overriden if IOAPICs are enumerated (below). + */ +enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC; + +#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN) + +/* rely on all ACPI tables being in the direct mapping */ +char *__acpi_map_table(unsigned long phys_addr, unsigned long size) +{ + if (!phys_addr || !size) + return NULL; + + if (phys_addr+size <= (end_pfn_map << PAGE_SHIFT) + PAGE_SIZE) + return __va(phys_addr); + + return NULL; +} + +#else + +/* + * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END, + * to map the target physical address. The problem is that set_fixmap() + * provides a single page, and it is possible that the page is not + * sufficient. + * By using this area, we can map up to MAX_IO_APICS pages temporarily, + * i.e. until the next __va_range() call. + * + * Important Safety Note: The fixed I/O APIC page numbers are *subtracted* + * from the fixed base. That's why we start at FIX_IO_APIC_BASE_END and + * count idx down while incrementing the phys address. + */ +char *__acpi_map_table(unsigned long phys, unsigned long size) +{ + unsigned long base, offset, mapped_size; + int idx; + +#ifndef CONFIG_XEN + if (phys + size < 8 * 1024 * 1024) + return __va(phys); +#endif + + offset = phys & (PAGE_SIZE - 1); + mapped_size = PAGE_SIZE - offset; + set_fixmap(FIX_ACPI_END, phys); + base = fix_to_virt(FIX_ACPI_END); + + /* + * Most cases can be covered by the below. + */ + idx = FIX_ACPI_END; + while (mapped_size < size) { + if (--idx < FIX_ACPI_BEGIN) + return NULL; /* cannot handle this */ + phys += PAGE_SIZE; + set_fixmap(idx, phys); + mapped_size += PAGE_SIZE; + } + + return ((unsigned char *)base + offset); +} +#endif + +#ifdef CONFIG_PCI_MMCONFIG +/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */ +struct acpi_table_mcfg_config *pci_mmcfg_config; +int pci_mmcfg_config_num; + +int __init acpi_parse_mcfg(unsigned long phys_addr, unsigned long size) +{ + struct acpi_table_mcfg *mcfg; + unsigned long i; + int config_size; + + if (!phys_addr || !size) + return -EINVAL; + + mcfg = (struct acpi_table_mcfg *)__acpi_map_table(phys_addr, size); + if (!mcfg) { + printk(KERN_WARNING PREFIX "Unable to map MCFG\n"); + return -ENODEV; + } + + /* how many config structures do we have */ + pci_mmcfg_config_num = 0; + i = size - sizeof(struct acpi_table_mcfg); + while (i >= sizeof(struct acpi_table_mcfg_config)) { + ++pci_mmcfg_config_num; + i -= sizeof(struct acpi_table_mcfg_config); + }; + if (pci_mmcfg_config_num == 0) { + printk(KERN_ERR PREFIX "MMCONFIG has no entries\n"); + return -ENODEV; + } + + config_size = pci_mmcfg_config_num * sizeof(*pci_mmcfg_config); + pci_mmcfg_config = kmalloc(config_size, GFP_KERNEL); + if (!pci_mmcfg_config) { + printk(KERN_WARNING PREFIX + "No memory for MCFG config tables\n"); + return -ENOMEM; + } + + memcpy(pci_mmcfg_config, &mcfg->config, config_size); + for (i = 0; i < pci_mmcfg_config_num; ++i) { + if (mcfg->config[i].base_reserved) { + printk(KERN_ERR PREFIX + "MMCONFIG not in low 4GB of memory\n"); + kfree(pci_mmcfg_config); + pci_mmcfg_config_num = 0; + return -ENODEV; + } + } + + return 0; +} +#endif /* CONFIG_PCI_MMCONFIG */ + +#ifdef CONFIG_X86_LOCAL_APIC +static int __init acpi_parse_madt(unsigned long phys_addr, unsigned long size) +{ + struct acpi_table_madt *madt = NULL; + + if (!phys_addr || !size || !cpu_has_apic) + return -EINVAL; + + madt = (struct acpi_table_madt *)__acpi_map_table(phys_addr, size); + if (!madt) { + printk(KERN_WARNING PREFIX "Unable to map MADT\n"); + return -ENODEV; + } + + if (madt->lapic_address) { + acpi_lapic_addr = (u64) madt->lapic_address; + + printk(KERN_DEBUG PREFIX "Local APIC address 0x%08x\n", + madt->lapic_address); + } + + acpi_madt_oem_check(madt->header.oem_id, madt->header.oem_table_id); + + return 0; +} + +static int __init +acpi_parse_lapic(acpi_table_entry_header * header, const unsigned long end) +{ + struct acpi_table_lapic *processor = NULL; + + processor = (struct acpi_table_lapic *)header; + + if (BAD_MADT_ENTRY(processor, end)) + return -EINVAL; + + acpi_table_print_madt_entry(header); + + /* Record local apic id only when enabled */ + if (processor->flags.enabled) + x86_acpiid_to_apicid[processor->acpi_id] = processor->id; + + /* + * We need to register disabled CPU as well to permit + * counting disabled CPUs. This allows us to size + * cpus_possible_map more accurately, to permit + * to not preallocating memory for all NR_CPUS + * when we use CPU hotplug. + */ + mp_register_lapic(processor->id, /* APIC ID */ + processor->flags.enabled); /* Enabled? */ + + return 0; +} + +static int __init +acpi_parse_lapic_addr_ovr(acpi_table_entry_header * header, + const unsigned long end) +{ + struct acpi_table_lapic_addr_ovr *lapic_addr_ovr = NULL; + + lapic_addr_ovr = (struct acpi_table_lapic_addr_ovr *)header; + + if (BAD_MADT_ENTRY(lapic_addr_ovr, end)) + return -EINVAL; + + acpi_lapic_addr = lapic_addr_ovr->address; + + return 0; +} + +static int __init +acpi_parse_lapic_nmi(acpi_table_entry_header * header, const unsigned long end) +{ + struct acpi_table_lapic_nmi *lapic_nmi = NULL; + + lapic_nmi = (struct acpi_table_lapic_nmi *)header; + + if (BAD_MADT_ENTRY(lapic_nmi, end)) + return -EINVAL; + + acpi_table_print_madt_entry(header); + + if (lapic_nmi->lint != 1) + printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n"); + + return 0; +} + +#endif /*CONFIG_X86_LOCAL_APIC */ + +#ifdef CONFIG_X86_IO_APIC + +static int __init +acpi_parse_ioapic(acpi_table_entry_header * header, const unsigned long end) +{ + struct acpi_table_ioapic *ioapic = NULL; + + ioapic = (struct acpi_table_ioapic *)header; + + if (BAD_MADT_ENTRY(ioapic, end)) + return -EINVAL; + + acpi_table_print_madt_entry(header); + + mp_register_ioapic(ioapic->id, + ioapic->address, ioapic->global_irq_base); + + return 0; +} + +/* + * Parse Interrupt Source Override for the ACPI SCI + */ +static void acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger) +{ + if (trigger == 0) /* compatible SCI trigger is level */ + trigger = 3; + + if (polarity == 0) /* compatible SCI polarity is low */ + polarity = 3; + + /* Command-line over-ride via acpi_sci= */ + if (acpi_sci_flags.trigger) + trigger = acpi_sci_flags.trigger; + + if (acpi_sci_flags.polarity) + polarity = acpi_sci_flags.polarity; + + /* + * mp_config_acpi_legacy_irqs() already setup IRQs < 16 + * If GSI is < 16, this will update its flags, + * else it will create a new mp_irqs[] entry. + */ + mp_override_legacy_irq(gsi, polarity, trigger, gsi); + + /* + * stash over-ride to indicate we've been here + * and for later update of acpi_fadt + */ + acpi_sci_override_gsi = gsi; + return; +} + +static int __init +acpi_parse_int_src_ovr(acpi_table_entry_header * header, + const unsigned long end) +{ + struct acpi_table_int_src_ovr *intsrc = NULL; + + intsrc = (struct acpi_table_int_src_ovr *)header; + + if (BAD_MADT_ENTRY(intsrc, end)) + return -EINVAL; + + acpi_table_print_madt_entry(header); + + if (intsrc->bus_irq == acpi_fadt.sci_int) { + acpi_sci_ioapic_setup(intsrc->global_irq, + intsrc->flags.polarity, + intsrc->flags.trigger); + return 0; + } + + if (acpi_skip_timer_override && + intsrc->bus_irq == 0 && intsrc->global_irq == 2) { + printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n"); + return 0; + } + + mp_override_legacy_irq(intsrc->bus_irq, + intsrc->flags.polarity, + intsrc->flags.trigger, intsrc->global_irq); + + return 0; +} + +static int __init +acpi_parse_nmi_src(acpi_table_entry_header * header, const unsigned long end) +{ + struct acpi_table_nmi_src *nmi_src = NULL; + + nmi_src = (struct acpi_table_nmi_src *)header; + + if (BAD_MADT_ENTRY(nmi_src, end)) + return -EINVAL; + + acpi_table_print_madt_entry(header); + + /* TBD: Support nimsrc entries? */ + + return 0; +} + +#endif /* CONFIG_X86_IO_APIC */ + +/* + * acpi_pic_sci_set_trigger() + * + * use ELCR to set PIC-mode trigger type for SCI + * + * If a PIC-mode SCI is not recognized or gives spurious IRQ7's + * it may require Edge Trigger -- use "acpi_sci=edge" + * + * Port 0x4d0-4d1 are ECLR1 and ECLR2, the Edge/Level Control Registers + * for the 8259 PIC. bit[n] = 1 means irq[n] is Level, otherwise Edge. + * ECLR1 is IRQ's 0-7 (IRQ 0, 1, 2 must be 0) + * ECLR2 is IRQ's 8-15 (IRQ 8, 13 must be 0) + */ + +void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger) +{ + unsigned int mask = 1 << irq; + unsigned int old, new; + + /* Real old ELCR mask */ + old = inb(0x4d0) | (inb(0x4d1) << 8); + + /* + * If we use ACPI to set PCI irq's, then we should clear ELCR + * since we will set it correctly as we enable the PCI irq + * routing. + */ + new = acpi_noirq ? old : 0; + + /* + * Update SCI information in the ELCR, it isn't in the PCI + * routing tables.. + */ + switch (trigger) { + case 1: /* Edge - clear */ + new &= ~mask; + break; + case 3: /* Level - set */ + new |= mask; + break; + } + + if (old == new) + return; + + printk(PREFIX "setting ELCR to %04x (from %04x)\n", new, old); + outb(new, 0x4d0); + outb(new >> 8, 0x4d1); +} + +int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) +{ +#ifdef CONFIG_X86_IO_APIC + if (use_pci_vector() && !platform_legacy_irq(gsi)) + *irq = IO_APIC_VECTOR(gsi); + else +#endif + *irq = gsi_irq_sharing(gsi); + return 0; +} + +/* + * success: return IRQ number (>=0) + * failure: return < 0 + */ +int acpi_register_gsi(u32 gsi, int triggering, int polarity) +{ + unsigned int irq; + unsigned int plat_gsi = gsi; + +#ifdef CONFIG_PCI + /* + * Make sure all (legacy) PCI IRQs are set as level-triggered. + */ + if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) { + extern void eisa_set_level_irq(unsigned int irq); + + if (triggering == ACPI_LEVEL_SENSITIVE) + eisa_set_level_irq(gsi); + } +#endif + +#ifdef CONFIG_X86_IO_APIC + if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) { + plat_gsi = mp_register_gsi(gsi, triggering, polarity); + } +#endif + acpi_gsi_to_irq(plat_gsi, &irq); + return irq; +} + +EXPORT_SYMBOL(acpi_register_gsi); + +/* + * ACPI based hotplug support for CPU + */ +#ifdef CONFIG_ACPI_HOTPLUG_CPU +int acpi_map_lsapic(acpi_handle handle, int *pcpu) +{ + /* TBD */ + return -EINVAL; +} + +EXPORT_SYMBOL(acpi_map_lsapic); + +int acpi_unmap_lsapic(int cpu) +{ + /* TBD */ + return -EINVAL; +} + +EXPORT_SYMBOL(acpi_unmap_lsapic); +#endif /* CONFIG_ACPI_HOTPLUG_CPU */ + +int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base) +{ + /* TBD */ + return -EINVAL; +} + +EXPORT_SYMBOL(acpi_register_ioapic); + +int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base) +{ + /* TBD */ + return -EINVAL; +} + +EXPORT_SYMBOL(acpi_unregister_ioapic); + +static unsigned long __init +acpi_scan_rsdp(unsigned long start, unsigned long length) +{ + unsigned long offset = 0; + unsigned long sig_len = sizeof("RSD PTR ") - 1; + unsigned long vstart = (unsigned long)isa_bus_to_virt(start); + + /* + * Scan all 16-byte boundaries of the physical memory region for the + * RSDP signature. + */ + for (offset = 0; offset < length; offset += 16) { + if (strncmp((char *)(vstart + offset), "RSD PTR ", sig_len)) + continue; + return (start + offset); + } + + return 0; +} + +static int __init acpi_parse_sbf(unsigned long phys_addr, unsigned long size) +{ + struct acpi_table_sbf *sb; + + if (!phys_addr || !size) + return -EINVAL; + + sb = (struct acpi_table_sbf *)__acpi_map_table(phys_addr, size); + if (!sb) { + printk(KERN_WARNING PREFIX "Unable to map SBF\n"); + return -ENODEV; + } + + sbf_port = sb->sbf_cmos; /* Save CMOS port */ + + return 0; +} + +#ifdef CONFIG_HPET_TIMER + +static int __init acpi_parse_hpet(unsigned long phys, unsigned long size) +{ + struct acpi_table_hpet *hpet_tbl; + + if (!phys || !size) + return -EINVAL; + + hpet_tbl = (struct acpi_table_hpet *)__acpi_map_table(phys, size); + if (!hpet_tbl) { + printk(KERN_WARNING PREFIX "Unable to map HPET\n"); + return -ENODEV; + } + + if (hpet_tbl->addr.space_id != ACPI_SPACE_MEM) { + printk(KERN_WARNING PREFIX "HPET timers must be located in " + "memory.\n"); + return -1; + } +#ifdef CONFIG_X86_64 + vxtime.hpet_address = hpet_tbl->addr.addrl | + ((long)hpet_tbl->addr.addrh << 32); + + printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n", + hpet_tbl->id, vxtime.hpet_address); +#else /* X86 */ + { + extern unsigned long hpet_address; + + hpet_address = hpet_tbl->addr.addrl; + printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n", + hpet_tbl->id, hpet_address); + } +#endif /* X86 */ + + return 0; +} +#else +#define acpi_parse_hpet NULL +#endif + +#ifdef CONFIG_X86_PM_TIMER +extern u32 pmtmr_ioport; +#endif + +static int __init acpi_parse_fadt(unsigned long phys, unsigned long size) +{ + struct fadt_descriptor *fadt = NULL; + + fadt = (struct fadt_descriptor *)__acpi_map_table(phys, size); + if (!fadt) { + printk(KERN_WARNING PREFIX "Unable to map FADT\n"); + return 0; + } + /* initialize sci_int early for INT_SRC_OVR MADT parsing */ + acpi_fadt.sci_int = fadt->sci_int; + + /* initialize rev and apic_phys_dest_mode for x86_64 genapic */ + acpi_fadt.revision = fadt->revision; + acpi_fadt.force_apic_physical_destination_mode = + fadt->force_apic_physical_destination_mode; + +#if defined(CONFIG_X86_PM_TIMER) && !defined(CONFIG_XEN) + /* detect the location of the ACPI PM Timer */ + if (fadt->revision >= FADT2_REVISION_ID) { + /* FADT rev. 2 */ + if (fadt->xpm_tmr_blk.address_space_id != + ACPI_ADR_SPACE_SYSTEM_IO) + return 0; + + pmtmr_ioport = fadt->xpm_tmr_blk.address; + /* + * "X" fields are optional extensions to the original V1.0 + * fields, so we must selectively expand V1.0 fields if the + * corresponding X field is zero. + */ + if (!pmtmr_ioport) + pmtmr_ioport = fadt->V1_pm_tmr_blk; + } else { + /* FADT rev. 1 */ + pmtmr_ioport = fadt->V1_pm_tmr_blk; + } + if (pmtmr_ioport) + printk(KERN_INFO PREFIX "PM-Timer IO Port: %#x\n", + pmtmr_ioport); +#endif + return 0; +} + +unsigned long __init acpi_find_rsdp(void) +{ + unsigned long rsdp_phys = 0; + + if (efi_enabled) { + if (efi.acpi20 != EFI_INVALID_TABLE_ADDR) + return efi.acpi20; + else if (efi.acpi != EFI_INVALID_TABLE_ADDR) + return efi.acpi; + } + /* + * Scan memory looking for the RSDP signature. First search EBDA (low + * memory) paragraphs and then search upper memory (E0000-FFFFF). + */ + rsdp_phys = acpi_scan_rsdp(0, 0x400); + if (!rsdp_phys) + rsdp_phys = acpi_scan_rsdp(0xE0000, 0x20000); + + return rsdp_phys; +} + +#ifdef CONFIG_X86_LOCAL_APIC +/* + * Parse LAPIC entries in MADT + * returns 0 on success, < 0 on error + */ +static int __init acpi_parse_madt_lapic_entries(void) +{ + int count; + + if (!cpu_has_apic) + return -ENODEV; + + /* + * Note that the LAPIC address is obtained from the MADT (32-bit value) + * and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value). + */ + + count = + acpi_table_parse_madt(ACPI_MADT_LAPIC_ADDR_OVR, + acpi_parse_lapic_addr_ovr, 0); + if (count < 0) { + printk(KERN_ERR PREFIX + "Error parsing LAPIC address override entry\n"); + return count; + } + + mp_register_lapic_address(acpi_lapic_addr); + + count = acpi_table_parse_madt(ACPI_MADT_LAPIC, acpi_parse_lapic, + MAX_APICS); + if (!count) { + printk(KERN_ERR PREFIX "No LAPIC entries present\n"); + /* TBD: Cleanup to allow fallback to MPS */ + return -ENODEV; + } else if (count < 0) { + printk(KERN_ERR PREFIX "Error parsing LAPIC entry\n"); + /* TBD: Cleanup to allow fallback to MPS */ + return count; + } + + count = + acpi_table_parse_madt(ACPI_MADT_LAPIC_NMI, acpi_parse_lapic_nmi, 0); + if (count < 0) { + printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n"); + /* TBD: Cleanup to allow fallback to MPS */ + return count; + } + return 0; +} +#endif /* CONFIG_X86_LOCAL_APIC */ + +#ifdef CONFIG_X86_IO_APIC +/* + * Parse IOAPIC related entries in MADT + * returns 0 on success, < 0 on error + */ +static int __init acpi_parse_madt_ioapic_entries(void) +{ + int count; + + /* + * ACPI interpreter is required to complete interrupt setup, + * so if it is off, don't enumerate the io-apics with ACPI. + * If MPS is present, it will handle them, + * otherwise the system will stay in PIC mode + */ + if (acpi_disabled || acpi_noirq) { + return -ENODEV; + } + + if (!cpu_has_apic) + return -ENODEV; + + /* + * if "noapic" boot option, don't look for IO-APICs + */ + if (skip_ioapic_setup) { + printk(KERN_INFO PREFIX "Skipping IOAPIC probe " + "due to 'noapic' option.\n"); + return -ENODEV; + } + + count = + acpi_table_parse_madt(ACPI_MADT_IOAPIC, acpi_parse_ioapic, + MAX_IO_APICS); + if (!count) { + printk(KERN_ERR PREFIX "No IOAPIC entries present\n"); + return -ENODEV; + } else if (count < 0) { + printk(KERN_ERR PREFIX "Error parsing IOAPIC entry\n"); + return count; + } + + count = + acpi_table_parse_madt(ACPI_MADT_INT_SRC_OVR, acpi_parse_int_src_ovr, + NR_IRQ_VECTORS); + if (count < 0) { + printk(KERN_ERR PREFIX + "Error parsing interrupt source overrides entry\n"); + /* TBD: Cleanup to allow fallback to MPS */ + return count; + } + + /* + * If BIOS did not supply an INT_SRC_OVR for the SCI + * pretend we got one so we can set the SCI flags. + */ + if (!acpi_sci_override_gsi) + acpi_sci_ioapic_setup(acpi_fadt.sci_int, 0, 0); + + /* Fill in identity legacy mapings where no override */ + mp_config_acpi_legacy_irqs(); + + count = + acpi_table_parse_madt(ACPI_MADT_NMI_SRC, acpi_parse_nmi_src, + NR_IRQ_VECTORS); + if (count < 0) { + printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n"); + /* TBD: Cleanup to allow fallback to MPS */ + return count; + } + + return 0; +} +#else +static inline int acpi_parse_madt_ioapic_entries(void) +{ + return -1; +} +#endif /* !CONFIG_X86_IO_APIC */ + +static void __init acpi_process_madt(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + int count, error; + + count = acpi_table_parse(ACPI_APIC, acpi_parse_madt); + if (count >= 1) { + + /* + * Parse MADT LAPIC entries + */ + error = acpi_parse_madt_lapic_entries(); + if (!error) { + acpi_lapic = 1; + +#ifdef CONFIG_X86_GENERICARCH + generic_bigsmp_probe(); +#endif + /* + * Parse MADT IO-APIC entries + */ + error = acpi_parse_madt_ioapic_entries(); + if (!error) { + acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC; + acpi_irq_balance_set(NULL); + acpi_ioapic = 1; + + smp_found_config = 1; + clustered_apic_check(); + } + } + if (error == -EINVAL) { + /* + * Dell Precision Workstation 410, 610 come here. + */ + printk(KERN_ERR PREFIX + "Invalid BIOS MADT, disabling ACPI\n"); + disable_acpi(); + } + } +#endif + return; +} + +extern int acpi_force; + +#ifdef __i386__ + +static int __init disable_acpi_irq(struct dmi_system_id *d) +{ + if (!acpi_force) { + printk(KERN_NOTICE "%s detected: force use of acpi=noirq\n", + d->ident); + acpi_noirq_set(); + } + return 0; +} + +static int __init disable_acpi_pci(struct dmi_system_id *d) +{ + if (!acpi_force) { + printk(KERN_NOTICE "%s detected: force use of pci=noacpi\n", + d->ident); + acpi_disable_pci(); + } + return 0; +} + +static int __init dmi_disable_acpi(struct dmi_system_id *d) +{ + if (!acpi_force) { + printk(KERN_NOTICE "%s detected: acpi off\n", d->ident); + disable_acpi(); + } else { + printk(KERN_NOTICE + "Warning: DMI blacklist says broken, but acpi forced\n"); + } + return 0; +} + +/* + * Limit ACPI to CPU enumeration for HT + */ +static int __init force_acpi_ht(struct dmi_system_id *d) +{ + if (!acpi_force) { + printk(KERN_NOTICE "%s detected: force use of acpi=ht\n", + d->ident); + disable_acpi(); + acpi_ht = 1; + } else { + printk(KERN_NOTICE + "Warning: acpi=force overrules DMI blacklist: acpi=ht\n"); + } + return 0; +} + +/* + * If your system is blacklisted here, but you find that acpi=force + * works for you, please contact acpi-devel@xxxxxxxxxxxxxxx + */ +static struct dmi_system_id __initdata acpi_dmi_table[] = { + /* + * Boxes that need ACPI disabled + */ + { + .callback = dmi_disable_acpi, + .ident = "IBM Thinkpad", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), + DMI_MATCH(DMI_BOARD_NAME, "2629H1G"), + }, + }, + + /* + * Boxes that need acpi=ht + */ + { + .callback = force_acpi_ht, + .ident = "FSC Primergy T850", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"), + DMI_MATCH(DMI_PRODUCT_NAME, "PRIMERGY T850"), + }, + }, + { + .callback = force_acpi_ht, + .ident = "DELL GX240", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Dell Computer Corporation"), + DMI_MATCH(DMI_BOARD_NAME, "OptiPlex GX240"), + }, + }, + { + .callback = force_acpi_ht, + .ident = "HP VISUALIZE NT Workstation", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_PRODUCT_NAME, "HP VISUALIZE NT Workstation"), + }, + }, + { + .callback = force_acpi_ht, + .ident = "Compaq Workstation W8000", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Compaq"), + DMI_MATCH(DMI_PRODUCT_NAME, "Workstation W8000"), + }, + }, + { + .callback = force_acpi_ht, + .ident = "ASUS P4B266", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), + DMI_MATCH(DMI_BOARD_NAME, "P4B266"), + }, + }, + { + .callback = force_acpi_ht, + .ident = "ASUS P2B-DS", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), + DMI_MATCH(DMI_BOARD_NAME, "P2B-DS"), + }, + }, + { + .callback = force_acpi_ht, + .ident = "ASUS CUR-DLS", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), + DMI_MATCH(DMI_BOARD_NAME, "CUR-DLS"), + }, + }, + { + .callback = force_acpi_ht, + .ident = "ABIT i440BX-W83977", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ABIT <http://www.abit.com>"), + DMI_MATCH(DMI_BOARD_NAME, "i440BX-W83977 (BP6)"), + }, + }, + { + .callback = force_acpi_ht, + .ident = "IBM Bladecenter", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), + DMI_MATCH(DMI_BOARD_NAME, "IBM eServer BladeCenter HS20"), + }, + }, + { + .callback = force_acpi_ht, + .ident = "IBM eServer xSeries 360", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), + DMI_MATCH(DMI_BOARD_NAME, "eServer xSeries 360"), + }, + }, + { + .callback = force_acpi_ht, + .ident = "IBM eserver xSeries 330", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), + DMI_MATCH(DMI_BOARD_NAME, "eserver xSeries 330"), + }, + }, + { + .callback = force_acpi_ht, + .ident = "IBM eserver xSeries 440", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), + DMI_MATCH(DMI_PRODUCT_NAME, "eserver xSeries 440"), + }, + }, + + /* + * Boxes that need ACPI PCI IRQ routing disabled + */ + { + .callback = disable_acpi_irq, + .ident = "ASUS A7V", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC"), + DMI_MATCH(DMI_BOARD_NAME, "<A7V>"), + /* newer BIOS, Revision 1011, does work */ + DMI_MATCH(DMI_BIOS_VERSION, + "ASUS A7V ACPI BIOS Revision 1007"), + }, + }, + + /* + * Boxes that need ACPI PCI IRQ routing and PCI scan disabled + */ + { /* _BBN 0 bug */ + .callback = disable_acpi_pci, + .ident = "ASUS PR-DLS", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), + DMI_MATCH(DMI_BOARD_NAME, "PR-DLS"), + DMI_MATCH(DMI_BIOS_VERSION, + "ASUS PR-DLS ACPI BIOS Revision 1010"), + DMI_MATCH(DMI_BIOS_DATE, "03/21/2003") + }, + }, + { + .callback = disable_acpi_pci, + .ident = "Acer TravelMate 36x Laptop", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Acer"), + DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"), + }, + }, + {} +}; + +#endif /* __i386__ */ + +/* + * acpi_boot_table_init() and acpi_boot_init() + * called from setup_arch(), always. + * 1. checksums all tables + * 2. enumerates lapics + * 3. enumerates io-apics + * + * acpi_table_init() is separate to allow reading SRAT without + * other side effects. + * + * side effects of acpi_boot_init: + * acpi_lapic = 1 if LAPIC found + * acpi_ioapic = 1 if IOAPIC found + * if (acpi_lapic && acpi_ioapic) smp_found_config = 1; + * if acpi_blacklisted() acpi_disabled = 1; + * acpi_irq_model=... + * ... + * + * return value: (currently ignored) + * 0: success + * !0: failure + */ + +int __init acpi_boot_table_init(void) +{ + int error; + +#ifdef __i386__ + dmi_check_system(acpi_dmi_table); +#endif + + /* + * If acpi_disabled, bail out + * One exception: acpi=ht continues far enough to enumerate LAPICs + */ + if (acpi_disabled && !acpi_ht) + return 1; + + /* + * Initialize the ACPI boot-time table parser. + */ + error = acpi_table_init(); + if (error) { + disable_acpi(); + return error; + } + + acpi_table_parse(ACPI_BOOT, acpi_parse_sbf); + + /* + * blacklist may disable ACPI entirely + */ + error = acpi_blacklisted(); + if (error) { + if (acpi_force) { + printk(KERN_WARNING PREFIX "acpi=force override\n"); + } else { + printk(KERN_WARNING PREFIX "Disabling ACPI support\n"); + disable_acpi(); + return error; + } + } + + return 0; +} + +int __init acpi_boot_init(void) +{ + /* + * If acpi_disabled, bail out + * One exception: acpi=ht continues far enough to enumerate LAPICs + */ + if (acpi_disabled && !acpi_ht) + return 1; + + acpi_table_parse(ACPI_BOOT, acpi_parse_sbf); + + /* + * set sci_int and PM timer address + */ + acpi_table_parse(ACPI_FADT, acpi_parse_fadt); + + /* + * Process the Multiple APIC Description Table (MADT), if present + */ + acpi_process_madt(); + + acpi_table_parse(ACPI_HPET, acpi_parse_hpet); + + return 0; +} diff -r 42f6970e18c9 -r a533be77c572 arch/i386/kernel/apic-xen.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/arch/i386/kernel/apic-xen.c Mon Jun 04 10:05:28 2007 +0100 @@ -0,0 +1,155 @@ +/* + * Local APIC handling, local APIC timers + * + * (c) 1999, 2000 Ingo Molnar <mingo@xxxxxxxxxx> + * + * Fixes + * Maciej W. Rozycki : Bits for genuine 82489DX APICs; + * thanks to Eric Gilmore + * and Rolf G. Tews + * for testing these extensively. + * Maciej W. Rozycki : Various updates and fixes. + * Mikael Pettersson : Power Management for UP-APIC. + * Pavel Machek and + * Mikael Pettersson : PM converted to driver model. + */ + +#include <linux/init.h> + +#include <linux/mm.h> +#include <linux/delay.h> +#include <linux/bootmem.h> +#include <linux/smp_lock.h> +#include <linux/interrupt.h> +#include <linux/mc146818rtc.h> +#include <linux/kernel_stat.h> +#include <linux/sysdev.h> +#include <linux/cpu.h> +#include <linux/module.h> + +#include <asm/atomic.h> +#include <asm/smp.h> +#include <asm/mtrr.h> +#include <asm/mpspec.h> +#include <asm/desc.h> +#include <asm/arch_hooks.h> +#include <asm/hpet.h> +#include <asm/i8253.h> +#include <asm/nmi.h> + +#include <mach_apic.h> +#include <mach_apicdef.h> +#include <mach_ipi.h> + +#include "io_ports.h" + +#ifndef CONFIG_XEN +/* + * cpu_mask that denotes the CPUs that needs timer interrupt coming in as + * IPIs in place of local APIC timers + */ +static cpumask_t timer_bcast_ipi; +#endif + +/* + * Knob to control our willingness to enable the local APIC. + */ +int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */ + +/* + * Debug level + */ +int apic_verbosity; + +#ifndef CONFIG_XEN +static int modern_apic(void) +{ + unsigned int lvr, version; + /* AMD systems use old APIC versions, so check the CPU */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && + boot_cpu_data.x86 >= 0xf) + return 1; + lvr = apic_read(APIC_LVR); + version = GET_APIC_VERSION(lvr); + return version >= 0x14; +} +#endif /* !CONFIG_XEN */ + +/* + * 'what should we do if we get a hw irq event on an illegal vector'. + * each architecture has to answer this themselves. + */ +void ack_bad_irq(unsigned int irq) +{ + printk("unexpected IRQ trap at vector %02x\n", irq); + /* + * Currently unexpected vectors happen only on SMP and APIC. + * We _must_ ack these because every local APIC has only N + * irq slots per priority level, and a 'hanging, unacked' IRQ + * holds up an irq slot - in excessive cases (when multiple + * unexpected vectors occur) that might lock up the APIC + * completely. + * But only ack when the APIC is enabled -AK + */ + if (cpu_has_apic) + ack_APIC_irq(); +} + +int get_physical_broadcast(void) +{ + return 0xff; +} + +#ifndef CONFIG_XEN +#ifndef CONFIG_SMP +static void up_apic_timer_interrupt_call(struct pt_regs *regs) +{ + int cpu = smp_processor_id(); + + /* + * the NMI deadlock-detector uses this. + */ + per_cpu(irq_stat, cpu).apic_timer_irqs++; + + smp_local_timer_interrupt(regs); +} +#endif + +void smp_send_timer_broadcast_ipi(struct pt_regs *regs) +{ + cpumask_t mask; + + cpus_and(mask, cpu_online_map, timer_bcast_ipi); + if (!cpus_empty(mask)) { +#ifdef CONFIG_SMP + send_IPI_mask(mask, LOCAL_TIMER_VECTOR); +#else + /* + * We can directly call the apic timer interrupt handler + * in UP case. Minus all irq related functions + */ + up_apic_timer_interrupt_call(regs); +#endif + } +} +#endif + +int setup_profiling_timer(unsigned int multiplier) +{ + return -EINVAL; +} + +/* + * This initializes the IO-APIC and APIC hardware if this is + * a UP kernel. + */ +int __init APIC_init_uniprocessor (void) +{ +#ifdef CONFIG_X86_IO_APIC + if (smp_found_config) + if (!skip_ioapic_setup && nr_ioapics) + setup_IO_APIC(); +#endif + + return 0; +} diff -r 42f6970e18c9 -r a533be77c572 arch/i386/kernel/asm-offsets.c --- a/arch/i386/kernel/asm-offsets.c Mon Jun 04 10:05:24 2007 +0100 +++ b/arch/i386/kernel/asm-offsets.c Mon Jun 04 10:05:28 2007 +0100 @@ -66,9 +66,14 @@ void foo(void) OFFSET(pbe_orig_address, pbe, orig_address); OFFSET(pbe_next, pbe, next); +#ifndef CONFIG_X86_NO_TSS /* Offset from the sysenter stack to tss.esp0 */ - DEFINE(TSS_sysenter_esp0, offsetof(struct tss_struct, esp0) - + DEFINE(SYSENTER_stack_esp0, offsetof(struct tss_struct, esp0) - sizeof(struct tss_struct)); +#else + /* sysenter stack points directly to esp0 */ + DEFINE(SYSENTER_stack_esp0, 0); +#endif DEFINE(PAGE_SIZE_asm, PAGE_SIZE); DEFINE(VDSO_PRELINK, VDSO_PRELINK); diff -r 42f6970e18c9 -r a533be77c572 arch/i386/kernel/cpu/Makefile --- a/arch/i386/kernel/cpu/Makefile Mon Jun 04 10:05:24 2007 +0100 +++ b/arch/i386/kernel/cpu/Makefile Mon Jun 04 10:05:28 2007 +0100 @@ -17,3 +17,8 @@ obj-$(CONFIG_X86_MCE) += mcheck/ obj-$(CONFIG_MTRR) += mtrr/ obj-$(CONFIG_CPU_FREQ) += cpufreq/ + +ifdef CONFIG_XEN +include $(srctree)/scripts/Makefile.xen +obj-y := $(call cherrypickxen, $(obj-y), $(src)) +endif diff -r 42f6970e18c9 -r a533be77c572 arch/i386/kernel/cpu/common-xen.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/arch/i386/kernel/cpu/common-xen.c Mon Jun 04 10:05:28 2007 +0100 @@ -0,0 +1,743 @@ +#include <linux/init.h> +#include <linux/string.h> +#include <linux/delay.h> +#include <linux/smp.h> +#include <linux/module.h> +#include <linux/percpu.h> +#include <linux/bootmem.h> +#include <asm/semaphore.h> +#include <asm/processor.h> +#include <asm/i387.h> +#include <asm/msr.h> +#include <asm/io.h> +#include <asm/mmu_context.h> +#include <asm/mtrr.h> +#include <asm/mce.h> +#ifdef CONFIG_X86_LOCAL_APIC +#include <asm/mpspec.h> +#include <asm/apic.h> +#include <mach_apic.h> +#else +#ifdef CONFIG_XEN +#define phys_pkg_id(a,b) a +#endif +#endif +#include <asm/hypervisor.h> + +#include "cpu.h" + +DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr); +EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr); + +#ifndef CONFIG_XEN +DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]); +EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack); +#endif + +static int cachesize_override __cpuinitdata = -1; +static int disable_x86_fxsr __cpuinitdata; +static int disable_x86_serial_nr __cpuinitdata = 1; +static int disable_x86_sep __cpuinitdata; + +struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {}; + +extern int disable_pse; + +static void default_init(struct cpuinfo_x86 * c) +{ + /* Not much we can do here... */ + /* Check if at least it has cpuid */ + if (c->cpuid_level == -1) { + /* No cpuid. It must be an ancient CPU */ + if (c->x86 == 4) + strcpy(c->x86_model_id, "486"); + else if (c->x86 == 3) + strcpy(c->x86_model_id, "386"); + } +} + +static struct cpu_dev default_cpu = { + .c_init = default_init, + .c_vendor = "Unknown", +}; +static struct cpu_dev * this_cpu = &default_cpu; + +static int __init cachesize_setup(char *str) +{ + get_option (&str, &cachesize_override); + return 1; +} +__setup("cachesize=", cachesize_setup); + +int __cpuinit get_model_name(struct cpuinfo_x86 *c) +{ + unsigned int *v; + char *p, *q; + + if (cpuid_eax(0x80000000) < 0x80000004) + return 0; + + v = (unsigned int *) c->x86_model_id; + cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); + cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]); + cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); + c->x86_model_id[48] = 0; + + /* Intel chips right-justify this string for some dumb reason; + undo that brain damage */ + p = q = &c->x86_model_id[0]; + while ( *p == ' ' ) + p++; + if ( p != q ) { + while ( *p ) + *q++ = *p++; + while ( q <= &c->x86_model_id[48] ) + *q++ = '\0'; /* Zero-pad the rest */ + } + + return 1; +} + + +void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) +{ + unsigned int n, dummy, ecx, edx, l2size; + + n = cpuid_eax(0x80000000); + + if (n >= 0x80000005) { + cpuid(0x80000005, &dummy, &dummy, &ecx, &edx); + printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", + edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); + c->x86_cache_size=(ecx>>24)+(edx>>24); + } + + if (n < 0x80000006) /* Some chips just has a large L1. */ + return; + + ecx = cpuid_ecx(0x80000006); + l2size = ecx >> 16; + + /* do processor-specific cache resizing */ + if (this_cpu->c_size_cache) + l2size = this_cpu->c_size_cache(c,l2size); + + /* Allow user to override all this if necessary. */ + if (cachesize_override != -1) + l2size = cachesize_override; + + if ( l2size == 0 ) + return; /* Again, no L2 cache is possible */ + + c->x86_cache_size = l2size; + + printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", + l2size, ecx & 0xFF); +} + +/* Naming convention should be: <Name> [(<Codename>)] */ +/* This table only is used unless init_<vendor>() below doesn't set it; */ +/* in particular, if CPUID levels 0x80000002..4 are supported, this isn't used */ + +/* Look up CPU names by table lookup. */ +static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c) +{ + struct cpu_model_info *info; + + if ( c->x86_model >= 16 ) + return NULL; /* Range check */ + + if (!this_cpu) + return NULL; + + info = this_cpu->c_models; + + while (info && info->family) { + if (info->family == c->x86) + return info->model_names[c->x86_model]; + info++; + } + return NULL; /* Not found */ +} + + +static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c, int early) +{ + char *v = c->x86_vendor_id; + int i; + static int printed; + + for (i = 0; i < X86_VENDOR_NUM; i++) { + if (cpu_devs[i]) { + if (!strcmp(v,cpu_devs[i]->c_ident[0]) || + (cpu_devs[i]->c_ident[1] && + !strcmp(v,cpu_devs[i]->c_ident[1]))) { + c->x86_vendor = i; + if (!early) + this_cpu = cpu_devs[i]; + return; + } + } + } + if (!printed) { + printed++; + printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n"); + printk(KERN_ERR "CPU: Your system may be unstable.\n"); + } + c->x86_vendor = X86_VENDOR_UNKNOWN; + this_cpu = &default_cpu; +} + + +static int __init x86_fxsr_setup(char * s) +{ + disable_x86_fxsr = 1; + return 1; +} +__setup("nofxsr", x86_fxsr_setup); + + +static int __init x86_sep_setup(char * s) +{ + disable_x86_sep = 1; + return 1; +} +__setup("nosep", x86_sep_setup); + + +/* Standard macro to see if a specific flag is changeable */ +static inline int flag_is_changeable_p(u32 flag) +{ + u32 f1, f2; + + asm("pushfl\n\t" + "pushfl\n\t" + "popl %0\n\t" + "movl %0,%1\n\t" + "xorl %2,%0\n\t" + "pushl %0\n\t" + "popfl\n\t" + "pushfl\n\t" + "popl %0\n\t" + "popfl\n\t" + : "=&r" (f1), "=&r" (f2) + : "ir" (flag)); + + return ((f1^f2) & flag) != 0; +} + + +/* Probe for the CPUID instruction */ +static int __cpuinit have_cpuid_p(void) +{ + return flag_is_changeable_p(X86_EFLAGS_ID); +} + +/* Do minimum CPU detection early. + Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment. + The others are not touched to avoid unwanted side effects. + + WARNING: this function is only called on the BP. Don't add code here + that is supposed to run on all CPUs. */ +static void __init early_cpu_detect(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + c->x86_cache_alignment = 32; + + if (!have_cpuid_p()) + return; + + /* Get vendor name */ + cpuid(0x00000000, &c->cpuid_level, + (int *)&c->x86_vendor_id[0], + (int *)&c->x86_vendor_id[8], + (int *)&c->x86_vendor_id[4]); + + get_cpu_vendor(c, 1); + + c->x86 = 4; + if (c->cpuid_level >= 0x00000001) { + u32 junk, tfms, cap0, misc; + cpuid(0x00000001, &tfms, &misc, &junk, &cap0); + c->x86 = (tfms >> 8) & 15; + c->x86_model = (tfms >> 4) & 15; + if (c->x86 == 0xf) + c->x86 += (tfms >> 20) & 0xff; + if (c->x86 >= 0x6) + c->x86_model += ((tfms >> 16) & 0xF) << 4; + c->x86_mask = tfms & 15; + if (cap0 & (1<<19)) + c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8; + } +} + +void __cpuinit generic_identify(struct cpuinfo_x86 * c) +{ + u32 tfms, xlvl; + int ebx; + + if (have_cpuid_p()) { + /* Get vendor name */ + cpuid(0x00000000, &c->cpuid_level, + (int *)&c->x86_vendor_id[0], + (int *)&c->x86_vendor_id[8], + (int *)&c->x86_vendor_id[4]); + + get_cpu_vendor(c, 0); + /* Initialize the standard set of capabilities */ + /* Note that the vendor-specific code below might override */ + + /* Intel-defined flags: level 0x00000001 */ + if ( c->cpuid_level >= 0x00000001 ) { + u32 capability, excap; + cpuid(0x00000001, &tfms, &ebx, &excap, &capability); + c->x86_capability[0] = capability; + c->x86_capability[4] = excap; + c->x86 = (tfms >> 8) & 15; + c->x86_model = (tfms >> 4) & 15; + if (c->x86 == 0xf) + c->x86 += (tfms >> 20) & 0xff; + if (c->x86 >= 0x6) + c->x86_model += ((tfms >> 16) & 0xF) << 4; + c->x86_mask = tfms & 15; +#ifdef CONFIG_X86_HT + c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0); +#else + c->apicid = (ebx >> 24) & 0xFF; +#endif + } else { + /* Have CPUID level 0 only - unheard of */ + c->x86 = 4; + } + + /* AMD-defined flags: level 0x80000001 */ + xlvl = cpuid_eax(0x80000000); + if ( (xlvl & 0xffff0000) == 0x80000000 ) { + if ( xlvl >= 0x80000001 ) { + c->x86_capability[1] = cpuid_edx(0x80000001); + c->x86_capability[6] = cpuid_ecx(0x80000001); + } + if ( xlvl >= 0x80000004 ) + get_model_name(c); /* Default name */ + } + } + + early_intel_workaround(c); + +#ifdef CONFIG_X86_HT + c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff; +#endif +} + +static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) +{ + if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr ) { + /* Disable processor serial number */ + unsigned long lo,hi; + rdmsr(MSR_IA32_BBL_CR_CTL,lo,hi); + lo |= 0x200000; + wrmsr(MSR_IA32_BBL_CR_CTL,lo,hi); + printk(KERN_NOTICE "CPU serial number disabled.\n"); + clear_bit(X86_FEATURE_PN, c->x86_capability); + + /* Disabling the serial number may affect the cpuid level */ + c->cpuid_level = cpuid_eax(0); + } +} + +static int __init x86_serial_nr_setup(char *s) +{ + disable_x86_serial_nr = 0; + return 1; +} +__setup("serialnumber", x86_serial_nr_setup); + + + +/* + * This does the hard work of actually picking apart the CPU stuff... + */ +void __cpuinit identify_cpu(struct cpuinfo_x86 *c) +{ + int i; + + c->loops_per_jiffy = loops_per_jiffy; + c->x86_cache_size = -1; + c->x86_vendor = X86_VENDOR_UNKNOWN; + c->cpuid_level = -1; /* CPUID not detected */ + c->x86_model = c->x86_mask = 0; /* So far unknown... */ + c->x86_vendor_id[0] = '\0'; /* Unset */ + c->x86_model_id[0] = '\0'; /* Unset */ + c->x86_max_cores = 1; + memset(&c->x86_capability, 0, sizeof c->x86_capability); + + if (!have_cpuid_p()) { + /* First of all, decide if this is a 486 or higher */ + /* It's a 486 if we can modify the AC flag */ + if ( flag_is_changeable_p(X86_EFLAGS_AC) ) + c->x86 = 4; + else + c->x86 = 3; + } + + generic_identify(c); + + printk(KERN_DEBUG "CPU: After generic identify, caps:"); + for (i = 0; i < NCAPINTS; i++) + printk(" %08lx", c->x86_capability[i]); + printk("\n"); + + if (this_cpu->c_identify) { + this_cpu->c_identify(c); + + printk(KERN_DEBUG "CPU: After vendor identify, caps:"); + for (i = 0; i < NCAPINTS; i++) + printk(" %08lx", c->x86_capability[i]); + printk("\n"); + } + + /* + * Vendor-specific initialization. In this section we + * canonicalize the feature flags, meaning if there are + * features a certain CPU supports which CPUID doesn't + * tell us, CPUID claiming incorrect flags, or other bugs, + * we handle them here. + * + * At the end of this section, c->x86_capability better + * indicate the features this CPU genuinely supports! + */ + if (this_cpu->c_init) + this_cpu->c_init(c); + + /* Disable the PN if appropriate */ + squash_the_stupid_serial_number(c); + + /* + * The vendor-specific functions might have changed features. Now + * we do "generic changes." + */ + + /* TSC disabled? */ + if ( tsc_disable ) + clear_bit(X86_FEATURE_TSC, c->x86_capability); + + /* FXSR disabled? */ + if (disable_x86_fxsr) { + clear_bit(X86_FEATURE_FXSR, c->x86_capability); + clear_bit(X86_FEATURE_XMM, c->x86_capability); + } + + /* SEP disabled? */ + if (disable_x86_sep) + clear_bit(X86_FEATURE_SEP, c->x86_capability); + + if (disable_pse) + clear_bit(X86_FEATURE_PSE, c->x86_capability); + + /* If the model name is still unset, do table lookup. */ + if ( !c->x86_model_id[0] ) { + char *p; + p = table_lookup_model(c); + if ( p ) + strcpy(c->x86_model_id, p); + else + /* Last resort... */ + sprintf(c->x86_model_id, "%02x/%02x", + c->x86, c->x86_model); + } + + /* Now the feature flags better reflect actual CPU features! */ + + printk(KERN_DEBUG "CPU: After all inits, caps:"); + for (i = 0; i < NCAPINTS; i++) + printk(" %08lx", c->x86_capability[i]); + printk("\n"); + + /* + * On SMP, boot_cpu_data holds the common feature set between + * all CPUs; so make sure that we indicate which features are + * common between the CPUs. The first time this routine gets + * executed, c == &boot_cpu_data. + */ + if ( c != &boot_cpu_data ) { + /* AND the already accumulated flags with these */ + for ( i = 0 ; i < NCAPINTS ; i++ ) + boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; + } + + /* Init Machine Check Exception if available. */ + mcheck_init(c); + + if (c == &boot_cpu_data) + sysenter_setup(); + enable_sep_cpu(); + + if (c == &boot_cpu_data) + mtrr_bp_init(); + else + mtrr_ap_init(); +} + +#ifdef CONFIG_X86_HT +void __cpuinit detect_ht(struct cpuinfo_x86 *c) +{ + u32 eax, ebx, ecx, edx; + int index_msb, core_bits; + + cpuid(1, &eax, &ebx, &ecx, &edx); + + if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY)) + return; + + smp_num_siblings = (ebx & 0xff0000) >> 16; + + if (smp_num_siblings == 1) { + printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); + } else if (smp_num_siblings > 1 ) { + + if (smp_num_siblings > NR_CPUS) { + printk(KERN_WARNING "CPU: Unsupported number of the " + "siblings %d", smp_num_siblings); + smp_num_siblings = 1; + return; + } + + index_msb = get_count_order(smp_num_siblings); + c->phys_proc_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb); + + printk(KERN_INFO "CPU: Physical Processor ID: %d\n", + c->phys_proc_id); + + smp_num_siblings = smp_num_siblings / c->x86_max_cores; + + index_msb = get_count_order(smp_num_siblings) ; + + core_bits = get_count_order(c->x86_max_cores); + + c->cpu_core_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb) & + ((1 << core_bits) - 1); + + if (c->x86_max_cores > 1) + printk(KERN_INFO "CPU: Processor Core ID: %d\n", + c->cpu_core_id); + } +} +#endif + +void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) +{ + char *vendor = NULL; + + if (c->x86_vendor < X86_VENDOR_NUM) + vendor = this_cpu->c_vendor; + else if (c->cpuid_level >= 0) + vendor = c->x86_vendor_id; + + if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor))) + printk("%s ", vendor); + + if (!c->x86_model_id[0]) + printk("%d86", c->x86); + else + printk("%s", c->x86_model_id); + + if (c->x86_mask || c->cpuid_level >= 0) + printk(" stepping %02x\n", c->x86_mask); + else + printk("\n"); +} + +cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; + +/* This is hacky. :) + * We're emulating future behavior. + * In the future, the cpu-specific init functions will be called implicitly + * via the magic of initcalls. + * They will insert themselves into the cpu_devs structure. + * Then, when cpu_init() is called, we can just iterate over that array. + */ + +extern int intel_cpu_init(void); +extern int cyrix_init_cpu(void); +extern int nsc_init_cpu(void); +extern int amd_init_cpu(void); +extern int centaur_init_cpu(void); +extern int transmeta_init_cpu(void); +extern int rise_init_cpu(void); +extern int nexgen_init_cpu(void); +extern int umc_init_cpu(void); + +void __init early_cpu_init(void) +{ + intel_cpu_init(); + cyrix_init_cpu(); + nsc_init_cpu(); + amd_init_cpu(); + centaur_init_cpu(); + transmeta_init_cpu(); + rise_init_cpu(); + nexgen_init_cpu(); + umc_init_cpu(); + early_cpu_detect(); + +#ifdef CONFIG_DEBUG_PAGEALLOC + /* pse is not compatible with on-the-fly unmapping, + * disable it even if the cpus claim to support it. + */ + clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); + disable_pse = 1; +#endif +} + +void __cpuinit cpu_gdt_init(struct Xgt_desc_struct *gdt_descr) +{ + unsigned long frames[16]; + unsigned long va; + int f; + + for (va = gdt_descr->address, f = 0; + va < gdt_descr->address + gdt_descr->size; + va += PAGE_SIZE, f++) { + frames[f] = virt_to_mfn(va); + make_lowmem_page_readonly( + (void *)va, XENFEAT_writable_descriptor_tables); + } + if (HYPERVISOR_set_gdt(frames, gdt_descr->size / 8)) + BUG(); +} + +/* + * cpu_init() initializes state that is per-CPU. Some data is already + * initialized (naturally) in the bootstrap process, such as the GDT + * and IDT. We reload them nevertheless, this function acts as a + * 'CPU state barrier', nothing should get across. + */ +void __cpuinit cpu_init(void) +{ + int cpu = smp_processor_id(); +#ifndef CONFIG_X86_NO_TSS + struct tss_struct * t = &per_cpu(init_tss, cpu); +#endif + struct thread_struct *thread = ¤t->thread; + struct desc_struct *gdt; + struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); + + if (cpu_test_and_set(cpu, cpu_initialized)) { + printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); + for (;;) local_irq_enable(); + } + printk(KERN_INFO "Initializing CPU#%d\n", cpu); + + if (cpu_has_vme || cpu_has_de) + clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); + if (tsc_disable && cpu_has_tsc) { + printk(KERN_NOTICE "Disabling TSC...\n"); + /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/ + clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability); + set_in_cr4(X86_CR4_TSD); + } + +#ifndef CONFIG_XEN + /* The CPU hotplug case */ + if (cpu_gdt_descr->address) { + gdt = (struct desc_struct *)cpu_gdt_descr->address; + memset(gdt, 0, PAGE_SIZE); + goto old_gdt; + } + /* + * This is a horrible hack to allocate the GDT. The problem + * is that cpu_init() is called really early for the boot CPU + * (and hence needs bootmem) but much later for the secondary + * CPUs, when bootmem will have gone away + */ + if (NODE_DATA(0)->bdata->node_bootmem_map) { + gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE); + /* alloc_bootmem_pages panics on failure, so no check */ + memset(gdt, 0, PAGE_SIZE); + } else { + gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL); + if (unlikely(!gdt)) { + printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu); + for (;;) + local_irq_enable(); + } + } +old_gdt: + /* + * Initialize the per-CPU GDT with the boot GDT, + * and set up the GDT descriptor: + */ + memcpy(gdt, cpu_gdt_table, GDT_SIZE); + + /* Set up GDT entry for 16bit stack */ + *(__u64 *)(&gdt[GDT_ENTRY_ESPFIX_SS]) |= + ((((__u64)stk16_off) << 16) & 0x000000ffffff0000ULL) | + ((((__u64)stk16_off) << 32) & 0xff00000000000000ULL) | + (CPU_16BIT_STACK_SIZE - 1); + + cpu_gdt_descr->size = GDT_SIZE - 1; + cpu_gdt_descr->address = (unsigned long)gdt; +#else + if (cpu == 0 && cpu_gdt_descr->address == 0) { + gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE); + /* alloc_bootmem_pages panics on failure, so no check */ + memset(gdt, 0, PAGE_SIZE); + + memcpy(gdt, cpu_gdt_table, GDT_SIZE); + + cpu_gdt_descr->size = GDT_SIZE; + cpu_gdt_descr->address = (unsigned long)gdt; + } +#endif + + cpu_gdt_init(cpu_gdt_descr); + + /* + * Set up and load the per-CPU TSS and LDT + */ + atomic_inc(&init_mm.mm_count); + current->active_mm = &init_mm; + if (current->mm) + BUG(); + enter_lazy_tlb(&init_mm, current); + + load_esp0(t, thread); + + load_LDT(&init_mm.context); + +#ifdef CONFIG_DOUBLEFAULT + /* Set up doublefault TSS pointer in the GDT */ + __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); +#endif + + /* Clear %fs and %gs. */ + asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs"); + + /* Clear all 6 debug registers: */ + set_debugreg(0, 0); + set_debugreg(0, 1); + set_debugreg(0, 2); + set_debugreg(0, 3); + set_debugreg(0, 6); + set_debugreg(0, 7); + + /* + * Force FPU initialization: + */ + current_thread_info()->status = 0; + clear_used_math(); + mxcsr_feature_mask_init(); +} + +#ifdef CONFIG_HOTPLUG_CPU +void __cpuinit cpu_uninit(void) +{ + int cpu = raw_smp_processor_id(); + cpu_clear(cpu, cpu_initialized); + + /* lazy TLB state */ + per_cpu(cpu_tlbstate, cpu).state = 0; + per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm; +} +#endif diff -r 42f6970e18c9 -r a533be77c572 arch/i386/kernel/cpu/mtrr/Makefile --- a/arch/i386/kernel/cpu/mtrr/Makefile Mon Jun 04 10:05:24 2007 +0100 +++ b/arch/i386/kernel/cpu/mtrr/Makefile Mon Jun 04 10:05:28 2007 +0100 @@ -3,3 +3,10 @@ obj-y += cyrix.o obj-y += cyrix.o obj-y += centaur.o +ifdef CONFIG_XEN +include $(srctree)/scripts/Makefile.xen +n-obj-xen := generic.o state.o amd.o cyrix.o centaur.o + +obj-y := $(call filterxen, $(obj-y), $(n-obj-xen)) +obj-y := $(call cherrypickxen, $(obj-y)) +endif diff -r 42f6970e18c9 -r a533be77c572 arch/i386/kernel/cpu/mtrr/main-xen.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/arch/i386/kernel/cpu/mtrr/main-xen.c Mon Jun 04 10:05:28 2007 +0100 @@ -0,0 +1,197 @@ +#include <linux/init.h> +#include <linux/proc_fs.h> +#include <linux/ctype.h> +#include <linux/module.h> +#include <linux/seq_file.h> +#include <asm/uaccess.h> +#include <linux/mutex.h> + +#include <asm/mtrr.h> +#include "mtrr.h" + +static DEFINE_MUTEX(mtrr_mutex); + +void generic_get_mtrr(unsigned int reg, unsigned long *base, + unsigned int *size, mtrr_type * type) +{ + struct xen_platform_op op; + + op.cmd = XENPF_read_memtype; + op.u.read_memtype.reg = reg; + (void)HYPERVISOR_platform_op(&op); + + *size = op.u.read_memtype.nr_mfns; + *base = op.u.read_memtype.mfn; + *type = op.u.read_memtype.type; +} + +struct mtrr_ops generic_mtrr_ops = { + .use_intel_if = 1, + .get = generic_get_mtrr, +}; + +struct mtrr_ops *mtrr_if = &generic_mtrr_ops; +unsigned int num_var_ranges; +unsigned int *usage_table; + +static void __init set_num_var_ranges(void) +{ + struct xen_platform_op op; + + for (num_var_ranges = 0; ; num_var_ranges++) { + op.cmd = XENPF_read_memtype; + op.u.read_memtype.reg = num_var_ranges; + if (HYPERVISOR_platform_op(&op) != 0) + break; + } +} + +static void __init init_table(void) +{ + int i, max; + + max = num_var_ranges; + if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL)) + == NULL) { + printk(KERN_ERR "mtrr: could not allocate\n"); + return; + } + for (i = 0; i < max; i++) + usage_table[i] = 0; +} + +int mtrr_add_page(unsigned long base, unsigned long size, + unsigned int type, char increment) +{ + int error; + struct xen_platform_op op; + + mutex_lock(&mtrr_mutex); + + op.cmd = XENPF_add_memtype; + op.u.add_memtype.mfn = base; + op.u.add_memtype.nr_mfns = size; + op.u.add_memtype.type = type; + error = HYPERVISOR_platform_op(&op); + if (error) { + mutex_unlock(&mtrr_mutex); + BUG_ON(error > 0); + return error; + } + + if (increment) + ++usage_table[op.u.add_memtype.reg]; + + mutex_unlock(&mtrr_mutex); + + return op.u.add_memtype.reg; +} + +static int mtrr_check(unsigned long base, unsigned long size) +{ + if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { + printk(KERN_WARNING + "mtrr: size and base must be multiples of 4 kiB\n"); + printk(KERN_DEBUG + "mtrr: size: 0x%lx base: 0x%lx\n", size, base); + dump_stack(); + return -1; + } + return 0; +} + +int +mtrr_add(unsigned long base, unsigned long size, unsigned int type, + char increment) +{ + if (mtrr_check(base, size)) + return -EINVAL; + return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type, + increment); +} + +int mtrr_del_page(int reg, unsigned long base, unsigned long size) +{ + unsigned i; + mtrr_type ltype; + unsigned long lbase; + unsigned int lsize; + int error = -EINVAL; + struct xen_platform_op op; + + mutex_lock(&mtrr_mutex); + + if (reg < 0) { + /* Search for existing MTRR */ + for (i = 0; i < num_var_ranges; ++i) { + mtrr_if->get(i, &lbase, &lsize, <ype); + if (lbase == base && lsize == size) { + reg = i; + break; + } + } + if (reg < 0) { + printk(KERN_DEBUG "mtrr: no MTRR for %lx000,%lx000 found\n", base, + size); + goto out; + } + } + if (usage_table[reg] < 1) { + printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg); + goto out; + } + if (--usage_table[reg] < 1) { + op.cmd = XENPF_del_memtype; + op.u.del_memtype.handle = 0; + op.u.del_memtype.reg = reg; + error = HYPERVISOR_platform_op(&op); + if (error) { + BUG_ON(error > 0); + goto out; + } + } + error = reg; + out: + mutex_unlock(&mtrr_mutex); + return error; +} + +int +mtrr_del(int reg, unsigned long base, unsigned long size) +{ + if (mtrr_check(base, size)) + return -EINVAL; + return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT); +} + +EXPORT_SYMBOL(mtrr_add); +EXPORT_SYMBOL(mtrr_del); + +void __init mtrr_bp_init(void) +{ +} + +void mtrr_ap_init(void) +{ +} + +static int __init mtrr_init(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + if (!is_initial_xendomain()) + return -ENODEV; + + if ((!cpu_has(c, X86_FEATURE_MTRR)) && + (!cpu_has(c, X86_FEATURE_K6_MTRR)) && + (!cpu_has(c, X86_FEATURE_CYRIX_ARR)) && + (!cpu_has(c, X86_FEATURE_CENTAUR_MCR))) + return -ENODEV; + + set_num_var_ranges(); + init_table(); + + return 0; +} + +subsys_initcall(mtrr_init); diff -r 42f6970e18c9 -r a533be77c572 arch/i386/kernel/crash.c --- a/arch/i386/kernel/crash.c Mon Jun 04 10:05:24 2007 +0100 +++ b/arch/i386/kernel/crash.c Mon Jun 04 10:05:28 2007 +0100 @@ -90,6 +90,7 @@ static void crash_save_self(struct pt_re crash_save_this_cpu(regs, cpu); } +#ifndef CONFIG_XEN #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) static atomic_t waiting_for_crash_ipi; @@ -154,6 +155,7 @@ static void nmi_shootdown_cpus(void) /* There are no cpus to shootdown */ } #endif +#endif /* CONFIG_XEN */ void machine_crash_shutdown(struct pt_regs *regs) { @@ -170,10 +172,12 @@ void machine_crash_shutdown(struct pt_re /* Make a note of crashing cpu. Will be used in NMI callback.*/ crashing_cpu = smp_processor_id(); +#ifndef CONFIG_XEN nmi_shootdown_cpus(); lapic_shutdown(); #if defined(CONFIG_X86_IO_APIC) disable_IO_APIC(); #endif +#endif /* CONFIG_XEN */ crash_save_self(regs); } diff -r 42f6970e18c9 -r a533be77c572 arch/i386/kernel/early_printk-xen.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/arch/i386/kernel/early_printk-xen.c Mon Jun 04 10:05:28 2007 +0100 @@ -0,0 +1,2 @@ + +#include "../../x86_64/kernel/early_printk-xen.c" diff -r 42f6970e18c9 -r a533be77c572 arch/i386/kernel/entry-xen.S --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/arch/i386/kernel/entry-xen.S Mon Jun 04 10:05:28 2007 +0100 @@ -0,0 +1,1216 @@ +/* + * linux/arch/i386/entry.S + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +/* + * entry.S contains the system-call and fault low-level handling routines. + * This also contains the timer-interrupt handler, as well as all interrupts + * and faults that can result in a task-switch. + * + * NOTE: This code handles signal-recognition, which happens every time + * after a timer-interrupt and after each system call. + * + * I changed all the .align's to 4 (16 byte alignment), as that's faster + * on a 486. + * + * Stack layout in 'ret_from_system_call': + * ptrace needs to have all regs on the stack. + * if the order here is changed, it needs to be + * updated in fork.c:copy_process, signal.c:do_signal, + * ptrace.c and ptrace.h + * + * 0(%esp) - %ebx + * 4(%esp) - %ecx + * 8(%esp) - %edx + * C(%esp) - %esi + * 10(%esp) - %edi + * 14(%esp) - %ebp + * 18(%esp) - %eax + * 1C(%esp) - %ds + * 20(%esp) - %es + * 24(%esp) - orig_eax + * 28(%esp) - %eip + * 2C(%esp) - %cs + * 30(%esp) - %eflags + * 34(%esp) - %oldesp + * 38(%esp) - %oldss + * + * "current" is in register %ebx during any slow entries. + */ + +#include <linux/linkage.h> +#include <asm/thread_info.h> +#include <asm/irqflags.h> +#include <asm/errno.h> +#include <asm/segment.h> +#include <asm/smp.h> +#include <asm/page.h> +#include <asm/desc.h> +#include <asm/dwarf2.h> +#include "irq_vectors.h" +#include <xen/interface/xen.h> + +#define nr_syscalls ((syscall_table_size)/4) + +EBX = 0x00 +ECX = 0x04 +EDX = 0x08 +ESI = 0x0C +EDI = 0x10 +EBP = 0x14 +EAX = 0x18 +DS = 0x1C +ES = 0x20 +ORIG_EAX = 0x24 +EIP = 0x28 +CS = 0x2C +EFLAGS = 0x30 +OLDESP = 0x34 +OLDSS = 0x38 + +CF_MASK = 0x00000001 +TF_MASK = 0x00000100 +IF_MASK = 0x00000200 +DF_MASK = 0x00000400 +NT_MASK = 0x00004000 +VM_MASK = 0x00020000 +/* Pseudo-eflags. */ +NMI_MASK = 0x80000000 + +#ifndef CONFIG_XEN +#define DISABLE_INTERRUPTS cli +#define ENABLE_INTERRUPTS sti +#else +/* Offsets into shared_info_t. */ +#define evtchn_upcall_pending /* 0 */ +#define evtchn_upcall_mask 1 + +#define sizeof_vcpu_shift 6 + +#ifdef CONFIG_SMP +#define GET_VCPU_INFO movl TI_cpu(%ebp),%esi ; \ + shl $sizeof_vcpu_shift,%esi ; \ + addl HYPERVISOR_shared_info,%esi +#else +#define GET_VCPU_INFO movl HYPERVISOR_shared_info,%esi +#endif + +#define __DISABLE_INTERRUPTS movb $1,evtchn_upcall_mask(%esi) +#define __ENABLE_INTERRUPTS movb $0,evtchn_upcall_mask(%esi) +#define DISABLE_INTERRUPTS GET_VCPU_INFO ; \ + __DISABLE_INTERRUPTS +#define ENABLE_INTERRUPTS GET_VCPU_INFO ; \ + __ENABLE_INTERRUPTS +#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(%esi) +#endif + +#ifdef CONFIG_PREEMPT +#define preempt_stop cli; TRACE_IRQS_OFF +#else +#define preempt_stop +#define resume_kernel restore_nocheck +#endif + +.macro TRACE_IRQS_IRET +#ifdef CONFIG_TRACE_IRQFLAGS + testl $IF_MASK,EFLAGS(%esp) # interrupts off? + jz 1f + TRACE_IRQS_ON +1: +#endif +.endm + +#ifdef CONFIG_VM86 +#define resume_userspace_sig check_userspace +#else +#define resume_userspace_sig resume_userspace +#endif + +#define SAVE_ALL \ + cld; \ + pushl %es; \ + CFI_ADJUST_CFA_OFFSET 4;\ + /*CFI_REL_OFFSET es, 0;*/\ + pushl %ds; \ + CFI_ADJUST_CFA_OFFSET 4;\ + /*CFI_REL_OFFSET ds, 0;*/\ + pushl %eax; \ + CFI_ADJUST_CFA_OFFSET 4;\ + CFI_REL_OFFSET eax, 0;\ + pushl %ebp; \ + CFI_ADJUST_CFA_OFFSET 4;\ + CFI_REL_OFFSET ebp, 0;\ + pushl %edi; \ + CFI_ADJUST_CFA_OFFSET 4;\ + CFI_REL_OFFSET edi, 0;\ + pushl %esi; \ + CFI_ADJUST_CFA_OFFSET 4;\ + CFI_REL_OFFSET esi, 0;\ + pushl %edx; \ + CFI_ADJUST_CFA_OFFSET 4;\ + CFI_REL_OFFSET edx, 0;\ + pushl %ecx; \ + CFI_ADJUST_CFA_OFFSET 4;\ + CFI_REL_OFFSET ecx, 0;\ + pushl %ebx; \ + CFI_ADJUST_CFA_OFFSET 4;\ + CFI_REL_OFFSET ebx, 0;\ + movl $(__USER_DS), %edx; \ + movl %edx, %ds; \ + movl %edx, %es; + +#define RESTORE_INT_REGS \ + popl %ebx; \ + CFI_ADJUST_CFA_OFFSET -4;\ + CFI_RESTORE ebx;\ + popl %ecx; \ + CFI_ADJUST_CFA_OFFSET -4;\ + CFI_RESTORE ecx;\ + popl %edx; \ + CFI_ADJUST_CFA_OFFSET -4;\ + CFI_RESTORE edx;\ + popl %esi; \ + CFI_ADJUST_CFA_OFFSET -4;\ + CFI_RESTORE esi;\ + popl %edi; \ + CFI_ADJUST_CFA_OFFSET -4;\ + CFI_RESTORE edi;\ + popl %ebp; \ + CFI_ADJUST_CFA_OFFSET -4;\ + CFI_RESTORE ebp;\ + popl %eax; \ + CFI_ADJUST_CFA_OFFSET -4;\ + CFI_RESTORE eax + +#define RESTORE_REGS \ + RESTORE_INT_REGS; \ +1: popl %ds; \ + CFI_ADJUST_CFA_OFFSET -4;\ + /*CFI_RESTORE ds;*/\ +2: popl %es; \ + CFI_ADJUST_CFA_OFFSET -4;\ + /*CFI_RESTORE es;*/\ +.section .fixup,"ax"; \ +3: movl $0,(%esp); \ + jmp 1b; \ +4: movl $0,(%esp); \ + jmp 2b; \ +.previous; \ +.section __ex_table,"a";\ + .align 4; \ + .long 1b,3b; \ + .long 2b,4b; \ +.previous + +#define RING0_INT_FRAME \ + CFI_STARTPROC simple;\ + CFI_DEF_CFA esp, 3*4;\ + /*CFI_OFFSET cs, -2*4;*/\ + CFI_OFFSET eip, -3*4 + +#define RING0_EC_FRAME \ + CFI_STARTPROC simple;\ + CFI_DEF_CFA esp, 4*4;\ + /*CFI_OFFSET cs, -2*4;*/\ + CFI_OFFSET eip, -3*4 + +#define RING0_PTREGS_FRAME \ + CFI_STARTPROC simple;\ + CFI_DEF_CFA esp, OLDESP-EBX;\ + /*CFI_OFFSET cs, CS-OLDESP;*/\ + CFI_OFFSET eip, EIP-OLDESP;\ + /*CFI_OFFSET es, ES-OLDESP;*/\ + /*CFI_OFFSET ds, DS-OLDESP;*/\ + CFI_OFFSET eax, EAX-OLDESP;\ + CFI_OFFSET ebp, EBP-OLDESP;\ + CFI_OFFSET edi, EDI-OLDESP;\ + CFI_OFFSET esi, ESI-OLDESP;\ + CFI_OFFSET edx, EDX-OLDESP;\ + CFI_OFFSET ecx, ECX-OLDESP;\ + CFI_OFFSET ebx, EBX-OLDESP + +ENTRY(ret_from_fork) + CFI_STARTPROC + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + call schedule_tail + GET_THREAD_INFO(%ebp) + popl %eax + CFI_ADJUST_CFA_OFFSET -4 + pushl $0x0202 # Reset kernel eflags + CFI_ADJUST_CFA_OFFSET 4 + popfl + CFI_ADJUST_CFA_OFFSET -4 + jmp syscall_exit + CFI_ENDPROC + +/* + * Return to user mode is not as complex as all this looks, + * but we want the default path for a system call return to + * go as quickly as possible which is why some of this is + * less clear than it otherwise should be. + */ + + # userspace resumption stub bypassing syscall exit tracing + ALIGN + RING0_PTREGS_FRAME +ret_from_exception: + preempt_stop +ret_from_intr: + GET_THREAD_INFO(%ebp) +check_userspace: + movl EFLAGS(%esp), %eax # mix EFLAGS and CS + movb CS(%esp), %al + testl $(VM_MASK | 2), %eax + jz resume_kernel +ENTRY(resume_userspace) + DISABLE_INTERRUPTS # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret + movl TI_flags(%ebp), %ecx + andl $_TIF_WORK_MASK, %ecx # is there any work to be done on + # int/exception return? + jne work_pending + jmp restore_all + +#ifdef CONFIG_PREEMPT +ENTRY(resume_kernel) + cli + cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? + jnz restore_nocheck +need_resched: + movl TI_flags(%ebp), %ecx # need_resched set ? + testb $_TIF_NEED_RESCHED, %cl + jz restore_all + testl $IF_MASK,EFLAGS(%esp) # interrupts off (exception path) ? + jz restore_all + call preempt_schedule_irq + jmp need_resched +#endif + CFI_ENDPROC + +/* SYSENTER_RETURN points to after the "sysenter" instruction in + the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ + + # sysenter call handler stub +ENTRY(sysenter_entry) + CFI_STARTPROC simple + CFI_DEF_CFA esp, 0 + CFI_REGISTER esp, ebp + movl SYSENTER_stack_esp0(%esp),%esp +sysenter_past_esp: + /* + * No need to follow this irqs on/off section: the syscall + * disabled irqs and here we enable it straight after entry: + */ + sti + pushl $(__USER_DS) + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET ss, 0*/ + pushl %ebp + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET esp, 0 + pushfl + CFI_ADJUST_CFA_OFFSET 4 + pushl $(__USER_CS) + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET cs, 0*/ + /* + * Push current_thread_info()->sysenter_return to the stack. + * A tiny bit of offset fixup is necessary - 4*4 means the 4 words + * pushed above; +8 corresponds to copy_thread's esp0 setting. + */ + pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET eip, 0 + +/* + * Load the potential sixth argument from user stack. + * Careful about security. + */ + cmpl $__PAGE_OFFSET-3,%ebp + jae syscall_fault +1: movl (%ebp),%ebp +.section __ex_table,"a" + .align 4 + .long 1b,syscall_fault +.previous + + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + GET_THREAD_INFO(%ebp) + + /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ + testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) + jnz syscall_trace_entry + cmpl $(nr_syscalls), %eax + jae syscall_badsys + call *sys_call_table(,%eax,4) + movl %eax,EAX(%esp) + DISABLE_INTERRUPTS + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + testw $_TIF_ALLWORK_MASK, %cx + jne syscall_exit_work +/* if something modifies registers it must also disable sysexit */ + movl EIP(%esp), %edx + movl OLDESP(%esp), %ecx + xorl %ebp,%ebp +#ifdef CONFIG_XEN + TRACE_IRQS_ON + __ENABLE_INTERRUPTS +sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/ + __TEST_PENDING + jnz 14f # process more events if necessary... + movl ESI(%esp), %esi + sysexit +14: __DISABLE_INTERRUPTS + TRACE_IRQS_OFF +sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/ + push %esp + call evtchn_do_upcall + add $4,%esp + jmp ret_from_intr +#else + TRACE_IRQS_ON + sti + sysexit +#endif /* !CONFIG_XEN */ + CFI_ENDPROC + + + # system call handler stub +ENTRY(system_call) + RING0_INT_FRAME # can't unwind into user space anyway + pushl %eax # save orig_eax + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + GET_THREAD_INFO(%ebp) + testl $TF_MASK,EFLAGS(%esp) + jz no_singlestep + orl $_TIF_SINGLESTEP,TI_flags(%ebp) +no_singlestep: + # system call tracing in operation / emulation + /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ + testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) + jnz syscall_trace_entry + cmpl $(nr_syscalls), %eax + jae syscall_badsys +syscall_call: + call *sys_call_table(,%eax,4) + movl %eax,EAX(%esp) # store the return value +syscall_exit: + DISABLE_INTERRUPTS # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + testw $_TIF_ALLWORK_MASK, %cx # current->work + jne syscall_exit_work + +restore_all: +#ifndef CONFIG_XEN + movl EFLAGS(%esp), %eax # mix EFLAGS, SS and CS + # Warning: OLDSS(%esp) contains the wrong/random values if we + # are returning to the kernel. + # See comments in process.c:copy_thread() for details. + movb OLDSS(%esp), %ah + movb CS(%esp), %al + andl $(VM_MASK | (4 << 8) | 3), %eax + cmpl $((4 << 8) | 3), %eax + CFI_REMEMBER_STATE + je ldt_ss # returning to user-space with LDT SS +restore_nocheck: +#else +restore_nocheck: + movl EFLAGS(%esp), %eax + testl $(VM_MASK|NMI_MASK), %eax + CFI_REMEMBER_STATE + jnz hypervisor_iret + shr $9, %eax # EAX[0] == IRET_EFLAGS.IF + GET_VCPU_INFO + andb evtchn_upcall_mask(%esi),%al + andb $1,%al # EAX[0] == IRET_EFLAGS.IF & event_mask + CFI_REMEMBER_STATE + jnz restore_all_enable_events # != 0 => enable event delivery +#endif + TRACE_IRQS_IRET +restore_nocheck_notrace: + RESTORE_REGS + addl $4, %esp + CFI_ADJUST_CFA_OFFSET -4 +1: iret +.section .fixup,"ax" +iret_exc: +#ifndef CONFIG_XEN + TRACE_IRQS_ON + sti +#endif + pushl $0 # no error code + pushl $do_iret_error + jmp error_code +.previous +.section __ex_table,"a" + .align 4 + .long 1b,iret_exc +.previous + + CFI_RESTORE_STATE +#ifndef CONFIG_XEN +ldt_ss: + larl OLDSS(%esp), %eax + jnz restore_nocheck + testl $0x00400000, %eax # returning to 32bit stack? + jnz restore_nocheck # allright, normal return + /* If returning to userspace with 16bit stack, + * try to fix the higher word of ESP, as the CPU + * won't restore it. + * This is an "official" bug of all the x86-compatible + * CPUs, which we can try to work around to make + * dosemu and wine happy. */ + subl $8, %esp # reserve space for switch16 pointer + CFI_ADJUST_CFA_OFFSET 8 + cli + TRACE_IRQS_OFF + movl %esp, %eax + /* Set up the 16bit stack frame with switch32 pointer on top, + * and a switch16 pointer on top of the current frame. */ + call setup_x86_bogus_stack + CFI_ADJUST_CFA_OFFSET -8 # frame has moved + TRACE_IRQS_IRET + RESTORE_REGS + lss 20+4(%esp), %esp # switch to 16bit stack +1: iret +.section __ex_table,"a" + .align 4 + .long 1b,iret_exc +.previous +#else + ALIGN +restore_all_enable_events: + TRACE_IRQS_ON + __ENABLE_INTERRUPTS +scrit: /**** START OF CRITICAL REGION ****/ + __TEST_PENDING + jnz 14f # process more events if necessary... + RESTORE_REGS + addl $4, %esp + CFI_ADJUST_CFA_OFFSET -4 +1: iret +.section __ex_table,"a" + .align 4 + .long 1b,iret_exc +.previous +14: __DISABLE_INTERRUPTS + TRACE_IRQS_OFF + jmp 11f +ecrit: /**** END OF CRITICAL REGION ****/ + + CFI_RESTORE_STATE +hypervisor_iret: + andl $~NMI_MASK, EFLAGS(%esp) + RESTORE_REGS + addl $4, %esp + CFI_ADJUST_CFA_OFFSET -4 + jmp hypercall_page + (__HYPERVISOR_iret * 32) +#endif + CFI_ENDPROC + + # perform work that needs to be done immediately before resumption + ALIGN + RING0_PTREGS_FRAME # can't unwind into user space anyway +work_pending: + testb $_TIF_NEED_RESCHED, %cl + jz work_notifysig +work_resched: + call schedule + DISABLE_INTERRUPTS # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + andl $_TIF_WORK_MASK, %ecx # is there any work to be done other + # than syscall tracing? + jz restore_all + testb $_TIF_NEED_RESCHED, %cl + jnz work_resched + +work_notifysig: # deal with pending signals and + # notify-resume requests + testl $VM_MASK, EFLAGS(%esp) + movl %esp, %eax + jne work_notifysig_v86 # returning to kernel-space or + # vm86-space + xorl %edx, %edx + call do_notify_resume + jmp resume_userspace_sig + + ALIGN +work_notifysig_v86: +#ifdef CONFIG_VM86 + pushl %ecx # save ti_flags for do_notify_resume + CFI_ADJUST_CFA_OFFSET 4 + call save_v86_state # %eax contains pt_regs pointer + popl %ecx + CFI_ADJUST_CFA_OFFSET -4 + movl %eax, %esp + xorl %edx, %edx + call do_notify_resume + jmp resume_userspace_sig +#endif + + # perform syscall exit tracing + ALIGN +syscall_trace_entry: + movl $-ENOSYS,EAX(%esp) + movl %esp, %eax + xorl %edx,%edx + call do_syscall_trace + cmpl $0, %eax + jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU, + # so must skip actual syscall + movl ORIG_EAX(%esp), %eax + cmpl $(nr_syscalls), %eax + jnae syscall_call + jmp syscall_exit + + # perform syscall exit tracing + ALIGN +syscall_exit_work: + testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl + jz work_pending + TRACE_IRQS_ON + ENABLE_INTERRUPTS # could let do_syscall_trace() call + # schedule() instead + movl %esp, %eax + movl $1, %edx + call do_syscall_trace + jmp resume_userspace + CFI_ENDPROC + + RING0_INT_FRAME # can't unwind into user space anyway +syscall_fault: + pushl %eax # save orig_eax + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + GET_THREAD_INFO(%ebp) + movl $-EFAULT,EAX(%esp) + jmp resume_userspace + +syscall_badsys: + movl $-ENOSYS,EAX(%esp) + jmp resume_userspace + CFI_ENDPROC + +#ifndef CONFIG_XEN +#define FIXUP_ESPFIX_STACK \ + movl %esp, %eax; \ + /* switch to 32bit stack using the pointer on top of 16bit stack */ \ + lss %ss:CPU_16BIT_STACK_SIZE-8, %esp; \ + /* copy data from 16bit stack to 32bit stack */ \ + call fixup_x86_bogus_stack; \ + /* put ESP to the proper location */ \ + movl %eax, %esp; +#define UNWIND_ESPFIX_STACK \ + pushl %eax; \ + CFI_ADJUST_CFA_OFFSET 4; \ + movl %ss, %eax; \ + /* see if on 16bit stack */ \ + cmpw $__ESPFIX_SS, %ax; \ + je 28f; \ +27: popl %eax; \ + CFI_ADJUST_CFA_OFFSET -4; \ +.section .fixup,"ax"; \ +28: movl $__KERNEL_DS, %eax; \ + movl %eax, %ds; \ + movl %eax, %es; \ + /* switch to 32bit stack */ \ + FIXUP_ESPFIX_STACK; \ + jmp 27b; \ +.previous + +/* + * Build the entry stubs and pointer table with + * some assembler magic. + */ +.data +ENTRY(interrupt) +.text + +vector=0 +ENTRY(irq_entries_start) + RING0_INT_FRAME +.rept NR_IRQS + ALIGN + .if vector + CFI_ADJUST_CFA_OFFSET -4 + .endif +1: pushl $~(vector) + CFI_ADJUST_CFA_OFFSET 4 + jmp common_interrupt +.data + .long 1b +.text +vector=vector+1 +.endr + +/* + * the CPU automatically disables interrupts when executing an IRQ vector, + * so IRQ-flags tracing has to follow that: + */ + ALIGN +common_interrupt: + SAVE_ALL + TRACE_IRQS_OFF + movl %esp,%eax + call do_IRQ + jmp ret_from_intr + CFI_ENDPROC + +#define BUILD_INTERRUPT(name, nr) \ +ENTRY(name) \ + RING0_INT_FRAME; \ + pushl $~(nr); \ + CFI_ADJUST_CFA_OFFSET 4; \ + SAVE_ALL; \ + TRACE_IRQS_OFF \ + movl %esp,%eax; \ + call smp_/**/name; \ + jmp ret_from_intr; \ + CFI_ENDPROC + +/* The include is where all of the SMP etc. interrupts come from */ +#include "entry_arch.h" +#else +#define UNWIND_ESPFIX_STACK +#endif + +ENTRY(divide_error) + RING0_INT_FRAME + pushl $0 # no error code + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_divide_error + CFI_ADJUST_CFA_OFFSET 4 + ALIGN +error_code: + pushl %ds + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET ds, 0*/ + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET eax, 0 + xorl %eax, %eax + pushl %ebp + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebp, 0 + pushl %edi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edi, 0 + pushl %esi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET esi, 0 + pushl %edx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edx, 0 + decl %eax # eax = -1 + pushl %ecx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ecx, 0 + pushl %ebx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebx, 0 + cld + pushl %es + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET es, 0*/ + UNWIND_ESPFIX_STACK + popl %ecx + CFI_ADJUST_CFA_OFFSET -4 + /*CFI_REGISTER es, ecx*/ + movl ES(%esp), %edi # get the function address + movl ORIG_EAX(%esp), %edx # get the error code + movl %eax, ORIG_EAX(%esp) + movl %ecx, ES(%esp) + /*CFI_REL_OFFSET es, ES*/ + movl $(__USER_DS), %ecx + movl %ecx, %ds + movl %ecx, %es + movl %esp,%eax # pt_regs pointer + call *%edi + jmp ret_from_exception + CFI_ENDPROC + +#ifdef CONFIG_XEN +# A note on the "critical region" in our callback handler. +# We want to avoid stacking callback handlers due to events occurring +# during handling of the last event. To do this, we keep events disabled +# until we've done all processing. HOWEVER, we must enable events before +# popping the stack frame (can't be done atomically) and so it would still +# be possible to get enough handler activations to overflow the stack. +# Although unlikely, bugs of that kind are hard to track down, so we'd +# like to avoid the possibility. +# So, on entry to the handler we detect whether we interrupted an +# existing activation in its critical region -- if so, we pop the current +# activation and restart the handler using the previous one. +# +# The sysexit critical region is slightly different. sysexit +# atomically removes the entire stack frame. If we interrupt in the +# critical region we know that the entire frame is present and correct +# so we can simply throw away the new one. +ENTRY(hypervisor_callback) + RING0_INT_FRAME + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + movl EIP(%esp),%eax + cmpl $scrit,%eax + jb 11f + cmpl $ecrit,%eax + jb critical_region_fixup + cmpl $sysexit_scrit,%eax + jb 11f + cmpl $sysexit_ecrit,%eax + ja 11f + addl $OLDESP,%esp # Remove eflags...ebx from stack frame. +11: push %esp + CFI_ADJUST_CFA_OFFSET 4 + call evtchn_do_upcall + add $4,%esp + CFI_ADJUST_CFA_OFFSET -4 + jmp ret_from_intr + CFI_ENDPROC + +# [How we do the fixup]. We want to merge the current stack frame with the +# just-interrupted frame. How we do this depends on where in the critical +# region the interrupted handler was executing, and so how many saved +# registers are in each frame. We do this quickly using the lookup table +# 'critical_fixup_table'. For each byte offset in the critical region, it +# provides the number of bytes which have already been popped from the +# interrupted stack frame. +critical_region_fixup: + movzbl critical_fixup_table-scrit(%eax),%ecx # %eax contains num bytes popped + cmpb $0xff,%cl # 0xff => vcpu_info critical region + jne 15f + xorl %ecx,%ecx +15: leal (%esp,%ecx),%esi # %esi points at end of src region + leal OLDESP(%esp),%edi # %edi points at end of dst region + shrl $2,%ecx # convert words to bytes + je 17f # skip loop if nothing to copy +16: subl $4,%esi # pre-decrementing copy loop + subl $4,%edi + movl (%esi),%eax + movl %eax,(%edi) + loop 16b +17: movl %edi,%esp # final %edi is top of merged stack + jmp 11b + +.section .rodata,"a" +critical_fixup_table: + .byte 0xff,0xff,0xff # testb $0xff,(%esi) = __TEST_PENDING + .byte 0xff,0xff # jnz 14f + .byte 0x00 # pop %ebx + .byte 0x04 # pop %ecx + .byte 0x08 # pop %edx + .byte 0x0c # pop %esi + .byte 0x10 # pop %edi + .byte 0x14 # pop %ebp + .byte 0x18 # pop %eax + .byte 0x1c # pop %ds + .byte 0x20 # pop %es + .byte 0x24,0x24,0x24 # add $4,%esp + .byte 0x28 # iret + .byte 0xff,0xff,0xff,0xff # movb $1,1(%esi) + .byte 0x00,0x00 # jmp 11b +.previous + +# Hypervisor uses this for application faults while it executes. +# We get here for two reasons: +# 1. Fault while reloading DS, ES, FS or GS +# 2. Fault while executing IRET +# Category 1 we fix up by reattempting the load, and zeroing the segment +# register if the load fails. +# Category 2 we fix up by jumping to do_iret_error. We cannot use the +# normal Linux return path in this case because if we use the IRET hypercall +# to pop the stack frame we end up in an infinite loop of failsafe callbacks. +# We distinguish between categories by maintaining a status value in EAX. +ENTRY(failsafe_callback) + pushl %eax + movl $1,%eax +1: mov 4(%esp),%ds +2: mov 8(%esp),%es +3: mov 12(%esp),%fs +4: mov 16(%esp),%gs + testl %eax,%eax + popl %eax + jz 5f + addl $16,%esp # EAX != 0 => Category 2 (Bad IRET) + jmp iret_exc +5: addl $16,%esp # EAX == 0 => Category 1 (Bad segment) + RING0_INT_FRAME + pushl $0 + SAVE_ALL + jmp ret_from_exception +.section .fixup,"ax"; \ +6: xorl %eax,%eax; \ + movl %eax,4(%esp); \ + jmp 1b; \ +7: xorl %eax,%eax; \ + movl %eax,8(%esp); \ + jmp 2b; \ +8: xorl %eax,%eax; \ + movl %eax,12(%esp); \ + jmp 3b; \ +9: xorl %eax,%eax; \ + movl %eax,16(%esp); \ + jmp 4b; \ +.previous; \ +.section __ex_table,"a"; \ + .align 4; \ + .long 1b,6b; \ + .long 2b,7b; \ + .long 3b,8b; \ + .long 4b,9b; \ +.previous +#endif + CFI_ENDPROC + +ENTRY(coprocessor_error) + RING0_INT_FRAME + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_coprocessor_error + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC + +ENTRY(simd_coprocessor_error) + RING0_INT_FRAME + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_simd_coprocessor_error + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC + +ENTRY(device_not_available) + RING0_INT_FRAME + pushl $-1 # mark this as an int + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL +#ifndef CONFIG_XEN + movl %cr0, %eax + testl $0x4, %eax # EM (math emulation bit) + je device_available_emulate + pushl $0 # temporary storage for ORIG_EIP + CFI_ADJUST_CFA_OFFSET 4 + call math_emulate + addl $4, %esp + CFI_ADJUST_CFA_OFFSET -4 + jmp ret_from_exception +device_available_emulate: +#endif + preempt_stop + call math_state_restore + jmp ret_from_exception + CFI_ENDPROC + +#ifndef CONFIG_XEN +/* + * Debug traps and NMI can happen at the one SYSENTER instruction + * that sets up the real kernel stack. Check here, since we can't + * allow the wrong stack to be used. + * + * "SYSENTER_stack_esp0+12" is because the NMI/debug handler will have + * already pushed 3 words if it hits on the sysenter instruction: + * eflags, cs and eip. + * + * We just load the right stack, and push the three (known) values + * by hand onto the new stack - while updating the return eip past + * the instruction that would have done it for sysenter. + */ +#define FIX_STACK(offset, ok, label) \ + cmpw $__KERNEL_CS,4(%esp); \ + jne ok; \ +label: \ + movl SYSENTER_stack_esp0+offset(%esp),%esp; \ + pushfl; \ + pushl $__KERNEL_CS; \ + pushl $sysenter_past_esp +#endif /* CONFIG_XEN */ + +KPROBE_ENTRY(debug) + RING0_INT_FRAME +#ifndef CONFIG_XEN + cmpl $sysenter_entry,(%esp) + jne debug_stack_correct + FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn) +debug_stack_correct: +#endif /* !CONFIG_XEN */ + pushl $-1 # mark this as an int + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + xorl %edx,%edx # error code 0 + movl %esp,%eax # pt_regs pointer + call do_debug + jmp ret_from_exception + CFI_ENDPROC + .previous .text +#ifndef CONFIG_XEN +/* + * NMI is doubly nasty. It can happen _while_ we're handling + * a debug fault, and the debug fault hasn't yet been able to + * clear up the stack. So we first check whether we got an + * NMI on the sysenter entry path, but after that we need to + * check whether we got an NMI on the debug path where the debug + * fault happened on the sysenter path. + */ +ENTRY(nmi) + RING0_INT_FRAME + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + movl %ss, %eax + cmpw $__ESPFIX_SS, %ax + popl %eax + CFI_ADJUST_CFA_OFFSET -4 + je nmi_16bit_stack + cmpl $sysenter_entry,(%esp) + je nmi_stack_fixup + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + movl %esp,%eax + /* Do not access memory above the end of our stack page, + * it might not exist. + */ + andl $(THREAD_SIZE-1),%eax + cmpl $(THREAD_SIZE-20),%eax + popl %eax + CFI_ADJUST_CFA_OFFSET -4 + jae nmi_stack_correct + cmpl $sysenter_entry,12(%esp) + je nmi_debug_stack_check +nmi_stack_correct: + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + xorl %edx,%edx # zero error code + movl %esp,%eax # pt_regs pointer + call do_nmi + jmp restore_nocheck_notrace + CFI_ENDPROC + +nmi_stack_fixup: + FIX_STACK(12,nmi_stack_correct, 1) + jmp nmi_stack_correct +nmi_debug_stack_check: + cmpw $__KERNEL_CS,16(%esp) + jne nmi_stack_correct + cmpl $debug,(%esp) + jb nmi_stack_correct + cmpl $debug_esp_fix_insn,(%esp) + ja nmi_stack_correct + FIX_STACK(24,nmi_stack_correct, 1) + jmp nmi_stack_correct + +nmi_16bit_stack: + RING0_INT_FRAME + /* create the pointer to lss back */ + pushl %ss + CFI_ADJUST_CFA_OFFSET 4 + pushl %esp + CFI_ADJUST_CFA_OFFSET 4 + movzwl %sp, %esp + addw $4, (%esp) + /* copy the iret frame of 12 bytes */ + .rept 3 + pushl 16(%esp) + CFI_ADJUST_CFA_OFFSET 4 + .endr + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + FIXUP_ESPFIX_STACK # %eax == %esp + CFI_ADJUST_CFA_OFFSET -20 # the frame has now moved + xorl %edx,%edx # zero error code + call do_nmi + RESTORE_REGS + lss 12+4(%esp), %esp # back to 16bit stack +1: iret + CFI_ENDPROC +.section __ex_table,"a" + .align 4 + .long 1b,iret_exc +.previous +#else +ENTRY(nmi) + RING0_INT_FRAME + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + xorl %edx,%edx # zero error code + movl %esp,%eax # pt_regs pointer + call do_nmi + orl $NMI_MASK, EFLAGS(%esp) + jmp restore_all + CFI_ENDPROC +#endif + +KPROBE_ENTRY(int3) + RING0_INT_FRAME + pushl $-1 # mark this as an int + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + xorl %edx,%edx # zero error code + movl %esp,%eax # pt_regs pointer + call do_int3 + jmp ret_from_exception + CFI_ENDPROC + .previous .text + +ENTRY(overflow) + RING0_INT_FRAME + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_overflow + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC + +ENTRY(bounds) + RING0_INT_FRAME + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_bounds + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC + +ENTRY(invalid_op) + RING0_INT_FRAME + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_invalid_op + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC + +ENTRY(coprocessor_segment_overrun) + RING0_INT_FRAME + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_coprocessor_segment_overrun + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC + +ENTRY(invalid_TSS) + RING0_EC_FRAME + pushl $do_invalid_TSS + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC + +ENTRY(segment_not_present) + RING0_EC_FRAME + pushl $do_segment_not_present + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC + +ENTRY(stack_segment) + RING0_EC_FRAME + pushl $do_stack_segment + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC + +KPROBE_ENTRY(general_protection) + RING0_EC_FRAME + pushl $do_general_protection + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC + .previous .text + +ENTRY(alignment_check) + RING0_EC_FRAME + pushl $do_alignment_check + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC + +KPROBE_ENTRY(page_fault) + RING0_EC_FRAME + pushl $do_page_fault + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC + .previous .text + +#ifdef CONFIG_X86_MCE +ENTRY(machine_check) + RING0_INT_FRAME + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + pushl machine_check_vector + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +#endif + +#ifndef CONFIG_XEN +ENTRY(spurious_interrupt_bug) + RING0_INT_FRAME + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_spurious_interrupt_bug + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +#endif /* !CONFIG_XEN */ + +#ifdef CONFIG_STACK_UNWIND +ENTRY(arch_unwind_init_running) + CFI_STARTPROC + movl 4(%esp), %edx + movl (%esp), %ecx + leal 4(%esp), %eax + movl %ebx, EBX(%edx) + xorl %ebx, %ebx + movl %ebx, ECX(%edx) + movl %ebx, EDX(%edx) + movl %esi, ESI(%edx) + movl %edi, EDI(%edx) + movl %ebp, EBP(%edx) + movl %ebx, EAX(%edx) + movl $__USER_DS, DS(%edx) + movl $__USER_DS, ES(%edx) + movl %ebx, ORIG_EAX(%edx) + movl %ecx, EIP(%edx) + movl 12(%esp), %ecx + movl $__KERNEL_CS, CS(%edx) + movl %ebx, EFLAGS(%edx) + movl %eax, OLDESP(%edx) + movl 8(%esp), %eax + movl %ecx, 8(%esp) + movl EBX(%edx), %ebx + movl $__KERNEL_DS, OLDSS(%edx) + jmpl *%eax + CFI_ENDPROC +ENDPROC(arch_unwind_init_running) +#endif + +ENTRY(fixup_4gb_segment) + RING0_EC_FRAME + pushl $do_fixup_4gb_segment + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC + +.section .rodata,"a" +.align 4 +#include "syscall_table.S" + +syscall_table_size=(.-sys_call_table) diff -r 42f6970e18c9 -r a533be77c572 arch/i386/kernel/fixup.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/arch/i386/kernel/fixup.c Mon Jun 04 10:05:28 2007 +0100 @@ -0,0 +1,88 @@ +/****************************************************************************** + * fixup.c + * + * Binary-rewriting of certain IA32 instructions, on notification by Xen. + * Used to avoid repeated slow emulation of common instructions used by the + * user-space TLS (Thread-Local Storage) libraries. + * + * **** NOTE **** + * Issues with the binary rewriting have caused it to be removed. Instead + * we rely on Xen's emulator to boot the kernel, and then print a banner + * message recommending that the user disables /lib/tls. + * + * Copyright (c) 2004, K A Fraser + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/init.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/kernel.h> +#include <linux/delay.h> +#include <linux/version.h> + +#define DP(_f, _args...) printk(KERN_ALERT " " _f "\n" , ## _args ) + +fastcall void do_fixup_4gb_segment(struct pt_regs *regs, long error_code) +{ + static unsigned long printed = 0; + char info[100]; + int i; + + /* Ignore statically-linked init. */ + if (current->tgid == 1) + return; + + HYPERVISOR_vm_assist( + VMASST_CMD_disable, VMASST_TYPE_4gb_segments_notify); + + if (test_and_set_bit(0, &printed)) + return; + + sprintf(info, "%s (pid=%d)", current->comm, current->tgid); + + DP(""); + DP("***************************************************************"); + DP("***************************************************************"); + DP("** WARNING: Currently emulating unsupported memory accesses **"); + DP("** in /lib/tls glibc libraries. The emulation is **"); + DP("** slow. To ensure full performance you should **"); + DP("** install a 'xen-friendly' (nosegneg) version of **"); + DP("** the library, or disable tls support by executing **"); + DP("** the following as root: **"); + DP("** mv /lib/tls /lib/tls.disabled **"); + DP("** Offending process: %-38.38s **", info); + DP("***************************************************************"); + DP("***************************************************************"); + DP(""); + + for (i = 5; i > 0; i--) { + touch_softlockup_watchdog(); + printk("Pausing... %d", i); + mdelay(1000); + printk("\b\b\b\b\b\b\b\b\b\b\b\b"); + } + + printk("Continuing...\n\n"); +} + +static int __init fixup_init(void) +{ + HYPERVISOR_vm_assist( + VMASST_CMD_enable, VMASST_TYPE_4gb_segments_notify); + return 0; +} +__initcall(fixup_init); diff -r 42f6970e18c9 -r a533be77c572 arch/i386/kernel/head-xen.S --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/arch/i386/kernel/head-xen.S Mon Jun 04 10:05:28 2007 +0100 @@ -0,0 +1,207 @@ + + +.text +#include <linux/elfnote.h> +#include <linux/threads.h> +#include <linux/linkage.h> +#include <asm/segment.h> +#include <asm/page.h> +#include <asm/cache.h> +#include <asm/thread_info.h> +#include <asm/asm-offsets.h> +#include <asm/dwarf2.h> +#include <xen/interface/xen.h> +#include <xen/interface/elfnote.h> + +/* + * References to members of the new_cpu_data structure. + */ + +#define X86 new_cpu_data+CPUINFO_x86 +#define X86_VENDOR new_cpu_data+CPUINFO_x86_vendor +#define X86_MODEL new_cpu_data+CPUINFO_x86_model +#define X86_MASK new_cpu_data+CPUINFO_x86_mask +#define X86_HARD_MATH new_cpu_data+CPUINFO_hard_math +#define X86_CPUID new_cpu_data+CPUINFO_cpuid_level +#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability +#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id + +#define VIRT_ENTRY_OFFSET 0x0 +.org VIRT_ENTRY_OFFSET +ENTRY(startup_32) + movl %esi,xen_start_info + cld + + /* Set up the stack pointer */ + movl $(init_thread_union+THREAD_SIZE),%esp + + /* get vendor info */ + xorl %eax,%eax # call CPUID with 0 -> return vendor ID + XEN_CPUID + movl %eax,X86_CPUID # save CPUID level + movl %ebx,X86_VENDOR_ID # lo 4 chars + movl %edx,X86_VENDOR_ID+4 # next 4 chars + movl %ecx,X86_VENDOR_ID+8 # last 4 chars + + movl $1,%eax # Use the CPUID instruction to get CPU type + XEN_CPUID + movb %al,%cl # save reg for future use + andb $0x0f,%ah # mask processor family + movb %ah,X86 + andb $0xf0,%al # mask model + shrb $4,%al + movb %al,X86_MODEL + andb $0x0f,%cl # mask mask revision + movb %cl,X86_MASK + movl %edx,X86_CAPABILITY + + movb $1,X86_HARD_MATH + + xorl %eax,%eax # Clear FS/GS and LDT + movl %eax,%fs + movl %eax,%gs + cld # gcc2 wants the direction flag cleared at all times + + pushl %eax # fake return address + jmp start_kernel + +#define HYPERCALL_PAGE_OFFSET 0x1000 +.org HYPERCALL_PAGE_OFFSET +ENTRY(hypercall_page) + CFI_STARTPROC +.skip 0x1000 + CFI_ENDPROC + +/* + * Real beginning of normal "text" segment + */ +ENTRY(stext) +ENTRY(_stext) + +/* + * BSS section + */ +.section ".bss.page_aligned","w" +ENTRY(empty_zero_page) + .fill 4096,1,0 + +/* + * This starts the data section. + */ +.data + +/* + * The Global Descriptor Table contains 28 quadwords, per-CPU. + */ + .align L1_CACHE_BYTES +ENTRY(cpu_gdt_table) + .quad 0x0000000000000000 /* NULL descriptor */ + .quad 0x0000000000000000 /* 0x0b reserved */ + .quad 0x0000000000000000 /* 0x13 reserved */ + .quad 0x0000000000000000 /* 0x1b reserved */ + .quad 0x0000000000000000 /* 0x20 unused */ + .quad 0x0000000000000000 /* 0x28 unused */ + .quad 0x0000000000000000 /* 0x33 TLS entry 1 */ + .quad 0x0000000000000000 /* 0x3b TLS entry 2 */ + .quad 0x0000000000000000 /* 0x43 TLS entry 3 */ + .quad 0x0000000000000000 /* 0x4b reserved */ + .quad 0x0000000000000000 /* 0x53 reserved */ + .quad 0x0000000000000000 /* 0x5b reserved */ + + .quad 0x00cf9a000000ffff /* 0x60 kernel 4GB code at 0x00000000 */ + .quad 0x00cf92000000ffff /* 0x68 kernel 4GB data at 0x00000000 */ + .quad 0x00cffa000000ffff /* 0x73 user 4GB code at 0x00000000 */ + .quad 0x00cff2000000ffff /* 0x7b user 4GB data at 0x00000000 */ + + .quad 0x0000000000000000 /* 0x80 TSS descriptor */ + .quad 0x0000000000000000 /* 0x88 LDT descriptor */ + + /* + * Segments used for calling PnP BIOS have byte granularity. + * They code segments and data segments have fixed 64k limits, + * the transfer segment sizes are set at run time. + */ + .quad 0x0000000000000000 /* 0x90 32-bit code */ + .quad 0x0000000000000000 /* 0x98 16-bit code */ + .quad 0x0000000000000000 /* 0xa0 16-bit data */ + .quad 0x0000000000000000 /* 0xa8 16-bit data */ + .quad 0x0000000000000000 /* 0xb0 16-bit data */ + + /* + * The APM segments have byte granularity and their bases + * are set at run time. All have 64k limits. + */ + .quad 0x0000000000000000 /* 0xb8 APM CS code */ + .quad 0x0000000000000000 /* 0xc0 APM CS 16 code (16 bit) */ + .quad 0x0000000000000000 /* 0xc8 APM DS data */ + + .quad 0x0000000000000000 /* 0xd0 - ESPFIX 16-bit SS */ + .quad 0x0000000000000000 /* 0xd8 - unused */ + .quad 0x0000000000000000 /* 0xe0 - unused */ + .quad 0x0000000000000000 /* 0xe8 - unused */ + .quad 0x0000000000000000 /* 0xf0 - unused */ + .quad 0x0000000000000000 /* 0xf8 - GDT entry 31: double-fault TSS */ + +#if CONFIG_XEN_COMPAT <= 0x030002 +/* + * __xen_guest information + */ +.macro utoa value + .if (\value) < 0 || (\value) >= 0x10 + utoa (((\value)>>4)&0x0fffffff) + .endif + .if ((\value) & 0xf) < 10 + .byte '0' + ((\value) & 0xf) + .else + .byte 'A' + ((\value) & 0xf) - 10 + .endif +.endm + +.section __xen_guest + .ascii "GUEST_OS=linux,GUEST_VER=2.6" + .ascii ",XEN_VER=xen-3.0" + .ascii ",VIRT_BASE=0x" + utoa __PAGE_OFFSET + .ascii ",ELF_PADDR_OFFSET=0x" + utoa __PAGE_OFFSET + .ascii ",VIRT_ENTRY=0x" + utoa (__PAGE_OFFSET + __PHYSICAL_START + VIRT_ENTRY_OFFSET) + .ascii ",HYPERCALL_PAGE=0x" + utoa ((__PHYSICAL_START+HYPERCALL_PAGE_OFFSET)>>PAGE_SHIFT) + .ascii ",FEATURES=writable_page_tables" + .ascii "|writable_descriptor_tables" + .ascii "|auto_translated_physmap" + .ascii "|pae_pgdir_above_4gb" + .ascii "|supervisor_mode_kernel" +#ifdef CONFIG_X86_PAE + .ascii ",PAE=yes[extended-cr3]" +#else + .ascii ",PAE=no" +#endif + .ascii ",LOADER=generic" + .byte 0 +#endif /* CONFIG_XEN_COMPAT <= 0x030002 */ + + + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz, "linux") + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz, "2.6") + ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz, "xen-3.0") + ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long, __PAGE_OFFSET) +#if CONFIG_XEN_COMPAT <= 0x030002 + ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .long, __PAGE_OFFSET) +#else + ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .long, 0) +#endif + ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long, startup_32) + ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long, hypercall_page) + ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, .long, HYPERVISOR_VIRT_START) + ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz, "writable_page_tables|writable_descriptor_tables|auto_translated_physmap|pae_pgdir_above_4gb|supervisor_mode_kernel") +#ifdef CONFIG_X86_PAE + ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "yes") + ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .quad, _PAGE_PRESENT,_PAGE_PRESENT) +#else + ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "no") + ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .long, _PAGE_PRESENT,_PAGE_PRESENT) +#endif + ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz, "generic") + ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long, 1) diff -r 42f6970e18c9 -r a533be77c572 arch/i386/kernel/init_task-xen.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/arch/i386/kernel/init_task-xen.c Mon Jun 04 10:05:28 2007 +0100 @@ -0,0 +1,51 @@ +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <linux/init_task.h> +#include <linux/fs.h> +#include <linux/mqueue.h> + +#include <asm/uaccess.h> +#include <asm/pgtable.h> +#include <asm/desc.h> + +static struct fs_struct init_fs = INIT_FS; +static struct files_struct init_files = INIT_FILES; +static struct signal_struct init_signals = INIT_SIGNALS(init_signals); +static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); + +#define swapper_pg_dir ((pgd_t *)NULL) +struct mm_struct init_mm = INIT_MM(init_mm); +#undef swapper_pg_dir + +EXPORT_SYMBOL(init_mm); + +/* + * Initial thread structure. + * + * We need to make sure that this is THREAD_SIZE aligned due to the + * way process stacks are handled. This is done by having a special + * "init_task" linker map entry.. + */ +union thread_union init_thread_union + __attribute__((__section__(".data.init_task"))) = + { INIT_THREAD_INFO(init_task) }; + +/* + * Initial task structure. + * + * All other task structs will be allocated on slabs in fork.c + */ +struct task_struct init_task = INIT_TASK(init_task); + +EXPORT_SYMBOL(init_task); + +#ifndef CONFIG_X86_NO_TSS +/* + * per-CPU TSS segments. Threads are completely 'soft' on Linux, + * no more per-task TSS's. + */ +DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS; +#endif + diff -r 42f6970e18c9 -r a533be77c572 arch/i386/kernel/io_apic-xen.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/arch/i386/kernel/io_apic-xen.c Mon Jun 04 10:05:28 2007 +0100 @@ -0,0 +1,2777 @@ +/* + * Intel IO-APIC support for multi-Pentium hosts. + * + * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo + * + * Many thanks to Stig Venaas for trying out countless experimental + * patches and reporting/debugging problems patiently! + * + * (c) 1999, Multiple IO-APIC support, developed by + * Ken-ichi Yaku <yaku@xxxxxxxxxxxxxxxxxxxx> and + * Hidemi Kishimoto <kisimoto@xxxxxxxxxxxxxxxxxxxx>, + * further tested and cleaned up by Zach Brown <zab@xxxxxxxxxx> + * and Ingo Molnar <mingo@xxxxxxxxxx> + * + * Fixes + * Maciej W. Rozycki : Bits for genuine 82489DX APICs; + * thanks to Eric Gilmore + * and Rolf G. Tews + * for testing these extensively + * Paul Diefenbaugh : Added full ACPI support + */ + +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/init.h> +#include <linux/delay.h> +#include <linux/sched.h> +#include <linux/smp_lock.h> +#include <linux/mc146818rtc.h> +#include <linux/compiler.h> +#include <linux/acpi.h> +#include <linux/module.h> +#include <linux/sysdev.h> + +#include <asm/io.h> +#include <asm/smp.h> +#include <asm/desc.h> +#include <asm/timer.h> +#include <asm/i8259.h> +#include <asm/nmi.h> + +#include <mach_apic.h> + +#include "io_ports.h" + +#ifdef CONFIG_XEN + +#include <xen/interface/xen.h> +#include <xen/interface/physdev.h> + +/* Fake i8259 */ +#define make_8259A_irq(_irq) (io_apic_irqs &= ~(1UL<<(_irq))) +#define disable_8259A_irq(_irq) ((void)0) +#define i8259A_irq_pending(_irq) (0) + +unsigned long io_apic_irqs; + +static inline unsigned int xen_io_apic_read(unsigned int apic, unsigned int reg) +{ + struct physdev_apic apic_op; + int ret; + + apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr; + apic_op.reg = reg; + ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op); + if (ret) + return ret; + return apic_op.value; +} + +static inline void xen_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) +{ + struct physdev_apic apic_op; + + apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr; + apic_op.reg = reg; + apic_op.value = value; + HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op); +} + +#define io_apic_read(a,r) xen_io_apic_read(a,r) +#define io_apic_write(a,r,v) xen_io_apic_write(a,r,v) + +#endif /* CONFIG_XEN */ + +int (*ioapic_renumber_irq)(int ioapic, int irq); +atomic_t irq_mis_count; + +/* Where if anywhere is the i8259 connect in external int mode */ +static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; + +static DEFINE_SPINLOCK(ioapic_lock); +static DEFINE_SPINLOCK(vector_lock); + +int timer_over_8254 __initdata = 1; + +/* + * Is the SiS APIC rmw bug present ? + * -1 = don't know, 0 = no, 1 = yes + */ +int sis_apic_bug = -1; + +/* + * # of IRQ routing registers + */ +int nr_ioapic_registers[MAX_IO_APICS]; + +int disable_timer_pin_1 __initdata; + +/* + * Rough estimation of how many shared IRQs there are, can + * be changed anytime. + */ +#define MAX_PLUS_SHARED_IRQS NR_IRQS +#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) + +/* + * This is performance-critical, we want to do it O(1) + * + * the indexing order of this array favors 1:1 mappings + * between pins and IRQs. + */ + +static struct irq_pin_list { + int apic, pin, next; +} irq_2_pin[PIN_MAP_SIZE]; + +int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1}; +#ifdef CONFIG_PCI_MSI +#define vector_to_irq(vector) \ + (platform_legacy_irq(vector) ? vector : vector_irq[vector]) +#else +#define vector_to_irq(vector) (vector) +#endif + +/* + * The common case is 1:1 IRQ<->pin mappings. Sometimes there are + * shared ISA-space IRQs, so we have to support them. We are super + * fast in the common case, and fast for shared ISA-space IRQs. + */ +static void add_pin_to_irq(unsigned int irq, int apic, int pin) +{ + static int first_free_entry = NR_IRQS; + struct irq_pin_list *entry = irq_2_pin + irq; + + while (entry->next) + entry = irq_2_pin + entry->next; + + if (entry->pin != -1) { + entry->next = first_free_entry; + entry = irq_2_pin + entry->next; + if (++first_free_entry >= PIN_MAP_SIZE) + panic("io_apic.c: whoops"); + } + entry->apic = apic; + entry->pin = pin; +} + +#ifdef CONFIG_XEN +#define clear_IO_APIC() ((void)0) +#else +/* + * Reroute an IRQ to a different pin. + */ +static void __init replace_pin_at_irq(unsigned int irq, + int oldapic, int oldpin, + int newapic, int newpin) +{ + struct irq_pin_list *entry = irq_2_pin + irq; + + while (1) { + if (entry->apic == oldapic && entry->pin == oldpin) { + entry->apic = newapic; + entry->pin = newpin; + } + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } +} + +static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable) +{ + struct irq_pin_list *entry = irq_2_pin + irq; + unsigned int pin, reg; + + for (;;) { + pin = entry->pin; + if (pin == -1) + break; + reg = io_apic_read(entry->apic, 0x10 + pin*2); + reg &= ~disable; + reg |= enable; + io_apic_modify(entry->apic, 0x10 + pin*2, reg); + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } +} + +/* mask = 1 */ +static void __mask_IO_APIC_irq (unsigned int irq) +{ + __modify_IO_APIC_irq(irq, 0x00010000, 0); +} + +/* mask = 0 */ +static void __unmask_IO_APIC_irq (unsigned int irq) +{ + __modify_IO_APIC_irq(irq, 0, 0x00010000); +} + +/* mask = 1, trigger = 0 */ +static void __mask_and_edge_IO_APIC_irq (unsigned int irq) +{ + __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000); +} + +/* mask = 0, trigger = 1 */ +static void __unmask_and_level_IO_APIC_irq (unsigned int irq) +{ + __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000); +} + +static void mask_IO_APIC_irq (unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + __mask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void unmask_IO_APIC_irq (unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + __unmask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) +{ + struct IO_APIC_route_entry entry; + unsigned long flags; + + /* Check delivery_mode to be sure we're not clearing an SMI pin */ + spin_lock_irqsave(&ioapic_lock, flags); + *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin); + *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin); + spin_unlock_irqrestore(&ioapic_lock, flags); + if (entry.delivery_mode == dest_SMI) + return; + + /* + * Disable it in the IO-APIC irq-routing table: + */ + memset(&entry, 0, sizeof(entry)); + entry.mask = 1; + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0)); + io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1)); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void clear_IO_APIC (void) +{ + int apic, pin; + + for (apic = 0; apic < nr_ioapics; apic++) + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) + clear_IO_APIC_pin(apic, pin); +} + +#ifdef CONFIG_SMP +static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) +{ + unsigned long flags; + int pin; + struct irq_pin_list *entry = irq_2_pin + irq; + unsigned int apicid_value; + cpumask_t tmp; + + cpus_and(tmp, cpumask, cpu_online_map); + if (cpus_empty(tmp)) + tmp = TARGET_CPUS; + + cpus_and(cpumask, tmp, CPU_MASK_ALL); + + apicid_value = cpu_mask_to_apicid(cpumask); + /* Prepare to do the io_apic_write */ + apicid_value = apicid_value << 24; + spin_lock_irqsave(&ioapic_lock, flags); + for (;;) { + pin = entry->pin; + if (pin == -1) + break; + io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value); + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } + set_irq_info(irq, cpumask); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +#if defined(CONFIG_IRQBALANCE) +# include <asm/processor.h> /* kernel_thread() */ +# include <linux/kernel_stat.h> /* kstat */ +# include <linux/slab.h> /* kmalloc() */ +# include <linux/timer.h> /* time_after() */ + +#ifdef CONFIG_BALANCED_IRQ_DEBUG +# define TDprintk(x...) do { printk("<%ld:%s:%d>: ", jiffies, __FILE__, __LINE__); printk(x); } while (0) +# define Dprintk(x...) do { TDprintk(x); } while (0) +# else +# define TDprintk(x...) +# define Dprintk(x...) +# endif + +#define IRQBALANCE_CHECK_ARCH -999 +#define MAX_BALANCED_IRQ_INTERVAL (5*HZ) +#define MIN_BALANCED_IRQ_INTERVAL (HZ/2) +#define BALANCED_IRQ_MORE_DELTA (HZ/10) +#define BALANCED_IRQ_LESS_DELTA (HZ) + +static int irqbalance_disabled __read_mostly = IRQBALANCE_CHECK_ARCH; +static int physical_balance __read_mostly; +static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL; + +static struct irq_cpu_info { + unsigned long * last_irq; + unsigned long * irq_delta; + unsigned long irq; +} irq_cpu_data[NR_CPUS]; + +#define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq) +#define LAST_CPU_IRQ(cpu,irq) (irq_cpu_data[cpu].last_irq[irq]) +#define IRQ_DELTA(cpu,irq) (irq_cpu_data[cpu].irq_delta[irq]) + +#define IDLE_ENOUGH(cpu,now) \ + (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1)) + +#define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask) + +#define CPU_TO_PACKAGEINDEX(i) (first_cpu(cpu_sibling_map[i])) + +static cpumask_t balance_irq_affinity[NR_IRQS] = { + [0 ... NR_IRQS-1] = CPU_MASK_ALL +}; + +void set_balance_irq_affinity(unsigned int irq, cpumask_t mask) +{ + balance_irq_affinity[irq] = mask; +} + +static unsigned long move(int curr_cpu, cpumask_t allowed_mask, + unsigned long now, int direction) +{ + int search_idle = 1; + int cpu = curr_cpu; + + goto inside; + + do { + if (unlikely(cpu == curr_cpu)) + search_idle = 0; +inside: + if (direction == 1) { + cpu++; + if (cpu >= NR_CPUS) + cpu = 0; + } else { + cpu--; + if (cpu == -1) + cpu = NR_CPUS-1; + } + } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) || + (search_idle && !IDLE_ENOUGH(cpu,now))); + + return cpu; +} + +static inline void balance_irq(int cpu, int irq) +{ + unsigned long now = jiffies; + cpumask_t allowed_mask; + unsigned int new_cpu; + + if (irqbalance_disabled) + return; + + cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]); + new_cpu = move(cpu, allowed_mask, now, 1); + if (cpu != new_cpu) { + set_pending_irq(irq, cpumask_of_cpu(new_cpu)); + } +} + +static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold) +{ + int i, j; + Dprintk("Rotating IRQs among CPUs.\n"); + for_each_online_cpu(i) { + for (j = 0; j < NR_IRQS; j++) { + if (!irq_desc[j].action) + continue; + /* Is it a significant load ? */ + if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) < + useful_load_threshold) + continue; + balance_irq(i, j); + } + } + balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, + balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); + return; +} + +static void do_irq_balance(void) +{ + int i, j; + unsigned long max_cpu_irq = 0, min_cpu_irq = (~0); + unsigned long move_this_load = 0; + int max_loaded = 0, min_loaded = 0; + int load; + unsigned long useful_load_threshold = balanced_irq_interval + 10; + int selected_irq; + int tmp_loaded, first_attempt = 1; + unsigned long tmp_cpu_irq; + unsigned long imbalance = 0; + cpumask_t allowed_mask, target_cpu_mask, tmp; + + for_each_possible_cpu(i) { + int package_index; + CPU_IRQ(i) = 0; + if (!cpu_online(i)) + continue; + package_index = CPU_TO_PACKAGEINDEX(i); + for (j = 0; j < NR_IRQS; j++) { + unsigned long value_now, delta; + /* Is this an active IRQ? */ + if (!irq_desc[j].action) + continue; + if ( package_index == i ) + IRQ_DELTA(package_index,j) = 0; + /* Determine the total count per processor per IRQ */ + value_now = (unsigned long) kstat_cpu(i).irqs[j]; + + /* Determine the activity per processor per IRQ */ + delta = value_now - LAST_CPU_IRQ(i,j); + + /* Update last_cpu_irq[][] for the next time */ + LAST_CPU_IRQ(i,j) = value_now; + + /* Ignore IRQs whose rate is less than the clock */ + if (delta < useful_load_threshold) + continue; + /* update the load for the processor or package total */ + IRQ_DELTA(package_index,j) += delta; + + /* Keep track of the higher numbered sibling as well */ + if (i != package_index) + CPU_IRQ(i) += delta; + /* + * We have sibling A and sibling B in the package + * + * cpu_irq[A] = load for cpu A + load for cpu B + * cpu_irq[B] = load for cpu B + */ + CPU_IRQ(package_index) += delta; + } + } + /* Find the least loaded processor package */ + for_each_online_cpu(i) { + if (i != CPU_TO_PACKAGEINDEX(i)) + continue; + if (min_cpu_irq > CPU_IRQ(i)) { + min_cpu_irq = CPU_IRQ(i); + min_loaded = i; + } + } + max_cpu_irq = ULONG_MAX; + +tryanothercpu: + /* Look for heaviest loaded processor. + * We may come back to get the next heaviest loaded processor. + * Skip processors with trivial loads. + */ + tmp_cpu_irq = 0; + tmp_loaded = -1; + for_each_online_cpu(i) { + if (i != CPU_TO_PACKAGEINDEX(i)) + continue; + if (max_cpu_irq <= CPU_IRQ(i)) + continue; + if (tmp_cpu_irq < CPU_IRQ(i)) { + tmp_cpu_irq = CPU_IRQ(i); + tmp_loaded = i; + } + } + + if (tmp_loaded == -1) { + /* In the case of small number of heavy interrupt sources, + * loading some of the cpus too much. We use Ingo's original + * approach to rotate them around. + */ + if (!first_attempt && imbalance >= useful_load_threshold) { + rotate_irqs_among_cpus(useful_load_threshold); + return; + } + goto not_worth_the_effort; + } + + first_attempt = 0; /* heaviest search */ + max_cpu_irq = tmp_cpu_irq; /* load */ + max_loaded = tmp_loaded; /* processor */ + imbalance = (max_cpu_irq - min_cpu_irq) / 2; + + Dprintk("max_loaded cpu = %d\n", max_loaded); + Dprintk("min_loaded cpu = %d\n", min_loaded); + Dprintk("max_cpu_irq load = %ld\n", max_cpu_irq); + Dprintk("min_cpu_irq load = %ld\n", min_cpu_irq); + Dprintk("load imbalance = %lu\n", imbalance); + + /* if imbalance is less than approx 10% of max load, then + * observe diminishing returns action. - quit + */ + if (imbalance < (max_cpu_irq >> 3)) { + Dprintk("Imbalance too trivial\n"); + goto not_worth_the_effort; + } + +tryanotherirq: + /* if we select an IRQ to move that can't go where we want, then + * see if there is another one to try. + */ + move_this_load = 0; + selected_irq = -1; + for (j = 0; j < NR_IRQS; j++) { + /* Is this an active IRQ? */ + if (!irq_desc[j].action) + continue; + if (imbalance <= IRQ_DELTA(max_loaded,j)) + continue; + /* Try to find the IRQ that is closest to the imbalance + * without going over. + */ + if (move_this_load < IRQ_DELTA(max_loaded,j)) { + move_this_load = IRQ_DELTA(max_loaded,j); + selected_irq = j; + } + } + if (selected_irq == -1) { + goto tryanothercpu; + } + + imbalance = move_this_load; + + /* For physical_balance case, we accumlated both load + * values in the one of the siblings cpu_irq[], + * to use the same code for physical and logical processors + * as much as possible. + * + * NOTE: the cpu_irq[] array holds the sum of the load for + * sibling A and sibling B in the slot for the lowest numbered + * sibling (A), _AND_ the load for sibling B in the slot for + * the higher numbered sibling. + * + * We seek the least loaded sibling by making the comparison + * (A+B)/2 vs B + */ + load = CPU_IRQ(min_loaded) >> 1; + for_each_cpu_mask(j, cpu_sibling_map[min_loaded]) { + if (load > CPU_IRQ(j)) { + /* This won't change cpu_sibling_map[min_loaded] */ + load = CPU_IRQ(j); + min_loaded = j; + } + } + + cpus_and(allowed_mask, + cpu_online_map, + balance_irq_affinity[selected_irq]); + target_cpu_mask = cpumask_of_cpu(min_loaded); + cpus_and(tmp, target_cpu_mask, allowed_mask); + + if (!cpus_empty(tmp)) { + + Dprintk("irq = %d moved to cpu = %d\n", + selected_irq, min_loaded); + /* mark for change destination */ + set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded)); + + /* Since we made a change, come back sooner to + * check for more variation. + */ + balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, + balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); + return; + } + goto tryanotherirq; + +not_worth_the_effort: + /* + * if we did not find an IRQ to move, then adjust the time interval + * upward + */ + balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL, + balanced_irq_interval + BALANCED_IRQ_MORE_DELTA); + Dprintk("IRQ worth rotating not found\n"); + return; +} + +static int balanced_irq(void *unused) +{ + int i; + unsigned long prev_balance_time = jiffies; + long time_remaining = balanced_irq_interval; + + daemonize("kirqd"); + + /* push everything to CPU 0 to give us a starting point. */ + for (i = 0 ; i < NR_IRQS ; i++) { + irq_desc[i].pending_mask = cpumask_of_cpu(0); + set_pending_irq(i, cpumask_of_cpu(0)); + } + + for ( ; ; ) { + time_remaining = schedule_timeout_interruptible(time_remaining); + try_to_freeze(); + if (time_after(jiffies, + prev_balance_time+balanced_irq_interval)) { + preempt_disable(); + do_irq_balance(); + prev_balance_time = jiffies; + time_remaining = balanced_irq_interval; + preempt_enable(); + } + } + return 0; +} + +static int __init balanced_irq_init(void) +{ + int i; + struct cpuinfo_x86 *c; + cpumask_t tmp; + + cpus_shift_right(tmp, cpu_online_map, 2); + c = &boot_cpu_data; + /* When not overwritten by the command line ask subarchitecture. */ + if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH) + irqbalance_disabled = NO_BALANCE_IRQ; + if (irqbalance_disabled) + return 0; + + /* disable irqbalance completely if there is only one processor online */ + if (num_online_cpus() < 2) { + irqbalance_disabled = 1; + return 0; + } + /* + * Enable physical balance only if more than 1 physical processor + * is present + */ + if (smp_num_siblings > 1 && !cpus_empty(tmp)) + physical_balance = 1; + + for_each_online_cpu(i) { + irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); + irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); + if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) { + printk(KERN_ERR "balanced_irq_init: out of memory"); + goto failed; + } + memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS); + memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS); + } + + printk(KERN_INFO "Starting balanced_irq\n"); + if (kernel_thread(balanced_irq, NULL, CLONE_KERNEL) >= 0) + return 0; + else + printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq"); +failed: + for_each_possible_cpu(i) { + kfree(irq_cpu_data[i].irq_delta); + irq_cpu_data[i].irq_delta = NULL; + kfree(irq_cpu_data[i].last_irq); + irq_cpu_data[i].last_irq = NULL; + } + return 0; +} + +int __init irqbalance_disable(char *str) +{ + irqbalance_disabled = 1; + return 1; +} + +__setup("noirqbalance", irqbalance_disable); + +late_initcall(balanced_irq_init); +#endif /* CONFIG_IRQBALANCE */ +#endif /* CONFIG_SMP */ +#endif + +#ifndef CONFIG_SMP +void fastcall send_IPI_self(int vector) +{ +#ifndef CONFIG_XEN + unsigned int cfg; + + /* + * Wait for idle. + */ + apic_wait_icr_idle(); + cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL; + /* + * Send the IPI. The write to APIC_ICR fires this off. + */ + apic_write_around(APIC_ICR, cfg); +#endif +} +#endif /* !CONFIG_SMP */ + + +/* + * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to + * specific CPU-side IRQs. + */ + +#define MAX_PIRQS 8 +static int pirq_entries [MAX_PIRQS]; +static int pirqs_enabled; +int skip_ioapic_setup; + +static int __init ioapic_setup(char *str) +{ + skip_ioapic_setup = 1; + return 1; +} + +__setup("noapic", ioapic_setup); + +static int __init ioapic_pirq_setup(char *str) +{ + int i, max; + int ints[MAX_PIRQS+1]; + + get_options(str, ARRAY_SIZE(ints), ints); + + for (i = 0; i < MAX_PIRQS; i++) + pirq_entries[i] = -1; + + pirqs_enabled = 1; + apic_printk(APIC_VERBOSE, KERN_INFO + "PIRQ redirection, working around broken MP-BIOS.\n"); + max = MAX_PIRQS; + if (ints[0] < MAX_PIRQS) + max = ints[0]; + + for (i = 0; i < max; i++) { + apic_printk(APIC_VERBOSE, KERN_DEBUG + "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); + /* + * PIRQs are mapped upside down, usually. + */ + pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; + } + return 1; +} + +__setup("pirq=", ioapic_pirq_setup); + +/* + * Find the IRQ entry number of a certain pin. + */ +static int find_irq_entry(int apic, int pin, int type) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) + if (mp_irqs[i].mpc_irqtype == type && + (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid || + mp_irqs[i].mpc_dstapic == MP_APIC_ALL) && + mp_irqs[i].mpc_dstirq == pin) + return i; + + return -1; +} + +/* + * Find the pin to which IRQ[irq] (ISA) is connected + */ +static int __init find_isa_irq_pin(int irq, int type) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].mpc_srcbus; + + if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || + mp_bus_id_to_type[lbus] == MP_BUS_EISA || + mp_bus_id_to_type[lbus] == MP_BUS_MCA || + mp_bus_id_to_type[lbus] == MP_BUS_NEC98 + ) && + (mp_irqs[i].mpc_irqtype == type) && + (mp_irqs[i].mpc_srcbusirq == irq)) + + return mp_irqs[i].mpc_dstirq; + } + return -1; +} + +static int __init find_isa_irq_apic(int irq, int type) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].mpc_srcbus; + + if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || + mp_bus_id_to_type[lbus] == MP_BUS_EISA || + mp_bus_id_to_type[lbus] == MP_BUS_MCA || + mp_bus_id_to_type[lbus] == MP_BUS_NEC98 + ) && + (mp_irqs[i].mpc_irqtype == type) && + (mp_irqs[i].mpc_srcbusirq == irq)) + break; + } + if (i < mp_irq_entries) { + int apic; + for(apic = 0; apic < nr_ioapics; apic++) { + if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic) + return apic; + } + } + + return -1; +} + +/* + * Find a specific PCI IRQ entry. + * Not an __init, possibly needed by modules + */ +static int pin_2_irq(int idx, int apic, int pin); + +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) +{ + int apic, i, best_guess = -1; + + apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, " + "slot:%d, pin:%d.\n", bus, slot, pin); + if (mp_bus_id_to_pci_bus[bus] == -1) { + printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus); + return -1; + } + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].mpc_srcbus; + + for (apic = 0; apic < nr_ioapics; apic++) + if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic || + mp_irqs[i].mpc_dstapic == MP_APIC_ALL) + break; + + if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) && + !mp_irqs[i].mpc_irqtype && + (bus == lbus) && + (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) { + int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq); + + if (!(apic || IO_APIC_IRQ(irq))) + continue; + + if (pin == (mp_irqs[i].mpc_srcbusirq & 3)) + return irq; + /* + * Use the first all-but-pin matching entry as a + * best-guess fuzzy result for broken mptables. + */ + if (best_guess < 0) + best_guess = irq; + } + } + return best_guess; +} +EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); + +/* + * This function currently is only a helper for the i386 smp boot process where + * we need to reprogram the ioredtbls to cater for the cpus which have come online + * so mask in all cases should simply be TARGET_CPUS + */ +#ifdef CONFIG_SMP +#ifndef CONFIG_XEN +void __init setup_ioapic_dest(void) +{ + int pin, ioapic, irq, irq_entry; + + if (skip_ioapic_setup == 1) + return; + + for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { + for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { + irq_entry = find_irq_entry(ioapic, pin, mp_INT); + if (irq_entry == -1) + continue; + irq = pin_2_irq(irq_entry, ioapic, pin); + set_ioapic_affinity_irq(irq, TARGET_CPUS); + } + + } +} +#endif /* !CONFIG_XEN */ +#endif + +/* + * EISA Edge/Level control register, ELCR + */ +static int EISA_ELCR(unsigned int irq) +{ + if (irq < 16) { + unsigned int port = 0x4d0 + (irq >> 3); + return (inb(port) >> (irq & 7)) & 1; + } + apic_printk(APIC_VERBOSE, KERN_INFO + "Broken MPtable reports ISA irq %d\n", irq); + return 0; +} + +/* EISA interrupts are always polarity zero and can be edge or level + * trigger depending on the ELCR value. If an interrupt is listed as + * EISA conforming in the MP table, that means its trigger type must + * be read in from the ELCR */ + +#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq)) +#define default_EISA_polarity(idx) (0) + +/* ISA interrupts are always polarity zero edge triggered, + * when listed as conforming in the MP table. */ + +#define default_ISA_trigger(idx) (0) +#define default_ISA_polarity(idx) (0) + +/* PCI interrupts are always polarity one level triggered, + * when listed as conforming in the MP table. */ + +#define default_PCI_trigger(idx) (1) +#define default_PCI_polarity(idx) (1) + +/* MCA interrupts are always polarity zero level triggered, + * when listed as conforming in the MP table. */ + +#define default_MCA_trigger(idx) (1) +#define default_MCA_polarity(idx) (0) + +/* NEC98 interrupts are always polarity zero edge triggered, + * when listed as conforming in the MP table. */ + +#define default_NEC98_trigger(idx) (0) +#define default_NEC98_polarity(idx) (0) + +static int __init MPBIOS_polarity(int idx) +{ + int bus = mp_irqs[idx].mpc_srcbus; + int polarity; + + /* + * Determine IRQ line polarity (high active or low active): + */ + switch (mp_irqs[idx].mpc_irqflag & 3) + { + case 0: /* conforms, ie. bus-type dependent polarity */ + { + switch (mp_bus_id_to_type[bus]) + { + case MP_BUS_ISA: /* ISA pin */ + { + polarity = default_ISA_polarity(idx); + break; + } + case MP_BUS_EISA: /* EISA pin */ + { + polarity = default_EISA_polarity(idx); + break; + } + case MP_BUS_PCI: /* PCI pin */ + { + polarity = default_PCI_polarity(idx); + break; + } + case MP_BUS_MCA: /* MCA pin */ + { + polarity = default_MCA_polarity(idx); + break; + } + case MP_BUS_NEC98: /* NEC 98 pin */ + { + polarity = default_NEC98_polarity(idx); + break; + } + default: + { + printk(KERN_WARNING "broken BIOS!!\n"); + polarity = 1; + break; + } + } + break; + } + case 1: /* high active */ + { + polarity = 0; + break; + } + case 2: /* reserved */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + polarity = 1; + break; + } + case 3: /* low active */ + { + polarity = 1; + break; + } + default: /* invalid */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + polarity = 1; + break; + } + } + return polarity; +} + +static int MPBIOS_trigger(int idx) +{ + int bus = mp_irqs[idx].mpc_srcbus; + int trigger; + + /* + * Determine IRQ trigger mode (edge or level sensitive): + */ + switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) + { + case 0: /* conforms, ie. bus-type dependent */ + { + switch (mp_bus_id_to_type[bus]) + { + case MP_BUS_ISA: /* ISA pin */ + { + trigger = default_ISA_trigger(idx); + break; + } + case MP_BUS_EISA: /* EISA pin */ + { + trigger = default_EISA_trigger(idx); + break; + } + case MP_BUS_PCI: /* PCI pin */ + { + trigger = default_PCI_trigger(idx); + break; + } + case MP_BUS_MCA: /* MCA pin */ + { + trigger = default_MCA_trigger(idx); + break; + } + case MP_BUS_NEC98: /* NEC 98 pin */ + { + trigger = default_NEC98_trigger(idx); + break; + } + default: + { + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 1; + break; + } + } + break; + } + case 1: /* edge */ + { + trigger = 0; + break; + } + case 2: /* reserved */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 1; + break; + } + case 3: /* level */ + { + trigger = 1; + break; + } + default: /* invalid */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 0; + break; + } + } + return trigger; +} + +static inline int irq_polarity(int idx) +{ + return MPBIOS_polarity(idx); +} + +static inline int irq_trigger(int idx) +{ + return MPBIOS_trigger(idx); +} + +static int pin_2_irq(int idx, int apic, int pin) +{ + int irq, i; + int bus = mp_irqs[idx].mpc_srcbus; + + /* + * Debugging check, we are in big trouble if this message pops up! + */ + if (mp_irqs[idx].mpc_dstirq != pin) + printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); + + switch (mp_bus_id_to_type[bus]) + { + case MP_BUS_ISA: /* ISA pin */ + case MP_BUS_EISA: + case MP_BUS_MCA: + case MP_BUS_NEC98: + { + irq = mp_irqs[idx].mpc_srcbusirq; + break; + } + case MP_BUS_PCI: /* PCI pin */ + { + /* + * PCI IRQs are mapped in order + */ + i = irq = 0; + while (i < apic) + irq += nr_ioapic_registers[i++]; + irq += pin; + + /* + * For MPS mode, so far only needed by ES7000 platform + */ + if (ioapic_renumber_irq) + irq = ioapic_renumber_irq(apic, irq); + + break; + } + default: + { + printk(KERN_ERR "unknown bus type %d.\n",bus); + irq = 0; + break; + } + } + + /* + * PCI IRQ command line redirection. Yes, limits are hardcoded. + */ + if ((pin >= 16) && (pin <= 23)) { + if (pirq_entries[pin-16] != -1) { + if (!pirq_entries[pin-16]) { + apic_printk(APIC_VERBOSE, KERN_DEBUG + "disabling PIRQ%d\n", pin-16); + } else { + irq = pirq_entries[pin-16]; + apic_printk(APIC_VERBOSE, KERN_DEBUG + "using PIRQ%d -> IRQ %d\n", + pin-16, irq); + } + } + } + return irq; +} + +static inline int IO_APIC_irq_trigger(int irq) +{ + int apic, idx, pin; + + for (apic = 0; apic < nr_ioapics; apic++) { + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + idx = find_irq_entry(apic,pin,mp_INT); + if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin))) + return irq_trigger(idx); + } + } + /* + * nonexistent IRQs are edge default + */ + return 0; +} + +/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */ +u8 irq_vector[NR_IRQ_VECTORS] __read_mostly; /* = { FIRST_DEVICE_VECTOR , 0 }; */ + +int assign_irq_vector(int irq) +{ + unsigned long flags; + int vector; + struct physdev_irq irq_op; + + BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS); + + spin_lock_irqsave(&vector_lock, flags); + + if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) { + spin_unlock_irqrestore(&vector_lock, flags); + return IO_APIC_VECTOR(irq); + } + + irq_op.irq = irq; + if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) { + spin_unlock_irqrestore(&vector_lock, flags); + return -ENOSPC; + } + + vector = irq_op.vector; + vector_irq[vector] = irq; + if (irq != AUTO_ASSIGN) + IO_APIC_VECTOR(irq) = vector; + + spin_unlock_irqrestore(&vector_lock, flags); + + return vector; +} + +#ifndef CONFIG_XEN +static struct hw_interrupt_type ioapic_level_type; +static struct hw_interrupt_type ioapic_edge_type; + +#define IOAPIC_AUTO -1 +#define IOAPIC_EDGE 0 +#define IOAPIC_LEVEL 1 + +static void ioapic_register_intr(int irq, int vector, unsigned long trigger) +{ + unsigned idx; + + idx = use_pci_vector() && !platform_legacy_irq(irq) ? vector : irq; + + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || + trigger == IOAPIC_LEVEL) + irq_desc[idx].chip = &ioapic_level_type; + else + irq_desc[idx].chip = &ioapic_edge_type; + set_intr_gate(vector, interrupt[idx]); +} +#else +#define ioapic_register_intr(_irq,_vector,_trigger) ((void)0) +#endif + +static void __init setup_IO_APIC_irqs(void) +{ + struct IO_APIC_route_entry entry; + int apic, pin, idx, irq, first_notcon = 1, vector; + unsigned long flags; + + apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); + + for (apic = 0; apic < nr_ioapics; apic++) { + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + + /* + * add it to the IO-APIC irq-routing table: + */ + memset(&entry,0,sizeof(entry)); + + entry.delivery_mode = INT_DELIVERY_MODE; + entry.dest_mode = INT_DEST_MODE; + entry.mask = 0; /* enable IRQ */ + entry.dest.logical.logical_dest = + cpu_mask_to_apicid(TARGET_CPUS); + + idx = find_irq_entry(apic,pin,mp_INT); + if (idx == -1) { + if (first_notcon) { + apic_printk(APIC_VERBOSE, KERN_DEBUG + " IO-APIC (apicid-pin) %d-%d", + mp_ioapics[apic].mpc_apicid, + pin); + first_notcon = 0; + } else + apic_printk(APIC_VERBOSE, ", %d-%d", + mp_ioapics[apic].mpc_apicid, pin); + continue; + } + + entry.trigger = irq_trigger(idx); + entry.polarity = irq_polarity(idx); + + if (irq_trigger(idx)) { + entry.trigger = 1; + entry.mask = 1; + } + + irq = pin_2_irq(idx, apic, pin); + /* + * skip adding the timer int on secondary nodes, which causes + * a small but painful rift in the time-space continuum + */ + if (multi_timer_check(apic, irq)) + continue; + else + add_pin_to_irq(irq, apic, pin); + + if (/*!apic &&*/ !IO_APIC_IRQ(irq)) + continue; + + if (IO_APIC_IRQ(irq)) { + vector = assign_irq_vector(irq); + entry.vector = vector; + ioapic_register_intr(irq, vector, IOAPIC_AUTO); + + if (!apic && (irq < 16)) + disable_8259A_irq(irq); + } + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); + io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); + set_native_irq_info(irq, TARGET_CPUS); + spin_unlock_irqrestore(&ioapic_lock, flags); + } + } + + if (!first_notcon) + apic_printk(APIC_VERBOSE, " not connected.\n"); +} + +/* + * Set up the 8259A-master output pin: + */ +#ifndef CONFIG_XEN +static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector) +{ + struct IO_APIC_route_entry entry; + unsigned long flags; + + memset(&entry,0,sizeof(entry)); + + disable_8259A_irq(0); + + /* mask LVT0 */ + apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); + + /* + * We use logical delivery to get the timer IRQ + * to the first CPU. + */ + entry.dest_mode = INT_DEST_MODE; + entry.mask = 0; /* unmask IRQ now */ + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); + entry.delivery_mode = INT_DELIVERY_MODE; + entry.polarity = 0; + entry.trigger = 0; + entry.vector = vector; + + /* + * The timer IRQ doesn't have to know that behind the + * scene we have a 8259A-master in AEOI mode ... + */ + irq_desc[0].chip = &ioapic_edge_type; + + /* + * Add it to the IO-APIC irq-routing table: + */ + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); + io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); + spin_unlock_irqrestore(&ioapic_lock, flags); + + enable_8259A_irq(0); +} + +static inline void UNEXPECTED_IO_APIC(void) +{ +} + +void __init print_IO_APIC(void) +{ + int apic, i; + union IO_APIC_reg_00 reg_00; + union IO_APIC_reg_01 reg_01; + union IO_APIC_reg_02 reg_02; + union IO_APIC_reg_03 reg_03; + unsigned long flags; + + if (apic_verbosity == APIC_QUIET) + return; + + printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); + for (i = 0; i < nr_ioapics; i++) + printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", + mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]); + + /* + * We are a bit conservative about what we expect. We have to + * know about every hardware change ASAP. + */ + printk(KERN_INFO "testing the IO APIC.......................\n"); + + for (apic = 0; apic < nr_ioapics; apic++) { + + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(apic, 0); + reg_01.raw = io_apic_read(apic, 1); + if (reg_01.bits.version >= 0x10) + reg_02.raw = io_apic_read(apic, 2); + if (reg_01.bits.version >= 0x20) + reg_03.raw = io_apic_read(apic, 3); + spin_unlock_irqrestore(&ioapic_lock, flags); + + printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid); + printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); + printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); + printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); + printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); + if (reg_00.bits.ID >= get_physical_broadcast()) + UNEXPECTED_IO_APIC(); + if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2) + UNEXPECTED_IO_APIC(); + + printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw); + printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); + if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */ + (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */ + (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */ + (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */ + (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */ + (reg_01.bits.entries != 0x2E) && + (reg_01.bits.entries != 0x3F) + ) + UNEXPECTED_IO_APIC(); + + printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); + printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); + if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */ + (reg_01.bits.version != 0x10) && /* oldest IO-APICs */ + (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */ + (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */ + (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */ + ) + UNEXPECTED_IO_APIC(); + if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2) + UNEXPECTED_IO_APIC(); + + /* + * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, + * but the value of reg_02 is read as the previous read register + * value, so ignore it if reg_02 == reg_01. + */ + if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) { + printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); + printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); + if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2) + UNEXPECTED_IO_APIC(); + } + + /* + * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02 + * or reg_03, but the value of reg_0[23] is read as the previous read + * register value, so ignore it if reg_03 == reg_0[12]. + */ + if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw && + reg_03.raw != reg_01.raw) { + printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw); + printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT); + if (reg_03.bits.__reserved_1) + UNEXPECTED_IO_APIC(); + } + + printk(KERN_DEBUG ".... IRQ redirection table:\n"); + + printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol" + " Stat Dest Deli Vect: \n"); + + for (i = 0; i <= reg_01.bits.entries; i++) { + struct IO_APIC_route_entry entry; + + spin_lock_irqsave(&ioapic_lock, flags); + *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2); + *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2); + spin_unlock_irqrestore(&ioapic_lock, flags); + + printk(KERN_DEBUG " %02x %03X %02X ", + i, + entry.dest.logical.logical_dest, + entry.dest.physical.physical_dest + ); + + printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", + entry.mask, + entry.trigger, + entry.irr, + entry.polarity, + entry.delivery_status, + entry.dest_mode, + entry.delivery_mode, + entry.vector + ); + } + } + if (use_pci_vector()) + printk(KERN_INFO "Using vector-based indexing\n"); + printk(KERN_DEBUG "IRQ to pin mappings:\n"); + for (i = 0; i < NR_IRQS; i++) { + struct irq_pin_list *entry = irq_2_pin + i; + if (entry->pin < 0) + continue; + if (use_pci_vector() && !platform_legacy_irq(i)) + printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i)); + else + printk(KERN_DEBUG "IRQ%d ", i); + for (;;) { + printk("-> %d:%d", entry->apic, entry->pin); + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } + printk("\n"); + } + + printk(KERN_INFO ".................................... done.\n"); + + return; +} + +#if 0 + +static void print_APIC_bitfield (int base) +{ + unsigned int v; + int i, j; + + if (apic_verbosity == APIC_QUIET) + return; + + printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); + for (i = 0; i < 8; i++) { + v = apic_read(base + i*0x10); + for (j = 0; j < 32; j++) { + if (v & (1<<j)) + printk("1"); + else + printk("0"); + } + printk("\n"); + } +} + +void /*__init*/ print_local_APIC(void * dummy) +{ + unsigned int v, ver, maxlvt; + + if (apic_verbosity == APIC_QUIET) + return; + + printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", + smp_processor_id(), hard_smp_processor_id()); + v = apic_read(APIC_ID); + printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v)); + v = apic_read(APIC_LVR); + printk(KERN_INFO "... APIC VERSION: %08x\n", v); + ver = GET_APIC_VERSION(v); + maxlvt = get_maxlvt(); + + v = apic_read(APIC_TASKPRI); + printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); + + if (APIC_INTEGRATED(ver)) { /* !82489DX */ + v = apic_read(APIC_ARBPRI); + printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v, + v & APIC_ARBPRI_MASK); + v = apic_read(APIC_PROCPRI); + printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v); + } + + v = apic_read(APIC_EOI); + printk(KERN_DEBUG "... APIC EOI: %08x\n", v); + v = apic_read(APIC_RRR); + printk(KERN_DEBUG "... APIC RRR: %08x\n", v); + v = apic_read(APIC_LDR); + printk(KERN_DEBUG "... APIC LDR: %08x\n", v); + v = apic_read(APIC_DFR); + printk(KERN_DEBUG "... APIC DFR: %08x\n", v); + v = apic_read(APIC_SPIV); + printk(KERN_DEBUG "... APIC SPIV: %08x\n", v); + + printk(KERN_DEBUG "... APIC ISR field:\n"); + print_APIC_bitfield(APIC_ISR); + printk(KERN_DEBUG "... APIC TMR field:\n"); + print_APIC_bitfield(APIC_TMR); + printk(KERN_DEBUG "... APIC IRR field:\n"); + print_APIC_bitfield(APIC_IRR); + + if (APIC_INTEGRATED(ver)) { /* !82489DX */ + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + v = apic_read(APIC_ESR); + printk(KERN_DEBUG "... APIC ESR: %08x\n", v); + } + + v = apic_read(APIC_ICR); + printk(KERN_DEBUG "... APIC ICR: %08x\n", v); + v = apic_read(APIC_ICR2); + printk(KERN_DEBUG "... APIC ICR2: %08x\n", v); + + v = apic_read(APIC_LVTT); + printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); + + if (maxlvt > 3) { /* PC is LVT#4. */ + v = apic_read(APIC_LVTPC); + printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v); + } + v = apic_read(APIC_LVT0); + printk(KERN_DEBUG "... APIC LVT0: %08x\n", v); + v = apic_read(APIC_LVT1); + printk(KERN_DEBUG "... APIC LVT1: %08x\n", v); + + if (maxlvt > 2) { /* ERR is LVT#3. */ + v = apic_read(APIC_LVTERR); + printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v); + } + + v = apic_read(APIC_TMICT); + printk(KERN_DEBUG "... APIC TMICT: %08x\n", v); + v = apic_read(APIC_TMCCT); + printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); + v = apic_read(APIC_TDCR); + printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); + printk("\n"); +} + +void print_all_local_APICs (void) +{ + on_each_cpu(print_local_APIC, NULL, 1, 1); +} + +void /*__init*/ print_PIC(void) +{ + unsigned int v; + unsigned long flags; + + if (apic_verbosity == APIC_QUIET) + return; + + printk(KERN_DEBUG "\nprinting PIC contents\n"); + + spin_lock_irqsave(&i8259A_lock, flags); + + v = inb(0xa1) << 8 | inb(0x21); + printk(KERN_DEBUG "... PIC IMR: %04x\n", v); + + v = inb(0xa0) << 8 | inb(0x20); + printk(KERN_DEBUG "... PIC IRR: %04x\n", v); + + outb(0x0b,0xa0); + outb(0x0b,0x20); + v = inb(0xa0) << 8 | inb(0x20); + outb(0x0a,0xa0); + outb(0x0a,0x20); + + spin_unlock_irqrestore(&i8259A_lock, flags); + + printk(KERN_DEBUG "... PIC ISR: %04x\n", v); + + v = inb(0x4d1) << 8 | inb(0x4d0); + printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); +} + +#endif /* 0 */ + +#else +void __init print_IO_APIC(void) { } +#endif /* !CONFIG_XEN */ + +static void __init enable_IO_APIC(void) +{ + union IO_APIC_reg_01 reg_01; + int i8259_apic, i8259_pin; + int i, apic; + unsigned long flags; + + for (i = 0; i < PIN_MAP_SIZE; i++) { + irq_2_pin[i].pin = -1; + irq_2_pin[i].next = 0; + } + if (!pirqs_enabled) + for (i = 0; i < MAX_PIRQS; i++) + pirq_entries[i] = -1; + + /* + * The number of IO-APIC IRQ registers (== #pins): + */ + for (apic = 0; apic < nr_ioapics; apic++) { + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(apic, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + nr_ioapic_registers[apic] = reg_01.bits.entries+1; + } + for(apic = 0; apic < nr_ioapics; apic++) { + int pin; + /* See if any of the pins is in ExtINT mode */ + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + struct IO_APIC_route_entry entry; + spin_lock_irqsave(&ioapic_lock, flags); + *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin); + *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin); + spin_unlock_irqrestore(&ioapic_lock, flags); + + + /* If the interrupt line is enabled and in ExtInt mode + * I have found the pin where the i8259 is connected. + */ + if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) { + ioapic_i8259.apic = apic; + ioapic_i8259.pin = pin; + goto found_i8259; + } + } + } + found_i8259: + /* Look to see what if the MP table has reported the ExtINT */ + /* If we could not find the appropriate pin by looking at the ioapic + * the i8259 probably is not connected the ioapic but give the + * mptable a chance anyway. + */ + i8259_pin = find_isa_irq_pin(0, mp_ExtINT); + i8259_apic = find_isa_irq_apic(0, mp_ExtINT); + /* Trust the MP table if nothing is setup in the hardware */ + if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) { + printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n"); + ioapic_i8259.pin = i8259_pin; + ioapic_i8259.apic = i8259_apic; + } + /* Complain if the MP table and the hardware disagree */ + if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) && + (i8259_pin >= 0) && (ioapic_i8259.pin >= 0)) + { + printk(KERN_WARNING "ExtINT in hardware and MP table differ\n"); + } + + /* + * Do not trust the IO-APIC being empty at bootup + */ + clear_IO_APIC(); +} + +/* + * Not an __init, needed by the reboot code + */ +void disable_IO_APIC(void) +{ + /* + * Clear the IO-APIC before rebooting: + */ + clear_IO_APIC(); + +#ifndef CONFIG_XEN + /* + * If the i8259 is routed through an IOAPIC + * Put that IOAPIC in virtual wire mode + * so legacy interrupts can be delivered. + */ + if (ioapic_i8259.pin != -1) { + struct IO_APIC_route_entry entry; + unsigned long flags; + + memset(&entry, 0, sizeof(entry)); + entry.mask = 0; /* Enabled */ + entry.trigger = 0; /* Edge */ + entry.irr = 0; + entry.polarity = 0; /* High */ + entry.delivery_status = 0; + entry.dest_mode = 0; /* Physical */ + entry.delivery_mode = dest_ExtINT; /* ExtInt */ + entry.vector = 0; + entry.dest.physical.physical_dest = + GET_APIC_ID(apic_read(APIC_ID)); + + /* + * Add it to the IO-APIC irq-routing table: + */ + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin, + *(((int *)&entry)+1)); + io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin, + *(((int *)&entry)+0)); + spin_unlock_irqrestore(&ioapic_lock, flags); + } + disconnect_bsp_APIC(ioapic_i8259.pin != -1); +#endif +} + +/* + * function to set the IO-APIC physical IDs based on the + * values stored in the MPC table. + * + * by Matt Domsch <Matt_Domsch@xxxxxxxx> Tue Dec 21 12:25:05 CST 1999 + */ + +#if !defined(CONFIG_XEN) && !defined(CONFIG_X86_NUMAQ) +static void __init setup_ioapic_ids_from_mpc(void) +{ + union IO_APIC_reg_00 reg_00; + physid_mask_t phys_id_present_map; + int apic; + int i; + unsigned char old_id; + unsigned long flags; + + /* + * Don't check I/O APIC IDs for xAPIC systems. They have + * no meaning without the serial APIC bus. + */ + if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + || APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) + return; + /* + * This is broken; anything with a real cpu count has to + * circumvent this idiocy regardless. + */ + phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map); + + /* + * Set the IOAPIC ID to the value stored in the MPC table. + */ + for (apic = 0; apic < nr_ioapics; apic++) { + + /* Read the register 0 value */ + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(apic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + + old_id = mp_ioapics[apic].mpc_apicid; + + if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) { + printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", + apic, mp_ioapics[apic].mpc_apicid); + printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", + reg_00.bits.ID); + mp_ioapics[apic].mpc_apicid = reg_00.bits.ID; + } + + /* + * Sanity check, is the ID really free? Every APIC in a + * system must have a unique ID or we get lots of nice + * 'stuck on smp_invalidate_needed IPI wait' messages. + */ + if (check_apicid_used(phys_id_present_map, + mp_ioapics[apic].mpc_apicid)) { + printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", + apic, mp_ioapics[apic].mpc_apicid); + for (i = 0; i < get_physical_broadcast(); i++) + if (!physid_isset(i, phys_id_present_map)) + break; + if (i >= get_physical_broadcast()) + panic("Max APIC ID exceeded!\n"); + printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", + i); + physid_set(i, phys_id_present_map); + mp_ioapics[apic].mpc_apicid = i; + } else { + physid_mask_t tmp; + tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid); + apic_printk(APIC_VERBOSE, "Setting %d in the " + "phys_id_present_map\n", + mp_ioapics[apic].mpc_apicid); + physids_or(phys_id_present_map, phys_id_present_map, tmp); + } + + + /* + * We need to adjust the IRQ routing table + * if the ID changed. + */ + if (old_id != mp_ioapics[apic].mpc_apicid) + for (i = 0; i < mp_irq_entries; i++) + if (mp_irqs[i].mpc_dstapic == old_id) + mp_irqs[i].mpc_dstapic + = mp_ioapics[apic].mpc_apicid; + + /* + * Read the right value from the MPC table and + * write it into the ID register. + */ + apic_printk(APIC_VERBOSE, KERN_INFO + "...changing IO-APIC physical APIC ID to %d ...", + mp_ioapics[apic].mpc_apicid); + + reg_00.bits.ID = mp_ioapics[apic].mpc_apicid; + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0, reg_00.raw); + spin_unlock_irqrestore(&ioapic_lock, flags); + + /* + * Sanity check + */ + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(apic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid) + printk("could not set ID!\n"); + else + apic_printk(APIC_VERBOSE, " ok.\n"); + } +} +#else +static void __init setup_ioapic_ids_from_mpc(void) { } +#endif + +#ifndef CONFIG_XEN +/* + * There is a nasty bug in some older SMP boards, their mptable lies + * about the timer IRQ. We do the following to work around the situation: + * + * - timer IRQ defaults to IO-APIC IRQ + * - if this function detects that timer IRQs are defunct, then we fall + * back to ISA timer IRQs + */ +static int __init timer_irq_works(void) +{ + unsigned long t1 = jiffies; + + local_irq_enable(); + /* Let ten ticks pass... */ + mdelay((10 * 1000) / HZ); + + /* + * Expect a few ticks at least, to be sure some possible + * glue logic does not lock up after one or two first + * ticks in a non-ExtINT mode. Also the local APIC + * might have cached one ExtINT interrupt. Finally, at + * least one tick may be lost due to delays. + */ + if (jiffies - t1 > 4) + return 1; + + return 0; +} + +/* + * In the SMP+IOAPIC case it might happen that there are an unspecified + * number of pending IRQ events unhandled. These cases are very rare, + * so we 'resend' these IRQs via IPIs, to the same CPU. It's much + * better to do it this way as thus we do not have to be aware of + * 'pending' interrupts in the IRQ path, except at this point. + */ +/* + * Edge triggered needs to resend any interrupt + * that was delayed but this is now handled in the device + * independent code. + */ + +/* + * Starting up a edge-triggered IO-APIC interrupt is + * nasty - we need to make sure that we get the edge. + * If it is already asserted for some reason, we need + * return 1 to indicate that is was pending. + * + * This is not complete - we should be able to fake + * an edge even if it isn't on the 8259A... + */ +static unsigned int startup_edge_ioapic_irq(unsigned int irq) +{ + int was_pending = 0; + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + if (irq < 16) { + disable_8259A_irq(irq); + if (i8259A_irq_pending(irq)) + was_pending = 1; + } + __unmask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return was_pending; +} + +/* + * Once we have recorded IRQ_PENDING already, we can mask the + * interrupt for real. This prevents IRQ storms from unhandled + * devices. + */ +static void ack_edge_ioapic_irq(unsigned int irq) +{ + move_irq(irq); + if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED)) + == (IRQ_PENDING | IRQ_DISABLED)) + mask_IO_APIC_irq(irq); + ack_APIC_irq(); +} + +/* + * Level triggered interrupts can just be masked, + * and shutting down and starting up the interrupt + * is the same as enabling and disabling them -- except + * with a startup need to return a "was pending" value. + * + * Level triggered interrupts are special because we + * do not touch any IO-APIC register while handling + * them. We ack the APIC in the end-IRQ handler, not + * in the start-IRQ-handler. Protection against reentrance + * from the same interrupt is still provided, both by the + * generic IRQ layer and by the fact that an unacked local + * APIC does not accept IRQs. + */ +static unsigned int startup_level_ioapic_irq (unsigned int irq) +{ + unmask_IO_APIC_irq(irq); + + return 0; /* don't check for pending */ +} + +static void end_level_ioapic_irq (unsigned int irq) +{ + unsigned long v; + int i; + + move_irq(irq); +/* + * It appears there is an erratum which affects at least version 0x11 + * of I/O APIC (that's the 82093AA and cores integrated into various + * chipsets). Under certain conditions a level-triggered interrupt is + * erroneously delivered as edge-triggered one but the respective IRR + * bit gets set nevertheless. As a result the I/O unit expects an EOI + * message but it will never arrive and further interrupts are blocked + * from the source. The exact reason is so far unknown, but the + * phenomenon was observed when two consecutive interrupt requests + * from a given source get delivered to the same CPU and the source is + * temporarily disabled in between. + * + * A workaround is to simulate an EOI message manually. We achieve it + * by setting the trigger mode to edge and then to level when the edge + * trigger mode gets detected in the TMR of a local APIC for a + * level-triggered interrupt. We mask the source for the time of the + * operation to prevent an edge-triggered interrupt escaping meanwhile. + * The idea is from Manfred Spraul. --macro + */ + i = IO_APIC_VECTOR(irq); + + v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); + + ack_APIC_irq(); + + if (!(v & (1 << (i & 0x1f)))) { + atomic_inc(&irq_mis_count); + spin_lock(&ioapic_lock); + __mask_and_edge_IO_APIC_irq(irq); + __unmask_and_level_IO_APIC_irq(irq); + spin_unlock(&ioapic_lock); + } +} + +#ifdef CONFIG_PCI_MSI +static unsigned int startup_edge_ioapic_vector(unsigned int vector) +{ + int irq = vector_to_irq(vector); + + return startup_edge_ioapic_irq(irq); +} + +static void ack_edge_ioapic_vector(unsigned int vector) +{ + int irq = vector_to_irq(vector); + + move_native_irq(vector); + ack_edge_ioapic_irq(irq); +} + +static unsigned int startup_level_ioapic_vector (unsigned int vector) +{ + int irq = vector_to_irq(vector); + + return startup_level_ioapic_irq (irq); +} + +static void end_level_ioapic_vector (unsigned int vector) +{ + int irq = vector_to_irq(vector); + + move_native_irq(vector); + end_level_ioapic_irq(irq); +} + +static void mask_IO_APIC_vector (unsigned int vector) +{ + int irq = vector_to_irq(vector); + + mask_IO_APIC_irq(irq); +} + +static void unmask_IO_APIC_vector (unsigned int vector) +{ + int irq = vector_to_irq(vector); + + unmask_IO_APIC_irq(irq); +} + +#ifdef CONFIG_SMP +static void set_ioapic_affinity_vector (unsigned int vector, + cpumask_t cpu_mask) +{ + int irq = vector_to_irq(vector); + + set_native_irq_info(vector, cpu_mask); + set_ioapic_affinity_irq(irq, cpu_mask); +} +#endif +#endif + +static int ioapic_retrigger(unsigned int irq) +{ + send_IPI_self(IO_APIC_VECTOR(irq)); + + return 1; +} + +/* + * Level and edge triggered IO-APIC interrupts need different handling, + * so we use two separate IRQ descriptors. Edge triggered IRQs can be + * handled with the level-triggered descriptor, but that one has slightly + * more overhead. Level-triggered interrupts cannot be handled with the + * edge-triggered handler, without risking IRQ storms and other ugly + * races. + */ +static struct hw_interrupt_type ioapic_edge_type __read_mostly = { + .typename = "IO-APIC-edge", + .startup = startup_edge_ioapic, + .shutdown = shutdown_edge_ioapic, + .enable = enable_edge_ioapic, + .disable = disable_edge_ioapic, + .ack = ack_edge_ioapic, + .end = end_edge_ioapic, +#ifdef CONFIG_SMP + .set_affinity = set_ioapic_affinity, +#endif + .retrigger = ioapic_retrigger, +}; + +static struct hw_interrupt_type ioapic_level_type __read_mostly = { + .typename = "IO-APIC-level", + .startup = startup_level_ioapic, + .shutdown = shutdown_level_ioapic, + .enable = enable_level_ioapic, + .disable = disable_level_ioapic, + .ack = mask_and_ack_level_ioapic, + .end = end_level_ioapic, +#ifdef CONFIG_SMP + .set_affinity = set_ioapic_affinity, +#endif + .retrigger = ioapic_retrigger, +}; +#endif /* !CONFIG_XEN */ + +static inline void init_IO_APIC_traps(void) +{ + int irq; + + /* + * NOTE! The local APIC isn't very good at handling + * multiple interrupts at the same interrupt level. + * As the interrupt level is determined by taking the + * vector number and shifting that right by 4, we + * want to spread these out a bit so that they don't + * all fall in the same interrupt level. + * + * Also, we've got to be careful not to trash gate + * 0x80, because int 0x80 is hm, kind of importantish. ;) + */ + for (irq = 0; irq < NR_IRQS ; irq++) { + int tmp = irq; + if (use_pci_vector()) { + if (!platform_legacy_irq(tmp)) + if ((tmp = vector_to_irq(tmp)) == -1) + continue; + } + if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) { + /* + * Hmm.. We don't have an entry for this, + * so default to an old-fashioned 8259 + * interrupt if we can.. + */ + if (irq < 16) + make_8259A_irq(irq); +#ifndef CONFIG_XEN + else + /* Strange. Oh, well.. */ + irq_desc[irq].chip = &no_irq_type; +#endif + } + } +} + +#ifndef CONFIG_XEN +static void enable_lapic_irq (unsigned int irq) +{ + unsigned long v; + + v = apic_read(APIC_LVT0); + apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED); +} + +static void disable_lapic_irq (unsigned int irq) +{ + unsigned long v; + + v = apic_read(APIC_LVT0); + apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); +} + +static void ack_lapic_irq (unsigned int irq) +{ + ack_APIC_irq(); +} + +static void end_lapic_irq (unsigned int i) { /* nothing */ } + +static struct hw_interrupt_type lapic_irq_type __read_mostly = { + .typename = "local-APIC-edge", + .startup = NULL, /* startup_irq() not used for IRQ0 */ + .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */ + .enable = enable_lapic_irq, + .disable = disable_lapic_irq, + .ack = ack_lapic_irq, + .end = end_lapic_irq +}; + +static void setup_nmi (void) +{ + /* + * Dirty trick to enable the NMI watchdog ... + * We put the 8259A master into AEOI mode and + * unmask on all local APICs LVT0 as NMI. + * + * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') + * is from Maciej W. Rozycki - so we do not have to EOI from + * the NMI handler or the timer interrupt. + */ + apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); + + on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1); + + apic_printk(APIC_VERBOSE, " done.\n"); +} + +/* + * This looks a bit hackish but it's about the only one way of sending + * a few INTA cycles to 8259As and any associated glue logic. ICR does + * not support the ExtINT mode, unfortunately. We need to send these + * cycles as some i82489DX-based boards have glue logic that keeps the + * 8259A interrupt line asserted until INTA. --macro + */ +static inline void unlock_ExtINT_logic(void) +{ + int apic, pin, i; + struct IO_APIC_route_entry entry0, entry1; + unsigned char save_control, save_freq_select; + unsigned long flags; + + pin = find_isa_irq_pin(8, mp_INT); + apic = find_isa_irq_apic(8, mp_INT); + if (pin == -1) + return; + + spin_lock_irqsave(&ioapic_lock, flags); + *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin); + *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin); + spin_unlock_irqrestore(&ioapic_lock, flags); + clear_IO_APIC_pin(apic, pin); + + memset(&entry1, 0, sizeof(entry1)); + + entry1.dest_mode = 0; /* physical delivery */ + entry1.mask = 0; /* unmask IRQ now */ + entry1.dest.physical.physical_dest = hard_smp_processor_id(); + entry1.delivery_mode = dest_ExtINT; + entry1.polarity = entry0.polarity; + entry1.trigger = 0; + entry1.vector = 0; + + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1)); + io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0)); + spin_unlock_irqrestore(&ioapic_lock, flags); + + save_control = CMOS_READ(RTC_CONTROL); + save_freq_select = CMOS_READ(RTC_FREQ_SELECT); + CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6, + RTC_FREQ_SELECT); + CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL); + + i = 100; + while (i-- > 0) { + mdelay(10); + if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF) + i -= 10; + } + + CMOS_WRITE(save_control, RTC_CONTROL); + CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); + clear_IO_APIC_pin(apic, pin); + + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1)); + io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0)); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +int timer_uses_ioapic_pin_0; + +/* + * This code may look a bit paranoid, but it's supposed to cooperate with + * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ + * is so screwy. Thanks to Brian Perkins for testing/hacking this beast + * fanatically on his truly buggy board. + */ +static inline void check_timer(void) +{ + int apic1, pin1, apic2, pin2; + int vector; + + /* + * get/set the timer IRQ vector: + */ + disable_8259A_irq(0); + vector = assign_irq_vector(0); + set_intr_gate(vector, interrupt[0]); + + /* + * Subtle, code in do_timer_interrupt() expects an AEOI + * mode for the 8259A whenever interrupts are routed + * through I/O APICs. Also IRQ0 has to be enabled in + * the 8259A which implies the virtual wire has to be + * disabled in the local APIC. + */ + apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); + init_8259A(1); + timer_ack = 1; + if (timer_over_8254 > 0) + enable_8259A_irq(0); + + pin1 = find_isa_irq_pin(0, mp_INT); + apic1 = find_isa_irq_apic(0, mp_INT); + pin2 = ioapic_i8259.pin; + apic2 = ioapic_i8259.apic; + + if (pin1 == 0) + timer_uses_ioapic_pin_0 = 1; + + printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", + vector, apic1, pin1, apic2, pin2); + + if (pin1 != -1) { + /* + * Ok, does IRQ0 through the IOAPIC work? + */ + unmask_IO_APIC_irq(0); + if (timer_irq_works()) { + if (nmi_watchdog == NMI_IO_APIC) { + disable_8259A_irq(0); + setup_nmi(); + enable_8259A_irq(0); + } + if (disable_timer_pin_1 > 0) + clear_IO_APIC_pin(0, pin1); + return; + } + clear_IO_APIC_pin(apic1, pin1); + printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to " + "IO-APIC\n"); + } + + printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... "); + if (pin2 != -1) { + printk("\n..... (found pin %d) ...", pin2); + /* + * legacy devices should be connected to IO APIC #0 + */ + setup_ExtINT_IRQ0_pin(apic2, pin2, vector); + if (timer_irq_works()) { + printk("works.\n"); + if (pin1 != -1) + replace_pin_at_irq(0, apic1, pin1, apic2, pin2); + else + add_pin_to_irq(0, apic2, pin2); + if (nmi_watchdog == NMI_IO_APIC) { + setup_nmi(); + } + return; + } + /* + * Cleanup, just in case ... + */ + clear_IO_APIC_pin(apic2, pin2); + } + printk(" failed.\n"); + + if (nmi_watchdog == NMI_IO_APIC) { + printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); + nmi_watchdog = 0; + } + + printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); + + disable_8259A_irq(0); + irq_desc[0].chip = &lapic_irq_type; + apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ + enable_8259A_irq(0); + + if (timer_irq_works()) { + printk(" works.\n"); + return; + } + apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); + printk(" failed.\n"); + + printk(KERN_INFO "...trying to set up timer as ExtINT IRQ..."); + + timer_ack = 0; + init_8259A(0); + make_8259A_irq(0); + apic_write_around(APIC_LVT0, APIC_DM_EXTINT); + + unlock_ExtINT_logic(); + + if (timer_irq_works()) { + printk(" works.\n"); + return; + } + printk(" failed :(.\n"); + panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " + "report. Then try booting with the 'noapic' option"); +} +#else +int timer_uses_ioapic_pin_0 = 0; +#define check_timer() ((void)0) +#endif + +/* + * + * IRQ's that are handled by the PIC in the MPS IOAPIC case. + * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ. + * Linux doesn't really care, as it's not actually used + * for any interrupt handling anyway. + */ +#define PIC_IRQS (1 << PIC_CASCADE_IR) + +void __init setup_IO_APIC(void) +{ + enable_IO_APIC(); + + if (acpi_ioapic) + io_apic_irqs = ~0; /* all IRQs go through IOAPIC */ + else + io_apic_irqs = ~PIC_IRQS; + + printk("ENABLING IO-APIC IRQs\n"); + + /* + * Set up IO-APIC IRQ routing. + */ + if (!acpi_ioapic) + setup_ioapic_ids_from_mpc(); +#ifndef CONFIG_XEN + sync_Arb_IDs(); +#endif + setup_IO_APIC_irqs(); + init_IO_APIC_traps(); + check_timer(); + if (!acpi_ioapic) + print_IO_APIC(); +} + +static int __init setup_disable_8254_timer(char *s) +{ + timer_over_8254 = -1; + return 1; +} +static int __init setup_enable_8254_timer(char *s) +{ + timer_over_8254 = 2; + return 1; +} + +__setup("disable_8254_timer", setup_disable_8254_timer); +__setup("enable_8254_timer", setup_enable_8254_timer); + +/* + * Called after all the initialization is done. If we didnt find any + * APIC bugs then we can allow the modify fast path + */ + +static int __init io_apic_bug_finalize(void) +{ + if(sis_apic_bug == -1) + sis_apic_bug = 0; + if (is_initial_xendomain()) { + struct xen_platform_op op = { .cmd = XENPF_platform_quirk }; + op.u.platform_quirk.quirk_id = sis_apic_bug ? + QUIRK_IOAPIC_BAD_REGSEL : QUIRK_IOAPIC_GOOD_REGSEL; + HYPERVISOR_platform_op(&op); + } + return 0; +} + +late_initcall(io_apic_bug_finalize); + +struct sysfs_ioapic_data { + struct sys_device dev; + struct IO_APIC_route_entry entry[0]; +}; +static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; + +static int ioapic_suspend(struct sys_device *dev, pm_message_t state) +{ + struct IO_APIC_route_entry *entry; + struct sysfs_ioapic_data *data; + unsigned long flags; + int i; + + data = container_of(dev, struct sysfs_ioapic_data, dev); + entry = data->entry; + spin_lock_irqsave(&ioapic_lock, flags); + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { + *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i); + *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i); + } + spin_unlock_irqrestore(&ioapic_lock, flags); + + return 0; +} + +static int ioapic_resume(struct sys_device *dev) +{ + struct IO_APIC_route_entry *entry; + struct sysfs_ioapic_data *data; + unsigned long flags; + union IO_APIC_reg_00 reg_00; + int i; + + data = container_of(dev, struct sysfs_ioapic_data, dev); + entry = data->entry; + + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(dev->id, 0); + if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) { + reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; + io_apic_write(dev->id, 0, reg_00.raw); + } + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { + io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1)); + io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0)); + } + spin_unlock_irqrestore(&ioapic_lock, flags); + + return 0; +} + +static struct sysdev_class ioapic_sysdev_class = { + set_kset_name("ioapic"), + .suspend = ioapic_suspend, + .resume = ioapic_resume, +}; + +static int __init ioapic_init_sysfs(void) +{ + struct sys_device * dev; + int i, size, error = 0; + + error = sysdev_class_register(&ioapic_sysdev_class); + if (error) + return error; + + for (i = 0; i < nr_ioapics; i++ ) { + size = sizeof(struct sys_device) + nr_ioapic_registers[i] + * sizeof(struct IO_APIC_route_entry); + mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL); + if (!mp_ioapic_data[i]) { + printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); + continue; + } + memset(mp_ioapic_data[i], 0, size); + dev = &mp_ioapic_data[i]->dev; + dev->id = i; + dev->cls = &ioapic_sysdev_class; + error = sysdev_register(dev); + if (error) { + kfree(mp_ioapic_data[i]); + mp_ioapic_data[i] = NULL; + printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); + continue; + } + } + + return 0; +} + +device_initcall(ioapic_init_sysfs); + +/* -------------------------------------------------------------------------- + ACPI-based IOAPIC Configuration + -------------------------------------------------------------------------- */ + +#ifdef CONFIG_ACPI + +int __init io_apic_get_unique_id (int ioapic, int apic_id) +{ +#ifndef CONFIG_XEN + union IO_APIC_reg_00 reg_00; + static physid_mask_t apic_id_map = PHYSID_MASK_NONE; + physid_mask_t tmp; + unsigned long flags; + int i = 0; + + /* + * The P4 platform supports up to 256 APIC IDs on two separate APIC + * buses (one for LAPICs, one for IOAPICs), where predecessors only + * supports up to 16 on one shared APIC bus. + * + * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full + * advantage of new APIC bus architecture. + */ + + if (physids_empty(apic_id_map)) + apic_id_map = ioapic_phys_id_map(phys_cpu_present_map); + + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(ioapic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + + if (apic_id >= get_physical_broadcast()) { + printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " + "%d\n", ioapic, apic_id, reg_00.bits.ID); + apic_id = reg_00.bits.ID; + } + + /* + * Every APIC in a system must have a unique ID or we get lots of nice + * 'stuck on smp_invalidate_needed IPI wait' messages. + */ + if (check_apicid_used(apic_id_map, apic_id)) { + + for (i = 0; i < get_physical_broadcast(); i++) { + if (!check_apicid_used(apic_id_map, i)) + break; + } + + if (i == get_physical_broadcast()) + panic("Max apic_id exceeded!\n"); + + printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, " + "trying %d\n", ioapic, apic_id, i); + + apic_id = i; + } + + tmp = apicid_to_cpu_present(apic_id); + physids_or(apic_id_map, apic_id_map, tmp); + + if (reg_00.bits.ID != apic_id) { + reg_00.bits.ID = apic_id; + + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(ioapic, 0, reg_00.raw); + reg_00.raw = io_apic_read(ioapic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + + /* Sanity check */ + if (reg_00.bits.ID != apic_id) { + printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic); + return -1; + } + } + + apic_printk(APIC_VERBOSE, KERN_INFO + "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); +#endif /* !CONFIG_XEN */ + + return apic_id; +} + + +int __init io_apic_get_version (int ioapic) +{ + union IO_APIC_reg_01 reg_01; + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(ioapic, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return reg_01.bits.version; +} + + +int __init io_apic_get_redir_entries (int ioapic) +{ + union IO_APIC_reg_01 reg_01; + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(ioapic, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return reg_01.bits.entries; +} + + +int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low) +{ + struct IO_APIC_route_entry entry; + unsigned long flags; + + if (!IO_APIC_IRQ(irq)) { + printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", + ioapic); + return -EINVAL; + } + + /* + * Generate a PCI IRQ routing entry and program the IOAPIC accordingly. + * Note that we mask (disable) IRQs now -- these get enabled when the + * corresponding device driver registers for this IRQ. + */ + + memset(&entry,0,sizeof(entry)); + + entry.delivery_mode = INT_DELIVERY_MODE; + entry.dest_mode = INT_DEST_MODE; + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); + entry.trigger = edge_level; + entry.polarity = active_high_low; + entry.mask = 1; + + /* + * IRQs < 16 are already in the irq_2_pin[] map + */ + if (irq >= 16) + add_pin_to_irq(irq, ioapic, pin); + + entry.vector = assign_irq_vector(irq); + + apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry " + "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic, + mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq, + edge_level, active_high_low); + + ioapic_register_intr(irq, entry.vector, edge_level); + + if (!ioapic && (irq < 16)) + disable_8259A_irq(irq); + + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1)); + io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0)); + set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return 0; +} + +#endif /* CONFIG_ACPI */ diff -r 42f6970e18c9 -r a533be77c572 arch/i386/kernel/ioport-xen.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/arch/i386/kernel/ioport-xen.c Mon Jun 04 10:05:28 2007 +0100 @@ -0,0 +1,122 @@ +/* + * linux/arch/i386/kernel/ioport.c + * + * This contains the io-permission bitmap code - written by obz, with changes + * by Linus. + */ + +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/capability.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/ioport.h> +#include <linux/smp.h> +#include <linux/smp_lock.h> +#include <linux/stddef.h> +#include <linux/slab.h> +#include <linux/thread_info.h> +#include <xen/interface/physdev.h> + +/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ +static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) +{ + unsigned long mask; + unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG); + unsigned int low_index = base & (BITS_PER_LONG-1); + int length = low_index + extent; + + if (low_index != 0) { + mask = (~0UL << low_index); + if (length < BITS_PER_LONG) + mask &= ~(~0UL << length); + if (new_value) + *bitmap_base++ |= mask; + else + *bitmap_base++ &= ~mask; + length -= BITS_PER_LONG; + } + + mask = (new_value ? ~0UL : 0UL); + while (length >= BITS_PER_LONG) { + *bitmap_base++ = mask; + length -= BITS_PER_LONG; + } + + if (length > 0) { + mask = ~(~0UL << length); + if (new_value) + *bitmap_base++ |= mask; + else + *bitmap_base++ &= ~mask; + } +} + + +/* + * this changes the io permissions bitmap in the current task. + */ +asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) +{ + struct thread_struct * t = ¤t->thread; + unsigned long *bitmap; + struct physdev_set_iobitmap set_iobitmap; + + if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) + return -EINVAL; + if (turn_on && !capable(CAP_SYS_RAWIO)) + return -EPERM; + + /* + * If it's the first ioperm() call in this thread's lifetime, set the + * IO bitmap up. ioperm() is much less timing critical than clone(), + * this is why we delay this operation until now: + */ + if (!t->io_bitmap_ptr) { + bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); + if (!bitmap) + return -ENOMEM; + + memset(bitmap, 0xff, IO_BITMAP_BYTES); + t->io_bitmap_ptr = bitmap; + set_thread_flag(TIF_IO_BITMAP); + + set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap); + set_iobitmap.nr_ports = IO_BITMAP_BITS; + HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, &set_iobitmap); + } + + set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); + + return 0; +} + +/* + * sys_iopl has to be used when you want to access the IO ports + * beyond the 0x3ff range: to get the full 65536 ports bitmapped + * you'd need 8kB of bitmaps/process, which is a bit excessive. + * + * Here we just change the eflags value on the stack: we allow + * only the super-user to do it. This depends on the stack-layout + * on system-call entry - see also fork() and the signal handling + * code. + */ + +asmlinkage long sys_iopl(unsigned long unused) +{ + volatile struct pt_regs * regs = (struct pt_regs *) &unused; + unsigned int level = regs->ebx; + struct thread_struct *t = ¤t->thread; + unsigned int old = (t->iopl >> 12) & 3; + + if (level > 3) + return -EINVAL; + /* Trying to gain more privileges? */ + if (level > old) { + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + } + t->iopl = level << 12; + set_iopl_mask(t->iopl); + return 0; +} diff -r 42f6970e18c9 -r a533be77c572 arch/i386/kernel/irq-xen.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/arch/i386/kernel/irq-xen.c Mon Jun 04 10:05:28 2007 +0100 @@ -0,0 +1,324 @@ +/* + * linux/arch/i386/kernel/irq.c + * + * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar + * + * This file contains the lowest level x86-specific interrupt + * entry, irq-stacks and irq statistics code. All the remaining + * irq logic is done by the generic kernel/irq/ code and + * by the x86-specific irq controller code. (e.g. i8259.c and + * io_apic.c.) + */ + +#include <asm/uaccess.h> +#include <linux/module.h> +#include <linux/seq_file.h> +#include <linux/interrupt.h> +#include <linux/kernel_stat.h> +#include <linux/notifier.h> +#include <linux/cpu.h> +#include <linux/delay.h> + +DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp; +EXPORT_PER_CPU_SYMBOL(irq_stat); + +#ifndef CONFIG_X86_LOCAL_APIC +/* + * 'what should we do if we get a hw irq event on an illegal vector'. + * each architecture has to answer this themselves. + */ +void ack_bad_irq(unsigned int irq) +{ + printk("unexpected IRQ trap at vector %02x\n", irq); +} +#endif + +#ifdef CONFIG_4KSTACKS +/* + * per-CPU IRQ handling contexts (thread information and stack) + */ +union irq_ctx { + struct thread_info tinfo; + u32 stack[THREAD_SIZE/sizeof(u32)]; +}; + +static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; +static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; +#endif + +/* + * do_IRQ handles all normal device IRQ's (the special + * SMP cross-CPU interrupts have their own specific + * handlers). + */ +fastcall unsigned int do_IRQ(struct pt_regs *regs) +{ + /* high bit used in ret_from_ code */ + int irq = ~regs->orig_eax; +#ifdef CONFIG_4KSTACKS + union irq_ctx *curctx, *irqctx; + u32 *isp; +#endif + + if (unlikely((unsigned)irq >= NR_IRQS)) { + printk(KERN_EMERG "%s: cannot handle IRQ %d\n", + __FUNCTION__, irq); + BUG(); + } + + irq_enter(); +#ifdef CONFIG_DEBUG_STACKOVERFLOW + /* Debugging check for stack overflow: is there less than 1KB free? */ + { + long esp; + + __asm__ __volatile__("andl %%esp,%0" : + "=r" (esp) : "0" (THREAD_SIZE - 1)); + if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) { + printk("do_IRQ: stack overflow: %ld\n", + esp - sizeof(struct thread_info)); + dump_stack(); + } + } +#endif + +#ifdef CONFIG_4KSTACKS + + curctx = (union irq_ctx *) current_thread_info(); + irqctx = hardirq_ctx[smp_processor_id()]; + + /* + * this is where we switch to the IRQ stack. However, if we are + * already using the IRQ stack (because we interrupted a hardirq + * handler) we can't do that and just have to keep using the + * current stack (which is the irq stack already after all) + */ + if (curctx != irqctx) { + int arg1, arg2, ebx; + + /* build the stack frame on the IRQ stack */ + isp = (u32*) ((char*)irqctx + sizeof(*irqctx)); + irqctx->tinfo.task = curctx->tinfo.task; + irqctx->tinfo.previous_esp = current_stack_pointer; + + /* + * Copy the softirq bits in preempt_count so that the + * softirq checks work in the hardirq context. + */ + irqctx->tinfo.preempt_count = + (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) | + (curctx->tinfo.preempt_count & SOFTIRQ_MASK); + + asm volatile( + " xchgl %%ebx,%%esp \n" + " call __do_IRQ \n" + " movl %%ebx,%%esp \n" + : "=a" (arg1), "=d" (arg2), "=b" (ebx) + : "0" (irq), "1" (regs), "2" (isp) + : "memory", "cc", "ecx" + ); + } else +#endif + __do_IRQ(irq, regs); + + irq_exit(); + + return 1; +} + +#ifdef CONFIG_4KSTACKS + +/* + * These should really be __section__(".bss.page_aligned") as well, but + * gcc's 3.0 and earlier don't handle that correctly. + */ +static char softirq_stack[NR_CPUS * THREAD_SIZE] + __attribute__((__aligned__(THREAD_SIZE))); + +static char hardirq_stack[NR_CPUS * THREAD_SIZE] + __attribute__((__aligned__(THREAD_SIZE))); + +/* + * allocate per-cpu stacks for hardirq and for softirq processing + */ +void irq_ctx_init(int cpu) +{ + union irq_ctx *irqctx; + + if (hardirq_ctx[cpu]) + return; + + irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE]; + irqctx->tinfo.task = NULL; + irqctx->tinfo.exec_domain = NULL; + irqctx->tinfo.cpu = cpu; + irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; + irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); + + hardirq_ctx[cpu] = irqctx; + + irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE]; + irqctx->tinfo.task = NULL; + irqctx->tinfo.exec_domain = NULL; + irqctx->tinfo.cpu = cpu; + irqctx->tinfo.preempt_count = 0; + irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); + + softirq_ctx[cpu] = irqctx; + + printk("CPU %u irqstacks, hard=%p soft=%p\n", + cpu,hardirq_ctx[cpu],softirq_ctx[cpu]); +} + +void irq_ctx_exit(int cpu) +{ + hardirq_ctx[cpu] = NULL; +} + +extern asmlinkage void __do_softirq(void); + +asmlinkage void do_softirq(void) +{ + unsigned long flags; + struct thread_info *curctx; + union irq_ctx *irqctx; + u32 *isp; + + if (in_interrupt()) + return; + + local_irq_save(flags); + + if (local_softirq_pending()) { + curctx = current_thread_info(); + irqctx = softirq_ctx[smp_processor_id()]; + irqctx->tinfo.task = curctx->task; + irqctx->tinfo.previous_esp = current_stack_pointer; + + /* build the stack frame on the softirq stack */ + isp = (u32*) ((char*)irqctx + sizeof(*irqctx)); + + asm volatile( + " xchgl %%ebx,%%esp \n" + " call __do_softirq \n" + " movl %%ebx,%%esp \n" + : "=b"(isp) + : "0"(isp) + : "memory", "cc", "edx", "ecx", "eax" + ); + /* + * Shouldnt happen, we returned above if in_interrupt(): + */ + WARN_ON_ONCE(softirq_count()); + } + + local_irq_restore(flags); +} + +EXPORT_SYMBOL(do_softirq); +#endif + +/* + * Interrupt statistics: + */ + +atomic_t irq_err_count; + +/* + * /proc/interrupts printing: + */ + +int show_interrupts(struct seq_file *p, void *v) +{ + int i = *(loff_t *) v, j; + struct irqaction * action; + unsigned long flags; + + if (i == 0) { + seq_printf(p, " "); + for_each_online_cpu(j) + seq_printf(p, "CPU%-8d",j); + seq_putc(p, '\n'); + } + + if (i < NR_IRQS) { + spin_lock_irqsave(&irq_desc[i].lock, flags); + action = irq_desc[i].action; + if (!action) + goto skip; + seq_printf(p, "%3d: ",i); +#ifndef CONFIG_SMP + seq_printf(p, "%10u ", kstat_irqs(i)); +#else + for_each_online_cpu(j) + seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); +#endif + seq_printf(p, " %14s", irq_desc[i].chip->typename); + seq_printf(p, " %s", action->name); + + for (action=action->next; action; action = action->next) + seq_printf(p, ", %s", action->name); + + seq_putc(p, '\n'); +skip: + spin_unlock_irqrestore(&irq_desc[i].lock, flags); + } else if (i == NR_IRQS) { + seq_printf(p, "NMI: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", nmi_count(j)); + seq_putc(p, '\n'); +#ifdef CONFIG_X86_LOCAL_APIC + seq_printf(p, "LOC: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", + per_cpu(irq_stat,j).apic_timer_irqs); + seq_putc(p, '\n'); +#endif + seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); +#if defined(CONFIG_X86_IO_APIC) + seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); +#endif + } + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU + +void fixup_irqs(cpumask_t map) +{ + unsigned int irq; + static int warned; + + for (irq = 0; irq < NR_IRQS; irq++) { + cpumask_t mask; + if (irq == 2) + continue; + + cpus_and(mask, irq_desc[irq].affinity, map); + if (any_online_cpu(mask) == NR_CPUS) { + /*printk("Breaking affinity for irq %i\n", irq);*/ + mask = map; + } + if (irq_desc[irq].chip->set_affinity) + irq_desc[irq].chip->set_affinity(irq, mask); + else if (irq_desc[irq].action && !(warned++)) + printk("Cannot set affinity for irq %i\n", irq); + } + +#if 0 + barrier(); + /* Ingo Molnar says: "after the IO-APIC masks have been redirected + [note the nop - the interrupt-enable boundary on x86 is two + instructions from sti] - to flush out pending hardirqs and + IPIs. After this point nothing is supposed to reach this CPU." */ + __asm__ __volatile__("sti; nop; cli"); + barrier(); +#else + /* That doesn't seem sufficient. Give it 1ms. */ + local_irq_enable(); + mdelay(1); + local_irq_disable(); +#endif +} +#endif + diff -r 42f6970e18c9 -r a533be77c572 arch/i386/kernel/ldt-xen.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/arch/i386/kernel/ldt-xen.c Mon Jun 04 10:05:28 2007 +0100 @@ -0,0 +1,270 @@ +/* + * linux/kernel/ldt.c + * + * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds + * Copyright (C) 1999 Ingo Molnar <mingo@xxxxxxxxxx> + */ + +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/smp_lock.h> +#include <linux/vmalloc.h> +#include <linux/slab.h> + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/ldt.h> +#include <asm/desc.h> +#include <asm/mmu_context.h> + +#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ +static void flush_ldt(void *null) +{ + if (current->active_mm) + load_LDT(¤t->active_mm->context); +} +#endif + +static int alloc_ldt(mm_context_t *pc, int mincount, int reload) +{ + void *oldldt; + void *newldt; + int oldsize; + + if (mincount <= pc->size) + return 0; + oldsize = pc->size; + mincount = (mincount+511)&(~511); + if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) + newldt = vmalloc(mincount*LDT_ENTRY_SIZE); + else + newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); + + if (!newldt) + return -ENOMEM; + + if (oldsize) + memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE); + oldldt = pc->ldt; + memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE); + pc->ldt = newldt; + wmb(); + pc->size = mincount; + wmb(); + + if (reload) { +#ifdef CONFIG_SMP + cpumask_t mask; + preempt_disable(); +#endif + make_pages_readonly( + pc->ldt, + (pc->size * LDT_ENTRY_SIZE) / PAGE_SIZE, + XENFEAT_writable_descriptor_tables); + load_LDT(pc); +#ifdef CONFIG_SMP + mask = cpumask_of_cpu(smp_processor_id()); + if (!cpus_equal(current->mm->cpu_vm_mask, mask)) + smp_call_function(flush_ldt, NULL, 1, 1); + preempt_enable(); +#endif + } + if (oldsize) { + make_pages_writable( + oldldt, + (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE, + XENFEAT_writable_descriptor_tables); + if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) + vfree(oldldt); + else + kfree(oldldt); + } + return 0; +} + +static inline int copy_ldt(mm_context_t *new, mm_context_t *old) +{ + int err = alloc_ldt(new, old->size, 0); + if (err < 0) + return err; + memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); + make_pages_readonly( + new->ldt, + (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE, + XENFEAT_writable_descriptor_tables); + return 0; +} + +/* + * we do not have to muck with descriptors here, that is + * done in switch_mm() as needed. + */ +int init_new_context(struct task_struct *tsk, struct mm_struct *mm) +{ + struct mm_struct * old_mm; + int retval = 0; + + init_MUTEX(&mm->context.sem); + mm->context.size = 0; + mm->context.has_foreign_mappings = 0; + old_mm = current->mm; + if (old_mm && old_mm->context.size > 0) { + down(&old_mm->context.sem); + retval = copy_ldt(&mm->context, &old_mm->context); + up(&old_mm->context.sem); + } + return retval; +} + +/* + * No need to lock the MM as we are the last user + */ +void destroy_context(struct mm_struct *mm) +{ + if (mm->context.size) { + if (mm == current->active_mm) + clear_LDT(); + make_pages_writable( + mm->context.ldt, + (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE, + XENFEAT_writable_descriptor_tables); + if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE) + vfree(mm->context.ldt); + else + kfree(mm->context.ldt); + mm->context.size = 0; + } +} + +static int read_ldt(void __user * ptr, unsigned long bytecount) +{ + int err; + unsigned long size; + struct mm_struct * mm = current->mm; + + if (!mm->context.size) + return 0; + if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES) + bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; + + down(&mm->context.sem); + size = mm->context.size*LDT_ENTRY_SIZE; + if (size > bytecount) + size = bytecount; + + err = 0; + if (copy_to_user(ptr, mm->context.ldt, size)) + err = -EFAULT; + up(&mm->context.sem); + if (err < 0) + goto error_return; + if (size != bytecount) { + /* zero-fill the rest */ + if (clear_user(ptr+size, bytecount-size) != 0) { + err = -EFAULT; + goto error_return; + } + } + return bytecount; +error_return: + return err; +} + +static int read_default_ldt(void __user * ptr, unsigned long bytecount) +{ + int err; + unsigned long size; + void *address; + + err = 0; + address = &default_ldt[0]; + size = 5*sizeof(struct desc_struct); + if (size > bytecount) + size = bytecount; + + err = size; + if (copy_to_user(ptr, address, size)) + err = -EFAULT; + + return err; +} + +static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) +{ + struct mm_struct * mm = current->mm; + __u32 entry_1, entry_2; + int error; + struct user_desc ldt_info; + + error = -EINVAL; + if (bytecount != sizeof(ldt_info)) + goto out; + error = -EFAULT; + if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info))) + goto out; + + error = -EINVAL; + if (ldt_info.entry_number >= LDT_ENTRIES) + goto out; + if (ldt_info.contents == 3) { + if (oldmode) + goto out; + if (ldt_info.seg_not_present == 0) + goto out; + } + + down(&mm->context.sem); + if (ldt_info.entry_number >= mm->context.size) { + error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1); + if (error < 0) + goto out_unlock; + } + + /* Allow LDTs to be cleared by the user. */ + if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { + if (oldmode || LDT_empty(&ldt_info)) { + entry_1 = 0; + entry_2 = 0; + goto install; + } + } + + entry_1 = LDT_entry_a(&ldt_info); + entry_2 = LDT_entry_b(&ldt_info); + if (oldmode) + entry_2 &= ~(1 << 20); + + /* Install the new entry ... */ +install: + error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number, + entry_1, entry_2); + +out_unlock: + up(&mm->context.sem); +out: + return error; +} + +asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) +{ + int ret = -ENOSYS; + + switch (func) { + case 0: + ret = read_ldt(ptr, bytecount); + break; + case 1: + ret = write_ldt(ptr, bytecount, 1); + break; + case 2: + ret = read_default_ldt(ptr, bytecount); + break; + case 0x11: + ret = write_ldt(ptr, bytecount, 0); + break; + } + return ret; +} diff -r 42f6970e18c9 -r a533be77c572 arch/i386/kernel/machine_kexec.c --- a/arch/i386/kernel/machine_kexec.c Mon Jun 04 10:05:24 2007 +0100 +++ b/arch/i386/kernel/machine_kexec.c Mon Jun 04 10:05:28 2007 +0100 @@ -19,6 +19,10 @@ #include <asm/desc.h> #include <asm/system.h> +#ifdef CONFIG_XEN +#include <xen/interface/kexec.h> +#endif + #define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) static u32 kexec_pgd[1024] PAGE_ALIGNED; #ifdef CONFIG_X86_PAE @@ -27,6 +31,40 @@ static u32 kexec_pmd1[1024] PAGE_ALIGNED #endif static u32 kexec_pte0[1024] PAGE_ALIGNED; static u32 kexec_pte1[1024] PAGE_ALIGNED; + +#ifdef CONFIG_XEN + +#define __ma(x) (pfn_to_mfn(__pa((x)) >> PAGE_SHIFT) << PAGE_SHIFT) + +#if PAGES_NR > KEXEC_XEN_NO_PAGES +#error PAGES_NR is greater than KEXEC_XEN_NO_PAGES - Xen support will break +#endif + +#if PA_CONTROL_PAGE != 0 +#error PA_CONTROL_PAGE is non zero - Xen support will break +#endif + +void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, struct kimage *image) +{ + void *control_page; + + memset(xki->page_list, 0, sizeof(xki->page_list)); + + control_page = page_address(image->control_code_page); + memcpy(control_page, relocate_kernel, PAGE_SIZE); + + xki->page_list[PA_CONTROL_PAGE] = __ma(control_page); + xki->page_list[PA_PGD] = __ma(kexec_pgd); +#ifdef CONFIG_X86_PAE + xki->page_list[PA_PMD_0] = __ma(kexec_pmd0); + xki->page_list[PA_PMD_1] = __ma(kexec_pmd1); +#endif + xki->page_list[PA_PTE_0] = __ma(kexec_pte0); + xki->page_list[PA_PTE_1] = __ma(kexec_pte1); + +} + +#endif /* CONFIG_XEN */ /* * A architecture hook called to validate the @@ -54,6 +92,7 @@ void machine_kexec_cleanup(struct kimage { } +#ifndef CONFIG_XEN /* * Do not allocate memory (or fail in any way) in machine_kexec(). * We are past the point of no return, committed to rebooting now. @@ -87,3 +126,4 @@ NORET_TYPE void machine_kexec(struct kim relocate_kernel((unsigned long)image->head, (unsigned long)page_list, image->start, cpu_has_pae); } +#endif diff -r 42f6970e18c9 -r a533be77c572 arch/i386/kernel/microcode-xen.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/arch/i386/kernel/microcode-xen.c Mon Jun 04 10:05:28 2007 +0100 @@ -0,0 +1,144 @@ +/* + * Intel CPU Microcode Update Driver for Linux + * + * Copyright (C) 2000-2004 Tigran Aivazian + * + * This driver allows to upgrade microcode on Intel processors + * belonging to IA-32 family - PentiumPro, Pentium II, + * Pentium III, Xeon, Pentium 4, etc. + * + * Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual, + * Order Number 245472 or free download from: + * + * http://developer.intel.com/design/pentium4/manuals/245472.htm + * + * For more information, go to http://www.urbanmyth.org/microcode + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +//#define DEBUG /* pr_debug */ +#include <linux/capability.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/sched.h> +#include <linux/cpumask.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/miscdevice.h> +#include <linux/spinlock.h> +#include <linux/mm.h> +#include <linux/mutex.h> +#include <linux/syscalls.h> + +#include <asm/msr.h> +#include <asm/uaccess.h> +#include <asm/processor.h> + +MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver"); +MODULE_AUTHOR("Tigran Aivazian <tigran@xxxxxxxxxxx>"); +MODULE_LICENSE("GPL"); + +static int verbose; +module_param(verbose, int, 0644); + +#define MICROCODE_VERSION "1.14a-xen" + +#define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */ +#define MC_HEADER_SIZE (sizeof (microcode_header_t)) /* 48 bytes */ +#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */ + +/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ +static DEFINE_MUTEX(microcode_mutex); + +static int microcode_open (struct inode *unused1, struct file *unused2) +{ + return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; +} + + +static int do_microcode_update (const void __user *ubuf, size_t len) +{ + int err; + void *kbuf; + + kbuf = vmalloc(len); + if (!kbuf) + return -ENOMEM; + + if (copy_from_user(kbuf, ubuf, len) == 0) { + struct xen_platform_op op; + + op.cmd = XENPF_microcode_update; + set_xen_guest_handle(op.u.microcode.data, kbuf); + op.u.microcode.length = len; + err = HYPERVISOR_platform_op(&op); + } else + err = -EFAULT; + + vfree(kbuf); + + return err; +} + +static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos) +{ + ssize_t ret; + + if (len < MC_HEADER_SIZE) { + printk(KERN_ERR "microcode: not enough data\n"); + return -EINVAL; + } + + mutex_lock(µcode_mutex); + + ret = do_microcode_update(buf, len); + if (!ret) + ret = (ssize_t)len; + + mutex_unlock(µcode_mutex); + + return ret; +} + +static struct file_operations microcode_fops = { + .owner = THIS_MODULE, + .write = microcode_write, + .open = microcode_open, +}; + +static struct miscdevice microcode_dev = { + .minor = MICROCODE_MINOR, + .name = "microcode", + .fops = µcode_fops, +}; + +static int __init microcode_init (void) +{ + int error; + + error = misc_register(µcode_dev); + if (error) { + printk(KERN_ERR + "microcode: can't misc_register on minor=%d\n", + MICROCODE_MINOR); + return error; + } + + printk(KERN_INFO + "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@xxxxxxxxxxx>\n"); + return 0; +} + +static void __exit microcode_exit (void) +{ + misc_deregister(µcode_dev); +} + +module_init(microcode_init) +module_exit(microcode_exit) +MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); diff -r 42f6970e18c9 -r a533be77c572 arch/i386/kernel/mpparse-xen.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/arch/i386/kernel/mpparse-xen.c Mon Jun 04 10:05:28 2007 +0100 @@ -0,0 +1,1185 @@ +/* + * Intel Multiprocessor Specification 1.1 and 1.4 + * compliant MP-table parsing routines. + * + * (c) 1995 Alan Cox, Building #3 <alan@xxxxxxxxxx> + * (c) 1998, 1999, 2000 Ingo Molnar <mingo@xxxxxxxxxx> + * + * Fixes + * Erich Boleyn : MP v1.4 and additional changes. + * Alan Cox : Added EBDA scanning + * Ingo Molnar : various cleanups and rewrites + * Maciej W. Rozycki: Bits for default MP configurations + * Paul Diefenbaugh: Added full ACPI support + */ + +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/acpi.h> +#include <linux/delay.h> +#include <linux/bootmem.h> +#include <linux/smp_lock.h> +#include <linux/kernel_stat.h> +#include <linux/mc146818rtc.h> +#include <linux/bitops.h> + +#include <asm/smp.h> +#include <asm/acpi.h> +#include <asm/mtrr.h> +#include <asm/mpspec.h> +#include <asm/io_apic.h> + +#include <mach_apic.h> +#include <mach_mpparse.h> +#include <bios_ebda.h> + +/* Have we found an MP table */ +int smp_found_config; +unsigned int __initdata maxcpus = NR_CPUS; + +/* + * Various Linux-internal data structures created from the + * MP-table. + */ +int apic_version [MAX_APICS]; +int mp_bus_id_to_type [MAX_MP_BUSSES]; +int mp_bus_id_to_node [MAX_MP_BUSSES]; +int mp_bus_id_to_local [MAX_MP_BUSSES]; +int quad_local_to_mp_bus_id [NR_CPUS/4][4]; +int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; +static int mp_current_pci_id; + +/* I/O APIC entries */ +struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS]; + +/* # of MP IRQ source entries */ +struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; + +/* MP IRQ source entries */ +int mp_irq_entries; + +int nr_ioapics; + +int pic_mode; +unsigned long mp_lapic_addr; + +unsigned int def_to_bigsmp = 0; + +/* Processor that is doing the boot up */ +unsigned int boot_cpu_physical_apicid = -1U; +/* Internal processor count */ +static unsigned int __devinitdata num_processors; + +/* Bitmask of physically existing CPUs */ +physid_mask_t phys_cpu_present_map; + +u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; + +/* + * Intel MP BIOS table parsing routines: + */ + + +/* + * Checksum an MP configuration block. + */ + +static int __init mpf_checksum(unsigned char *mp, int len) +{ + int sum = 0; + + while (len--) + sum += *mp++; + + return sum & 0xFF; +} + +/* + * Have to match translation table entries to main table entries by counter + * hence the mpc_record variable .... can't see a less disgusting way of + * doing this .... + */ + +static int mpc_record; +static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __initdata; + +#ifndef CONFIG_XEN +static void __devinit MP_processor_info (struct mpc_config_processor *m) +{ + int ver, apicid; + physid_mask_t phys_cpu; + + if (!(m->mpc_cpuflag & CPU_ENABLED)) + return; + + apicid = mpc_apic_id(m, translation_table[mpc_record]); + + if (m->mpc_featureflag&(1<<0)) + Dprintk(" Floating point unit present.\n"); + if (m->mpc_featureflag&(1<<7)) + Dprintk(" Machine Exception supported.\n"); + if (m->mpc_featureflag&(1<<8)) + Dprintk(" 64 bit compare & exchange supported.\n"); + if (m->mpc_featureflag&(1<<9)) + Dprintk(" Internal APIC present.\n"); + if (m->mpc_featureflag&(1<<11)) + Dprintk(" SEP present.\n"); + if (m->mpc_featureflag&(1<<12)) + Dprintk(" MTRR present.\n"); + if (m->mpc_featureflag&(1<<13)) + Dprintk(" PGE present.\n"); + if (m->mpc_featureflag&(1<<14)) + Dprintk(" MCA present.\n"); + if (m->mpc_featureflag&(1<<15)) + Dprintk(" CMOV present.\n"); + if (m->mpc_featureflag&(1<<16)) + Dprintk(" PAT present.\n"); + if (m->mpc_featureflag&(1<<17)) + Dprintk(" PSE present.\n"); + if (m->mpc_featureflag&(1<<18)) + Dprintk(" PSN present.\n"); + if (m->mpc_featureflag&(1<<19)) + Dprintk(" Cache Line Flush Instruction present.\n"); + /* 20 Reserved */ + if (m->mpc_featureflag&(1<<21)) + Dprintk(" Debug Trace and EMON Store present.\n"); + if (m->mpc_featureflag&(1<<22)) + Dprintk(" ACPI Thermal Throttle Registers present.\n"); + if (m->mpc_featureflag&(1<<23)) + Dprintk(" MMX present.\n"); + if (m->mpc_featureflag&(1<<24)) + Dprintk(" FXSR present.\n"); + if (m->mpc_featureflag&(1<<25)) + Dprintk(" XMM present.\n"); + if (m->mpc_featureflag&(1<<26)) + Dprintk(" Willamette New Instructions present.\n"); + if (m->mpc_featureflag&(1<<27)) + Dprintk(" Self Snoop present.\n"); + if (m->mpc_featureflag&(1<<28)) + Dprintk(" HT present.\n"); + if (m->mpc_featureflag&(1<<29)) + Dprintk(" Thermal Monitor present.\n"); + /* 30, 31 Reserved */ + + + if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { + Dprintk(" Bootup CPU\n"); + boot_cpu_physical_apicid = m->mpc_apicid; + } + + ver = m->mpc_apicver; + + /* + * Validate version + */ + if (ver == 0x0) { + printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! " + "fixing up to 0x10. (tell your hw vendor)\n", + m->mpc_apicid); + ver = 0x10; + } + apic_version[m->mpc_apicid] = ver; + + phys_cpu = apicid_to_cpu_present(apicid); + physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu); + + if (num_processors >= NR_CPUS) { + printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." + " Processor ignored.\n", NR_CPUS); + return; + } + + if (num_processors >= maxcpus) { + printk(KERN_WARNING "WARNING: maxcpus limit of %i reached." + " Processor ignored.\n", maxcpus); + return; + } + + cpu_set(num_processors, cpu_possible_map); + num_processors++; + + /* + * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y + * but we need to work other dependencies like SMP_SUSPEND etc + * before this can be done without some confusion. + * if (CPU_HOTPLUG_ENABLED || num_processors > 8) + * - Ashok Raj <ashok.raj@xxxxxxxxx> + */ + if (num_processors > 8) { + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + if (!APIC_XAPIC(ver)) { + def_to_bigsmp = 0; + break; + } + /* If P4 and above fall through */ + case X86_VENDOR_AMD: + def_to_bigsmp = 1; + } + } + bios_cpu_apicid[num_processors - 1] = m->mpc_apicid; +} +#else +void __init MP_processor_info (struct mpc_config_processor *m) +{ + num_processors++; +} +#endif /* CONFIG_XEN */ + +static void __init MP_bus_info (struct mpc_config_bus *m) +{ + char str[7]; + + memcpy(str, m->mpc_bustype, 6); + str[6] = 0; + + mpc_oem_bus_info(m, str, translation_table[mpc_record]); + + if (m->mpc_busid >= MAX_MP_BUSSES) { + printk(KERN_WARNING "MP table busid value (%d) for bustype %s " + " is too large, max. supported is %d\n", + m->mpc_busid, str, MAX_MP_BUSSES - 1); + return; + } + + if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) { + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; + } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA)-1) == 0) { + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA; + } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI)-1) == 0) { + mpc_oem_pci_bus(m, translation_table[mpc_record]); + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; + mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id; + mp_current_pci_id++; + } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) { + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA; + } else if (strncmp(str, BUSTYPE_NEC98, sizeof(BUSTYPE_NEC98)-1) == 0) { + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_NEC98; + } else { + printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); + } +} + +static void __init MP_ioapic_info (struct mpc_config_ioapic *m) +{ + if (!(m->mpc_flags & MPC_APIC_USABLE)) + return; + + printk(KERN_INFO "I/O APIC #%d Version %d at 0x%lX.\n", + m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr); + if (nr_ioapics >= MAX_IO_APICS) { + printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n", + MAX_IO_APICS, nr_ioapics); + panic("Recompile kernel with bigger MAX_IO_APICS!.\n"); + } + if (!m->mpc_apicaddr) { + printk(KERN_ERR "WARNING: bogus zero I/O APIC address" + " found in MP table, skipping!\n"); + return; + } + mp_ioapics[nr_ioapics] = *m; + nr_ioapics++; +} + +static void __init MP_intsrc_info (struct mpc_config_intsrc *m) +{ + mp_irqs [mp_irq_entries] = *m; + Dprintk("Int: type %d, pol %d, trig %d, bus %d," + " IRQ %02x, APIC ID %x, APIC INT %02x\n", + m->mpc_irqtype, m->mpc_irqflag & 3, + (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, + m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq); + if (++mp_irq_entries == MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!!\n"); +} + +static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m) +{ + Dprintk("Lint: type %d, pol %d, trig %d, bus %d," + " IRQ %02x, APIC ID %x, APIC LINT %02x\n", + m->mpc_irqtype, m->mpc_irqflag & 3, + (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid, + m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); + /* + * Well it seems all SMP boards in existence + * use ExtINT/LVT1 == LINT0 and + * NMI/LVT2 == LINT1 - the following check + * will show us if this assumptions is false. + * Until then we do not have to add baggage. + */ + if ((m->mpc_irqtype == mp_ExtINT) && + (m->mpc_destapiclint != 0)) + BUG(); + if ((m->mpc_irqtype == mp_NMI) && + (m->mpc_destapiclint != 1)) + BUG(); +} + +#ifdef CONFIG_X86_NUMAQ +static void __init MP_translation_info (struct mpc_config_translation *m) +{ + printk(KERN_INFO "Translation: record %d, type %d, quad %d, global %d, local %d\n", mpc_record, m->trans_type, m->trans_quad, m->trans_global, m->trans_local); + + if (mpc_record >= MAX_MPC_ENTRY) + printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n"); + else + translation_table[mpc_record] = m; /* stash this for later */ + if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad)) + node_set_online(m->trans_quad); +} + +/* + * Read/parse the MPC oem tables + */ + +static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, \ + unsigned short oemsize) +{ + int count = sizeof (*oemtable); /* the header size */ + unsigned char *oemptr = ((unsigned char *)oemtable)+count; + + mpc_record = 0; + printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", oemtable); + if (memcmp(oemtable->oem_signature,MPC_OEM_SIGNATURE,4)) + { + printk(KERN_WARNING "SMP mpc oemtable: bad signature [%c%c%c%c]!\n", + oemtable->oem_signature[0], + oemtable->oem_signature[1], + oemtable->oem_signature[2], + oemtable->oem_signature[3]); + return; + } + if (mpf_checksum((unsigned char *)oemtable,oemtable->oem_length)) + { + printk(KERN_WARNING "SMP oem mptable: checksum error!\n"); + return; + } + while (count < oemtable->oem_length) { + switch (*oemptr) { + case MP_TRANSLATION: + { + struct mpc_config_translation *m= + (struct mpc_config_translation *)oemptr; + MP_translation_info(m); + oemptr += sizeof(*m); + count += sizeof(*m); + ++mpc_record; + break; + } + default: + { + printk(KERN_WARNING "Unrecognised OEM table entry type! - %d\n", (int) *oemptr); + return; + } + } + } +} + +static inline void mps_oem_check(struct mp_config_table *mpc, char *oem, + char *productid) +{ + if (strncmp(oem, "IBM NUMA", 8)) + printk("Warning! May not be a NUMA-Q system!\n"); + if (mpc->mpc_oemptr) + smp_read_mpc_oem((struct mp_config_oemtable *) mpc->mpc_oemptr, + mpc->mpc_oemsize); +} +#endif /* CONFIG_X86_NUMAQ */ + +/* + * Read/parse the MPC + */ + +static int __init smp_read_mpc(struct mp_config_table *mpc) +{ + char str[16]; + char oem[10]; + int count=sizeof(*mpc); + unsigned char *mpt=((unsigned char *)mpc)+count; + + if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) { + printk(KERN_ERR "SMP mptable: bad signature [0x%x]!\n", + *(u32 *)mpc->mpc_signature); + return 0; + } + if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) { + printk(KERN_ERR "SMP mptable: checksum error!\n"); + return 0; + } + if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) { + printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n", + mpc->mpc_spec); + return 0; + } + if (!mpc->mpc_lapic) { + printk(KERN_ERR "SMP mptable: null local APIC address!\n"); + return 0; + } + memcpy(oem,mpc->mpc_oem,8); + oem[8]=0; + printk(KERN_INFO "OEM ID: %s ",oem); + + memcpy(str,mpc->mpc_productid,12); + str[12]=0; + printk("Product ID: %s ",str); + + mps_oem_check(mpc, oem, str); + + printk("APIC at: 0x%lX\n",mpc->mpc_lapic); + + /* + * Save the local APIC address (it might be non-default) -- but only + * if we're not using ACPI. + */ + if (!acpi_lapic) + mp_lapic_addr = mpc->mpc_lapic; + + /* + * Now process the configuration blocks. + */ + mpc_record = 0; + while (count < mpc->mpc_length) { + switch(*mpt) { + case MP_PROCESSOR: + { + struct mpc_config_processor *m= + (struct mpc_config_processor *)mpt; + /* ACPI may have already provided this data */ + if (!acpi_lapic) + MP_processor_info(m); + mpt += sizeof(*m); + count += sizeof(*m); + break; + } + case MP_BUS: + { + struct mpc_config_bus *m= + (struct mpc_config_bus *)mpt; + MP_bus_info(m); + mpt += sizeof(*m); + count += sizeof(*m); + break; + } + case MP_IOAPIC: + { + struct mpc_config_ioapic *m= + (struct mpc_config_ioapic *)mpt; + MP_ioapic_info(m); + mpt+=sizeof(*m); + count+=sizeof(*m); + break; + } + case MP_INTSRC: + { + struct mpc_config_intsrc *m= + (struct mpc_config_intsrc *)mpt; + + MP_intsrc_info(m); + mpt+=sizeof(*m); + count+=sizeof(*m); + break; + } + case MP_LINTSRC: + { + struct mpc_config_lintsrc *m= + (struct mpc_config_lintsrc *)mpt; + MP_lintsrc_info(m); + mpt+=sizeof(*m); + count+=sizeof(*m); + break; + } + default: + { + count = mpc->mpc_length; + break; + } + } + ++mpc_record; + } + clustered_apic_check(); + if (!num_processors) + printk(KERN_ERR "SMP mptable: no processors registered!\n"); + return num_processors; +} + +static int __init ELCR_trigger(unsigned int irq) +{ + unsigned int port; + + port = 0x4d0 + (irq >> 3); + return (inb(port) >> (irq & 7)) & 1; +} + +static void __init construct_default_ioirq_mptable(int mpc_default_type) +{ + struct mpc_config_intsrc intsrc; + int i; + int ELCR_fallback = 0; + + intsrc.mpc_type = MP_INTSRC; + intsrc.mpc_irqflag = 0; /* conforming */ + intsrc.mpc_srcbus = 0; + intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid; + + intsrc.mpc_irqtype = mp_INT; + + /* + * If true, we have an ISA/PCI system with no IRQ entries + * in the MP table. To prevent the PCI interrupts from being set up + * incorrectly, we try to use the ELCR. The sanity check to see if + * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can + * never be level sensitive, so we simply see if the ELCR agrees. + * If it does, we assume it's valid. + */ + if (mpc_default_type == 5) { + printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n"); + + if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13)) + printk(KERN_WARNING "ELCR contains invalid data... not using ELCR\n"); + else { + printk(KERN_INFO "Using ELCR to identify PCI interrupts\n"); + ELCR_fallback = 1; + } + } + + for (i = 0; i < 16; i++) { + switch (mpc_default_type) { + case 2: + if (i == 0 || i == 13) + continue; /* IRQ0 & IRQ13 not connected */ + /* fall through */ + default: + if (i == 2) + continue; /* IRQ2 is never connected */ + } + + if (ELCR_fallback) { + /* + * If the ELCR indicates a level-sensitive interrupt, we + * copy that information over to the MP table in the + * irqflag field (level sensitive, active high polarity). + */ + if (ELCR_trigger(i)) + intsrc.mpc_irqflag = 13; + else + intsrc.mpc_irqflag = 0; + } + + intsrc.mpc_srcbusirq = i; + intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */ + MP_intsrc_info(&intsrc); + } + + intsrc.mpc_irqtype = mp_ExtINT; + intsrc.mpc_srcbusirq = 0; + intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */ + MP_intsrc_info(&intsrc); +} + +static inline void __init construct_default_ISA_mptable(int mpc_default_type) +{ + struct mpc_config_processor processor; + struct mpc_config_bus bus; + struct mpc_config_ioapic ioapic; + struct mpc_config_lintsrc lintsrc; + int linttypes[2] = { mp_ExtINT, mp_NMI }; + int i; + + /* + * local APIC has default address + */ + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; + + /* + * 2 CPUs, numbered 0 & 1. + */ + processor.mpc_type = MP_PROCESSOR; + /* Either an integrated APIC or a discrete 82489DX. */ + processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; + processor.mpc_cpuflag = CPU_ENABLED; + processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | + (boot_cpu_data.x86_model << 4) | + boot_cpu_data.x86_mask; + processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; + processor.mpc_reserved[0] = 0; + processor.mpc_reserved[1] = 0; + for (i = 0; i < 2; i++) { + processor.mpc_apicid = i; + MP_processor_info(&processor); + } + + bus.mpc_type = MP_BUS; + bus.mpc_busid = 0; + switch (mpc_default_type) { + default: + printk("???\n"); + printk(KERN_ERR "Unknown standard configuration %d\n", + mpc_default_type); + /* fall through */ + case 1: + case 5: + memcpy(bus.mpc_bustype, "ISA ", 6); + break; + case 2: + case 6: + case 3: + memcpy(bus.mpc_bustype, "EISA ", 6); + break; + case 4: + case 7: + memcpy(bus.mpc_bustype, "MCA ", 6); + } + MP_bus_info(&bus); + if (mpc_default_type > 4) { + bus.mpc_busid = 1; + memcpy(bus.mpc_bustype, "PCI ", 6); + MP_bus_info(&bus); + } + + ioapic.mpc_type = MP_IOAPIC; + ioapic.mpc_apicid = 2; + ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; + ioapic.mpc_flags = MPC_APIC_USABLE; + ioapic.mpc_apicaddr = 0xFEC00000; + MP_ioapic_info(&ioapic); + + /* + * We set up most of the low 16 IO-APIC pins according to MPS rules. + */ + construct_default_ioirq_mptable(mpc_default_type); + + lintsrc.mpc_type = MP_LINTSRC; + lintsrc.mpc_irqflag = 0; /* conforming */ + lintsrc.mpc_srcbusid = 0; + lintsrc.mpc_srcbusirq = 0; + lintsrc.mpc_destapic = MP_APIC_ALL; + for (i = 0; i < 2; i++) { + lintsrc.mpc_irqtype = linttypes[i]; + lintsrc.mpc_destapiclint = i; + MP_lintsrc_info(&lintsrc); + } +} + +static struct intel_mp_floating *mpf_found; + +/* + * Scan the memory blocks for an SMP configuration block. + */ +void __init get_smp_config (void) +{ + struct intel_mp_floating *mpf = mpf_found; + + /* + * ACPI supports both logical (e.g. Hyper-Threading) and physical + * processors, where MPS only supports physical. + */ + if (acpi_lapic && acpi_ioapic) { + printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n"); + return; + } + else if (acpi_lapic) + printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n"); + + printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification); + if (mpf->mpf_feature2 & (1<<7)) { + printk(KERN_INFO " IMCR and PIC compatibility mode.\n"); + pic_mode = 1; + } else { + printk(KERN_INFO " Virtual Wire compatibility mode.\n"); + pic_mode = 0; + } + + /* + * Now see if we need to read further. + */ + if (mpf->mpf_feature1 != 0) { + + printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1); + construct_default_ISA_mptable(mpf->mpf_feature1); + + } else if (mpf->mpf_physptr) { + + /* + * Read the physical hardware table. Anything here will + * override the defaults. + */ + if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) { + smp_found_config = 0; + printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"); + printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n"); + return; + } + /* + * If there are no explicit MP IRQ entries, then we are + * broken. We set up most of the low 16 IO-APIC pins to + * ISA defaults and hope it will work. + */ + if (!mp_irq_entries) { + struct mpc_config_bus bus; + + printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n"); + + bus.mpc_type = MP_BUS; + bus.mpc_busid = 0; + memcpy(bus.mpc_bustype, "ISA ", 6); + MP_bus_info(&bus); + + construct_default_ioirq_mptable(0); + } + + } else + BUG(); + + printk(KERN_INFO "Processors: %d\n", num_processors); + /* + * Only use the first configuration found. + */ +} + +static int __init smp_scan_config (unsigned long base, unsigned long length) +{ + unsigned long *bp = isa_bus_to_virt(base); + struct intel_mp_floating *mpf; + + Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length); + if (sizeof(*mpf) != 16) + printk("Error: MPF size\n"); + + while (length > 0) { + mpf = (struct intel_mp_floating *)bp; + if ((*bp == SMP_MAGIC_IDENT) && + (mpf->mpf_length == 1) && + !mpf_checksum((unsigned char *)bp, 16) && + ((mpf->mpf_specification == 1) + || (mpf->mpf_specification == 4)) ) { + + smp_found_config = 1; +#ifndef CONFIG_XEN + printk(KERN_INFO "found SMP MP-table at %08lx\n", + virt_to_phys(mpf)); + reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE); + if (mpf->mpf_physptr) { + /* + * We cannot access to MPC table to compute + * table size yet, as only few megabytes from + * the bottom is mapped now. + * PC-9800's MPC table places on the very last + * of physical memory; so that simply reserving + * PAGE_SIZE from mpg->mpf_physptr yields BUG() + * in reserve_bootmem. + */ + unsigned long size = PAGE_SIZE; + unsigned long end = max_low_pfn * PAGE_SIZE; + if (mpf->mpf_physptr + size > end) + size = end - mpf->mpf_physptr; + reserve_bootmem(mpf->mpf_physptr, size); + } +#else + printk(KERN_INFO "found SMP MP-table at %08lx\n", + ((unsigned long)bp - (unsigned long)isa_bus_to_virt(base)) + base); +#endif + + mpf_found = mpf; + return 1; + } + bp += 4; + length -= 16; + } + return 0; +} + +void __init find_smp_config (void) +{ +#ifndef CONFIG_XEN + unsigned int address; +#endif + + /* + * FIXME: Linux assumes you have 640K of base ram.. + * this continues the error... + * + * 1) Scan the bottom 1K for a signature + * 2) Scan the top 1K of base RAM + * 3) Scan the 64K of bios + */ + if (smp_scan_config(0x0,0x400) || + smp_scan_config(639*0x400,0x400) || + smp_scan_config(0xF0000,0x10000)) + return; + /* + * If it is an SMP machine we should know now, unless the + * configuration is in an EISA/MCA bus machine with an + * extended bios data area. + * + * there is a real-mode segmented pointer pointing to the + * 4K EBDA area at 0x40E, calculate and scan it here. + * + * NOTE! There are Linux loaders that will corrupt the EBDA + * area, and as such this kind of SMP config may be less + * trustworthy, simply because the SMP table may have been + * stomped on during early boot. These loaders are buggy and + * should be fixed. + * + * MP1.4 SPEC states to only scan first 1K of 4K EBDA. + */ + +#ifndef CONFIG_XEN + address = get_bios_ebda(); + if (address) + smp_scan_config(address, 0x400); +#endif +} + +int es7000_plat; + +/* -------------------------------------------------------------------------- + ACPI-based MP Configuration + -------------------------------------------------------------------------- */ + +#ifdef CONFIG_ACPI + +void __init mp_register_lapic_address ( + u64 address) +{ +#ifndef CONFIG_XEN + mp_lapic_addr = (unsigned long) address; + + set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); + + if (boot_cpu_physical_apicid == -1U) + boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID)); + + Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid); +#endif +} + + +void __devinit mp_register_lapic ( + u8 id, + u8 enabled) +{ + struct mpc_config_processor processor; + int boot_cpu = 0; + + if (MAX_APICS - id <= 0) { + printk(KERN_WARNING "Processor #%d invalid (max %d)\n", + id, MAX_APICS); + return; + } + + if (id == boot_cpu_physical_apicid) + boot_cpu = 1; + +#ifndef CONFIG_XEN + processor.mpc_type = MP_PROCESSOR; + processor.mpc_apicid = id; + processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR)); + processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0); + processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0); + processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | + (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask; + processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; + processor.mpc_reserved[0] = 0; + processor.mpc_reserved[1] = 0; +#endif + + MP_processor_info(&processor); +} + +#ifdef CONFIG_X86_IO_APIC + +#define MP_ISA_BUS 0 +#define MP_MAX_IOAPIC_PIN 127 + +static struct mp_ioapic_routing { + int apic_id; + int gsi_base; + int gsi_end; + u32 pin_programmed[4]; +} mp_ioapic_routing[MAX_IO_APICS]; + + +static int mp_find_ioapic ( + int gsi) +{ + int i = 0; + + /* Find the IOAPIC that manages this GSI. */ + for (i = 0; i < nr_ioapics; i++) { + if ((gsi >= mp_ioapic_routing[i].gsi_base) + && (gsi <= mp_ioapic_routing[i].gsi_end)) + return i; + } + + printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); + + return -1; +} + + +void __init mp_register_ioapic ( + u8 id, + u32 address, + u32 gsi_base) +{ + int idx = 0; + int tmpid; + + if (nr_ioapics >= MAX_IO_APICS) { + printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " + "(found %d)\n", MAX_IO_APICS, nr_ioapics); + panic("Recompile kernel with bigger MAX_IO_APICS!\n"); + } + if (!address) { + printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address" + " found in MADT table, skipping!\n"); + return; + } + + idx = nr_ioapics++; + + mp_ioapics[idx].mpc_type = MP_IOAPIC; + mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE; + mp_ioapics[idx].mpc_apicaddr = address; + +#ifndef CONFIG_XEN + set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); +#endif + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + && !APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) + tmpid = io_apic_get_unique_id(idx, id); + else + tmpid = id; + if (tmpid == -1) { + nr_ioapics--; + return; + } + mp_ioapics[idx].mpc_apicid = tmpid; + mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); + + /* + * Build basic GSI lookup table to facilitate gsi->io_apic lookups + * and to prevent reprogramming of IOAPIC pins (PCI GSIs). + */ + mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid; + mp_ioapic_routing[idx].gsi_base = gsi_base; + mp_ioapic_routing[idx].gsi_end = gsi_base + + io_apic_get_redir_entries(idx); + + printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, " + "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, + mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, + mp_ioapic_routing[idx].gsi_base, + mp_ioapic_routing[idx].gsi_end); + + return; +} + + +void __init mp_override_legacy_irq ( + u8 bus_irq, + u8 polarity, + u8 trigger, + u32 gsi) +{ + struct mpc_config_intsrc intsrc; + int ioapic = -1; + int pin = -1; + + /* + * Convert 'gsi' to 'ioapic.pin'. + */ + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) + return; + pin = gsi - mp_ioapic_routing[ioapic].gsi_base; + + /* + * TBD: This check is for faulty timer entries, where the override + * erroneously sets the trigger to level, resulting in a HUGE + * increase of timer interrupts! + */ + if ((bus_irq == 0) && (trigger == 3)) + trigger = 1; + + intsrc.mpc_type = MP_INTSRC; + intsrc.mpc_irqtype = mp_INT; + intsrc.mpc_irqflag = (trigger << 2) | polarity; + intsrc.mpc_srcbus = MP_ISA_BUS; + intsrc.mpc_srcbusirq = bus_irq; /* IRQ */ + intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */ + intsrc.mpc_dstirq = pin; /* INTIN# */ + + Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n", + intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, + (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, + intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq); + + mp_irqs[mp_irq_entries] = intsrc; + if (++mp_irq_entries == MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!\n"); + + return; +} + +void __init mp_config_acpi_legacy_irqs (void) +{ + struct mpc_config_intsrc intsrc; + int i = 0; + int ioapic = -1; + + /* + * Fabricate the legacy ISA bus (bus #31). + */ + mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; + Dprintk("Bus #%d is ISA\n", MP_ISA_BUS); + + /* + * Older generations of ES7000 have no legacy identity mappings + */ + if (es7000_plat == 1) + return; + + /* + * Locate the IOAPIC that manages the ISA IRQs (0-15). + */ + ioapic = mp_find_ioapic(0); + if (ioapic < 0) + return; + + intsrc.mpc_type = MP_INTSRC; + intsrc.mpc_irqflag = 0; /* Conforming */ + intsrc.mpc_srcbus = MP_ISA_BUS; + intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; + + /* + * Use the default configuration for the IRQs 0-15. Unless + * overriden by (MADT) interrupt source override entries. + */ + for (i = 0; i < 16; i++) { + int idx; + + for (idx = 0; idx < mp_irq_entries; idx++) { + struct mpc_config_intsrc *irq = mp_irqs + idx; + + /* Do we already have a mapping for this ISA IRQ? */ + if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i) + break; + + /* Do we already have a mapping for this IOAPIC pin */ + if ((irq->mpc_dstapic == intsrc.mpc_dstapic) && + (irq->mpc_dstirq == i)) + break; + } + + if (idx != mp_irq_entries) { + printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i); + continue; /* IRQ already used */ + } + + intsrc.mpc_irqtype = mp_INT; + intsrc.mpc_srcbusirq = i; /* Identity mapped */ + intsrc.mpc_dstirq = i; + + Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, " + "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, + (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, + intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, + intsrc.mpc_dstirq); + + mp_irqs[mp_irq_entries] = intsrc; + if (++mp_irq_entries == MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!\n"); + } +} + +#define MAX_GSI_NUM 4096 + +int mp_register_gsi (u32 gsi, int triggering, int polarity) +{ + int ioapic = -1; + int ioapic_pin = 0; + int idx, bit = 0; + static int pci_irq = 16; + /* + * Mapping between Global System Interrups, which + * represent all possible interrupts, and IRQs + * assigned to actual devices. + */ + static int gsi_to_irq[MAX_GSI_NUM]; + + /* Don't set up the ACPI SCI because it's already set up */ + if (acpi_fadt.sci_int == gsi) + return gsi; + + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) { + printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi); + return gsi; + } + + ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base; + + if (ioapic_renumber_irq) + gsi = ioapic_renumber_irq(ioapic, gsi); + + /* + * Avoid pin reprogramming. PRTs typically include entries + * with redundant pin->gsi mappings (but unique PCI devices); + * we only program the IOAPIC on the first. + */ + bit = ioapic_pin % 32; + idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32); + if (idx > 3) { + printk(KERN_ERR "Invalid reference to IOAPIC pin " + "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, + ioapic_pin); + return gsi; + } + if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) { + Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", + mp_ioapic_routing[ioapic].apic_id, ioapic_pin); + return gsi_to_irq[gsi]; + } + + mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit); + + if (triggering == ACPI_LEVEL_SENSITIVE) { + /* + * For PCI devices assign IRQs in order, avoiding gaps + * due to unused I/O APIC pins. + */ + int irq = gsi; + if (gsi < MAX_GSI_NUM) { + /* + * Retain the VIA chipset work-around (gsi > 15), but + * avoid a problem where the 8254 timer (IRQ0) is setup + * via an override (so it's not on pin 0 of the ioapic), + * and at the same time, the pin 0 interrupt is a PCI + * type. The gsi > 15 test could cause these two pins + * to be shared as IRQ0, and they are not shareable. + * So test for this condition, and if necessary, avoid + * the pin collision. + */ + if (gsi > 15 || (gsi == 0 && !timer_uses_ioapic_pin_0)) + gsi = pci_irq++; + /* + * Don't assign IRQ used by ACPI SCI + */ + if (gsi == acpi_fadt.sci_int) + gsi = pci_irq++; + gsi_to_irq[irq] = gsi; + } else { + printk(KERN_ERR "GSI %u is too high\n", gsi); + return gsi; + } + } + + io_apic_set_pci_routing(ioapic, ioapic_pin, gsi, + triggering == ACPI_EDGE_SENSITIVE ? 0 : 1, + polarity == ACPI_ACTIVE_HIGH ? 0 : 1); + return gsi; +} + +#endif /* CONFIG_X86_IO_APIC */ +#endif /* CONFIG_ACPI */ diff -r 42f6970e18c9 -r a533be77c572 arch/i386/kernel/pci-dma-xen.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/arch/i386/kernel/pci-dma-xen.c Mon Jun 04 10:05:28 2007 +0100 @@ -0,0 +1,378 @@ +/* + * Dynamic DMA mapping support. + * + * On i386 there is no hardware dynamic DMA address translation, + * so consistent alloc/free are merely page allocation/freeing. + * The rest of the dynamic DMA mapping interface is implemented + * in asm/pci.h. + */ + +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/pci.h> +#include <linux/module.h> +#include <linux/version.h> +#include <asm/io.h> +#include <xen/balloon.h> +#include <xen/gnttab.h> +#include <asm/swiotlb.h> +#include <asm/tlbflush.h> +#include <asm-i386/mach-xen/asm/swiotlb.h> +#include <asm/bug.h> + +#ifdef __x86_64__ +#include <asm/proto.h> + +int iommu_merge __read_mostly = 0; +EXPORT_SYMBOL(iommu_merge); + +dma_addr_t bad_dma_address __read_mostly; +EXPORT_SYMBOL(bad_dma_address); + +/* This tells the BIO block layer to assume merging. Default to off + because we cannot guarantee merging later. */ +int iommu_bio_merge __read_mostly = 0; +EXPORT_SYMBOL(iommu_bio_merge); + +int force_iommu __read_mostly= 0; + +__init int iommu_setup(char *p) +{ + return 1; +} + +void __init pci_iommu_alloc(void) +{ +#ifdef CONFIG_SWIOTLB + pci_swiotlb_init(); +#endif +} + +static int __init pci_iommu_init(void) +{ + no_iommu_init(); + return 0; +} + +/* Must execute after PCI subsystem */ +fs_initcall(pci_iommu_init); +#endif + +struct dma_coherent_mem { + void *virt_base; + u32 device_base; + int size; + int flags; + unsigned long *bitmap; +}; + +#define IOMMU_BUG_ON(test) \ +do { \ + if (unlikely(test)) { \ + printk(KERN_ALERT "Fatal DMA error! " \ + "Please use 'swiotlb=force'\n"); \ + BUG(); \ + } \ +} while (0) + +int +dma_map_sg(struct device *hwdev, struct scatterlist *sg, int nents, + enum dma_data_direction direction) +{ + int i, rc; + + if (direction == DMA_NONE) + BUG(); + WARN_ON(nents == 0 || sg[0].length == 0); + + if (swiotlb) { + rc = swiotlb_map_sg(hwdev, sg, nents, direction); + } else { + for (i = 0; i < nents; i++ ) { + sg[i].dma_address = + gnttab_dma_map_page(sg[i].page) + sg[i].offset; + sg[i].dma_length = sg[i].length; + BUG_ON(!sg[i].page); + IOMMU_BUG_ON(address_needs_mapping( + hwdev, sg[i].dma_address)); + } + rc = nents; + } + + flush_write_buffers(); + return rc; +} +EXPORT_SYMBOL(dma_map_sg); + +void +dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents, + enum dma_data_direction direction) +{ + int i; + + BUG_ON(direction == DMA_NONE); + if (swiotlb) + swiotlb_unmap_sg(hwdev, sg, nents, direction); + else { + for (i = 0; i < nents; i++ ) + gnttab_dma_unmap_page(sg[i].dma_address); + } +} +EXPORT_SYMBOL(dma_unmap_sg); + +#ifdef CONFIG_HIGHMEM +dma_addr_t +dma_map_page(struct device *dev, struct page *page, unsigned long offset, + size_t size, enum dma_data_direction direction) +{ + dma_addr_t dma_addr; + + BUG_ON(direction == DMA_NONE); + + if (swiotlb) { + dma_addr = swiotlb_map_page( + dev, page, offset, size, direction); + } else { + dma_addr = gnttab_dma_map_page(page) + offset; + IOMMU_BUG_ON(address_needs_mapping(dev, dma_addr)); + } + + return dma_addr; +} +EXPORT_SYMBOL(dma_map_page); + +void +dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size, + enum dma_data_direction direction) +{ + BUG_ON(direction == DMA_NONE); + if (swiotlb) + swiotlb_unmap_page(dev, dma_address, size, direction); + else + gnttab_dma_unmap_page(dma_address); +} +EXPORT_SYMBOL(dma_unmap_page); +#endif /* CONFIG_HIGHMEM */ + +int +dma_mapping_error(dma_addr_t dma_addr) +{ + if (swiotlb) + return swiotlb_dma_mapping_error(dma_addr); + return 0; +} +EXPORT_SYMBOL(dma_mapping_error); + +int +dma_supported(struct device *dev, u64 mask) +{ + if (swiotlb) + return swiotlb_dma_supported(dev, mask); + /* + * By default we'll BUG when an infeasible DMA is requested, and + * request swiotlb=force (see IOMMU_BUG_ON). + */ + return 1; +} +EXPORT_SYMBOL(dma_supported); + +void *dma_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp) +{ + void *ret; + struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; + unsigned int order = get_order(size); + unsigned long vstart; + u64 mask; + + /* ignore region specifiers */ + gfp &= ~(__GFP_DMA | __GFP_HIGHMEM); + + if (mem) { + int page = bitmap_find_free_region(mem->bitmap, mem->size, + order); + if (page >= 0) { + *dma_handle = mem->device_base + (page << PAGE_SHIFT); + ret = mem->virt_base + (page << PAGE_SHIFT); + memset(ret, 0, size); + return ret; + } + if (mem->flags & DMA_MEMORY_EXCLUSIVE) + return NULL; + } + + if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff)) + gfp |= GFP_DMA; + + vstart = __get_free_pages(gfp, order); + ret = (void *)vstart; + + if (dev != NULL && dev->coherent_dma_mask) + mask = dev->coherent_dma_mask; + else + mask = 0xffffffff; + + if (ret != NULL) { + if (xen_create_contiguous_region(vstart, order, + fls64(mask)) != 0) { + free_pages(vstart, order); + return NULL; + } + memset(ret, 0, size); + *dma_handle = virt_to_bus(ret); + } + return ret; +} +EXPORT_SYMBOL(dma_alloc_coherent); + +void dma_free_coherent(struct device *dev, size_t size, + void *vaddr, dma_addr_t dma_handle) +{ _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-changelog
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |